teichmann@5844: /* Copyright (C) 2011, 2012, 2013 by Bundesanstalt für Gewässerkunde teichmann@5844: * Software engineering by Intevation GmbH teichmann@5844: * teichmann@5992: * This file is Free Software under the GNU AGPL (>=v3) teichmann@5844: * and comes with ABSOLUTELY NO WARRANTY! Check out the teichmann@5992: * documentation coming with Dive4Elements River for details. teichmann@5844: */ teichmann@5844: teichmann@5829: package org.dive4elements.river.importer.parsers; sascha@1211: sascha@1211: import java.util.ArrayList; sascha@1211: import java.util.HashSet; teichmann@6506: import java.util.Arrays; sascha@1211: sascha@1211: import java.io.File; sascha@1211: import java.io.IOException; sascha@1211: import java.io.LineNumberReader; sascha@1211: import java.io.InputStreamReader; sascha@1211: import java.io.FileInputStream; sascha@1211: sascha@1211: import java.text.NumberFormat; sascha@1211: sascha@1211: import org.apache.log4j.Logger; sascha@1211: teichmann@5829: import org.dive4elements.river.utils.StringUtil; teichmann@5829: import org.dive4elements.river.utils.DateGuesser; sascha@1211: sascha@1211: import java.util.regex.Pattern; sascha@1211: import java.util.regex.Matcher; sascha@1211: sascha@1211: import java.math.BigDecimal; sascha@1211: teichmann@5829: import org.dive4elements.river.importer.ImportWstQRange; teichmann@5829: import org.dive4elements.river.importer.ImportWstColumn; teichmann@5829: import org.dive4elements.river.importer.ImportTimeInterval; teichmann@5829: import org.dive4elements.river.importer.ImportRange; teichmann@5829: import org.dive4elements.river.importer.ImportUnit; teichmann@5829: import org.dive4elements.river.importer.ImportWst; sascha@1211: sascha@1211: public class WstParser sascha@1211: { sascha@1211: private static Logger log = Logger.getLogger(WstParser.class); sascha@1211: sascha@1211: public static final String COLUMN_BEZ_TEXT = "column-bez-text"; sascha@1211: public static final String COLUMN_BEZ_BREITE = "column-bez-breite"; sascha@1211: public static final String COLUMN_QUELLE = "column-quelle"; sascha@1211: public static final String COLUMN_DATUM = "column-datum"; sascha@1211: sascha@1211: public static final BigDecimal UNDEFINED_ZERO = sascha@1211: new BigDecimal(0.0); sascha@1211: public static final BigDecimal MIN_RANGE = sascha@1211: new BigDecimal(-Double.MAX_VALUE); sascha@1211: public static final BigDecimal MAX_RANGE = sascha@1211: new BigDecimal(Double.MAX_VALUE); sascha@1211: sascha@1211: public static final String ENCODING = "ISO-8859-1"; sascha@1211: sascha@1211: public static final Pattern UNIT_COMMENT = sascha@1211: Pattern.compile("\\*\\s*[kK][mM]\\s+(.+)"); sascha@1211: sascha@1211: public static final Pattern UNIT = sascha@1211: Pattern.compile("[^\\[]*\\[([^]]+)\\].*"); sascha@1211: teichmann@5541: public static final double INTERVAL_GAP = 0.00001d; sascha@1211: sascha@1211: protected ImportWst wst; sascha@1211: sascha@1211: protected ImportRange lastRange; sascha@1211: sascha@1211: public WstParser() { sascha@1211: } sascha@1211: teichmann@6337: public WstParser(ImportWst wst) { teichmann@6337: this.wst = wst; teichmann@6337: } teichmann@6337: sascha@1211: public ImportWst getWst() { sascha@1211: return wst; sascha@1211: } sascha@1211: sascha@1211: public void setWst(ImportWst wst) { sascha@1211: this.wst = wst; sascha@1211: } sascha@1211: teichmann@6328: public static ImportTimeInterval guessDate(String string) { sascha@2357: try { sascha@2357: return new ImportTimeInterval( sascha@2357: DateGuesser.guessDate(string)); sascha@2357: } sascha@2357: catch (IllegalArgumentException iae) { tom@6028: log.warn("WST: String '" + string + tom@6028: "' could not be interpreted as valid timestamp"); sascha@2357: } sascha@2357: return null; sascha@2357: } sascha@2357: sascha@1211: public void parse(File file) throws IOException { sascha@1211: sascha@1211: log.info("Parsing WST file '" + file + "'"); sascha@1211: teichmann@6370: if (wst == null) { teichmann@6370: wst = new ImportWst(file.getName()); teichmann@6370: } teichmann@6370: else { teichmann@6370: wst.setDescription(file.getName()); teichmann@6370: } sascha@1211: teichmann@6337: LineNumberReader in = teichmann@6337: new LineNumberReader( teichmann@6337: new InputStreamReader( teichmann@6337: new FileInputStream(file), ENCODING)); sascha@1211: try { sascha@1211: String input; sascha@1211: boolean first = true; sascha@1211: int columnCount = 0; sascha@1211: sascha@1211: String [] lsBezeichner = null; sascha@1211: String [] langBezeichner = null; sascha@1211: int [] colNaWidths = null; teichmann@6504: String [] quellen = null; sascha@1211: String [] daten = null; sascha@1211: sascha@1211: BigDecimal [] aktAbfluesse = null; sascha@1211: BigDecimal [] firstAbfluesse = null; sascha@1211: teichmann@5544: BigDecimal minKm = MAX_RANGE; teichmann@5544: BigDecimal maxKm = MIN_RANGE; teichmann@5544: BigDecimal kmHist1 = null; teichmann@5544: BigDecimal kmHist2 = null; sascha@1211: sascha@1211: boolean columnHeaderChecked = false; sascha@1211: sascha@1211: String einheit = "Wasserstand [NN + m]"; sascha@1211: sascha@1211: HashSet kms = new HashSet(); sascha@1211: sascha@1211: while ((input = in.readLine()) != null) { sascha@1211: String line = input; sascha@1211: if (first) { // fetch number of columns sascha@1211: if ((line = line.trim()).length() == 0) { sascha@1211: continue; sascha@1211: } sascha@1211: try { sascha@1211: columnCount = Integer.parseInt(line); sascha@1211: if (columnCount <= 0) { sascha@1211: throw new NumberFormatException( tom@6028: "number of columns <= 0"); sascha@1211: } sascha@1211: log.debug("Number of columns: " + columnCount); sascha@1211: wst.setNumberColumns(columnCount); sascha@1211: lsBezeichner = new String[columnCount]; sascha@1211: } sascha@1211: catch (NumberFormatException nfe) { sascha@3662: log.warn("WST: invalid number.", nfe); sascha@1211: continue; sascha@1211: } sascha@1211: first = false; sascha@1211: continue; sascha@1211: } sascha@1211: sascha@1211: line = line.replace(',', '.'); sascha@1211: sascha@1211: if (line.startsWith("*\u001f")) { sascha@1211: BigDecimal [] data = sascha@1211: parseLineAsDouble(line, columnCount, false, true); sascha@1211: sascha@1211: if (aktAbfluesse != null) { teichmann@5544: if (kmHist1 != null && kmHist2 != null teichmann@5544: && kmHist1.compareTo(kmHist2) < 0) { teichmann@5544: BigDecimal t = minKm; minKm = maxKm; maxKm = t; teichmann@5544: } sascha@1211: addInterval(minKm, maxKm, aktAbfluesse); sascha@1211: minKm = MAX_RANGE; sascha@1211: maxKm = MIN_RANGE; sascha@1211: } sascha@1211: sascha@1211: aktAbfluesse = new BigDecimal[columnCount]; sascha@1211: log.debug("new q range: " + columnCount); sascha@1211: for (int i = 0; i < Math.min(columnCount, data.length); ++i) { sascha@1211: if (data[i] != null) { sascha@1211: log.debug(" column: " + data[i]); sascha@1211: aktAbfluesse[i] = data[i]; sascha@1211: } sascha@1211: } sascha@1211: sascha@1211: if (firstAbfluesse == null) { sascha@1211: firstAbfluesse = (BigDecimal [])aktAbfluesse.clone(); sascha@1211: } sascha@1211: continue; sascha@1211: } sascha@1211: sascha@1211: if (line.startsWith("*!")) { sascha@1211: String spezial = line.substring(2).trim(); sascha@1211: sascha@1211: if (spezial.length() == 0) { sascha@1211: continue; sascha@1211: } sascha@1211: sascha@1211: if (spezial.startsWith(COLUMN_BEZ_TEXT)) { sascha@1211: spezial = spezial.substring(COLUMN_BEZ_TEXT.length()).trim(); sascha@1211: if (spezial.length() == 0) { sascha@1211: continue; sascha@1211: } sascha@1211: langBezeichner = StringUtil.splitQuoted(spezial, '"'); sascha@1211: } sascha@1211: else if (spezial.startsWith(COLUMN_BEZ_BREITE)) { sascha@1211: spezial = spezial.substring(COLUMN_BEZ_BREITE.length()).trim(); sascha@1211: sascha@1211: if (spezial.length() == 0) { sascha@1211: continue; sascha@1211: } sascha@1211: sascha@1211: String[] split = spezial.split("\\s+"); sascha@1211: sascha@1211: colNaWidths = new int[split.length]; sascha@1211: for (int i=0; i < split.length; i++) { sascha@1211: colNaWidths[i] = Integer.parseInt(split[i]); sascha@1211: } sascha@1211: } sascha@1211: else if (spezial.startsWith(COLUMN_QUELLE)) { teichmann@6506: spezial = spezial.substring(COLUMN_QUELLE.length()).trim(); sascha@1211: if (spezial.length() == 0) { sascha@1211: continue; sascha@1211: } teichmann@6504: quellen = StringUtil.splitQuoted(spezial, '"'); teichmann@6506: log.debug("sources: " + Arrays.toString(quellen)); sascha@1211: } sascha@1211: else if (spezial.startsWith(COLUMN_DATUM)) { sascha@1211: spezial = spezial.substring(COLUMN_DATUM.length()).trim(); sascha@1211: if (spezial.length() == 0) { sascha@1211: continue; sascha@1211: } sascha@1211: daten = StringUtil.splitQuoted(spezial, '"'); sascha@1211: } sascha@1211: continue; sascha@1211: } sascha@1211: sascha@1211: if (line.length() < 11) { sascha@1211: continue; sascha@1211: } sascha@1211: sascha@1211: if (line.startsWith("*")) { sascha@1211: Matcher m = UNIT_COMMENT.matcher(line); sascha@1211: if (m.matches()) { sascha@1211: log.debug("unit comment found"); sascha@1211: // XXX: This hack is needed because desktop sascha@1211: // FLYS is broken figuring out the unit sascha@1211: String [] units = m.group(1).split("\\s{2,}"); sascha@1211: m = UNIT.matcher(units[0]); sascha@1211: einheit = m.matches() ? m.group(1) : units[0]; sascha@1211: log.debug("unit: " + einheit); sascha@1211: } sascha@1211: continue; sascha@1211: } sascha@1211: sascha@1211: if (firstAbfluesse != null) { sascha@1211: if (!columnHeaderChecked) { sascha@1211: int unknownCount = 0; sascha@1211: HashSet uniqueColumnNames = sascha@1211: new HashSet(); tom@6029: if (langBezeichner != null) { tom@6029: // use column name from '*!column-bez-text'-line tom@6029: lsBezeichner = StringUtil.fitArray( tom@6029: langBezeichner, lsBezeichner); tom@6029: } sascha@1211: for (int i = 0; i < lsBezeichner.length; ++i) { sascha@1211: if (lsBezeichner[i] == null sascha@1211: || lsBezeichner[i].length() == 0) { tom@6029: // generate alternative column names sascha@1211: double q = firstAbfluesse[i].doubleValue(); sascha@1211: if (q < 0.001) { sascha@1211: lsBezeichner[i] = sascha@1211: ""; sascha@1211: ++unknownCount; sascha@1211: } sascha@1211: else { sascha@1211: lsBezeichner[i] = "Q="+format(q); sascha@1211: } sascha@1211: } sascha@1211: String candidate = lsBezeichner[i]; sascha@1211: int collision = 1; sascha@1211: while (!uniqueColumnNames.add(candidate)) { sascha@1211: candidate = lsBezeichner[i] + sascha@1211: " (" + collision + ")"; sascha@1211: ++collision; sascha@1211: } sascha@2357: ImportWstColumn iwc = wst.getColumn(i); sascha@2357: iwc.setName(candidate); teichmann@6504: if (quellen != null && i < quellen.length) { teichmann@6504: iwc.setSource(quellen[i]); teichmann@6504: } teichmann@5558: String potentialDate = daten != null && i < daten.length teichmann@5558: ? daten[i] teichmann@5558: : candidate; teichmann@5558: iwc.setTimeInterval(guessDate(potentialDate)); sascha@1211: } sascha@1211: columnHeaderChecked = true; sascha@1211: } sascha@1211: sascha@1211: BigDecimal [] data = sascha@1211: parseLineAsDouble(line, columnCount, true, false); sascha@1211: sascha@1211: BigDecimal kaem = data[0]; sascha@1211: sascha@1211: if (!kms.add(kaem)) { sascha@1211: log.warn( sascha@3662: "WST: km " + kaem + sascha@1211: " (line " + in.getLineNumber() + sascha@1211: ") found more than once. -> ignored"); sascha@1211: continue; sascha@1211: } sascha@1211: teichmann@5544: kmHist2 = kmHist1; teichmann@5544: kmHist1 = kaem; tom@5543: sascha@1211: if (kaem.compareTo(minKm) < 0) { sascha@1211: minKm = kaem; sascha@1211: } sascha@1211: if (kaem.compareTo(maxKm) > 0) { sascha@1211: maxKm = kaem; sascha@1211: } sascha@1211: sascha@1211: // extract values sascha@1211: for (int i = 0; i < columnCount; ++i) { sascha@1211: addValue(kaem, data[i+1], i); sascha@1211: } sascha@1211: sascha@1211: } sascha@1211: else { // firstAbfluesse == null sascha@1211: if (langBezeichner != null) { tom@6029: // nothing to do sascha@1211: } sascha@1211: else if (colNaWidths != null) { sascha@1211: for (int j = 0, i = 0, N = input.length(); sascha@1211: j < colNaWidths.length && i < N; sascha@1211: i += colNaWidths[j++] sascha@1211: ) { sascha@1211: lsBezeichner[j] = input.substring( sascha@1211: i, i+colNaWidths[j]).trim(); sascha@1211: } sascha@1211: } sascha@1211: else { sascha@1211: // first column begins at position 8 in line sascha@1211: for (int i = 8, col = 0; i < input.length(); i += 9) { sascha@1211: if ((i + 9) > input.length()) { sascha@1211: i = input.length() - 10; sascha@1211: } sascha@1211: // one column header is 9 chars wide sascha@1211: lsBezeichner[col++] = sascha@1211: input.substring(i, i + 9).trim(); sascha@1211: sascha@1211: if (col == lsBezeichner.length) { sascha@1211: break; sascha@1211: } sascha@1211: } sascha@1211: } sascha@1211: } sascha@1211: teichmann@5542: } // for all lines in WST file ingo@2346: ingo@2346: wst.setUnit(new ImportUnit(einheit)); teichmann@5544: teichmann@5544: if (kmHist1 != null && kmHist2 != null teichmann@5544: && kmHist1.compareTo(kmHist2) < 0) { teichmann@5544: BigDecimal t = minKm; minKm = maxKm; maxKm = t; teichmann@5544: } sascha@1211: addInterval(minKm, maxKm, aktAbfluesse); teichmann@5542: teichmann@5542: fixRangesOrder(); sascha@1211: } sascha@1211: finally { teichmann@6337: in.close(); sascha@1211: } sascha@1211: } sascha@1211: teichmann@5542: protected void fixRangesOrder() { teichmann@5542: wst.fixRangesOrder(); teichmann@5542: } teichmann@5542: sascha@1211: protected void addValue(BigDecimal km, BigDecimal w, int index) { sascha@1211: if (w != null) { sascha@1211: ImportWstColumn column = wst.getColumn(index); sascha@1211: column.addColumnValue(km, w); sascha@1211: } sascha@1211: } sascha@1211: sascha@1211: private static final NumberFormat NF = getNumberFormat(); sascha@1211: sascha@1211: private static final NumberFormat getNumberFormat() { sascha@1211: NumberFormat nf = NumberFormat.getInstance(); sascha@1211: nf.setMinimumFractionDigits(2); sascha@1211: nf.setMaximumFractionDigits(2); sascha@1211: return nf; sascha@1211: } sascha@1211: sascha@1211: protected static String format(double value) { sascha@1211: return NF.format(value); sascha@1211: } sascha@1211: sascha@1211: protected void addInterval( sascha@1211: BigDecimal from, sascha@1211: BigDecimal to, sascha@1211: BigDecimal [] values sascha@1211: ) { sascha@1211: log.debug("addInterval: " + from + " " + to); sascha@1211: felix@5795: if (values == null || from == MAX_RANGE || from == MIN_RANGE) { sascha@1211: return; sascha@1211: } sascha@1211: sascha@1211: ImportRange range = new ImportRange(from, to); sascha@1211: sascha@1211: // little workaround to make the q ranges tightly fit. sascha@1211: // Leave a very small gap to ensure that the range queries sascha@1211: // still work. sascha@1211: sascha@1211: if (lastRange != null) { teichmann@5541: double a1 = lastRange.getA().doubleValue(); teichmann@5541: double b1 = lastRange.getB().doubleValue(); teichmann@5541: double a2 = range.getA().doubleValue(); sascha@1211: teichmann@5541: if (a1 < b1) { teichmann@5541: lastRange.setB(new BigDecimal(a2 - INTERVAL_GAP)); sascha@1211: } teichmann@5541: else { // a1 >= b1 teichmann@5541: lastRange.setB(new BigDecimal(a2 + INTERVAL_GAP)); sascha@1211: } sascha@1211: } sascha@1211: sascha@1211: for (int i = 0; i < values.length; ++i) { sascha@1211: ImportWstColumn column = wst.getColumn(i); sascha@1211: ImportWstQRange wstQRange = new ImportWstQRange(range, values[i]); sascha@1211: column.addColumnQRange(wstQRange); sascha@1211: } sascha@1211: sascha@1211: lastRange = range; sascha@1211: } sascha@1211: sascha@1211: private static final BigDecimal [] parseLineAsDouble( sascha@1211: String line, sascha@1211: int count, sascha@1211: boolean bStation, sascha@1211: boolean bParseEmptyAsZero sascha@1211: ) { sascha@1211: String [] tokens = parseLine(line, count, bStation); sascha@1211: sascha@1211: BigDecimal [] doubles = new BigDecimal[tokens.length]; sascha@1211: sascha@1211: for (int i = 0; i < doubles.length; ++i) { sascha@1211: String token = tokens[i].trim(); sascha@1211: if (token.length() != 0) { sascha@1211: doubles[i] = new BigDecimal(token); sascha@1211: } sascha@1211: else if (bParseEmptyAsZero) { sascha@1211: doubles[i] = UNDEFINED_ZERO; sascha@1211: } sascha@1211: } sascha@1211: sascha@1211: return doubles; sascha@1211: } sascha@1211: sascha@1211: private static String [] parseLine( sascha@1211: String line, sascha@1211: int tokenCount, sascha@1211: boolean bParseStation sascha@1211: ) { sascha@1211: ArrayList strings = new ArrayList(); sascha@1211: sascha@1211: if (bParseStation) { sascha@1211: if (line.length() < 8) { sascha@1211: throw new IllegalArgumentException("station too short"); sascha@1211: } sascha@1211: strings.add(line.substring(0, 8)); sascha@1211: } sascha@1211: sascha@1211: int pos = 9; sascha@1211: for (int i = 0; i < tokenCount; ++i) { sascha@1211: if (line.length() >= pos + 8) { sascha@1211: strings.add(line.substring(pos, pos + 8)); sascha@1211: } sascha@1211: else { sascha@1211: strings.add(""); sascha@1211: } sascha@1211: pos += 9; sascha@1211: } sascha@1211: sascha@1211: return strings.toArray(new String[strings.size()]); sascha@1211: } sascha@1211: } sascha@1211: // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :