Mercurial > dive4elements > river
diff backend/src/main/java/org/dive4elements/river/importer/parsers/WstParser.java @ 5838:5aa05a7a34b7
Rename modules to more fitting names.
author | Sascha L. Teichmann <teichmann@intevation.de> |
---|---|
date | Thu, 25 Apr 2013 15:23:37 +0200 |
parents | flys-backend/src/main/java/org/dive4elements/river/importer/parsers/WstParser.java@18619c1e7c2a |
children | 4dd33b86dc61 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/backend/src/main/java/org/dive4elements/river/importer/parsers/WstParser.java Thu Apr 25 15:23:37 2013 +0200 @@ -0,0 +1,464 @@ +package org.dive4elements.river.importer.parsers; + +import java.util.ArrayList; +import java.util.HashSet; + +import java.io.File; +import java.io.IOException; +import java.io.LineNumberReader; +import java.io.InputStreamReader; +import java.io.FileInputStream; + +import java.text.NumberFormat; + +import org.apache.log4j.Logger; + +import org.dive4elements.river.utils.StringUtil; +import org.dive4elements.river.utils.DateGuesser; + +import java.util.regex.Pattern; +import java.util.regex.Matcher; + +import java.math.BigDecimal; + +import org.dive4elements.river.importer.ImportWstQRange; +import org.dive4elements.river.importer.ImportWstColumn; +import org.dive4elements.river.importer.ImportTimeInterval; +import org.dive4elements.river.importer.ImportRange; +import org.dive4elements.river.importer.ImportUnit; +import org.dive4elements.river.importer.ImportWst; + +public class WstParser +{ + private static Logger log = Logger.getLogger(WstParser.class); + + public static final String COLUMN_BEZ_TEXT = "column-bez-text"; + public static final String COLUMN_BEZ_BREITE = "column-bez-breite"; + public static final String COLUMN_QUELLE = "column-quelle"; + public static final String COLUMN_DATUM = "column-datum"; + + public static final BigDecimal UNDEFINED_ZERO = + new BigDecimal(0.0); + public static final BigDecimal MIN_RANGE = + new BigDecimal(-Double.MAX_VALUE); + public static final BigDecimal MAX_RANGE = + new BigDecimal(Double.MAX_VALUE); + + public static final String ENCODING = "ISO-8859-1"; + + public static final Pattern UNIT_COMMENT = + Pattern.compile("\\*\\s*[kK][mM]\\s+(.+)"); + + public static final Pattern UNIT = + Pattern.compile("[^\\[]*\\[([^]]+)\\].*"); + + public static final double INTERVAL_GAP = 0.00001d; + + protected ImportWst wst; + + protected ImportRange lastRange; + + public WstParser() { + } + + public ImportWst getWst() { + return wst; + } + + public void setWst(ImportWst wst) { + this.wst = wst; + } + + public ImportTimeInterval guessDate(String string) { + try { + return new ImportTimeInterval( + DateGuesser.guessDate(string)); + } + catch (IllegalArgumentException iae) { + } + return null; + } + + public void parse(File file) throws IOException { + + log.info("Parsing WST file '" + file + "'"); + + wst = new ImportWst(file.getName()); + + LineNumberReader in = null; + try { + in = + new LineNumberReader( + new InputStreamReader( + new FileInputStream(file), ENCODING)); + + String input; + boolean first = true; + int columnCount = 0; + + String [] lsBezeichner = null; + String [] langBezeichner = null; + int [] colNaWidths = null; + String [] quellen = null; + String [] daten = null; + + BigDecimal [] aktAbfluesse = null; + BigDecimal [] firstAbfluesse = null; + + BigDecimal minKm = MAX_RANGE; + BigDecimal maxKm = MIN_RANGE; + BigDecimal kmHist1 = null; + BigDecimal kmHist2 = null; + + boolean columnHeaderChecked = false; + + String einheit = "Wasserstand [NN + m]"; + + HashSet<BigDecimal> kms = new HashSet<BigDecimal>(); + + while ((input = in.readLine()) != null) { + String line = input; + if (first) { // fetch number of columns + if ((line = line.trim()).length() == 0) { + continue; + } + try { + columnCount = Integer.parseInt(line); + if (columnCount <= 0) { + throw new NumberFormatException( + "number columns <= 0"); + } + log.debug("Number of columns: " + columnCount); + wst.setNumberColumns(columnCount); + lsBezeichner = new String[columnCount]; + } + catch (NumberFormatException nfe) { + log.warn("WST: invalid number.", nfe); + continue; + } + first = false; + continue; + } + + line = line.replace(',', '.'); + + if (line.startsWith("*\u001f")) { + BigDecimal [] data = + parseLineAsDouble(line, columnCount, false, true); + + if (aktAbfluesse != null) { + if (kmHist1 != null && kmHist2 != null + && kmHist1.compareTo(kmHist2) < 0) { + BigDecimal t = minKm; minKm = maxKm; maxKm = t; + } + addInterval(minKm, maxKm, aktAbfluesse); + minKm = MAX_RANGE; + maxKm = MIN_RANGE; + } + + aktAbfluesse = new BigDecimal[columnCount]; + log.debug("new q range: " + columnCount); + for (int i = 0; i < Math.min(columnCount, data.length); ++i) { + if (data[i] != null) { + log.debug(" column: " + data[i]); + aktAbfluesse[i] = data[i]; + } + } + + if (firstAbfluesse == null) { + firstAbfluesse = (BigDecimal [])aktAbfluesse.clone(); + } + continue; + } + + if (line.startsWith("*!")) { + String spezial = line.substring(2).trim(); + + if (spezial.length() == 0) { + continue; + } + + if (spezial.startsWith(COLUMN_BEZ_TEXT)) { + spezial = spezial.substring(COLUMN_BEZ_TEXT.length()).trim(); + if (spezial.length() == 0) { + continue; + } + langBezeichner = StringUtil.splitQuoted(spezial, '"'); + } + else if (spezial.startsWith(COLUMN_BEZ_BREITE)) { + spezial = spezial.substring(COLUMN_BEZ_BREITE.length()).trim(); + + if (spezial.length() == 0) { + continue; + } + + String[] split = spezial.split("\\s+"); + + colNaWidths = new int[split.length]; + for (int i=0; i < split.length; i++) { + colNaWidths[i] = Integer.parseInt(split[i]); + } + } + else if (spezial.startsWith(COLUMN_QUELLE)) { + if (spezial.length() == 0) { + continue; + } + quellen = StringUtil.splitQuoted(spezial, '"'); + } + else if (spezial.startsWith(COLUMN_DATUM)) { + spezial = spezial.substring(COLUMN_DATUM.length()).trim(); + if (spezial.length() == 0) { + continue; + } + daten = StringUtil.splitQuoted(spezial, '"'); + } + continue; + } + + if (line.length() < 11) { + continue; + } + + if (line.startsWith("*")) { + Matcher m = UNIT_COMMENT.matcher(line); + if (m.matches()) { + log.debug("unit comment found"); + // XXX: This hack is needed because desktop + // FLYS is broken figuring out the unit + String [] units = m.group(1).split("\\s{2,}"); + m = UNIT.matcher(units[0]); + einheit = m.matches() ? m.group(1) : units[0]; + log.debug("unit: " + einheit); + } + continue; + } + + if (firstAbfluesse != null) { + if (!columnHeaderChecked) { + int unknownCount = 0; + HashSet<String> uniqueColumnNames = + new HashSet<String>(); + for (int i = 0; i < lsBezeichner.length; ++i) { + if (lsBezeichner[i] == null + || lsBezeichner[i].length() == 0) { + double q = firstAbfluesse[i].doubleValue(); + if (q < 0.001) { + lsBezeichner[i] = + "<unbekannt #" + unknownCount + ">"; + ++unknownCount; + } + else { + lsBezeichner[i] = "Q="+format(q); + } + } + String candidate = lsBezeichner[i]; + int collision = 1; + while (!uniqueColumnNames.add(candidate)) { + candidate = lsBezeichner[i] + + " (" + collision + ")"; + ++collision; + } + ImportWstColumn iwc = wst.getColumn(i); + iwc.setName(candidate); + String potentialDate = daten != null && i < daten.length + ? daten[i] + : candidate; + iwc.setTimeInterval(guessDate(potentialDate)); + } + columnHeaderChecked = true; + } + + BigDecimal [] data = + parseLineAsDouble(line, columnCount, true, false); + + BigDecimal kaem = data[0]; + + if (!kms.add(kaem)) { + log.warn( + "WST: km " + kaem + + " (line " + in.getLineNumber() + + ") found more than once. -> ignored"); + continue; + } + + kmHist2 = kmHist1; + kmHist1 = kaem; + + if (kaem.compareTo(minKm) < 0) { + minKm = kaem; + } + if (kaem.compareTo(maxKm) > 0) { + maxKm = kaem; + } + + // extract values + for (int i = 0; i < columnCount; ++i) { + addValue(kaem, data[i+1], i); + } + + } + else { // firstAbfluesse == null + if (langBezeichner != null) { + lsBezeichner = StringUtil.fitArray( + langBezeichner, lsBezeichner); + } + else if (colNaWidths != null) { + for (int j = 0, i = 0, N = input.length(); + j < colNaWidths.length && i < N; + i += colNaWidths[j++] + ) { + lsBezeichner[j] = input.substring( + i, i+colNaWidths[j]).trim(); + } + } + else { + // first column begins at position 8 in line + for (int i = 8, col = 0; i < input.length(); i += 9) { + if ((i + 9) > input.length()) { + i = input.length() - 10; + } + // one column header is 9 chars wide + lsBezeichner[col++] = + input.substring(i, i + 9).trim(); + + if (col == lsBezeichner.length) { + break; + } + } + } + } + + } // for all lines in WST file + + wst.setUnit(new ImportUnit(einheit)); + + if (kmHist1 != null && kmHist2 != null + && kmHist1.compareTo(kmHist2) < 0) { + BigDecimal t = minKm; minKm = maxKm; maxKm = t; + } + addInterval(minKm, maxKm, aktAbfluesse); + + fixRangesOrder(); + } + finally { + if (in != null) { + in.close(); + } + } + } + + protected void fixRangesOrder() { + wst.fixRangesOrder(); + } + + protected void addValue(BigDecimal km, BigDecimal w, int index) { + if (w != null) { + ImportWstColumn column = wst.getColumn(index); + column.addColumnValue(km, w); + } + } + + private static final NumberFormat NF = getNumberFormat(); + + private static final NumberFormat getNumberFormat() { + NumberFormat nf = NumberFormat.getInstance(); + nf.setMinimumFractionDigits(2); + nf.setMaximumFractionDigits(2); + return nf; + } + + protected static String format(double value) { + return NF.format(value); + } + + protected void addInterval( + BigDecimal from, + BigDecimal to, + BigDecimal [] values + ) { + log.debug("addInterval: " + from + " " + to); + + if (values == null || from == MAX_RANGE || from == MIN_RANGE) { + return; + } + + ImportRange range = new ImportRange(from, to); + + // little workaround to make the q ranges tightly fit. + // Leave a very small gap to ensure that the range queries + // still work. + + if (lastRange != null) { + double a1 = lastRange.getA().doubleValue(); + double b1 = lastRange.getB().doubleValue(); + double a2 = range.getA().doubleValue(); + + if (a1 < b1) { + lastRange.setB(new BigDecimal(a2 - INTERVAL_GAP)); + } + else { // a1 >= b1 + lastRange.setB(new BigDecimal(a2 + INTERVAL_GAP)); + } + } + + for (int i = 0; i < values.length; ++i) { + ImportWstColumn column = wst.getColumn(i); + ImportWstQRange wstQRange = new ImportWstQRange(range, values[i]); + column.addColumnQRange(wstQRange); + } + + lastRange = range; + } + + private static final BigDecimal [] parseLineAsDouble( + String line, + int count, + boolean bStation, + boolean bParseEmptyAsZero + ) { + String [] tokens = parseLine(line, count, bStation); + + BigDecimal [] doubles = new BigDecimal[tokens.length]; + + for (int i = 0; i < doubles.length; ++i) { + String token = tokens[i].trim(); + if (token.length() != 0) { + doubles[i] = new BigDecimal(token); + } + else if (bParseEmptyAsZero) { + doubles[i] = UNDEFINED_ZERO; + } + } + + return doubles; + } + + private static String [] parseLine( + String line, + int tokenCount, + boolean bParseStation + ) { + ArrayList<String> strings = new ArrayList<String>(); + + if (bParseStation) { + if (line.length() < 8) { + throw new IllegalArgumentException("station too short"); + } + strings.add(line.substring(0, 8)); + } + + int pos = 9; + for (int i = 0; i < tokenCount; ++i) { + if (line.length() >= pos + 8) { + strings.add(line.substring(pos, pos + 8)); + } + else { + strings.add(""); + } + pos += 9; + } + + return strings.toArray(new String[strings.size()]); + } +} +// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :