Mercurial > dive4elements > river
view flys-backend/src/main/java/de/intevation/flys/importer/WstParser.java @ 1197:ce3dacc6ea92
PRFParser: extract km from lines. TODO: extract data.
flys-backend/trunk@2297 c6561f87-3c4e-4783-a992-168aeb5c3f6f
author | Sascha L. Teichmann <sascha.teichmann@intevation.de> |
---|---|
date | Thu, 07 Jul 2011 09:29:31 +0000 |
parents | 6dc847194625 |
children |
line wrap: on
line source
package de.intevation.flys.importer; import java.util.ArrayList; import java.util.HashSet; import java.io.File; import java.io.IOException; import java.io.LineNumberReader; import java.io.InputStreamReader; import java.io.FileInputStream; import java.text.NumberFormat; import org.apache.log4j.Logger; import de.intevation.flys.utils.StringUtil; import java.util.regex.Pattern; import java.util.regex.Matcher; import java.math.BigDecimal; public class WstParser { private static Logger log = Logger.getLogger(WstParser.class); public static final String COLUMN_BEZ_TEXT = "column-bez-text"; public static final String COLUMN_BEZ_BREITE = "column-bez-breite"; public static final String COLUMN_QUELLE = "column-quelle"; public static final String COLUMN_DATUM = "column-datum"; public static final BigDecimal UNDEFINED_ZERO = new BigDecimal(0.0); public static final BigDecimal MIN_RANGE = new BigDecimal(-Double.MAX_VALUE); public static final BigDecimal MAX_RANGE = new BigDecimal(Double.MAX_VALUE); public static final String ENCODING = "ISO-8859-1"; public static final Pattern UNIT_COMMENT = Pattern.compile("\\*\\s*[kK][mM]\\s+(.+)"); public static final Pattern UNIT = Pattern.compile("[^\\[]*\\[([^]]+)\\].*"); public static final BigDecimal INTERVAL_GAP = new BigDecimal(0.00001); protected ImportWst wst; protected ImportRange lastRange; public WstParser() { } public ImportWst getWst() { return wst; } public void setWst(ImportWst wst) { this.wst = wst; } public void parse(File file) throws IOException { log.info("Parsing WST file '" + file + "'"); wst = new ImportWst(file.getName()); LineNumberReader in = null; try { in = new LineNumberReader( new InputStreamReader( new FileInputStream(file), ENCODING)); String input; boolean first = true; int columnCount = 0; String [] lsBezeichner = null; String [] langBezeichner = null; int [] colNaWidths = null; String [] quellen = null; String [] daten = null; BigDecimal [] aktAbfluesse = null; BigDecimal [] firstAbfluesse = null; BigDecimal minKm = MAX_RANGE; BigDecimal maxKm = MIN_RANGE; boolean columnHeaderChecked = false; String einheit = "Wasserstand [NN + m]"; HashSet<BigDecimal> kms = new HashSet<BigDecimal>(); while ((input = in.readLine()) != null) { String line = input; if (first) { // fetch number of columns if ((line = line.trim()).length() == 0) { continue; } try { columnCount = Integer.parseInt(line); if (columnCount <= 0) { throw new NumberFormatException( "number columns <= 0"); } log.debug("Number of columns: " + columnCount); wst.setNumberColumns(columnCount); lsBezeichner = new String[columnCount]; } catch (NumberFormatException nfe) { log.warn(nfe); continue; } first = false; continue; } line = line.replace(',', '.'); if (line.startsWith("*\u001f")) { BigDecimal [] data = parseLineAsDouble(line, columnCount, false, true); if (aktAbfluesse != null) { addInterval(minKm, maxKm, aktAbfluesse); minKm = MAX_RANGE; maxKm = MIN_RANGE; } aktAbfluesse = new BigDecimal[columnCount]; log.debug("new q range: " + columnCount); for (int i = 0; i < Math.min(columnCount, data.length); ++i) { if (data[i] != null) { log.debug(" column: " + data[i]); aktAbfluesse[i] = data[i]; } } if (firstAbfluesse == null) { firstAbfluesse = (BigDecimal [])aktAbfluesse.clone(); } continue; } if (line.startsWith("*!")) { String spezial = line.substring(2).trim(); if (spezial.length() == 0) { continue; } if (spezial.startsWith(COLUMN_BEZ_TEXT)) { spezial = spezial.substring(COLUMN_BEZ_TEXT.length()).trim(); if (spezial.length() == 0) { continue; } langBezeichner = StringUtil.splitQuoted(spezial, '"'); } else if (spezial.startsWith(COLUMN_BEZ_BREITE)) { spezial = spezial.substring(COLUMN_BEZ_BREITE.length()).trim(); if (spezial.length() == 0) { continue; } String[] split = spezial.split("\\s+"); colNaWidths = new int[split.length]; for (int i=0; i < split.length; i++) { colNaWidths[i] = Integer.parseInt(split[i]); } } else if (spezial.startsWith(COLUMN_QUELLE)) { if (spezial.length() == 0) { continue; } quellen = StringUtil.splitQuoted(spezial, '"'); } else if (spezial.startsWith(COLUMN_DATUM)) { spezial = spezial.substring(COLUMN_DATUM.length()).trim(); if (spezial.length() == 0) { continue; } daten = StringUtil.splitQuoted(spezial, '"'); } continue; } if (line.length() < 11) { continue; } if (line.startsWith("*")) { Matcher m = UNIT_COMMENT.matcher(line); if (m.matches()) { log.debug("unit comment found"); // XXX: This hack is needed because desktop // FLYS is broken figuring out the unit String [] units = m.group(1).split("\\s{2,}"); m = UNIT.matcher(units[0]); einheit = m.matches() ? m.group(1) : units[0]; log.debug("unit: " + einheit); } continue; } if (firstAbfluesse != null) { if (!columnHeaderChecked) { int unknownCount = 0; HashSet<String> uniqueColumnNames = new HashSet<String>(); for (int i = 0; i < lsBezeichner.length; ++i) { if (lsBezeichner[i] == null || lsBezeichner[i].length() == 0) { double q = firstAbfluesse[i].doubleValue(); if (q < 0.001) { lsBezeichner[i] = "<unbekannt #" + unknownCount + ">"; ++unknownCount; } else { lsBezeichner[i] = "Q="+format(q); } } String candidate = lsBezeichner[i]; int collision = 1; while (!uniqueColumnNames.add(candidate)) { candidate = lsBezeichner[i] + " (" + collision + ")"; ++collision; } wst.getColumn(i).setName(candidate); } columnHeaderChecked = true; } BigDecimal [] data = parseLineAsDouble(line, columnCount, true, false); BigDecimal kaem = data[0]; if (!kms.add(kaem)) { log.warn( "km " + kaem + " (line " + in.getLineNumber() + ") found more than once. -> ignored"); continue; } if (kaem.compareTo(minKm) < 0) { minKm = kaem; } if (kaem.compareTo(maxKm) > 0) { maxKm = kaem; } // extract values for (int i = 0; i < columnCount; ++i) { addValue(kaem, data[i+1], i); } } else { // firstAbfluesse == null if (langBezeichner != null) { lsBezeichner = StringUtil.fitArray( langBezeichner, lsBezeichner); } else if (colNaWidths != null) { for (int j = 0, i = 0, N = input.length(); j < colNaWidths.length && i < N; i += colNaWidths[j++] ) { lsBezeichner[j] = input.substring( i, i+colNaWidths[j]).trim(); } } else { // first column begins at position 8 in line for (int i = 8, col = 0; i < input.length(); i += 9) { if ((i + 9) > input.length()) { i = input.length() - 10; } // one column header is 9 chars wide lsBezeichner[col++] = input.substring(i, i + 9).trim(); if (col == lsBezeichner.length) { break; } } } } } addInterval(minKm, maxKm, aktAbfluesse); } finally { if (in != null) { in.close(); } } } protected void addValue(BigDecimal km, BigDecimal w, int index) { if (w != null) { ImportWstColumn column = wst.getColumn(index); column.addColumnValue(km, w); } } private static final NumberFormat NF = getNumberFormat(); private static final NumberFormat getNumberFormat() { NumberFormat nf = NumberFormat.getInstance(); nf.setMinimumFractionDigits(2); nf.setMaximumFractionDigits(2); return nf; } protected static String format(double value) { return NF.format(value); } protected void addInterval( BigDecimal from, BigDecimal to, BigDecimal [] values ) { log.debug("addInterval: " + from + " " + to); if (values == null || from == MAX_RANGE) { return; } if (to.compareTo(from) < 0) { BigDecimal t = from; from = to; to = t; } ImportRange range = new ImportRange(from, to); // little workaround to make the q ranges tightly fit. // Leave a very small gap to ensure that the range queries // still work. if (lastRange != null) { double d1 = Math.abs( lastRange.getB().doubleValue() - range.getA().doubleValue()); double d2 = Math.abs( range.getB().doubleValue() - lastRange.getA().doubleValue()); if (d1 < d2) { lastRange.setB(range.getA().subtract(INTERVAL_GAP)); } else { range.setA(lastRange.getB().subtract(INTERVAL_GAP)); } } for (int i = 0; i < values.length; ++i) { ImportWstColumn column = wst.getColumn(i); ImportWstQRange wstQRange = new ImportWstQRange(range, values[i]); column.addColumnQRange(wstQRange); } lastRange = range; } private static final BigDecimal [] parseLineAsDouble( String line, int count, boolean bStation, boolean bParseEmptyAsZero ) { String [] tokens = parseLine(line, count, bStation); BigDecimal [] doubles = new BigDecimal[tokens.length]; for (int i = 0; i < doubles.length; ++i) { String token = tokens[i].trim(); if (token.length() != 0) { doubles[i] = new BigDecimal(token); } else if (bParseEmptyAsZero) { doubles[i] = UNDEFINED_ZERO; } } return doubles; } private static String [] parseLine( String line, int tokenCount, boolean bParseStation ) { ArrayList<String> strings = new ArrayList<String>(); if (bParseStation) { if (line.length() < 8) { throw new IllegalArgumentException("station too short"); } strings.add(line.substring(0, 8)); } int pos = 9; for (int i = 0; i < tokenCount; ++i) { if (line.length() >= pos + 8) { strings.add(line.substring(pos, pos + 8)); } else { strings.add(""); } pos += 9; } return strings.toArray(new String[strings.size()]); } } // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :