view backend/src/main/java/org/dive4elements/river/importer/parsers/WstParser.java @ 6328:53d08f33d094

Backend: Moved guessing of main values and there time intervals out of the STA parser. Same come will be useful to extend the WST parser to better handle official lines.
author Sascha L. Teichmann <teichmann@intevation.de>
date Thu, 13 Jun 2013 17:15:34 +0200
parents f9c5e1a8032d
children 224d7c4b5291
line wrap: on
line source
/* Copyright (C) 2011, 2012, 2013 by Bundesanstalt für Gewässerkunde
 * Software engineering by Intevation GmbH
 *
 * This file is Free Software under the GNU AGPL (>=v3)
 * and comes with ABSOLUTELY NO WARRANTY! Check out the
 * documentation coming with Dive4Elements River for details.
 */

package org.dive4elements.river.importer.parsers;

import java.util.ArrayList;
import java.util.HashSet;

import java.io.File;
import java.io.IOException;
import java.io.LineNumberReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;

import java.text.NumberFormat;

import org.apache.log4j.Logger;

import org.dive4elements.river.utils.StringUtil;
import org.dive4elements.river.utils.DateGuesser;

import java.util.regex.Pattern;
import java.util.regex.Matcher;

import java.math.BigDecimal;

import org.dive4elements.river.importer.ImportWstQRange;
import org.dive4elements.river.importer.ImportWstColumn;
import org.dive4elements.river.importer.ImportTimeInterval;
import org.dive4elements.river.importer.ImportRange;
import org.dive4elements.river.importer.ImportUnit;
import org.dive4elements.river.importer.ImportWst;

public class WstParser
{
    private static Logger log = Logger.getLogger(WstParser.class);

    public static final String COLUMN_BEZ_TEXT   = "column-bez-text";
    public static final String COLUMN_BEZ_BREITE = "column-bez-breite";
    public static final String COLUMN_QUELLE     = "column-quelle";
    public static final String COLUMN_DATUM      = "column-datum";

    public static final BigDecimal UNDEFINED_ZERO =
        new BigDecimal(0.0);
    public static final BigDecimal MIN_RANGE =
        new BigDecimal(-Double.MAX_VALUE);
    public static final BigDecimal MAX_RANGE =
        new BigDecimal(Double.MAX_VALUE);

    public static final String ENCODING = "ISO-8859-1";

    public static final Pattern UNIT_COMMENT =
        Pattern.compile("\\*\\s*[kK][mM]\\s+(.+)");

    public static final Pattern UNIT =
        Pattern.compile("[^\\[]*\\[([^]]+)\\].*");

    public static final double INTERVAL_GAP = 0.00001d;

    protected ImportWst wst;

    protected ImportRange lastRange;

    public WstParser() {
    }

    public ImportWst getWst() {
        return wst;
    }

    public void setWst(ImportWst wst) {
        this.wst = wst;
    }

    public static ImportTimeInterval guessDate(String string) {
        try {
            return new ImportTimeInterval(
                DateGuesser.guessDate(string));
        }
        catch (IllegalArgumentException iae) {
            log.warn("WST: String '" + string +
                     "' could not be interpreted as valid timestamp");
        }
        return null;
    }

    public void parse(File file) throws IOException {

        log.info("Parsing WST file '" + file + "'");

        wst = new ImportWst(file.getName());

        LineNumberReader in = null;
        try {
            in =
                new LineNumberReader(
                new InputStreamReader(
                new FileInputStream(file), ENCODING));

            String input;
            boolean first = true;
            int columnCount = 0;

            String [] lsBezeichner   = null;
            String [] langBezeichner = null;
            int    [] colNaWidths    = null;
            String [] quellen        = null;
            String [] daten          = null;

            BigDecimal [] aktAbfluesse   = null;
            BigDecimal [] firstAbfluesse = null;

            BigDecimal minKm   = MAX_RANGE;
            BigDecimal maxKm   = MIN_RANGE;
            BigDecimal kmHist1 = null;
            BigDecimal kmHist2 = null;

            boolean columnHeaderChecked = false;

            String einheit = "Wasserstand [NN + m]";

            HashSet<BigDecimal> kms = new HashSet<BigDecimal>();

            while ((input = in.readLine()) != null) {
                String line = input;
                if (first) { // fetch number of columns
                    if ((line = line.trim()).length() == 0) {
                        continue;
                    }
                    try {
                        columnCount = Integer.parseInt(line);
                        if (columnCount <= 0) {
                            throw new NumberFormatException(
                                "number of columns <= 0");
                        }
                        log.debug("Number of columns: " + columnCount);
                        wst.setNumberColumns(columnCount);
                        lsBezeichner = new String[columnCount];
                    }
                    catch (NumberFormatException nfe) {
                        log.warn("WST: invalid number.", nfe);
                        continue;
                    }
                    first = false;
                    continue;
                }

                line = line.replace(',', '.');

                if (line.startsWith("*\u001f")) {
                    BigDecimal [] data =
                        parseLineAsDouble(line, columnCount, false, true);

                    if (aktAbfluesse != null) {
                        if (kmHist1 != null && kmHist2 != null
                        && kmHist1.compareTo(kmHist2) < 0) {
                            BigDecimal t = minKm; minKm = maxKm; maxKm = t;
                        }
                        addInterval(minKm, maxKm, aktAbfluesse);
                        minKm = MAX_RANGE;
                        maxKm = MIN_RANGE;
                    }

                    aktAbfluesse = new BigDecimal[columnCount];
                    log.debug("new q range: " + columnCount);
                    for (int i = 0; i < Math.min(columnCount, data.length); ++i) {
                        if (data[i] != null) {
                            log.debug("  column: " + data[i]);
                            aktAbfluesse[i] = data[i];
                        }
                    }

                    if (firstAbfluesse == null) {
                        firstAbfluesse = (BigDecimal [])aktAbfluesse.clone();
                    }
                    continue;
                }

                if (line.startsWith("*!")) {
                    String spezial = line.substring(2).trim();

                    if (spezial.length() == 0) {
                        continue;
                    }

                    if (spezial.startsWith(COLUMN_BEZ_TEXT)) {
                        spezial = spezial.substring(COLUMN_BEZ_TEXT.length()).trim();
                        if (spezial.length() == 0) {
                            continue;
                        }
                        langBezeichner = StringUtil.splitQuoted(spezial, '"');
                    }
                    else if (spezial.startsWith(COLUMN_BEZ_BREITE)) {
                        spezial = spezial.substring(COLUMN_BEZ_BREITE.length()).trim();

                        if (spezial.length() == 0) {
                            continue;
                        }

                        String[] split = spezial.split("\\s+");

                        colNaWidths = new int[split.length];
                        for (int i=0; i < split.length; i++) {
                            colNaWidths[i] = Integer.parseInt(split[i]);
                        }
                    }
                    else if (spezial.startsWith(COLUMN_QUELLE)) {
                        if (spezial.length() == 0) {
                            continue;
                        }
                        quellen = StringUtil.splitQuoted(spezial, '"');
                    }
                    else if (spezial.startsWith(COLUMN_DATUM)) {
                        spezial = spezial.substring(COLUMN_DATUM.length()).trim();
                        if (spezial.length() == 0) {
                            continue;
                        }
                        daten = StringUtil.splitQuoted(spezial, '"');
                    }
                    continue;
                }

                if (line.length() < 11) {
                    continue;
                }

                if (line.startsWith("*")) {
                    Matcher m = UNIT_COMMENT.matcher(line);
                    if (m.matches()) {
                        log.debug("unit comment found");
                        // XXX: This hack is needed because desktop
                        // FLYS is broken figuring out the unit
                        String [] units = m.group(1).split("\\s{2,}");
                        m = UNIT.matcher(units[0]);
                        einheit = m.matches() ? m.group(1) : units[0];
                        log.debug("unit: " + einheit);
                    }
                    continue;
                }

                if (firstAbfluesse != null) {
                    if (!columnHeaderChecked) {
                        int unknownCount = 0;
                        HashSet<String> uniqueColumnNames =
                            new HashSet<String>();
                        if (langBezeichner != null) {
                            // use column name from '*!column-bez-text'-line
                            lsBezeichner = StringUtil.fitArray(
                                langBezeichner, lsBezeichner);
                        }
                        for (int i = 0; i < lsBezeichner.length; ++i) {
                            if (lsBezeichner[i] == null
                            || lsBezeichner[i].length() == 0) {
                                // generate alternative column names
                                double q = firstAbfluesse[i].doubleValue();
                                if (q < 0.001) {
                                    lsBezeichner[i] =
                                        "<unbekannt #" + unknownCount + ">";
                                    ++unknownCount;
                                }
                                else {
                                    lsBezeichner[i] = "Q="+format(q);
                                }
                            }
                            String candidate = lsBezeichner[i];
                            int collision = 1;
                            while (!uniqueColumnNames.add(candidate)) {
                                candidate = lsBezeichner[i] +
                                    " (" + collision + ")";
                                ++collision;
                            }
                            ImportWstColumn iwc = wst.getColumn(i);
                            iwc.setName(candidate);
                            String potentialDate = daten != null && i < daten.length
                                ? daten[i]
                                : candidate;
                            iwc.setTimeInterval(guessDate(potentialDate));
                        }
                        columnHeaderChecked = true;
                    }

                    BigDecimal [] data =
                        parseLineAsDouble(line, columnCount, true, false);

                    BigDecimal kaem = data[0];

                    if (!kms.add(kaem)) {
                        log.warn(
                            "WST: km " + kaem +
                            " (line " + in.getLineNumber() +
                            ") found more than once. -> ignored");
                        continue;
                    }

                    kmHist2 = kmHist1;
                    kmHist1 = kaem;

                    if (kaem.compareTo(minKm) < 0) {
                        minKm = kaem;
                    }
                    if (kaem.compareTo(maxKm) > 0) {
                        maxKm = kaem;
                    }

                    // extract values
                    for (int i = 0; i < columnCount; ++i) {
                        addValue(kaem, data[i+1], i);
                    }

                }
                else { // firstAbfluesse == null
                    if (langBezeichner != null) {
                        // nothing to do
                    }
                    else if (colNaWidths != null) {
                        for (int j = 0, i = 0, N = input.length();
                             j < colNaWidths.length && i < N;
                             i += colNaWidths[j++]
                        ) {
                            lsBezeichner[j] = input.substring(
                                i, i+colNaWidths[j]).trim();
                        }
                    }
                    else {
                        // first column begins at position 8 in line
                        for (int i = 8, col = 0; i < input.length(); i += 9) {
                            if ((i + 9) > input.length()) {
                                i = input.length() - 10;
                            }
                            // one column header is 9 chars wide
                            lsBezeichner[col++] =
                                input.substring(i, i + 9).trim();

                            if (col == lsBezeichner.length) {
                                break;
                            }
                        }
                    }
                }

            } // for all lines in WST file

            wst.setUnit(new ImportUnit(einheit));

            if (kmHist1 != null && kmHist2 != null
            && kmHist1.compareTo(kmHist2) < 0) {
                BigDecimal t = minKm; minKm = maxKm; maxKm = t;
            }
            addInterval(minKm, maxKm, aktAbfluesse);

            fixRangesOrder();
        }
        finally {
            if (in != null) {
                in.close();
            }
        }
    }

    protected void fixRangesOrder() {
        wst.fixRangesOrder();
    }

    protected void addValue(BigDecimal km, BigDecimal w, int index) {
        if (w != null) {
            ImportWstColumn column = wst.getColumn(index);
            column.addColumnValue(km, w);
        }
    }

    private static final NumberFormat NF = getNumberFormat();

    private static final NumberFormat getNumberFormat() {
        NumberFormat nf = NumberFormat.getInstance();
        nf.setMinimumFractionDigits(2);
        nf.setMaximumFractionDigits(2);
        return nf;
    }

    protected static String format(double value) {
        return NF.format(value);
    }

    protected void addInterval(
        BigDecimal    from,
        BigDecimal    to,
        BigDecimal [] values
    ) {
        log.debug("addInterval: " + from + " " + to);

        if (values == null || from == MAX_RANGE || from == MIN_RANGE) {
            return;
        }

        ImportRange range = new ImportRange(from, to);

        // little workaround to make the q ranges tightly fit.
        // Leave a very small gap to ensure that the range queries
        // still work.

        if (lastRange != null) {
            double a1 = lastRange.getA().doubleValue();
            double b1 = lastRange.getB().doubleValue();
            double a2 = range.getA().doubleValue();

            if (a1 < b1) {
                lastRange.setB(new BigDecimal(a2 - INTERVAL_GAP));
            }
            else { // a1 >= b1
                lastRange.setB(new BigDecimal(a2 + INTERVAL_GAP));
            }
        }

        for (int i = 0; i < values.length; ++i) {
            ImportWstColumn column = wst.getColumn(i);
            ImportWstQRange wstQRange = new ImportWstQRange(range, values[i]);
            column.addColumnQRange(wstQRange);
        }

        lastRange = range;
    }

    private static final BigDecimal [] parseLineAsDouble(
        String  line,
        int     count,
        boolean bStation,
        boolean bParseEmptyAsZero
    ) {
        String [] tokens = parseLine(line, count, bStation);

        BigDecimal [] doubles = new BigDecimal[tokens.length];

        for (int i = 0; i < doubles.length; ++i) {
            String token = tokens[i].trim();
            if (token.length() != 0) {
                doubles[i] = new BigDecimal(token);
            }
            else if (bParseEmptyAsZero) {
                doubles[i] = UNDEFINED_ZERO;
            }
        }

        return doubles;
    }

    private static String [] parseLine(
        String  line,
        int     tokenCount,
        boolean bParseStation
    ) {
        ArrayList<String> strings = new ArrayList<String>();

        if (bParseStation) {
            if (line.length() < 8) {
                throw new IllegalArgumentException("station too short");
            }
            strings.add(line.substring(0, 8));
        }

        int pos = 9;
        for (int i = 0; i < tokenCount; ++i) {
            if (line.length() >= pos + 8) {
                strings.add(line.substring(pos, pos + 8));
            }
            else {
                strings.add("");
            }
            pos += 9;
        }

        return strings.toArray(new String[strings.size()]);
    }
}
// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :

http://dive4elements.wald.intevation.org