view backend/src/main/java/org/dive4elements/river/importer/parsers/WstParser.java @ 9650:a2a42a6bac6b

Importer (s/u-info) extensions: outer try/catch for parse and log of line no, catching parsing exception if not enough value fields, parsing error and warning log messages with line number, detecting and rejecting duplicate data series, better differentiation between error and warning log messages
author mschaefer
date Mon, 23 Mar 2020 14:57:03 +0100
parents 5ff8ce9a2e06
children
line wrap: on
line source
/* Copyright (C) 2011, 2012, 2013 by Bundesanstalt für Gewässerkunde
 * Software engineering by Intevation GmbH
 *
 * This file is Free Software under the GNU AGPL (>=v3)
 * and comes with ABSOLUTELY NO WARRANTY! Check out the
 * documentation coming with Dive4Elements River for details.
 */

package org.dive4elements.river.importer.parsers;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.math.BigDecimal;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.dive4elements.river.backend.utils.DateGuesser;
import org.dive4elements.river.backend.utils.StringUtil;
import org.dive4elements.river.importer.ImportRange;
import org.dive4elements.river.importer.ImportTimeInterval;
import org.dive4elements.river.importer.ImportUnit;
import org.dive4elements.river.importer.ImportWst;
import org.dive4elements.river.importer.ImportWstColumn;
import org.dive4elements.river.importer.ImportWstQRange;

public class WstParser
{
    private static Logger log = Logger.getLogger(WstParser.class);

    public static final String COLUMN_BEZ_TEXT   = "column-bez-text";
    public static final String COLUMN_BEZ_BREITE = "column-bez-breite";
    public static final String COLUMN_QUELLE     = "column-quelle";
    public static final String COLUMN_DATUM      = "column-datum";

    public static final BigDecimal UNDEFINED_ZERO =
            new BigDecimal(0.0);
    public static final BigDecimal MIN_RANGE =
            new BigDecimal(-Double.MAX_VALUE);
    public static final BigDecimal MAX_RANGE =
            new BigDecimal(Double.MAX_VALUE);

    public static final String ENCODING = "ISO-8859-1";

    public static final Pattern UNIT_COMMENT =
            Pattern.compile("\\*\\s*[kK][mM]\\s+(.+)");

    public static final Pattern UNIT =
            Pattern.compile("[^\\[]*\\[([^]]+)\\].*");

    public static final Pattern YEAR_INTERVAL =
            Pattern.compile("(\\d{4})\\s*[-/]\\s*(\\d{4})");

    public static final BigDecimal INTERVAL_GAP = new BigDecimal("0.00001");

    protected ImportWst wst;

    protected ImportRange lastRange;
    protected Double lastA;
    protected Double lastB;

    public WstParser() {
    }

    public WstParser(final ImportWst wst) {
        this.wst = wst;
    }

    public ImportWst getWst() {
        return this.wst;
    }

    public void setWst(final ImportWst wst) {
        this.wst = wst;
    }

    public static final class ParseException extends Exception {
        public ParseException() {
        }

        public ParseException(final String msg) {
            super(msg);
        }
    } // class ParseException

    /** Returns a new ImportTimeInterval with a date guessed from string. */
    public static ImportTimeInterval guessDate(final String string) {
        try {
            final Matcher m = YEAR_INTERVAL.matcher(string);
            if (m.matches()) {
                return new ImportTimeInterval(
                        DateGuesser.guessDate(m.group(1)),
                        DateGuesser.guessDate(m.group(2)));
            }

            return new ImportTimeInterval(
                    DateGuesser.guessDate(string));
        }
        catch (final IllegalArgumentException iae) {
            log.warn("WST: String '" + string +
                    "' could not be interpreted as valid timestamp");
        }
        return null;
    }

    public void parse(final File file) throws IOException, ParseException {

        log.info("Parsing WST file '" + file + "'");

        if (this.wst == null) {
            this.wst = new ImportWst(file.getName());
        }
        else {
            this.wst.setDescription(file.getName());
        }

        final LineNumberReader in =
                new LineNumberReader(
                        new InputStreamReader(
                                new FileInputStream(file), ENCODING));
        try {
            String input;
            boolean first = true;
            int columnCount = 0;

            String [] lsBezeichner   = null;
            String [] langBezeichner = null;
            int    [] colNaWidths    = null;
            String [] quellen        = null;
            String [] daten          = null;

            BigDecimal [] aktAbfluesse   = null;
            BigDecimal [] firstAbfluesse = null;

            BigDecimal minKm   = MAX_RANGE;
            BigDecimal maxKm   = MIN_RANGE;
            BigDecimal kmHist1 = null;
            BigDecimal kmHist2 = null;

            boolean columnHeaderChecked = false;

            /* Default string for altitude reference
             * if none is found in WST-file.
             * Use in case no unit comment is found in file */
            String einheit = "m ü. unbekannte Referenz";
            boolean unitFound = false;

            final HashSet<BigDecimal> kms = new HashSet<>();

            while ((input = in.readLine()) != null) {
                String line = input;
                if (first) { // fetch number of columns
                    if ((line = line.trim()).length() == 0) {
                        continue;
                    }
                    try {
                        columnCount = Integer.parseInt(line);
                        if (columnCount <= 0) {
                            throw new NumberFormatException(
                                    "number of columns <= 0");
                        }
                        log.debug("Number of columns: " + columnCount);
                        this.wst.setNumberColumns(columnCount);
                        lsBezeichner = new String[columnCount];
                    }
                    catch (final NumberFormatException nfe) {
                        log.warn("WST: invalid number.", nfe);
                        continue;
                    }
                    first = false;
                    continue;
                }

                line = line.replace(',', '.');

                // handle Q-lines
                if (line.startsWith("*\u001f")) {
                    final BigDecimal [] data = parseLineAsDouble(
                            line, columnCount, false, true);

                    if (aktAbfluesse != null) {
                        // add Q-ranges obtained from previous lines
                        if (kmHist1 != null && kmHist2 != null
                                && kmHist1.compareTo(kmHist2) < 0) {
                            // stations descending in file
                            final BigDecimal t = minKm; minKm = maxKm; maxKm = t;
                        }
                        addInterval(minKm, maxKm, aktAbfluesse);
                        minKm = MAX_RANGE;
                        maxKm = MIN_RANGE;
                    }

                    // obtain Q-values from current line
                    aktAbfluesse = new BigDecimal[data.length];
                    log.debug("new q range: " + columnCount);
                    for (int i = 0; i < data.length; ++i) {
                        if (data[i] != null) {
                            log.debug("  column: " + data[i]);
                            aktAbfluesse[i] = data[i];
                        }
                    }

                    // remember Q-values from first Q-line
                    // for header generation
                    if (firstAbfluesse == null) {
                        firstAbfluesse = aktAbfluesse.clone();
                    }
                    continue;
                }

                // handle special column identifiers
                if (line.startsWith("*!")) {
                    String spezial = line.substring(2).trim();

                    if (spezial.length() == 0) {
                        continue;
                    }

                    if (spezial.startsWith(COLUMN_BEZ_TEXT)) {
                        spezial = spezial.substring(
                                COLUMN_BEZ_TEXT.length()).trim();
                        if (spezial.length() == 0) {
                            continue;
                        }
                        langBezeichner = StringUtil.splitQuoted(spezial, '"');
                    }
                    else if (spezial.startsWith(COLUMN_BEZ_BREITE)) {
                        spezial = spezial.substring(
                                COLUMN_BEZ_BREITE.length()).trim();

                        if (spezial.length() == 0) {
                            continue;
                        }

                        final String[] split = spezial.split("\\s+");

                        colNaWidths = new int[split.length];
                        for (int i=0; i < split.length; i++) {
                            colNaWidths[i] = Integer.parseInt(split[i]);
                        }
                    }
                    else if (spezial.startsWith(COLUMN_QUELLE)) {
                        spezial = spezial.substring(
                                COLUMN_QUELLE.length()).trim();
                        if (spezial.length() == 0) {
                            continue;
                        }
                        quellen = StringUtil.splitQuoted(spezial, '"');
                        log.debug("sources: " + Arrays.toString(quellen));
                    }
                    else if (spezial.startsWith(COLUMN_DATUM)) {
                        spezial = spezial.substring(
                                COLUMN_DATUM.length()).trim();
                        if (spezial.length() == 0) {
                            continue;
                        }
                        daten = StringUtil.splitQuoted(spezial, '"');
                    }
                    continue;
                }

                if (line.length() < 11) {
                    continue;
                }

                // handle comment lines to fetch unit
                if (line.startsWith("*")) {
                    Matcher m = UNIT_COMMENT.matcher(line);
                    if (m.matches()) {
                        log.debug("unit comment found");
                        // XXX: This hack is needed because desktop
                        // FLYS is broken figuring out the unit
                        final String [] units = m.group(1).split("\\s{2,}");
                        m = UNIT.matcher(units[0]);
                        einheit = m.matches() ? m.group(1) : units[0];
                        log.debug("unit: " + einheit);
                        unitFound = true;
                    }

                    continue;
                }

                if (firstAbfluesse != null) {
                    if (!columnHeaderChecked) {
                        int unknownCount = 0;
                        final HashSet<String> uniqueColumnNames =
                                new HashSet<>();
                        if (langBezeichner != null) {
                            // use column name from '*!column-bez-text'-line
                            lsBezeichner = StringUtil.fitArray(
                                    langBezeichner, lsBezeichner);
                        }
                        for (int i = 0; i < lsBezeichner.length; ++i) {
                            if (lsBezeichner[i] == null
                                    || lsBezeichner[i].length() == 0) {
                                // generate alternative column names
                                final double q = firstAbfluesse.length > i ?
                                        firstAbfluesse[i].doubleValue() : 0d;
                                        if (q < 0.001) {
                                            lsBezeichner[i] =
                                                    "<unbekannt #" + unknownCount + ">";
                                            ++unknownCount;
                                        }
                                        else {
                                            lsBezeichner[i] = "Q="+format(q);
                                        }
                            }
                            String candidate = lsBezeichner[i];
                            int collision = 1;
                            while (!uniqueColumnNames.add(candidate)) {
                                candidate = lsBezeichner[i] +
                                        " (" + collision + ")";
                                ++collision;
                            }
                            final ImportWstColumn iwc = this.wst.getColumn(i);
                            iwc.setName(candidate);
                            if (quellen != null && i < quellen.length) {
                                iwc.setSource(quellen[i]);
                            }
                            final String potentialDate =
                                    daten != null && i < daten.length
                                    ? daten[i]
                                            : candidate;
                                    iwc.setTimeInterval(guessDate(potentialDate));
                        }
                        columnHeaderChecked = true;
                    }

                    final BigDecimal [] data = parseLineAsDouble(
                            line, columnCount, true, false);

                    final BigDecimal kaem = data[0];

                    if (!kms.add(kaem)) {
                        log.warn(
                                "WST: km " + kaem +
                                " (line " + in.getLineNumber() +
                                ") found more than once. -> ignored");
                        continue;
                    }

                    // check consistence of station ordering in file
                    if (kmHist2 != null &&
                            kmHist2.compareTo(kmHist1) != kmHist1.compareTo(kaem)
                            ) {
                        throw new ParseException("WST: Stations in " + file +
                                " near line " + in.getLineNumber() +
                                " not ordered. File rejected.");
                    }

                    // remember stations in two previous lines
                    kmHist2 = kmHist1;
                    kmHist1 = kaem;

                    // iteratively determine actual km-range
                    if (kaem.compareTo(minKm) < 0) {
                        minKm = kaem;
                    }
                    if (kaem.compareTo(maxKm) > 0) {
                        maxKm = kaem;
                    }

                    // extract values
                    for (int i = 0; i < data.length - 1; ++i) {
                        addValue(kaem, data[i+1], i);
                    }

                }
                else { // firstAbfluesse == null
                    if (langBezeichner != null) {
                        // nothing to do
                    }
                    else if (colNaWidths != null) {
                        for (int j = 0, i = 0, N = input.length();
                                j < colNaWidths.length && i < N;
                                i += colNaWidths[j++]
                                ) {
                            lsBezeichner[j] = input.substring(
                                    i, i+colNaWidths[j]).trim();
                        }
                    }
                    else { // fetch column names from non-comment header line
                        // (above first Qs)
                        // first column begins at position 8 in line
                        for (int i = 8, col = 0; i < input.length(); i += 9) {
                            // one column header is 9 chars wide
                            // but the last one may be shorter
                            if (col < lsBezeichner.length) {
                                lsBezeichner[col++] =
                                        input.substring(
                                                i,
                                                Math.min(i + 9, input.length())
                                                ).trim();
                            }
                            if (col == lsBezeichner.length) {
                                break;
                            }
                        }
                    }
                }

            } // for all lines in WST file

            if (!unitFound) {
                log.warn("no unit and height reference found. Using default.");
            }
            this.wst.setUnit(new ImportUnit(einheit));

            // add Q-ranges obtained from previous lines
            // in case there was no further Q-line
            // but only if there were values following the last Q-line
            if (minKm != MAX_RANGE && maxKm != MIN_RANGE) {
                if (kmHist1 != null && kmHist2 != null
                        && kmHist1.compareTo(kmHist2) < 0) {
                    // stations descending in file
                    final BigDecimal t = minKm; minKm = maxKm; maxKm = t;
                }
                addInterval(minKm, maxKm, aktAbfluesse);
            }
        }
        finally {
            in.close();
        }
    }

    protected void addValue(final BigDecimal km, final BigDecimal w, final int index) {
        if (w != null) {
            final ImportWstColumn column = this.wst.getColumn(index);
            column.addColumnValue(km, w);
        }
    }

    private static final NumberFormat NF = getNumberFormat();

    private static final NumberFormat getNumberFormat() {
        final NumberFormat nf = NumberFormat.getInstance();
        nf.setMinimumFractionDigits(2);
        nf.setMaximumFractionDigits(2);
        return nf;
    }

    protected static String format(final double value) {
        return NF.format(value);
    }

    protected void addInterval(
            final BigDecimal from,
            BigDecimal to,
            final BigDecimal[] values) {
        log.debug("addInterval: " + from + " " + to);

        if (values == null || from == MAX_RANGE || from == MIN_RANGE) {
            return;
        }

        // expand single-line i.e. 0-lenght Q-range to minimal length
        if (from == to) {
            if (this.lastRange != null && this.lastA > this.lastB) {
                to = from.subtract(INTERVAL_GAP);
            }
            else {
                to = from.add(INTERVAL_GAP);
            }
        }

        final ImportRange range = new ImportRange(from, to);

        // little workaround to make the q ranges tightly fit.
        // Leave a very small gap to ensure that the range queries
        // still work.

        if (this.lastRange != null) {
            if (this.lastA < this.lastB) {
                this.lastRange.setB(range.getA().subtract(INTERVAL_GAP));
            }
            else { // lastA >= lastB
                this.lastRange.setA(range.getB().add(INTERVAL_GAP));
            }
        }

        for (int i = 0; i < values.length; ++i) {
            final ImportWstColumn column = this.wst.getColumn(i);
            final ImportWstQRange wstQRange = new ImportWstQRange(range, values[i]);
            column.addColumnQRange(wstQRange);
        }

        this.lastA = from.doubleValue();
        this.lastB = to.doubleValue();
        this.lastRange = range;
    }

    private static final BigDecimal [] parseLineAsDouble(
            final String  line,
            final int     count,
            final boolean bStation,
            final boolean bParseEmptyAsZero
            ) throws ParseException {
        final String [] tokens = parseLine(line, count, bStation);

        final BigDecimal [] doubles = new BigDecimal[tokens.length];

        for (int i = 0; i < doubles.length; ++i) {
            final String token = tokens[i].trim();
            if (token.length() != 0) {
                doubles[i] = new BigDecimal(token);
            }
            else if (bParseEmptyAsZero) {
                doubles[i] = UNDEFINED_ZERO;
            }
        }

        return doubles;
    }

    private static String [] parseLine(
            final String  line,
            final int     tokenCount,
            final boolean bParseStation
            ) throws ParseException {
        final ArrayList<String> strings = new ArrayList<>();

        if (bParseStation) {
            if (line.length() < 8) {
                throw new IllegalArgumentException("station too short");
            }
            strings.add(line.substring(0, 8));
        }

        int pos = 0;
        for (int i = 0; i < tokenCount; ++i) {
            pos += 9;
            if (pos >= line.length()) {
                break;
            }
            strings.add(line.substring(pos,
                    Math.min(pos + 8, line.length())));
        }

        return strings.toArray(new String[strings.size()]);
    }
}
// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :

http://dive4elements.wald.intevation.org