diff backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java @ 8971:50416a0df385

Importer for the Schifffahrt (S-INFO) and Oekologie (U-INFO) files
author mschaefer
date Tue, 03 Apr 2018 10:18:30 +0200
parents
children 2693bfaf503d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java	Tue Apr 03 10:18:30 2018 +0200
@@ -0,0 +1,408 @@
+/* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde
+ * Software engineering by
+ *  Björnsen Beratende Ingenieure GmbH
+ *  Dr. Schumacher Ingenieurbüro für Wasser und Umwelt
+ *
+ * This file is Free Software under the GNU AGPL (>=v3)
+ * and comes with ABSOLUTELY NO WARRANTY! Check out the
+ * documentation coming with Dive4Elements River for details.
+ */
+
+package org.dive4elements.river.importer.common;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.LineNumberReader;
+import java.text.NumberFormat;
+import java.text.ParseException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.log4j.Logger;
+import org.dive4elements.river.backend.utils.EpsilonComparator;
+import org.dive4elements.river.importer.ImportRiver;
+
+/**
+ * Abstract base class for a parser of one FLYS csv data file.<br />
+ * The {@link parse} method creates a SERIES object for the meta data
+ * and a list of KMLINE objects for the km value lines read from the file.<br />
+ * The {@link store} method gets or creates the corresponding database objects
+ * by the hibernate binding classes DB_SERIES and DB_KMTUPLE,
+ * and updates or inserts them in the database.
+ * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.<br />
+ * <br />
+ * The structure of the file is as follows:<br />
+ * <ul>
+ * <li>one or more comment lines (#) with the meta info of the data series</li>
+ * <li>the comment line with the column titles of values table, starting with the km column</li>
+ * <li>the rows of the values table, each one on its own line</li>
+ * </ul>
+ *
+ * @author Matthias Schäfer
+ *
+ */
+public abstract class AbstractParser<DB_SERIES, DB_KMTUPLE, KMLINE extends AbstractKmLineImport<DB_SERIES, DB_KMTUPLE>, HEADER extends AbstractSeriesImport<DB_SERIES, DB_KMTUPLE, KMLINE>> implements ImportParser {
+
+    /***** FIELDS *****/
+
+    public static final String ENCODING = "ISO-8859-1";
+
+    protected static final Locale DEFAULT_LOCALE = Locale.GERMAN;
+
+    public static final String START_META_CHAR = "#";
+
+    protected static final String SEPARATOR_CHAR = ";";
+
+    protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);
+
+    protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);
+
+    protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);
+
+    private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE);
+
+    private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE);
+
+    private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT);
+
+    /**
+     * Path of the file or directory to import from
+     */
+    protected final File importPath;
+
+    /**
+     * Part of {@link importPath} without the river root dir
+     */
+    protected final File rootRelativePath;
+
+    /**
+     * River for which the import runs
+     */
+    protected final ImportRiver river;
+
+    /**
+     * Reader during parse
+     */
+    protected LineNumberReader in;
+
+    /**
+     * Last line read from in
+     */
+    protected String currentLine;
+
+    /**
+     * State of the header lines parse loop
+     */
+    protected ParsingState headerParsingState;
+
+    /**
+     * Series header of the stations table, with the imported meta info.
+     */
+    protected HEADER seriesHeader;
+
+    /**
+     * List of meta info Pattern matched during {@link handleMetaLine}
+     */
+    protected final List<Pattern> metaPatternsMatched;
+
+    /**
+     * Column titles of the stations table, starting with the km column.
+     * All strings have been trimmed.
+     */
+    protected final List<String> columnTitles;
+
+    /**
+     * List of the km value tuples imported, no duplicate km
+     */
+    protected final List<KMLINE> values;
+
+    /**
+     * Ordered list with the imported km to check for duplicates.
+     */
+    protected final TreeSet<Double> kmExists;
+
+
+    /***** CONSTRUCTORS *****/
+
+    /**
+     * Constructs a parser for an import file
+     */
+    public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) {
+        this.importPath = importPath;
+        this.rootRelativePath = rootRelativePath;
+        this.river = river;
+        this.metaPatternsMatched = new ArrayList<>();
+        this.kmExists = new TreeSet<>(EpsilonComparator.CMP);
+        this.columnTitles = new ArrayList<>();
+        this.values = new ArrayList<>();
+    }
+
+
+    /***** METHODS *****/
+
+    /**
+     * Lists all files from a directory having a type extension (starting with dot)
+     */
+    protected static List<File> listFiles(final File importDir, final String extension) {
+        final File[] files = importDir.listFiles(new FilenameFilter() {
+            @Override
+            public boolean accept(final File dir, final String name) {
+                return name.toLowerCase().endsWith(extension);
+            }
+        });
+        final List<File> fl = new ArrayList<>();
+        if (files != null)
+            for (final File file : files)
+                fl.add(file);
+        return fl;
+    }
+
+    /**
+     * Parses a file and adds series and values to the parser's collection
+     */
+    @Override
+    public void parse() throws IOException {
+        logStartInfo();
+        this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", ""));
+        this.metaPatternsMatched.clear();
+        this.kmExists.clear();
+        this.headerParsingState = ParsingState.CONTINUE;
+        try {
+            try {
+                this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING));
+            }
+            catch (final Exception e) {
+                logError("Could not open (" + e.getMessage() + ")");
+                this.headerParsingState = ParsingState.STOP;
+            }
+            this.currentLine = null;
+            while (this.headerParsingState != ParsingState.STOP) {
+                this.currentLine = this.in.readLine();
+                if (this.currentLine == null)
+                    break;
+                this.currentLine = this.currentLine.trim();
+                if (this.currentLine.isEmpty())
+                    continue;
+                if (this.headerParsingState == ParsingState.CONTINUE)
+                    handleMetaLine();
+                else
+                    handleDataLine();
+            }
+            if (this.headerParsingState != ParsingState.STOP)
+                getLog().info("Number of values found: " + this.seriesHeader.getValueCount());
+        }
+        finally {
+            if (this.in != null) {
+                this.in.close();
+                this.in = null;
+            }
+        }
+        if (this.headerParsingState == ParsingState.STOP)
+            logError("Parsing of the file stopped due to a severe error");
+    }
+
+    /**
+     * Writes the parse start info to the log
+     */
+    protected void logStartInfo() {
+        getLog().info("Start parsing:;'" + this.rootRelativePath + "'");
+    }
+
+    /**
+     * Stores the parsed series and values in the database
+     */
+    @Override
+    public void store() {
+        if (this.headerParsingState != ParsingState.STOP) {
+            this.seriesHeader.store(this.river.getPeer());
+            final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(),
+                    this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE));
+            if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT))
+                logWarning("Number of value inserts less than number parsed: " + counts);
+            else
+                getLog().info("Number of values records: " + counts);
+        }
+        else
+            logWarning("Severe parsing errors, not storing series '" + this.seriesHeader.getFilename() + "'");
+    }
+
+    /**
+     * Strips separator chars from a meta info text, and trims leading and trailing whitespace
+     */
+    public static String parseMetaInfo(final String text) {
+        return text.replace(SEPARATOR_CHAR, "").trim();
+    }
+
+    /**
+     * Parses a number string with dot or comma as decimal char, and returning null in case of an error
+     */
+    public static Number parseDoubleWithNull(final String text) {
+        try {
+            return parseDouble(text);
+        }
+        catch (final Exception e) {
+            return null;
+        }
+    }
+
+    /**
+     * Parses a number string with dot or comma as decimal char
+     *
+     * @throws ParseException
+     */
+    public static Number parseDouble(final String text) throws ParseException {
+        return numberFormat.parse(text.replace(',', '.'));
+    }
+
+    /**
+     * Gets the class's logger
+     */
+    protected abstract Logger getLog();
+
+    /**
+     * Logs an error message, appending the relative file path
+     */
+    protected void logError(final String message) {
+        getLog().error(message + ";" + this.rootRelativePath);
+    }
+
+    /**
+     * Logs a warning message, appending the relative file path
+     */
+    protected void logWarning(final String message) {
+        getLog().warn(message + ";" + this.rootRelativePath);
+    }
+
+    /**
+     * Creates a new series import object
+     */
+    protected abstract HEADER createSeriesImport(final String filename);
+
+    protected void handleMetaLine() {
+        if (META_SUBGROUP.matcher(this.currentLine).matches())
+            return;
+        else if (handleMetaRivername())
+            return;
+        else if (handleMetaKmrange_info())
+            return;
+        else if (handleMetaComment())
+            return;
+        else if (handleMetaOther())
+            return;
+        else if (handleMetaColumnTitles()) {
+            if (this.headerParsingState != ParsingState.STOP)
+                this.headerParsingState = ParsingState.DONE;
+            return;
+        }
+        else {
+            if (this.currentLine.startsWith(START_META_CHAR)) {
+                if (this.headerParsingState != ParsingState.IGNORE)
+                    logWarning("Not matching any known meta type in line " + this.in.getLineNumber() + ", ignored");
+                else
+                    this.headerParsingState = ParsingState.CONTINUE;
+            }
+        }
+    }
+
+    private boolean handleMetaRivername() {
+        if (META_RIVERNAME.matcher(this.currentLine).matches()) {
+            this.metaPatternsMatched.add(META_RIVERNAME);
+            return true;
+        }
+        else
+            return false;
+    }
+
+    private boolean handleMetaKmrange_info() {
+        final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine);
+        if (m.matches()) {
+            this.metaPatternsMatched.add(META_KMRANGE_INFO);
+            this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1)));
+            return true;
+        }
+        return false;
+    }
+
+    private boolean handleMetaComment() {
+        final Matcher m = META_COMMENTS.matcher(this.currentLine);
+        if (m.matches()) {
+            this.metaPatternsMatched.add(META_COMMENTS);
+            this.seriesHeader.setComment(parseMetaInfo(m.group(1)));
+            return true;
+        }
+        return false;
+    }
+
+    /**
+     * Parses currentLine for non-default meta info
+     *
+     * @return Whether the line has been handled
+     */
+    protected boolean handleMetaOther() {
+        return false;
+    }
+
+    /**
+     * Parses a header line for the km table column header line
+     *
+     * @return Whether the line has been handled and we are ready for reading the km values lines
+     */
+    protected boolean handleMetaColumnTitles() {
+        if (META_COLUMNTITLES.matcher(this.currentLine).matches()) {
+            this.metaPatternsMatched.add(META_COLUMNTITLES);
+            this.columnTitles.clear();
+            final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0);
+            for (int i = 0; i <= titles.length - 1; i++)
+                this.columnTitles.add(titles[i].trim());
+            return true;
+        }
+        return false;
+    }
+
+    private void handleDataLine() {
+        final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0);
+        // Skip import line without data or only km
+        if (values.length < 2)
+            return;
+        Double km;
+        try {
+            km = Double.valueOf(parseDouble(values[0]).doubleValue());
+            if (kmMustBeUnique()) {
+                if (this.kmExists.contains(km)) {
+                    logWarning("Ignoring duplicate station '" + values[0] + "' in line " + this.in.getLineNumber());
+                    return;
+                }
+                this.kmExists.add(km);
+            }
+        }
+        catch (final Exception e) {
+            logError("Not parseable km in line " + this.in.getLineNumber() + ": " + e.getMessage());
+            return;
+        }
+        final KMLINE value = createKmLineImport(km, values);
+        if (value != null)
+            this.seriesHeader.addValue(value);
+    }
+
+    /**
+     * Whether {@link handleDataLine} shall check for and reject km duplicates
+     */
+    protected boolean kmMustBeUnique() {
+        return true;
+    }
+
+    /**
+     * Creates a value import item with the km and other fields of the current line;
+     * the km has been validated
+     *
+     * @return value item, or null if parse error
+     */
+    protected abstract KMLINE createKmLineImport(final Double km, final String[] values);
+}

http://dive4elements.wald.intevation.org