view backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java @ 9650:a2a42a6bac6b

Importer (s/u-info) extensions: outer try/catch for parse and log of line no, catching parsing exception if not enough value fields, parsing error and warning log messages with line number, detecting and rejecting duplicate data series, better differentiation between error and warning log messages
author mschaefer
date Mon, 23 Mar 2020 14:57:03 +0100
parents ddebd4c2fe93
children
line wrap: on
line source
/* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde
 * Software engineering by
 *  Björnsen Beratende Ingenieure GmbH
 *  Dr. Schumacher Ingenieurbüro für Wasser und Umwelt
 *
 * This file is Free Software under the GNU AGPL (>=v3)
 * and comes with ABSOLUTELY NO WARRANTY! Check out the
 * documentation coming with Dive4Elements River for details.
 */

package org.dive4elements.river.importer.common;

import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.math.BigDecimal;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.dive4elements.river.backend.utils.EpsilonComparator;
import org.dive4elements.river.importer.ImportRiver;
import org.dive4elements.river.importer.ImporterSession;
import org.hibernate.Session;

/**
 * Abstract base class for a parser of one FLYS csv data file.<br />
 * The {@link parse} method creates a SERIES object for the meta data
 * and a list of KMLINE objects for the km value lines read from the file.<br />
 * The {@link store} method gets or creates the corresponding database objects
 * by the hibernate binding classes DB_SERIES and DB_KMTUPLE,
 * and updates or inserts them in the database.
 * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.<br />
 * <br />
 * The structure of the file is as follows:<br />
 * <ul>
 * <li>one or more comment lines (#) with the meta info of the data series</li>
 * <li>the comment line with the column titles of values table, starting with the km column</li>
 * <li>the rows of the values table, each one on its own line</li>
 * </ul>
 *
 * @author Matthias Schäfer
 *
 */
public abstract class AbstractParser<DB_SERIES, DB_KMTUPLE, KMLINE extends AbstractKmLineImport<DB_SERIES, DB_KMTUPLE>, HEADER extends AbstractSeriesImport<DB_SERIES, DB_KMTUPLE, KMLINE>> implements ImportParser {

    /***** FIELDS *****/

    public static final String ENCODING = "ISO-8859-1";

    protected static final Locale DEFAULT_LOCALE = Locale.GERMAN;

    public static final String START_META_CHAR = "#";

    protected static final String SEPARATOR_CHAR = ";";

    protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);

    protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);

    protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);

    private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE);

    private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE);

    private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT);

    private static DecimalFormat bigDecimalFormat;

    protected static final String INVALID_VALUE_ERROR_FORMAT = "Invalid or missing %s value";

    static {
        bigDecimalFormat = (DecimalFormat) NumberFormat.getInstance(Locale.ROOT);
        bigDecimalFormat.setParseBigDecimal(true);
    }

    /**
     * How the km column and its content are expected
     */
    protected enum KmMode {
        NONE, UNIQUE, DUPLICATES
    }

    /**
     * Path of the file or directory to import from
     */
    protected final File importPath;

    /**
     * Part of {@link importPath} without the river root dir
     */
    protected final File rootRelativePath;

    /**
     * River for which the import runs
     */
    protected final ImportRiver river;

    /**
     * Reader during parse
     */
    protected LineNumberReader in;

    /**
     * Last line read from in
     */
    protected String currentLine;

    /**
     * State of the header lines parse loop
     */
    protected ParsingState headerParsingState;

    /**
     * Series header of the stations table, with the imported meta info.
     */
    protected HEADER seriesHeader;

    /**
     * List of meta info Pattern matched during {@link handleMetaLine}
     */
    protected final List<Pattern> metaPatternsMatched;

    /**
     * Column titles of the stations table, starting with the km column.
     * All strings have been trimmed.
     */
    protected final List<String> columnTitles;

    /**
     * List of the km value tuples imported, no duplicate km
     */
    protected final List<KMLINE> values;

    /**
     * Ordered list with the imported km to check for duplicates.
     */
    protected final TreeSet<Double> kmExists;


    /***** CONSTRUCTORS *****/

    /**
     * Constructs a parser for an import file
     */
    public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) {
        this.importPath = importPath;
        this.rootRelativePath = rootRelativePath;
        this.river = river;
        this.metaPatternsMatched = new ArrayList<>();
        this.kmExists = new TreeSet<>(EpsilonComparator.CMP);
        this.columnTitles = new ArrayList<>();
        this.values = new ArrayList<>();
    }


    /***** FILE-METHODS *****/

    /**
     * Lists all files from a directory having a type extension (starting with dot)
     */
    protected static List<File> listFiles(final File importDir, final String extension) {
        final File[] files = importDir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(final File dir, final String name) {
                return name.toLowerCase().endsWith(extension);
            }
        });
        final List<File> fl = new ArrayList<>();
        if (files != null)
            for (final File file : files)
                fl.add(file);
        return fl;
    }

    /**
     * Lists all files from a directory matching a file name pattern
     */
    protected static List<File> listFiles(final File importDir, final Pattern filenamePattern) {
        final File[] files = importDir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(final File dir, final String name) {
                return filenamePattern.matcher(name).matches();
            }
        });
        final List<File> fl = new ArrayList<>();
        if (files != null)
            for (final File file : files)
                fl.add(file);
        return fl;
    }

    /***** PARSE-METHODS *****/

    /**
     * Parses a file and adds series and values to the parser's collection
     */
    @Override
    public void parse() throws Exception {
        logStartInfo();
        this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", ""));
        this.metaPatternsMatched.clear();
        this.kmExists.clear();
        this.headerParsingState = ParsingState.CONTINUE;
        try {
            try {
                this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING));
            }
            catch (final Exception e) {
                logError("Could not open (%s)", e.getMessage());
                this.headerParsingState = ParsingState.STOP;
            }
            try {
                this.currentLine = null;
                while (this.headerParsingState != ParsingState.STOP) {
                    this.currentLine = this.in.readLine();
                    if (this.currentLine == null)
                        break;
                    this.currentLine = this.currentLine.trim();
                    if (this.currentLine.isEmpty())
                        continue;
                    if (this.headerParsingState == ParsingState.CONTINUE) {
                        handleMetaLine();
                        if (this.headerParsingState == ParsingState.DONE)
                            checkMetaData();
                    }
                    else
                        handleDataLine();
                }
                if (this.headerParsingState != ParsingState.STOP)
                    getLog().info(String.format("Number of values found: %d", this.seriesHeader.getValueCount()));
            }
            catch (final Exception e) {
                throw new Exception(String.format("Parsing error (last read line: %d)", this.in.getLineNumber() + 1), e);
            }
        }
        finally {
            if (this.in != null) {
                this.in.close();
                this.in = null;
            }
        }
        if (this.headerParsingState == ParsingState.STOP)
            logError("Parsing of the file stopped due to a severe error");
    }

    /**
     * Writes the parse start info to the log
     */
    protected void logStartInfo() {
        getLog().info(String.format("Start parsing:;'%s'", this.rootRelativePath));
    }

    /**
     * Strips separator chars from a meta info text, and trims leading and trailing whitespace
     */
    public static String parseMetaInfo(final String text) {
        return text.replace(SEPARATOR_CHAR, "").trim();
    }

    /**
     * Parses a number string with dot or comma as decimal char, and returning null in case of an error
     */
    public static Number parseDoubleCheckNull(final String[] values, final int index) {
        if (index > values.length - 1)
            return null;
        try {
            return parseDouble(values[index]);
        }
        catch (final Exception e) {
            return null;
        }
    }

    /**
     * Parses a number string with dot or comma as decimal char
     *
     * @throws ParseException
     */
    private static Number parseDouble(final String text) throws ParseException {
        return numberFormat.parse(text.replace(',', '.'));
    }

    /**
     * Parses an integer number string , and returning null in case of an error
     */
    public static Integer parseIntegerCheckNull(final String[] values, final int index) {
        if (index > values.length - 1)
            return null;
        try {
            return Integer.valueOf((values[index]));
        }
        catch (final Exception e) {
            return null;
        }
    }

    /**
     * Parses a number string as a BigDecimal, replacing a comma with a dot first
     */
    public static BigDecimal parseDecimal(final String text) throws ParseException {
        return (BigDecimal) bigDecimalFormat.parse(text.replace(',', '.'));
    }

    /**
     * Creates a new series import object
     */
    protected abstract HEADER createSeriesImport(final String filename);


    /***** METAHEADER-PARSE-METHODS *****/

    protected void handleMetaLine() {
        if (META_SUBGROUP.matcher(this.currentLine).matches())
            return;
        else if (handleMetaRivername())
            return;
        else if (handleMetaKmrange_info())
            return;
        else if (handleMetaComment())
            return;
        else if (handleMetaOther())
            return;
        else if (handleMetaColumnTitles()) {
            if (this.headerParsingState != ParsingState.STOP)
                this.headerParsingState = ParsingState.DONE;
            return;
        }
        else {
            if (this.currentLine.startsWith(START_META_CHAR)) {
                if (this.headerParsingState != ParsingState.IGNORE)
                    logLineWarning("Not matching any known meta type");
                else
                    this.headerParsingState = ParsingState.CONTINUE;
            }
            else
                this.headerParsingState = ParsingState.DONE; // no more meta data expected, if neither meta line nor empty line
        }
    }

    private boolean handleMetaRivername() {
        if (META_RIVERNAME.matcher(this.currentLine).matches()) {
            this.metaPatternsMatched.add(META_RIVERNAME);
            return true;
        }
        else
            return false;
    }

    private boolean handleMetaKmrange_info() {
        final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine);
        if (m.matches()) {
            this.metaPatternsMatched.add(META_KMRANGE_INFO);
            this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1)));
            return true;
        }
        return false;
    }

    private boolean handleMetaComment() {
        final Matcher m = META_COMMENTS.matcher(this.currentLine);
        if (m.matches()) {
            this.metaPatternsMatched.add(META_COMMENTS);
            this.seriesHeader.setNotes(parseMetaInfo(m.group(1)));
            return true;
        }
        return false;
    }

    /**
     * Parses currentLine for non-default meta info
     *
     * @return Whether the line has been handled
     */
    protected boolean handleMetaOther() {
        return false;
    }

    /**
     * Parses a header line for the km table column header line
     *
     * @return Whether the line has been handled (also in case of State=STOP),<br>
     *         and we are ready for reading the km values lines (or cancel parsing)
     */
    protected boolean handleMetaColumnTitles() {
        if (META_COLUMNTITLES.matcher(this.currentLine).matches()) {
            this.metaPatternsMatched.add(META_COLUMNTITLES);
            this.columnTitles.clear();
            final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0);
            for (int i = 0; i <= titles.length - 1; i++)
                this.columnTitles.add(titles[i].trim());
            return true;
        }
        return false;
    }

    /**
     * Check meta data after all meta data lines (#) have been read
     */
    protected boolean checkMetaData() {
        if (this.columnTitles.size() <= 1) {
            logError("No valid header line with column titles found");
            this.headerParsingState = ParsingState.STOP;
            return false;
        }
        if (checkSeriesExistsAlready()) {
            logError("Data series/filename exists already in the database");
            this.headerParsingState = ParsingState.STOP;
            return false;
        }
        return true;
    }

    /**
     * Checks the existence of the active series in the database
     */
    protected boolean checkSeriesExistsAlready() {
        if (!checkRiverExists())
            return false;
        final Session session = ImporterSession.getInstance().getDatabaseSession();
        final List<DB_SERIES> rows = this.seriesHeader.querySeriesItem(session, this.river.getPeer(), true);
        return !rows.isEmpty();
    }

    /**
     * Checks the existence of the active river in the database
     */
    protected boolean checkRiverExists() {
        return (this.river.getPeer(false) != null);
    }


    /***** VALUELINE-PARSE-METHODS *****/

    /**
     * Parses a values line and adds the values record
     */
    protected void handleDataLine() {
        final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0);
        // Skip import line without data or only km
        if (values.length < 2) {
            logLineWarning("Too few data");
            return;
        }
        Double km = Double.NaN;
        if (kmMode() != KmMode.NONE) {
            try {
                km = Double.valueOf(parseDouble(values[0]).doubleValue());
                if (kmMode() == KmMode.UNIQUE) {
                    if (this.kmExists.contains(km)) {
                        logLineWarning("Duplicate km '%s'", values[0]);
                        return;
                    }
                    this.kmExists.add(km);
                }
            }
            catch (final Exception e) {
                logLineWarning("Invalid km: %s", e.getMessage());
                return;
            }
        }
        final KMLINE value = createKmLineImport(km, values);
        if (value != null) {
            final boolean added = this.seriesHeader.addValue(value);
            if (!added)
                logLineWarning("Duplicate data line");
        }
    }

    /**
     * How {@link handleDataLine} shall handle the km column (if any)
     */
    protected KmMode kmMode() {
        return KmMode.UNIQUE;
    }

    /**
     * Creates a value import item with the km and other fields of the current line;
     * the km has been validated
     *
     * @return value item, or null if parse error
     */
    protected abstract KMLINE createKmLineImport(final Double km, final String[] values);


    /***** STORE-METHODS *****/

    /**
     * Stores the parsed series and values in the database
     */
    @Override
    public void store() {
        if (this.headerParsingState != ParsingState.STOP) {
            this.seriesHeader.store(this.river.getPeer());
            final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(),
                    this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE));
            if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT))
                logWarning("Number of value inserts less than number parsed: %s", counts);
            else
                getLog().info("Number of values records: " + counts);
        }
        else
            logWarning("Severe parsing errors, not storing series '%s'", this.seriesHeader.getFilename());
    }


    /***** LOG-METHODS *****/

    /**
     * Gets the class's logger
     */
    protected abstract Logger getLog();

    /**
     * Logs an error message, appending the relative file path
     */
    protected void logError(final String message) {
        getLog().error(buildLogMessage(message));
    }

    /**
     * Logs an error message, appending the relative file path
     */
    protected void logError(final String format, final Object... args) {
        getLog().error(buildLogMessage(String.format(format, args)));
    }

    /**
     * Logs an error message with current line number, appending the relative file path
     */
    protected void logLineError(final String message) {
        getLog().error(buildLineLogMessage(message));
    }

    /**
     * Logs an error message with current line number, appending the relative file path
     */
    protected void logLineError(final String format, final Object... args) {
        getLog().error(buildLineLogMessage(String.format(format, args)));
    }

    /**
     * Logs a warning message, appending the relative file path
     */
    protected void logWarning(final String message) {
        getLog().warn(buildLogMessage(message));
    }

    /**
     * Logs a warning message, appending the relative file path
     */
    protected void logWarning(final String format, final Object... args) {
        getLog().warn(buildLogMessage(String.format(format, args)));
    }

    /**
     * Logs a warning message, appending the line number and the relative file path
     */
    protected void logLineWarning(final String message) {
        getLog().warn(buildLineLogMessage(message));
    }

    /**
     * Logs a warning message, appending the line number and the relative file path
     */
    protected void logLineWarning(final String format, final Object... args) {
        getLog().warn(buildLineLogMessage(String.format(format, args)));
    }

    /**
     * Logs an info message, appending the relative file path
     */
    protected void logInfo(final String message) {
        getLog().info(buildLogMessage(message));
    }

    /**
     * Logs a debug message, appending the relative file path
     */
    protected void logDebug(final String message) {
        getLog().debug(buildLogMessage(message));
    }

    /**
     * Logs a trace message, appending the relative file path
     */
    protected void logTrace(final String message) {
        getLog().trace(buildLogMessage(message));
    }

    private String buildLogMessage(final String message) {
        return String.format("%s;%s", message, this.rootRelativePath);
    }

    private String buildLineLogMessage(final String message) {
        return String.format("Line %d: %s;%s", this.in.getLineNumber(), message, this.rootRelativePath);
    }
}

http://dive4elements.wald.intevation.org