view backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java @ 9709:b74f817435fe

comment removed
author dnt_bjoernsen <d.tironi@bjoernsen.de>
date Wed, 27 Jan 2021 11:47:38 +0100
parents a2a42a6bac6b
children
line wrap: on
line source
/* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde
 * Software engineering by
 *  Björnsen Beratende Ingenieure GmbH
 *  Dr. Schumacher Ingenieurbüro für Wasser und Umwelt
 *
 * This file is Free Software under the GNU AGPL (>=v3)
 * and comes with ABSOLUTELY NO WARRANTY! Check out the
 * documentation coming with Dive4Elements River for details.
 */

package org.dive4elements.river.importer.common;

import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.math.BigDecimal;
import java.text.DecimalFormat;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.log4j.Logger;
import org.dive4elements.river.backend.utils.EpsilonComparator;
import org.dive4elements.river.importer.ImportRiver;
import org.dive4elements.river.importer.ImporterSession;
import org.hibernate.Session;

/**
 * Abstract base class for a parser of one FLYS csv data file.<br />
 * The {@link parse} method creates a SERIES object for the meta data
 * and a list of KMLINE objects for the km value lines read from the file.<br />
 * The {@link store} method gets or creates the corresponding database objects
 * by the hibernate binding classes DB_SERIES and DB_KMTUPLE,
 * and updates or inserts them in the database.
 * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.<br />
 * <br />
 * The structure of the file is as follows:<br />
 * <ul>
 * <li>one or more comment lines (#) with the meta info of the data series</li>
 * <li>the comment line with the column titles of values table, starting with the km column</li>
 * <li>the rows of the values table, each one on its own line</li>
 * </ul>
 *
 * @author Matthias Schäfer
 *
 */
public abstract class AbstractParser<DB_SERIES, DB_KMTUPLE, KMLINE extends AbstractKmLineImport<DB_SERIES, DB_KMTUPLE>, HEADER extends AbstractSeriesImport<DB_SERIES, DB_KMTUPLE, KMLINE>> implements ImportParser {

    /***** FIELDS *****/

    public static final String ENCODING = "ISO-8859-1";

    protected static final Locale DEFAULT_LOCALE = Locale.GERMAN;

    public static final String START_META_CHAR = "#";

    protected static final String SEPARATOR_CHAR = ";";

    protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);

    protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);

    protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);

    private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE);

    private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE);

    private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT);

    private static DecimalFormat bigDecimalFormat;

    protected static final String INVALID_VALUE_ERROR_FORMAT = "Invalid or missing %s value";

    static {
        bigDecimalFormat = (DecimalFormat) NumberFormat.getInstance(Locale.ROOT);
        bigDecimalFormat.setParseBigDecimal(true);
    }

    /**
     * How the km column and its content are expected
     */
    protected enum KmMode {
        NONE, UNIQUE, DUPLICATES
    }

    /**
     * Path of the file or directory to import from
     */
    protected final File importPath;

    /**
     * Part of {@link importPath} without the river root dir
     */
    protected final File rootRelativePath;

    /**
     * River for which the import runs
     */
    protected final ImportRiver river;

    /**
     * Reader during parse
     */
    protected LineNumberReader in;

    /**
     * Last line read from in
     */
    protected String currentLine;

    /**
     * State of the header lines parse loop
     */
    protected ParsingState headerParsingState;

    /**
     * Series header of the stations table, with the imported meta info.
     */
    protected HEADER seriesHeader;

    /**
     * List of meta info Pattern matched during {@link handleMetaLine}
     */
    protected final List<Pattern> metaPatternsMatched;

    /**
     * Column titles of the stations table, starting with the km column.
     * All strings have been trimmed.
     */
    protected final List<String> columnTitles;

    /**
     * List of the km value tuples imported, no duplicate km
     */
    protected final List<KMLINE> values;

    /**
     * Ordered list with the imported km to check for duplicates.
     */
    protected final TreeSet<Double> kmExists;


    /***** CONSTRUCTORS *****/

    /**
     * Constructs a parser for an import file
     */
    public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) {
        this.importPath = importPath;
        this.rootRelativePath = rootRelativePath;
        this.river = river;
        this.metaPatternsMatched = new ArrayList<>();
        this.kmExists = new TreeSet<>(EpsilonComparator.CMP);
        this.columnTitles = new ArrayList<>();
        this.values = new ArrayList<>();
    }


    /***** FILE-METHODS *****/

    /**
     * Lists all files from a directory having a type extension (starting with dot)
     */
    protected static List<File> listFiles(final File importDir, final String extension) {
        final File[] files = importDir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(final File dir, final String name) {
                return name.toLowerCase().endsWith(extension);
            }
        });
        final List<File> fl = new ArrayList<>();
        if (files != null)
            for (final File file : files)
                fl.add(file);
        return fl;
    }

    /**
     * Lists all files from a directory matching a file name pattern
     */
    protected static List<File> listFiles(final File importDir, final Pattern filenamePattern) {
        final File[] files = importDir.listFiles(new FilenameFilter() {
            @Override
            public boolean accept(final File dir, final String name) {
                return filenamePattern.matcher(name).matches();
            }
        });
        final List<File> fl = new ArrayList<>();
        if (files != null)
            for (final File file : files)
                fl.add(file);
        return fl;
    }

    /***** PARSE-METHODS *****/

    /**
     * Parses a file and adds series and values to the parser's collection
     */
    @Override
    public void parse() throws Exception {
        logStartInfo();
        this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", ""));
        this.metaPatternsMatched.clear();
        this.kmExists.clear();
        this.headerParsingState = ParsingState.CONTINUE;
        try {
            try {
                this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING));
            }
            catch (final Exception e) {
                logError("Could not open (%s)", e.getMessage());
                this.headerParsingState = ParsingState.STOP;
            }
            try {
                this.currentLine = null;
                while (this.headerParsingState != ParsingState.STOP) {
                    this.currentLine = this.in.readLine();
                    if (this.currentLine == null)
                        break;
                    this.currentLine = this.currentLine.trim();
                    if (this.currentLine.isEmpty())
                        continue;
                    if (this.headerParsingState == ParsingState.CONTINUE) {
                        handleMetaLine();
                        if (this.headerParsingState == ParsingState.DONE)
                            checkMetaData();
                    }
                    else
                        handleDataLine();
                }
                if (this.headerParsingState != ParsingState.STOP)
                    getLog().info(String.format("Number of values found: %d", this.seriesHeader.getValueCount()));
            }
            catch (final Exception e) {
                throw new Exception(String.format("Parsing error (last read line: %d)", this.in.getLineNumber() + 1), e);
            }
        }
        finally {
            if (this.in != null) {
                this.in.close();
                this.in = null;
            }
        }
        if (this.headerParsingState == ParsingState.STOP)
            logError("Parsing of the file stopped due to a severe error");
    }

    /**
     * Writes the parse start info to the log
     */
    protected void logStartInfo() {
        getLog().info(String.format("Start parsing:;'%s'", this.rootRelativePath));
    }

    /**
     * Strips separator chars from a meta info text, and trims leading and trailing whitespace
     */
    public static String parseMetaInfo(final String text) {
        return text.replace(SEPARATOR_CHAR, "").trim();
    }

    /**
     * Parses a number string with dot or comma as decimal char, and returning null in case of an error
     */
    public static Number parseDoubleCheckNull(final String[] values, final int index) {
        if (index > values.length - 1)
            return null;
        try {
            return parseDouble(values[index]);
        }
        catch (final Exception e) {
            return null;
        }
    }

    /**
     * Parses a number string with dot or comma as decimal char
     *
     * @throws ParseException
     */
    private static Number parseDouble(final String text) throws ParseException {
        return numberFormat.parse(text.replace(',', '.'));
    }

    /**
     * Parses an integer number string , and returning null in case of an error
     */
    public static Integer parseIntegerCheckNull(final String[] values, final int index) {
        if (index > values.length - 1)
            return null;
        try {
            return Integer.valueOf((values[index]));
        }
        catch (final Exception e) {
            return null;
        }
    }

    /**
     * Parses a number string as a BigDecimal, replacing a comma with a dot first
     */
    public static BigDecimal parseDecimal(final String text) throws ParseException {
        return (BigDecimal) bigDecimalFormat.parse(text.replace(',', '.'));
    }

    /**
     * Creates a new series import object
     */
    protected abstract HEADER createSeriesImport(final String filename);


    /***** METAHEADER-PARSE-METHODS *****/

    protected void handleMetaLine() {
        if (META_SUBGROUP.matcher(this.currentLine).matches())
            return;
        else if (handleMetaRivername())
            return;
        else if (handleMetaKmrange_info())
            return;
        else if (handleMetaComment())
            return;
        else if (handleMetaOther())
            return;
        else if (handleMetaColumnTitles()) {
            if (this.headerParsingState != ParsingState.STOP)
                this.headerParsingState = ParsingState.DONE;
            return;
        }
        else {
            if (this.currentLine.startsWith(START_META_CHAR)) {
                if (this.headerParsingState != ParsingState.IGNORE)
                    logLineWarning("Not matching any known meta type");
                else
                    this.headerParsingState = ParsingState.CONTINUE;
            }
            else
                this.headerParsingState = ParsingState.DONE; // no more meta data expected, if neither meta line nor empty line
        }
    }

    private boolean handleMetaRivername() {
        if (META_RIVERNAME.matcher(this.currentLine).matches()) {
            this.metaPatternsMatched.add(META_RIVERNAME);
            return true;
        }
        else
            return false;
    }

    private boolean handleMetaKmrange_info() {
        final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine);
        if (m.matches()) {
            this.metaPatternsMatched.add(META_KMRANGE_INFO);
            this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1)));
            return true;
        }
        return false;
    }

    private boolean handleMetaComment() {
        final Matcher m = META_COMMENTS.matcher(this.currentLine);
        if (m.matches()) {
            this.metaPatternsMatched.add(META_COMMENTS);
            this.seriesHeader.setNotes(parseMetaInfo(m.group(1)));
            return true;
        }
        return false;
    }

    /**
     * Parses currentLine for non-default meta info
     *
     * @return Whether the line has been handled
     */
    protected boolean handleMetaOther() {
        return false;
    }

    /**
     * Parses a header line for the km table column header line
     *
     * @return Whether the line has been handled (also in case of State=STOP),<br>
     *         and we are ready for reading the km values lines (or cancel parsing)
     */
    protected boolean handleMetaColumnTitles() {
        if (META_COLUMNTITLES.matcher(this.currentLine).matches()) {
            this.metaPatternsMatched.add(META_COLUMNTITLES);
            this.columnTitles.clear();
            final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0);
            for (int i = 0; i <= titles.length - 1; i++)
                this.columnTitles.add(titles[i].trim());
            return true;
        }
        return false;
    }

    /**
     * Check meta data after all meta data lines (#) have been read
     */
    protected boolean checkMetaData() {
        if (this.columnTitles.size() <= 1) {
            logError("No valid header line with column titles found");
            this.headerParsingState = ParsingState.STOP;
            return false;
        }
        if (checkSeriesExistsAlready()) {
            logError("Data series/filename exists already in the database");
            this.headerParsingState = ParsingState.STOP;
            return false;
        }
        return true;
    }

    /**
     * Checks the existence of the active series in the database
     */
    protected boolean checkSeriesExistsAlready() {
        if (!checkRiverExists())
            return false;
        final Session session = ImporterSession.getInstance().getDatabaseSession();
        final List<DB_SERIES> rows = this.seriesHeader.querySeriesItem(session, this.river.getPeer(), true);
        return !rows.isEmpty();
    }

    /**
     * Checks the existence of the active river in the database
     */
    protected boolean checkRiverExists() {
        return (this.river.getPeer(false) != null);
    }


    /***** VALUELINE-PARSE-METHODS *****/

    /**
     * Parses a values line and adds the values record
     */
    protected void handleDataLine() {
        final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0);
        // Skip import line without data or only km
        if (values.length < 2) {
            logLineWarning("Too few data");
            return;
        }
        Double km = Double.NaN;
        if (kmMode() != KmMode.NONE) {
            try {
                km = Double.valueOf(parseDouble(values[0]).doubleValue());
                if (kmMode() == KmMode.UNIQUE) {
                    if (this.kmExists.contains(km)) {
                        logLineWarning("Duplicate km '%s'", values[0]);
                        return;
                    }
                    this.kmExists.add(km);
                }
            }
            catch (final Exception e) {
                logLineWarning("Invalid km: %s", e.getMessage());
                return;
            }
        }
        final KMLINE value = createKmLineImport(km, values);
        if (value != null) {
            final boolean added = this.seriesHeader.addValue(value);
            if (!added)
                logLineWarning("Duplicate data line");
        }
    }

    /**
     * How {@link handleDataLine} shall handle the km column (if any)
     */
    protected KmMode kmMode() {
        return KmMode.UNIQUE;
    }

    /**
     * Creates a value import item with the km and other fields of the current line;
     * the km has been validated
     *
     * @return value item, or null if parse error
     */
    protected abstract KMLINE createKmLineImport(final Double km, final String[] values);


    /***** STORE-METHODS *****/

    /**
     * Stores the parsed series and values in the database
     */
    @Override
    public void store() {
        if (this.headerParsingState != ParsingState.STOP) {
            this.seriesHeader.store(this.river.getPeer());
            final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(),
                    this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE));
            if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT))
                logWarning("Number of value inserts less than number parsed: %s", counts);
            else
                getLog().info("Number of values records: " + counts);
        }
        else
            logWarning("Severe parsing errors, not storing series '%s'", this.seriesHeader.getFilename());
    }


    /***** LOG-METHODS *****/

    /**
     * Gets the class's logger
     */
    protected abstract Logger getLog();

    /**
     * Logs an error message, appending the relative file path
     */
    protected void logError(final String message) {
        getLog().error(buildLogMessage(message));
    }

    /**
     * Logs an error message, appending the relative file path
     */
    protected void logError(final String format, final Object... args) {
        getLog().error(buildLogMessage(String.format(format, args)));
    }

    /**
     * Logs an error message with current line number, appending the relative file path
     */
    protected void logLineError(final String message) {
        getLog().error(buildLineLogMessage(message));
    }

    /**
     * Logs an error message with current line number, appending the relative file path
     */
    protected void logLineError(final String format, final Object... args) {
        getLog().error(buildLineLogMessage(String.format(format, args)));
    }

    /**
     * Logs a warning message, appending the relative file path
     */
    protected void logWarning(final String message) {
        getLog().warn(buildLogMessage(message));
    }

    /**
     * Logs a warning message, appending the relative file path
     */
    protected void logWarning(final String format, final Object... args) {
        getLog().warn(buildLogMessage(String.format(format, args)));
    }

    /**
     * Logs a warning message, appending the line number and the relative file path
     */
    protected void logLineWarning(final String message) {
        getLog().warn(buildLineLogMessage(message));
    }

    /**
     * Logs a warning message, appending the line number and the relative file path
     */
    protected void logLineWarning(final String format, final Object... args) {
        getLog().warn(buildLineLogMessage(String.format(format, args)));
    }

    /**
     * Logs an info message, appending the relative file path
     */
    protected void logInfo(final String message) {
        getLog().info(buildLogMessage(message));
    }

    /**
     * Logs a debug message, appending the relative file path
     */
    protected void logDebug(final String message) {
        getLog().debug(buildLogMessage(message));
    }

    /**
     * Logs a trace message, appending the relative file path
     */
    protected void logTrace(final String message) {
        getLog().trace(buildLogMessage(message));
    }

    private String buildLogMessage(final String message) {
        return String.format("%s;%s", message, this.rootRelativePath);
    }

    private String buildLineLogMessage(final String message) {
        return String.format("Line %d: %s;%s", this.in.getLineNumber(), message, this.rootRelativePath);
    }
}

http://dive4elements.wald.intevation.org