Mercurial > dive4elements > river
diff backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java @ 8971:50416a0df385
Importer for the Schifffahrt (S-INFO) and Oekologie (U-INFO) files
author | mschaefer |
---|---|
date | Tue, 03 Apr 2018 10:18:30 +0200 |
parents | |
children | 2693bfaf503d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java Tue Apr 03 10:18:30 2018 +0200 @@ -0,0 +1,408 @@ +/* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde + * Software engineering by + * Björnsen Beratende Ingenieure GmbH + * Dr. Schumacher Ingenieurbüro für Wasser und Umwelt + * + * This file is Free Software under the GNU AGPL (>=v3) + * and comes with ABSOLUTELY NO WARRANTY! Check out the + * documentation coming with Dive4Elements River for details. + */ + +package org.dive4elements.river.importer.common; + +import java.io.File; +import java.io.FileInputStream; +import java.io.FilenameFilter; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.text.NumberFormat; +import java.text.ParseException; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.log4j.Logger; +import org.dive4elements.river.backend.utils.EpsilonComparator; +import org.dive4elements.river.importer.ImportRiver; + +/** + * Abstract base class for a parser of one FLYS csv data file.<br /> + * The {@link parse} method creates a SERIES object for the meta data + * and a list of KMLINE objects for the km value lines read from the file.<br /> + * The {@link store} method gets or creates the corresponding database objects + * by the hibernate binding classes DB_SERIES and DB_KMTUPLE, + * and updates or inserts them in the database. + * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.<br /> + * <br /> + * The structure of the file is as follows:<br /> + * <ul> + * <li>one or more comment lines (#) with the meta info of the data series</li> + * <li>the comment line with the column titles of values table, starting with the km column</li> + * <li>the rows of the values table, each one on its own line</li> + * </ul> + * + * @author Matthias Schäfer + * + */ +public abstract class AbstractParser<DB_SERIES, DB_KMTUPLE, KMLINE extends AbstractKmLineImport<DB_SERIES, DB_KMTUPLE>, HEADER extends AbstractSeriesImport<DB_SERIES, DB_KMTUPLE, KMLINE>> implements ImportParser { + + /***** FIELDS *****/ + + public static final String ENCODING = "ISO-8859-1"; + + protected static final Locale DEFAULT_LOCALE = Locale.GERMAN; + + public static final String START_META_CHAR = "#"; + + protected static final String SEPARATOR_CHAR = ";"; + + protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); + + protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); + + protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); + + private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE); + + private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE); + + private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT); + + /** + * Path of the file or directory to import from + */ + protected final File importPath; + + /** + * Part of {@link importPath} without the river root dir + */ + protected final File rootRelativePath; + + /** + * River for which the import runs + */ + protected final ImportRiver river; + + /** + * Reader during parse + */ + protected LineNumberReader in; + + /** + * Last line read from in + */ + protected String currentLine; + + /** + * State of the header lines parse loop + */ + protected ParsingState headerParsingState; + + /** + * Series header of the stations table, with the imported meta info. + */ + protected HEADER seriesHeader; + + /** + * List of meta info Pattern matched during {@link handleMetaLine} + */ + protected final List<Pattern> metaPatternsMatched; + + /** + * Column titles of the stations table, starting with the km column. + * All strings have been trimmed. + */ + protected final List<String> columnTitles; + + /** + * List of the km value tuples imported, no duplicate km + */ + protected final List<KMLINE> values; + + /** + * Ordered list with the imported km to check for duplicates. + */ + protected final TreeSet<Double> kmExists; + + + /***** CONSTRUCTORS *****/ + + /** + * Constructs a parser for an import file + */ + public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) { + this.importPath = importPath; + this.rootRelativePath = rootRelativePath; + this.river = river; + this.metaPatternsMatched = new ArrayList<>(); + this.kmExists = new TreeSet<>(EpsilonComparator.CMP); + this.columnTitles = new ArrayList<>(); + this.values = new ArrayList<>(); + } + + + /***** METHODS *****/ + + /** + * Lists all files from a directory having a type extension (starting with dot) + */ + protected static List<File> listFiles(final File importDir, final String extension) { + final File[] files = importDir.listFiles(new FilenameFilter() { + @Override + public boolean accept(final File dir, final String name) { + return name.toLowerCase().endsWith(extension); + } + }); + final List<File> fl = new ArrayList<>(); + if (files != null) + for (final File file : files) + fl.add(file); + return fl; + } + + /** + * Parses a file and adds series and values to the parser's collection + */ + @Override + public void parse() throws IOException { + logStartInfo(); + this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", "")); + this.metaPatternsMatched.clear(); + this.kmExists.clear(); + this.headerParsingState = ParsingState.CONTINUE; + try { + try { + this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING)); + } + catch (final Exception e) { + logError("Could not open (" + e.getMessage() + ")"); + this.headerParsingState = ParsingState.STOP; + } + this.currentLine = null; + while (this.headerParsingState != ParsingState.STOP) { + this.currentLine = this.in.readLine(); + if (this.currentLine == null) + break; + this.currentLine = this.currentLine.trim(); + if (this.currentLine.isEmpty()) + continue; + if (this.headerParsingState == ParsingState.CONTINUE) + handleMetaLine(); + else + handleDataLine(); + } + if (this.headerParsingState != ParsingState.STOP) + getLog().info("Number of values found: " + this.seriesHeader.getValueCount()); + } + finally { + if (this.in != null) { + this.in.close(); + this.in = null; + } + } + if (this.headerParsingState == ParsingState.STOP) + logError("Parsing of the file stopped due to a severe error"); + } + + /** + * Writes the parse start info to the log + */ + protected void logStartInfo() { + getLog().info("Start parsing:;'" + this.rootRelativePath + "'"); + } + + /** + * Stores the parsed series and values in the database + */ + @Override + public void store() { + if (this.headerParsingState != ParsingState.STOP) { + this.seriesHeader.store(this.river.getPeer()); + final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(), + this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE)); + if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT)) + logWarning("Number of value inserts less than number parsed: " + counts); + else + getLog().info("Number of values records: " + counts); + } + else + logWarning("Severe parsing errors, not storing series '" + this.seriesHeader.getFilename() + "'"); + } + + /** + * Strips separator chars from a meta info text, and trims leading and trailing whitespace + */ + public static String parseMetaInfo(final String text) { + return text.replace(SEPARATOR_CHAR, "").trim(); + } + + /** + * Parses a number string with dot or comma as decimal char, and returning null in case of an error + */ + public static Number parseDoubleWithNull(final String text) { + try { + return parseDouble(text); + } + catch (final Exception e) { + return null; + } + } + + /** + * Parses a number string with dot or comma as decimal char + * + * @throws ParseException + */ + public static Number parseDouble(final String text) throws ParseException { + return numberFormat.parse(text.replace(',', '.')); + } + + /** + * Gets the class's logger + */ + protected abstract Logger getLog(); + + /** + * Logs an error message, appending the relative file path + */ + protected void logError(final String message) { + getLog().error(message + ";" + this.rootRelativePath); + } + + /** + * Logs a warning message, appending the relative file path + */ + protected void logWarning(final String message) { + getLog().warn(message + ";" + this.rootRelativePath); + } + + /** + * Creates a new series import object + */ + protected abstract HEADER createSeriesImport(final String filename); + + protected void handleMetaLine() { + if (META_SUBGROUP.matcher(this.currentLine).matches()) + return; + else if (handleMetaRivername()) + return; + else if (handleMetaKmrange_info()) + return; + else if (handleMetaComment()) + return; + else if (handleMetaOther()) + return; + else if (handleMetaColumnTitles()) { + if (this.headerParsingState != ParsingState.STOP) + this.headerParsingState = ParsingState.DONE; + return; + } + else { + if (this.currentLine.startsWith(START_META_CHAR)) { + if (this.headerParsingState != ParsingState.IGNORE) + logWarning("Not matching any known meta type in line " + this.in.getLineNumber() + ", ignored"); + else + this.headerParsingState = ParsingState.CONTINUE; + } + } + } + + private boolean handleMetaRivername() { + if (META_RIVERNAME.matcher(this.currentLine).matches()) { + this.metaPatternsMatched.add(META_RIVERNAME); + return true; + } + else + return false; + } + + private boolean handleMetaKmrange_info() { + final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine); + if (m.matches()) { + this.metaPatternsMatched.add(META_KMRANGE_INFO); + this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1))); + return true; + } + return false; + } + + private boolean handleMetaComment() { + final Matcher m = META_COMMENTS.matcher(this.currentLine); + if (m.matches()) { + this.metaPatternsMatched.add(META_COMMENTS); + this.seriesHeader.setComment(parseMetaInfo(m.group(1))); + return true; + } + return false; + } + + /** + * Parses currentLine for non-default meta info + * + * @return Whether the line has been handled + */ + protected boolean handleMetaOther() { + return false; + } + + /** + * Parses a header line for the km table column header line + * + * @return Whether the line has been handled and we are ready for reading the km values lines + */ + protected boolean handleMetaColumnTitles() { + if (META_COLUMNTITLES.matcher(this.currentLine).matches()) { + this.metaPatternsMatched.add(META_COLUMNTITLES); + this.columnTitles.clear(); + final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0); + for (int i = 0; i <= titles.length - 1; i++) + this.columnTitles.add(titles[i].trim()); + return true; + } + return false; + } + + private void handleDataLine() { + final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0); + // Skip import line without data or only km + if (values.length < 2) + return; + Double km; + try { + km = Double.valueOf(parseDouble(values[0]).doubleValue()); + if (kmMustBeUnique()) { + if (this.kmExists.contains(km)) { + logWarning("Ignoring duplicate station '" + values[0] + "' in line " + this.in.getLineNumber()); + return; + } + this.kmExists.add(km); + } + } + catch (final Exception e) { + logError("Not parseable km in line " + this.in.getLineNumber() + ": " + e.getMessage()); + return; + } + final KMLINE value = createKmLineImport(km, values); + if (value != null) + this.seriesHeader.addValue(value); + } + + /** + * Whether {@link handleDataLine} shall check for and reject km duplicates + */ + protected boolean kmMustBeUnique() { + return true; + } + + /** + * Creates a value import item with the km and other fields of the current line; + * the km has been validated + * + * @return value item, or null if parse error + */ + protected abstract KMLINE createKmLineImport(final Double km, final String[] values); +}