mschaefer@8971: /* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde mschaefer@8971: * Software engineering by mschaefer@8971: * Björnsen Beratende Ingenieure GmbH mschaefer@8971: * Dr. Schumacher Ingenieurbüro für Wasser und Umwelt mschaefer@8971: * mschaefer@8971: * This file is Free Software under the GNU AGPL (>=v3) mschaefer@8971: * and comes with ABSOLUTELY NO WARRANTY! Check out the mschaefer@8971: * documentation coming with Dive4Elements River for details. mschaefer@8971: */ mschaefer@8971: mschaefer@8971: package org.dive4elements.river.importer.common; mschaefer@8971: mschaefer@8971: import java.io.File; mschaefer@8971: import java.io.FileInputStream; mschaefer@8971: import java.io.FilenameFilter; mschaefer@8971: import java.io.IOException; mschaefer@8971: import java.io.InputStreamReader; mschaefer@8971: import java.io.LineNumberReader; mschaefer@8971: import java.text.NumberFormat; mschaefer@8971: import java.text.ParseException; mschaefer@8971: import java.util.ArrayList; mschaefer@8971: import java.util.List; mschaefer@8971: import java.util.Locale; mschaefer@8971: import java.util.TreeSet; mschaefer@8971: import java.util.regex.Matcher; mschaefer@8971: import java.util.regex.Pattern; mschaefer@8971: mschaefer@8971: import org.apache.log4j.Logger; mschaefer@8971: import org.dive4elements.river.backend.utils.EpsilonComparator; mschaefer@8971: import org.dive4elements.river.importer.ImportRiver; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Abstract base class for a parser of one FLYS csv data file.
mschaefer@8971: * The {@link parse} method creates a SERIES object for the meta data mschaefer@8971: * and a list of KMLINE objects for the km value lines read from the file.
mschaefer@8971: * The {@link store} method gets or creates the corresponding database objects mschaefer@8971: * by the hibernate binding classes DB_SERIES and DB_KMTUPLE, mschaefer@8971: * and updates or inserts them in the database. mschaefer@8971: * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.
mschaefer@8971: *
mschaefer@8971: * The structure of the file is as follows:
mschaefer@8971: * mschaefer@8971: * mschaefer@8971: * @author Matthias Schäfer mschaefer@8971: * mschaefer@8971: */ mschaefer@8971: public abstract class AbstractParser, HEADER extends AbstractSeriesImport> implements ImportParser { mschaefer@8971: mschaefer@8971: /***** FIELDS *****/ mschaefer@8971: mschaefer@8971: public static final String ENCODING = "ISO-8859-1"; mschaefer@8971: mschaefer@8971: protected static final Locale DEFAULT_LOCALE = Locale.GERMAN; mschaefer@8971: mschaefer@8971: public static final String START_META_CHAR = "#"; mschaefer@8971: mschaefer@8971: protected static final String SEPARATOR_CHAR = ";"; mschaefer@8971: mschaefer@8971: protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); mschaefer@8971: mschaefer@8971: protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); mschaefer@8971: mschaefer@8971: protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); mschaefer@8971: mschaefer@8971: private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE); mschaefer@8971: mschaefer@8971: private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE); mschaefer@8971: mschaefer@8971: private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT); mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Path of the file or directory to import from mschaefer@8971: */ mschaefer@8971: protected final File importPath; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Part of {@link importPath} without the river root dir mschaefer@8971: */ mschaefer@8971: protected final File rootRelativePath; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * River for which the import runs mschaefer@8971: */ mschaefer@8971: protected final ImportRiver river; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Reader during parse mschaefer@8971: */ mschaefer@8971: protected LineNumberReader in; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Last line read from in mschaefer@8971: */ mschaefer@8971: protected String currentLine; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * State of the header lines parse loop mschaefer@8971: */ mschaefer@8971: protected ParsingState headerParsingState; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Series header of the stations table, with the imported meta info. mschaefer@8971: */ mschaefer@8971: protected HEADER seriesHeader; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * List of meta info Pattern matched during {@link handleMetaLine} mschaefer@8971: */ mschaefer@8971: protected final List metaPatternsMatched; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Column titles of the stations table, starting with the km column. mschaefer@8971: * All strings have been trimmed. mschaefer@8971: */ mschaefer@8971: protected final List columnTitles; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * List of the km value tuples imported, no duplicate km mschaefer@8971: */ mschaefer@8971: protected final List values; mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Ordered list with the imported km to check for duplicates. mschaefer@8971: */ mschaefer@8971: protected final TreeSet kmExists; mschaefer@8971: mschaefer@8971: mschaefer@8971: /***** CONSTRUCTORS *****/ mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Constructs a parser for an import file mschaefer@8971: */ mschaefer@8971: public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) { mschaefer@8971: this.importPath = importPath; mschaefer@8971: this.rootRelativePath = rootRelativePath; mschaefer@8971: this.river = river; mschaefer@8971: this.metaPatternsMatched = new ArrayList<>(); mschaefer@8971: this.kmExists = new TreeSet<>(EpsilonComparator.CMP); mschaefer@8971: this.columnTitles = new ArrayList<>(); mschaefer@8971: this.values = new ArrayList<>(); mschaefer@8971: } mschaefer@8971: mschaefer@8971: mschaefer@8971: /***** METHODS *****/ mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Lists all files from a directory having a type extension (starting with dot) mschaefer@8971: */ mschaefer@8971: protected static List listFiles(final File importDir, final String extension) { mschaefer@8971: final File[] files = importDir.listFiles(new FilenameFilter() { mschaefer@8971: @Override mschaefer@8971: public boolean accept(final File dir, final String name) { mschaefer@8971: return name.toLowerCase().endsWith(extension); mschaefer@8971: } mschaefer@8971: }); mschaefer@8971: final List fl = new ArrayList<>(); mschaefer@8971: if (files != null) mschaefer@8971: for (final File file : files) mschaefer@8971: fl.add(file); mschaefer@8971: return fl; mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Parses a file and adds series and values to the parser's collection mschaefer@8971: */ mschaefer@8971: @Override mschaefer@8971: public void parse() throws IOException { mschaefer@8971: logStartInfo(); mschaefer@8971: this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", "")); mschaefer@8971: this.metaPatternsMatched.clear(); mschaefer@8971: this.kmExists.clear(); mschaefer@8971: this.headerParsingState = ParsingState.CONTINUE; mschaefer@8971: try { mschaefer@8971: try { mschaefer@8971: this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING)); mschaefer@8971: } mschaefer@8971: catch (final Exception e) { mschaefer@8971: logError("Could not open (" + e.getMessage() + ")"); mschaefer@8971: this.headerParsingState = ParsingState.STOP; mschaefer@8971: } mschaefer@8971: this.currentLine = null; mschaefer@8971: while (this.headerParsingState != ParsingState.STOP) { mschaefer@8971: this.currentLine = this.in.readLine(); mschaefer@8971: if (this.currentLine == null) mschaefer@8971: break; mschaefer@8971: this.currentLine = this.currentLine.trim(); mschaefer@8971: if (this.currentLine.isEmpty()) mschaefer@8971: continue; mschaefer@8971: if (this.headerParsingState == ParsingState.CONTINUE) mschaefer@8971: handleMetaLine(); mschaefer@8971: else mschaefer@8971: handleDataLine(); mschaefer@8971: } mschaefer@8971: if (this.headerParsingState != ParsingState.STOP) mschaefer@8971: getLog().info("Number of values found: " + this.seriesHeader.getValueCount()); mschaefer@8971: } mschaefer@8971: finally { mschaefer@8971: if (this.in != null) { mschaefer@8971: this.in.close(); mschaefer@8971: this.in = null; mschaefer@8971: } mschaefer@8971: } mschaefer@8971: if (this.headerParsingState == ParsingState.STOP) mschaefer@8971: logError("Parsing of the file stopped due to a severe error"); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Writes the parse start info to the log mschaefer@8971: */ mschaefer@8971: protected void logStartInfo() { mschaefer@8971: getLog().info("Start parsing:;'" + this.rootRelativePath + "'"); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Stores the parsed series and values in the database mschaefer@8971: */ mschaefer@8971: @Override mschaefer@8971: public void store() { mschaefer@8971: if (this.headerParsingState != ParsingState.STOP) { mschaefer@8971: this.seriesHeader.store(this.river.getPeer()); mschaefer@8971: final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(), mschaefer@8971: this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE)); mschaefer@8971: if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT)) mschaefer@8971: logWarning("Number of value inserts less than number parsed: " + counts); mschaefer@8971: else mschaefer@8971: getLog().info("Number of values records: " + counts); mschaefer@8971: } mschaefer@8971: else mschaefer@8971: logWarning("Severe parsing errors, not storing series '" + this.seriesHeader.getFilename() + "'"); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Strips separator chars from a meta info text, and trims leading and trailing whitespace mschaefer@8971: */ mschaefer@8971: public static String parseMetaInfo(final String text) { mschaefer@8971: return text.replace(SEPARATOR_CHAR, "").trim(); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Parses a number string with dot or comma as decimal char, and returning null in case of an error mschaefer@8971: */ mschaefer@8971: public static Number parseDoubleWithNull(final String text) { mschaefer@8971: try { mschaefer@8971: return parseDouble(text); mschaefer@8971: } mschaefer@8971: catch (final Exception e) { mschaefer@8971: return null; mschaefer@8971: } mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Parses a number string with dot or comma as decimal char mschaefer@8971: * mschaefer@8971: * @throws ParseException mschaefer@8971: */ mschaefer@8971: public static Number parseDouble(final String text) throws ParseException { mschaefer@8971: return numberFormat.parse(text.replace(',', '.')); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Gets the class's logger mschaefer@8971: */ mschaefer@8971: protected abstract Logger getLog(); mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Logs an error message, appending the relative file path mschaefer@8971: */ mschaefer@8971: protected void logError(final String message) { mschaefer@8971: getLog().error(message + ";" + this.rootRelativePath); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Logs a warning message, appending the relative file path mschaefer@8971: */ mschaefer@8971: protected void logWarning(final String message) { mschaefer@8971: getLog().warn(message + ";" + this.rootRelativePath); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Creates a new series import object mschaefer@8971: */ mschaefer@8971: protected abstract HEADER createSeriesImport(final String filename); mschaefer@8971: mschaefer@8971: protected void handleMetaLine() { mschaefer@8971: if (META_SUBGROUP.matcher(this.currentLine).matches()) mschaefer@8971: return; mschaefer@8971: else if (handleMetaRivername()) mschaefer@8971: return; mschaefer@8971: else if (handleMetaKmrange_info()) mschaefer@8971: return; mschaefer@8971: else if (handleMetaComment()) mschaefer@8971: return; mschaefer@8971: else if (handleMetaOther()) mschaefer@8971: return; mschaefer@8971: else if (handleMetaColumnTitles()) { mschaefer@8971: if (this.headerParsingState != ParsingState.STOP) mschaefer@8971: this.headerParsingState = ParsingState.DONE; mschaefer@8971: return; mschaefer@8971: } mschaefer@8971: else { mschaefer@8971: if (this.currentLine.startsWith(START_META_CHAR)) { mschaefer@8971: if (this.headerParsingState != ParsingState.IGNORE) mschaefer@8971: logWarning("Not matching any known meta type in line " + this.in.getLineNumber() + ", ignored"); mschaefer@8971: else mschaefer@8971: this.headerParsingState = ParsingState.CONTINUE; mschaefer@8971: } mschaefer@8971: } mschaefer@8971: } mschaefer@8971: mschaefer@8971: private boolean handleMetaRivername() { mschaefer@8971: if (META_RIVERNAME.matcher(this.currentLine).matches()) { mschaefer@8971: this.metaPatternsMatched.add(META_RIVERNAME); mschaefer@8971: return true; mschaefer@8971: } mschaefer@8971: else mschaefer@8971: return false; mschaefer@8971: } mschaefer@8971: mschaefer@8971: private boolean handleMetaKmrange_info() { mschaefer@8971: final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine); mschaefer@8971: if (m.matches()) { mschaefer@8971: this.metaPatternsMatched.add(META_KMRANGE_INFO); mschaefer@8971: this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1))); mschaefer@8971: return true; mschaefer@8971: } mschaefer@8971: return false; mschaefer@8971: } mschaefer@8971: mschaefer@8971: private boolean handleMetaComment() { mschaefer@8971: final Matcher m = META_COMMENTS.matcher(this.currentLine); mschaefer@8971: if (m.matches()) { mschaefer@8971: this.metaPatternsMatched.add(META_COMMENTS); mschaefer@8971: this.seriesHeader.setComment(parseMetaInfo(m.group(1))); mschaefer@8971: return true; mschaefer@8971: } mschaefer@8971: return false; mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Parses currentLine for non-default meta info mschaefer@8971: * mschaefer@8971: * @return Whether the line has been handled mschaefer@8971: */ mschaefer@8971: protected boolean handleMetaOther() { mschaefer@8971: return false; mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Parses a header line for the km table column header line mschaefer@8971: * mschaefer@8971: * @return Whether the line has been handled and we are ready for reading the km values lines mschaefer@8971: */ mschaefer@8971: protected boolean handleMetaColumnTitles() { mschaefer@8971: if (META_COLUMNTITLES.matcher(this.currentLine).matches()) { mschaefer@8971: this.metaPatternsMatched.add(META_COLUMNTITLES); mschaefer@8971: this.columnTitles.clear(); mschaefer@8971: final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0); mschaefer@8971: for (int i = 0; i <= titles.length - 1; i++) mschaefer@8971: this.columnTitles.add(titles[i].trim()); mschaefer@8971: return true; mschaefer@8971: } mschaefer@8971: return false; mschaefer@8971: } mschaefer@8971: mschaefer@8971: private void handleDataLine() { mschaefer@8971: final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0); mschaefer@8971: // Skip import line without data or only km mschaefer@8971: if (values.length < 2) mschaefer@8971: return; mschaefer@8971: Double km; mschaefer@8971: try { mschaefer@8971: km = Double.valueOf(parseDouble(values[0]).doubleValue()); mschaefer@8971: if (kmMustBeUnique()) { mschaefer@8971: if (this.kmExists.contains(km)) { mschaefer@8971: logWarning("Ignoring duplicate station '" + values[0] + "' in line " + this.in.getLineNumber()); mschaefer@8971: return; mschaefer@8971: } mschaefer@8971: this.kmExists.add(km); mschaefer@8971: } mschaefer@8971: } mschaefer@8971: catch (final Exception e) { mschaefer@8971: logError("Not parseable km in line " + this.in.getLineNumber() + ": " + e.getMessage()); mschaefer@8971: return; mschaefer@8971: } mschaefer@8971: final KMLINE value = createKmLineImport(km, values); mschaefer@8971: if (value != null) mschaefer@8971: this.seriesHeader.addValue(value); mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Whether {@link handleDataLine} shall check for and reject km duplicates mschaefer@8971: */ mschaefer@8971: protected boolean kmMustBeUnique() { mschaefer@8971: return true; mschaefer@8971: } mschaefer@8971: mschaefer@8971: /** mschaefer@8971: * Creates a value import item with the km and other fields of the current line; mschaefer@8971: * the km has been validated mschaefer@8971: * mschaefer@8971: * @return value item, or null if parse error mschaefer@8971: */ mschaefer@8971: protected abstract KMLINE createKmLineImport(final Double km, final String[] values); mschaefer@8971: }