Mercurial > dive4elements > river
view backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java @ 8989:2693bfaf503d
Fixed several BigDecimal(double) creations by BigDecimal(String) parsing to avoid unnecessary decimal digits
author | mschaefer |
---|---|
date | Mon, 09 Apr 2018 09:07:00 +0200 |
parents | 50416a0df385 |
children | c43d8c1a4455 |
line wrap: on
line source
/* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde * Software engineering by * Björnsen Beratende Ingenieure GmbH * Dr. Schumacher Ingenieurbüro für Wasser und Umwelt * * This file is Free Software under the GNU AGPL (>=v3) * and comes with ABSOLUTELY NO WARRANTY! Check out the * documentation coming with Dive4Elements River for details. */ package org.dive4elements.river.importer.common; import java.io.File; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.math.BigDecimal; import java.text.NumberFormat; import java.text.ParseException; import java.util.ArrayList; import java.util.List; import java.util.Locale; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; import org.dive4elements.river.backend.utils.EpsilonComparator; import org.dive4elements.river.importer.ImportRiver; /** * Abstract base class for a parser of one FLYS csv data file.<br /> * The {@link parse} method creates a SERIES object for the meta data * and a list of KMLINE objects for the km value lines read from the file.<br /> * The {@link store} method gets or creates the corresponding database objects * by the hibernate binding classes DB_SERIES and DB_KMTUPLE, * and updates or inserts them in the database. * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.<br /> * <br /> * The structure of the file is as follows:<br /> * <ul> * <li>one or more comment lines (#) with the meta info of the data series</li> * <li>the comment line with the column titles of values table, starting with the km column</li> * <li>the rows of the values table, each one on its own line</li> * </ul> * * @author Matthias Schäfer * */ public abstract class AbstractParser<DB_SERIES, DB_KMTUPLE, KMLINE extends AbstractKmLineImport<DB_SERIES, DB_KMTUPLE>, HEADER extends AbstractSeriesImport<DB_SERIES, DB_KMTUPLE, KMLINE>> implements ImportParser { /***** FIELDS *****/ public static final String ENCODING = "ISO-8859-1"; protected static final Locale DEFAULT_LOCALE = Locale.GERMAN; public static final String START_META_CHAR = "#"; protected static final String SEPARATOR_CHAR = ";"; protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE); private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE); private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT); /** * Path of the file or directory to import from */ protected final File importPath; /** * Part of {@link importPath} without the river root dir */ protected final File rootRelativePath; /** * River for which the import runs */ protected final ImportRiver river; /** * Reader during parse */ protected LineNumberReader in; /** * Last line read from in */ protected String currentLine; /** * State of the header lines parse loop */ protected ParsingState headerParsingState; /** * Series header of the stations table, with the imported meta info. */ protected HEADER seriesHeader; /** * List of meta info Pattern matched during {@link handleMetaLine} */ protected final List<Pattern> metaPatternsMatched; /** * Column titles of the stations table, starting with the km column. * All strings have been trimmed. */ protected final List<String> columnTitles; /** * List of the km value tuples imported, no duplicate km */ protected final List<KMLINE> values; /** * Ordered list with the imported km to check for duplicates. */ protected final TreeSet<Double> kmExists; /***** CONSTRUCTORS *****/ /** * Constructs a parser for an import file */ public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) { this.importPath = importPath; this.rootRelativePath = rootRelativePath; this.river = river; this.metaPatternsMatched = new ArrayList<>(); this.kmExists = new TreeSet<>(EpsilonComparator.CMP); this.columnTitles = new ArrayList<>(); this.values = new ArrayList<>(); } /***** METHODS *****/ /** * Lists all files from a directory having a type extension (starting with dot) */ protected static List<File> listFiles(final File importDir, final String extension) { final File[] files = importDir.listFiles(new FilenameFilter() { @Override public boolean accept(final File dir, final String name) { return name.toLowerCase().endsWith(extension); } }); final List<File> fl = new ArrayList<>(); if (files != null) for (final File file : files) fl.add(file); return fl; } /** * Parses a file and adds series and values to the parser's collection */ @Override public void parse() throws IOException { logStartInfo(); this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", "")); this.metaPatternsMatched.clear(); this.kmExists.clear(); this.headerParsingState = ParsingState.CONTINUE; try { try { this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING)); } catch (final Exception e) { logError("Could not open (" + e.getMessage() + ")"); this.headerParsingState = ParsingState.STOP; } this.currentLine = null; while (this.headerParsingState != ParsingState.STOP) { this.currentLine = this.in.readLine(); if (this.currentLine == null) break; this.currentLine = this.currentLine.trim(); if (this.currentLine.isEmpty()) continue; if (this.headerParsingState == ParsingState.CONTINUE) handleMetaLine(); else handleDataLine(); } if (this.headerParsingState != ParsingState.STOP) getLog().info("Number of values found: " + this.seriesHeader.getValueCount()); } finally { if (this.in != null) { this.in.close(); this.in = null; } } if (this.headerParsingState == ParsingState.STOP) logError("Parsing of the file stopped due to a severe error"); } /** * Writes the parse start info to the log */ protected void logStartInfo() { getLog().info("Start parsing:;'" + this.rootRelativePath + "'"); } /** * Stores the parsed series and values in the database */ @Override public void store() { if (this.headerParsingState != ParsingState.STOP) { this.seriesHeader.store(this.river.getPeer()); final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(), this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE)); if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT)) logWarning("Number of value inserts less than number parsed: " + counts); else getLog().info("Number of values records: " + counts); } else logWarning("Severe parsing errors, not storing series '" + this.seriesHeader.getFilename() + "'"); } /** * Strips separator chars from a meta info text, and trims leading and trailing whitespace */ public static String parseMetaInfo(final String text) { return text.replace(SEPARATOR_CHAR, "").trim(); } /** * Parses a number string with dot or comma as decimal char, and returning null in case of an error */ public static Number parseDoubleWithNull(final String text) { try { return parseDouble(text); } catch (final Exception e) { return null; } } /** * Parses a number string with dot or comma as decimal char * * @throws ParseException */ public static Number parseDouble(final String text) throws ParseException { return numberFormat.parse(text.replace(',', '.')); } /** * Parses a number string as a BigDecimal, replacing a comma with a dot first */ public static BigDecimal parseDecimal(final String text) throws NumberFormatException { return new BigDecimal(text.replace(',', '.')); } /** * Gets the class's logger */ protected abstract Logger getLog(); /** * Logs an error message, appending the relative file path */ protected void logError(final String message) { getLog().error(message + ";" + this.rootRelativePath); } /** * Logs a warning message, appending the relative file path */ protected void logWarning(final String message) { getLog().warn(message + ";" + this.rootRelativePath); } /** * Creates a new series import object */ protected abstract HEADER createSeriesImport(final String filename); protected void handleMetaLine() { if (META_SUBGROUP.matcher(this.currentLine).matches()) return; else if (handleMetaRivername()) return; else if (handleMetaKmrange_info()) return; else if (handleMetaComment()) return; else if (handleMetaOther()) return; else if (handleMetaColumnTitles()) { if (this.headerParsingState != ParsingState.STOP) this.headerParsingState = ParsingState.DONE; return; } else { if (this.currentLine.startsWith(START_META_CHAR)) { if (this.headerParsingState != ParsingState.IGNORE) logWarning("Not matching any known meta type in line " + this.in.getLineNumber() + ", ignored"); else this.headerParsingState = ParsingState.CONTINUE; } } } private boolean handleMetaRivername() { if (META_RIVERNAME.matcher(this.currentLine).matches()) { this.metaPatternsMatched.add(META_RIVERNAME); return true; } else return false; } private boolean handleMetaKmrange_info() { final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine); if (m.matches()) { this.metaPatternsMatched.add(META_KMRANGE_INFO); this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1))); return true; } return false; } private boolean handleMetaComment() { final Matcher m = META_COMMENTS.matcher(this.currentLine); if (m.matches()) { this.metaPatternsMatched.add(META_COMMENTS); this.seriesHeader.setComment(parseMetaInfo(m.group(1))); return true; } return false; } /** * Parses currentLine for non-default meta info * * @return Whether the line has been handled */ protected boolean handleMetaOther() { return false; } /** * Parses a header line for the km table column header line * * @return Whether the line has been handled and we are ready for reading the km values lines */ protected boolean handleMetaColumnTitles() { if (META_COLUMNTITLES.matcher(this.currentLine).matches()) { this.metaPatternsMatched.add(META_COLUMNTITLES); this.columnTitles.clear(); final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0); for (int i = 0; i <= titles.length - 1; i++) this.columnTitles.add(titles[i].trim()); return true; } return false; } private void handleDataLine() { final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0); // Skip import line without data or only km if (values.length < 2) return; Double km; try { km = Double.valueOf(parseDouble(values[0]).doubleValue()); if (kmMustBeUnique()) { if (this.kmExists.contains(km)) { logWarning("Ignoring duplicate station '" + values[0] + "' in line " + this.in.getLineNumber()); return; } this.kmExists.add(km); } } catch (final Exception e) { logError("Not parseable km in line " + this.in.getLineNumber() + ": " + e.getMessage()); return; } final KMLINE value = createKmLineImport(km, values); if (value != null) this.seriesHeader.addValue(value); } /** * Whether {@link handleDataLine} shall check for and reject km duplicates */ protected boolean kmMustBeUnique() { return true; } /** * Creates a value import item with the km and other fields of the current line; * the km has been validated * * @return value item, or null if parse error */ protected abstract KMLINE createKmLineImport(final Double km, final String[] values); }