comparison backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java @ 8971:50416a0df385

Importer for the Schifffahrt (S-INFO) and Oekologie (U-INFO) files
author mschaefer
date Tue, 03 Apr 2018 10:18:30 +0200
parents
children 2693bfaf503d
comparison
equal deleted inserted replaced
8970:da5dc7446652 8971:50416a0df385
1 /* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde
2 * Software engineering by
3 * Björnsen Beratende Ingenieure GmbH
4 * Dr. Schumacher Ingenieurbüro für Wasser und Umwelt
5 *
6 * This file is Free Software under the GNU AGPL (>=v3)
7 * and comes with ABSOLUTELY NO WARRANTY! Check out the
8 * documentation coming with Dive4Elements River for details.
9 */
10
11 package org.dive4elements.river.importer.common;
12
13 import java.io.File;
14 import java.io.FileInputStream;
15 import java.io.FilenameFilter;
16 import java.io.IOException;
17 import java.io.InputStreamReader;
18 import java.io.LineNumberReader;
19 import java.text.NumberFormat;
20 import java.text.ParseException;
21 import java.util.ArrayList;
22 import java.util.List;
23 import java.util.Locale;
24 import java.util.TreeSet;
25 import java.util.regex.Matcher;
26 import java.util.regex.Pattern;
27
28 import org.apache.log4j.Logger;
29 import org.dive4elements.river.backend.utils.EpsilonComparator;
30 import org.dive4elements.river.importer.ImportRiver;
31
32 /**
33 * Abstract base class for a parser of one FLYS csv data file.<br />
34 * The {@link parse} method creates a SERIES object for the meta data
35 * and a list of KMLINE objects for the km value lines read from the file.<br />
36 * The {@link store} method gets or creates the corresponding database objects
37 * by the hibernate binding classes DB_SERIES and DB_KMTUPLE,
38 * and updates or inserts them in the database.
39 * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.<br />
40 * <br />
41 * The structure of the file is as follows:<br />
42 * <ul>
43 * <li>one or more comment lines (#) with the meta info of the data series</li>
44 * <li>the comment line with the column titles of values table, starting with the km column</li>
45 * <li>the rows of the values table, each one on its own line</li>
46 * </ul>
47 *
48 * @author Matthias Schäfer
49 *
50 */
51 public abstract class AbstractParser<DB_SERIES, DB_KMTUPLE, KMLINE extends AbstractKmLineImport<DB_SERIES, DB_KMTUPLE>, HEADER extends AbstractSeriesImport<DB_SERIES, DB_KMTUPLE, KMLINE>> implements ImportParser {
52
53 /***** FIELDS *****/
54
55 public static final String ENCODING = "ISO-8859-1";
56
57 protected static final Locale DEFAULT_LOCALE = Locale.GERMAN;
58
59 public static final String START_META_CHAR = "#";
60
61 protected static final String SEPARATOR_CHAR = ";";
62
63 protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);
64
65 protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);
66
67 protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE);
68
69 private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE);
70
71 private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE);
72
73 private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT);
74
75 /**
76 * Path of the file or directory to import from
77 */
78 protected final File importPath;
79
80 /**
81 * Part of {@link importPath} without the river root dir
82 */
83 protected final File rootRelativePath;
84
85 /**
86 * River for which the import runs
87 */
88 protected final ImportRiver river;
89
90 /**
91 * Reader during parse
92 */
93 protected LineNumberReader in;
94
95 /**
96 * Last line read from in
97 */
98 protected String currentLine;
99
100 /**
101 * State of the header lines parse loop
102 */
103 protected ParsingState headerParsingState;
104
105 /**
106 * Series header of the stations table, with the imported meta info.
107 */
108 protected HEADER seriesHeader;
109
110 /**
111 * List of meta info Pattern matched during {@link handleMetaLine}
112 */
113 protected final List<Pattern> metaPatternsMatched;
114
115 /**
116 * Column titles of the stations table, starting with the km column.
117 * All strings have been trimmed.
118 */
119 protected final List<String> columnTitles;
120
121 /**
122 * List of the km value tuples imported, no duplicate km
123 */
124 protected final List<KMLINE> values;
125
126 /**
127 * Ordered list with the imported km to check for duplicates.
128 */
129 protected final TreeSet<Double> kmExists;
130
131
132 /***** CONSTRUCTORS *****/
133
134 /**
135 * Constructs a parser for an import file
136 */
137 public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) {
138 this.importPath = importPath;
139 this.rootRelativePath = rootRelativePath;
140 this.river = river;
141 this.metaPatternsMatched = new ArrayList<>();
142 this.kmExists = new TreeSet<>(EpsilonComparator.CMP);
143 this.columnTitles = new ArrayList<>();
144 this.values = new ArrayList<>();
145 }
146
147
148 /***** METHODS *****/
149
150 /**
151 * Lists all files from a directory having a type extension (starting with dot)
152 */
153 protected static List<File> listFiles(final File importDir, final String extension) {
154 final File[] files = importDir.listFiles(new FilenameFilter() {
155 @Override
156 public boolean accept(final File dir, final String name) {
157 return name.toLowerCase().endsWith(extension);
158 }
159 });
160 final List<File> fl = new ArrayList<>();
161 if (files != null)
162 for (final File file : files)
163 fl.add(file);
164 return fl;
165 }
166
167 /**
168 * Parses a file and adds series and values to the parser's collection
169 */
170 @Override
171 public void parse() throws IOException {
172 logStartInfo();
173 this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", ""));
174 this.metaPatternsMatched.clear();
175 this.kmExists.clear();
176 this.headerParsingState = ParsingState.CONTINUE;
177 try {
178 try {
179 this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING));
180 }
181 catch (final Exception e) {
182 logError("Could not open (" + e.getMessage() + ")");
183 this.headerParsingState = ParsingState.STOP;
184 }
185 this.currentLine = null;
186 while (this.headerParsingState != ParsingState.STOP) {
187 this.currentLine = this.in.readLine();
188 if (this.currentLine == null)
189 break;
190 this.currentLine = this.currentLine.trim();
191 if (this.currentLine.isEmpty())
192 continue;
193 if (this.headerParsingState == ParsingState.CONTINUE)
194 handleMetaLine();
195 else
196 handleDataLine();
197 }
198 if (this.headerParsingState != ParsingState.STOP)
199 getLog().info("Number of values found: " + this.seriesHeader.getValueCount());
200 }
201 finally {
202 if (this.in != null) {
203 this.in.close();
204 this.in = null;
205 }
206 }
207 if (this.headerParsingState == ParsingState.STOP)
208 logError("Parsing of the file stopped due to a severe error");
209 }
210
211 /**
212 * Writes the parse start info to the log
213 */
214 protected void logStartInfo() {
215 getLog().info("Start parsing:;'" + this.rootRelativePath + "'");
216 }
217
218 /**
219 * Stores the parsed series and values in the database
220 */
221 @Override
222 public void store() {
223 if (this.headerParsingState != ParsingState.STOP) {
224 this.seriesHeader.store(this.river.getPeer());
225 final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(),
226 this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE));
227 if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT))
228 logWarning("Number of value inserts less than number parsed: " + counts);
229 else
230 getLog().info("Number of values records: " + counts);
231 }
232 else
233 logWarning("Severe parsing errors, not storing series '" + this.seriesHeader.getFilename() + "'");
234 }
235
236 /**
237 * Strips separator chars from a meta info text, and trims leading and trailing whitespace
238 */
239 public static String parseMetaInfo(final String text) {
240 return text.replace(SEPARATOR_CHAR, "").trim();
241 }
242
243 /**
244 * Parses a number string with dot or comma as decimal char, and returning null in case of an error
245 */
246 public static Number parseDoubleWithNull(final String text) {
247 try {
248 return parseDouble(text);
249 }
250 catch (final Exception e) {
251 return null;
252 }
253 }
254
255 /**
256 * Parses a number string with dot or comma as decimal char
257 *
258 * @throws ParseException
259 */
260 public static Number parseDouble(final String text) throws ParseException {
261 return numberFormat.parse(text.replace(',', '.'));
262 }
263
264 /**
265 * Gets the class's logger
266 */
267 protected abstract Logger getLog();
268
269 /**
270 * Logs an error message, appending the relative file path
271 */
272 protected void logError(final String message) {
273 getLog().error(message + ";" + this.rootRelativePath);
274 }
275
276 /**
277 * Logs a warning message, appending the relative file path
278 */
279 protected void logWarning(final String message) {
280 getLog().warn(message + ";" + this.rootRelativePath);
281 }
282
283 /**
284 * Creates a new series import object
285 */
286 protected abstract HEADER createSeriesImport(final String filename);
287
288 protected void handleMetaLine() {
289 if (META_SUBGROUP.matcher(this.currentLine).matches())
290 return;
291 else if (handleMetaRivername())
292 return;
293 else if (handleMetaKmrange_info())
294 return;
295 else if (handleMetaComment())
296 return;
297 else if (handleMetaOther())
298 return;
299 else if (handleMetaColumnTitles()) {
300 if (this.headerParsingState != ParsingState.STOP)
301 this.headerParsingState = ParsingState.DONE;
302 return;
303 }
304 else {
305 if (this.currentLine.startsWith(START_META_CHAR)) {
306 if (this.headerParsingState != ParsingState.IGNORE)
307 logWarning("Not matching any known meta type in line " + this.in.getLineNumber() + ", ignored");
308 else
309 this.headerParsingState = ParsingState.CONTINUE;
310 }
311 }
312 }
313
314 private boolean handleMetaRivername() {
315 if (META_RIVERNAME.matcher(this.currentLine).matches()) {
316 this.metaPatternsMatched.add(META_RIVERNAME);
317 return true;
318 }
319 else
320 return false;
321 }
322
323 private boolean handleMetaKmrange_info() {
324 final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine);
325 if (m.matches()) {
326 this.metaPatternsMatched.add(META_KMRANGE_INFO);
327 this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1)));
328 return true;
329 }
330 return false;
331 }
332
333 private boolean handleMetaComment() {
334 final Matcher m = META_COMMENTS.matcher(this.currentLine);
335 if (m.matches()) {
336 this.metaPatternsMatched.add(META_COMMENTS);
337 this.seriesHeader.setComment(parseMetaInfo(m.group(1)));
338 return true;
339 }
340 return false;
341 }
342
343 /**
344 * Parses currentLine for non-default meta info
345 *
346 * @return Whether the line has been handled
347 */
348 protected boolean handleMetaOther() {
349 return false;
350 }
351
352 /**
353 * Parses a header line for the km table column header line
354 *
355 * @return Whether the line has been handled and we are ready for reading the km values lines
356 */
357 protected boolean handleMetaColumnTitles() {
358 if (META_COLUMNTITLES.matcher(this.currentLine).matches()) {
359 this.metaPatternsMatched.add(META_COLUMNTITLES);
360 this.columnTitles.clear();
361 final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0);
362 for (int i = 0; i <= titles.length - 1; i++)
363 this.columnTitles.add(titles[i].trim());
364 return true;
365 }
366 return false;
367 }
368
369 private void handleDataLine() {
370 final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0);
371 // Skip import line without data or only km
372 if (values.length < 2)
373 return;
374 Double km;
375 try {
376 km = Double.valueOf(parseDouble(values[0]).doubleValue());
377 if (kmMustBeUnique()) {
378 if (this.kmExists.contains(km)) {
379 logWarning("Ignoring duplicate station '" + values[0] + "' in line " + this.in.getLineNumber());
380 return;
381 }
382 this.kmExists.add(km);
383 }
384 }
385 catch (final Exception e) {
386 logError("Not parseable km in line " + this.in.getLineNumber() + ": " + e.getMessage());
387 return;
388 }
389 final KMLINE value = createKmLineImport(km, values);
390 if (value != null)
391 this.seriesHeader.addValue(value);
392 }
393
394 /**
395 * Whether {@link handleDataLine} shall check for and reject km duplicates
396 */
397 protected boolean kmMustBeUnique() {
398 return true;
399 }
400
401 /**
402 * Creates a value import item with the km and other fields of the current line;
403 * the km has been validated
404 *
405 * @return value item, or null if parse error
406 */
407 protected abstract KMLINE createKmLineImport(final Double km, final String[] values);
408 }

http://dive4elements.wald.intevation.org