Mercurial > dive4elements > river
comparison backend/src/main/java/org/dive4elements/river/importer/common/AbstractParser.java @ 8971:50416a0df385
Importer for the Schifffahrt (S-INFO) and Oekologie (U-INFO) files
author | mschaefer |
---|---|
date | Tue, 03 Apr 2018 10:18:30 +0200 |
parents | |
children | 2693bfaf503d |
comparison
equal
deleted
inserted
replaced
8970:da5dc7446652 | 8971:50416a0df385 |
---|---|
1 /* Copyright (C) 2017 by Bundesanstalt für Gewässerkunde | |
2 * Software engineering by | |
3 * Björnsen Beratende Ingenieure GmbH | |
4 * Dr. Schumacher Ingenieurbüro für Wasser und Umwelt | |
5 * | |
6 * This file is Free Software under the GNU AGPL (>=v3) | |
7 * and comes with ABSOLUTELY NO WARRANTY! Check out the | |
8 * documentation coming with Dive4Elements River for details. | |
9 */ | |
10 | |
11 package org.dive4elements.river.importer.common; | |
12 | |
13 import java.io.File; | |
14 import java.io.FileInputStream; | |
15 import java.io.FilenameFilter; | |
16 import java.io.IOException; | |
17 import java.io.InputStreamReader; | |
18 import java.io.LineNumberReader; | |
19 import java.text.NumberFormat; | |
20 import java.text.ParseException; | |
21 import java.util.ArrayList; | |
22 import java.util.List; | |
23 import java.util.Locale; | |
24 import java.util.TreeSet; | |
25 import java.util.regex.Matcher; | |
26 import java.util.regex.Pattern; | |
27 | |
28 import org.apache.log4j.Logger; | |
29 import org.dive4elements.river.backend.utils.EpsilonComparator; | |
30 import org.dive4elements.river.importer.ImportRiver; | |
31 | |
32 /** | |
33 * Abstract base class for a parser of one FLYS csv data file.<br /> | |
34 * The {@link parse} method creates a SERIES object for the meta data | |
35 * and a list of KMLINE objects for the km value lines read from the file.<br /> | |
36 * The {@link store} method gets or creates the corresponding database objects | |
37 * by the hibernate binding classes DB_SERIES and DB_KMTUPLE, | |
38 * and updates or inserts them in the database. | |
39 * DB_SERIES has a one-to-many relationship with DB_KMTUPLE.<br /> | |
40 * <br /> | |
41 * The structure of the file is as follows:<br /> | |
42 * <ul> | |
43 * <li>one or more comment lines (#) with the meta info of the data series</li> | |
44 * <li>the comment line with the column titles of values table, starting with the km column</li> | |
45 * <li>the rows of the values table, each one on its own line</li> | |
46 * </ul> | |
47 * | |
48 * @author Matthias Schäfer | |
49 * | |
50 */ | |
51 public abstract class AbstractParser<DB_SERIES, DB_KMTUPLE, KMLINE extends AbstractKmLineImport<DB_SERIES, DB_KMTUPLE>, HEADER extends AbstractSeriesImport<DB_SERIES, DB_KMTUPLE, KMLINE>> implements ImportParser { | |
52 | |
53 /***** FIELDS *****/ | |
54 | |
55 public static final String ENCODING = "ISO-8859-1"; | |
56 | |
57 protected static final Locale DEFAULT_LOCALE = Locale.GERMAN; | |
58 | |
59 public static final String START_META_CHAR = "#"; | |
60 | |
61 protected static final String SEPARATOR_CHAR = ";"; | |
62 | |
63 protected static final Pattern META_RIVERNAME = Pattern.compile("^#\\s*((Gew.sser)|(Gewaesser)):\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); | |
64 | |
65 protected static final Pattern META_KMRANGE_INFO = Pattern.compile("^#\\s*Strecke:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); | |
66 | |
67 protected static final Pattern META_COMMENTS = Pattern.compile("^#\\s*weitere Bemerkungen:\\s*(\\S[^;]*).*", Pattern.CASE_INSENSITIVE); | |
68 | |
69 private static final Pattern META_COLUMNTITLES = Pattern.compile("^#*\\s*Fluss.km\\s*;.+", Pattern.CASE_INSENSITIVE); | |
70 | |
71 private static final Pattern META_SUBGROUP = Pattern.compile("^##.*", Pattern.CASE_INSENSITIVE); | |
72 | |
73 private static NumberFormat numberFormat = NumberFormat.getInstance(Locale.ROOT); | |
74 | |
75 /** | |
76 * Path of the file or directory to import from | |
77 */ | |
78 protected final File importPath; | |
79 | |
80 /** | |
81 * Part of {@link importPath} without the river root dir | |
82 */ | |
83 protected final File rootRelativePath; | |
84 | |
85 /** | |
86 * River for which the import runs | |
87 */ | |
88 protected final ImportRiver river; | |
89 | |
90 /** | |
91 * Reader during parse | |
92 */ | |
93 protected LineNumberReader in; | |
94 | |
95 /** | |
96 * Last line read from in | |
97 */ | |
98 protected String currentLine; | |
99 | |
100 /** | |
101 * State of the header lines parse loop | |
102 */ | |
103 protected ParsingState headerParsingState; | |
104 | |
105 /** | |
106 * Series header of the stations table, with the imported meta info. | |
107 */ | |
108 protected HEADER seriesHeader; | |
109 | |
110 /** | |
111 * List of meta info Pattern matched during {@link handleMetaLine} | |
112 */ | |
113 protected final List<Pattern> metaPatternsMatched; | |
114 | |
115 /** | |
116 * Column titles of the stations table, starting with the km column. | |
117 * All strings have been trimmed. | |
118 */ | |
119 protected final List<String> columnTitles; | |
120 | |
121 /** | |
122 * List of the km value tuples imported, no duplicate km | |
123 */ | |
124 protected final List<KMLINE> values; | |
125 | |
126 /** | |
127 * Ordered list with the imported km to check for duplicates. | |
128 */ | |
129 protected final TreeSet<Double> kmExists; | |
130 | |
131 | |
132 /***** CONSTRUCTORS *****/ | |
133 | |
134 /** | |
135 * Constructs a parser for an import file | |
136 */ | |
137 public AbstractParser(final File importPath, final File rootRelativePath, final ImportRiver river) { | |
138 this.importPath = importPath; | |
139 this.rootRelativePath = rootRelativePath; | |
140 this.river = river; | |
141 this.metaPatternsMatched = new ArrayList<>(); | |
142 this.kmExists = new TreeSet<>(EpsilonComparator.CMP); | |
143 this.columnTitles = new ArrayList<>(); | |
144 this.values = new ArrayList<>(); | |
145 } | |
146 | |
147 | |
148 /***** METHODS *****/ | |
149 | |
150 /** | |
151 * Lists all files from a directory having a type extension (starting with dot) | |
152 */ | |
153 protected static List<File> listFiles(final File importDir, final String extension) { | |
154 final File[] files = importDir.listFiles(new FilenameFilter() { | |
155 @Override | |
156 public boolean accept(final File dir, final String name) { | |
157 return name.toLowerCase().endsWith(extension); | |
158 } | |
159 }); | |
160 final List<File> fl = new ArrayList<>(); | |
161 if (files != null) | |
162 for (final File file : files) | |
163 fl.add(file); | |
164 return fl; | |
165 } | |
166 | |
167 /** | |
168 * Parses a file and adds series and values to the parser's collection | |
169 */ | |
170 @Override | |
171 public void parse() throws IOException { | |
172 logStartInfo(); | |
173 this.seriesHeader = createSeriesImport(this.importPath.getName().replaceAll("\\.csv", "")); | |
174 this.metaPatternsMatched.clear(); | |
175 this.kmExists.clear(); | |
176 this.headerParsingState = ParsingState.CONTINUE; | |
177 try { | |
178 try { | |
179 this.in = new LineNumberReader(new InputStreamReader(new FileInputStream(this.importPath), ENCODING)); | |
180 } | |
181 catch (final Exception e) { | |
182 logError("Could not open (" + e.getMessage() + ")"); | |
183 this.headerParsingState = ParsingState.STOP; | |
184 } | |
185 this.currentLine = null; | |
186 while (this.headerParsingState != ParsingState.STOP) { | |
187 this.currentLine = this.in.readLine(); | |
188 if (this.currentLine == null) | |
189 break; | |
190 this.currentLine = this.currentLine.trim(); | |
191 if (this.currentLine.isEmpty()) | |
192 continue; | |
193 if (this.headerParsingState == ParsingState.CONTINUE) | |
194 handleMetaLine(); | |
195 else | |
196 handleDataLine(); | |
197 } | |
198 if (this.headerParsingState != ParsingState.STOP) | |
199 getLog().info("Number of values found: " + this.seriesHeader.getValueCount()); | |
200 } | |
201 finally { | |
202 if (this.in != null) { | |
203 this.in.close(); | |
204 this.in = null; | |
205 } | |
206 } | |
207 if (this.headerParsingState == ParsingState.STOP) | |
208 logError("Parsing of the file stopped due to a severe error"); | |
209 } | |
210 | |
211 /** | |
212 * Writes the parse start info to the log | |
213 */ | |
214 protected void logStartInfo() { | |
215 getLog().info("Start parsing:;'" + this.rootRelativePath + "'"); | |
216 } | |
217 | |
218 /** | |
219 * Stores the parsed series and values in the database | |
220 */ | |
221 @Override | |
222 public void store() { | |
223 if (this.headerParsingState != ParsingState.STOP) { | |
224 this.seriesHeader.store(this.river.getPeer()); | |
225 final String counts = String.format("parse=%d, insert=%d, update/ignore=%d", this.seriesHeader.getValueCount(), | |
226 this.seriesHeader.getValueStoreCount(StoreMode.INSERT), this.seriesHeader.getValueStoreCount(StoreMode.UPDATE)); | |
227 if (this.seriesHeader.getValueCount() > this.seriesHeader.getValueStoreCount(StoreMode.INSERT)) | |
228 logWarning("Number of value inserts less than number parsed: " + counts); | |
229 else | |
230 getLog().info("Number of values records: " + counts); | |
231 } | |
232 else | |
233 logWarning("Severe parsing errors, not storing series '" + this.seriesHeader.getFilename() + "'"); | |
234 } | |
235 | |
236 /** | |
237 * Strips separator chars from a meta info text, and trims leading and trailing whitespace | |
238 */ | |
239 public static String parseMetaInfo(final String text) { | |
240 return text.replace(SEPARATOR_CHAR, "").trim(); | |
241 } | |
242 | |
243 /** | |
244 * Parses a number string with dot or comma as decimal char, and returning null in case of an error | |
245 */ | |
246 public static Number parseDoubleWithNull(final String text) { | |
247 try { | |
248 return parseDouble(text); | |
249 } | |
250 catch (final Exception e) { | |
251 return null; | |
252 } | |
253 } | |
254 | |
255 /** | |
256 * Parses a number string with dot or comma as decimal char | |
257 * | |
258 * @throws ParseException | |
259 */ | |
260 public static Number parseDouble(final String text) throws ParseException { | |
261 return numberFormat.parse(text.replace(',', '.')); | |
262 } | |
263 | |
264 /** | |
265 * Gets the class's logger | |
266 */ | |
267 protected abstract Logger getLog(); | |
268 | |
269 /** | |
270 * Logs an error message, appending the relative file path | |
271 */ | |
272 protected void logError(final String message) { | |
273 getLog().error(message + ";" + this.rootRelativePath); | |
274 } | |
275 | |
276 /** | |
277 * Logs a warning message, appending the relative file path | |
278 */ | |
279 protected void logWarning(final String message) { | |
280 getLog().warn(message + ";" + this.rootRelativePath); | |
281 } | |
282 | |
283 /** | |
284 * Creates a new series import object | |
285 */ | |
286 protected abstract HEADER createSeriesImport(final String filename); | |
287 | |
288 protected void handleMetaLine() { | |
289 if (META_SUBGROUP.matcher(this.currentLine).matches()) | |
290 return; | |
291 else if (handleMetaRivername()) | |
292 return; | |
293 else if (handleMetaKmrange_info()) | |
294 return; | |
295 else if (handleMetaComment()) | |
296 return; | |
297 else if (handleMetaOther()) | |
298 return; | |
299 else if (handleMetaColumnTitles()) { | |
300 if (this.headerParsingState != ParsingState.STOP) | |
301 this.headerParsingState = ParsingState.DONE; | |
302 return; | |
303 } | |
304 else { | |
305 if (this.currentLine.startsWith(START_META_CHAR)) { | |
306 if (this.headerParsingState != ParsingState.IGNORE) | |
307 logWarning("Not matching any known meta type in line " + this.in.getLineNumber() + ", ignored"); | |
308 else | |
309 this.headerParsingState = ParsingState.CONTINUE; | |
310 } | |
311 } | |
312 } | |
313 | |
314 private boolean handleMetaRivername() { | |
315 if (META_RIVERNAME.matcher(this.currentLine).matches()) { | |
316 this.metaPatternsMatched.add(META_RIVERNAME); | |
317 return true; | |
318 } | |
319 else | |
320 return false; | |
321 } | |
322 | |
323 private boolean handleMetaKmrange_info() { | |
324 final Matcher m = META_KMRANGE_INFO.matcher(this.currentLine); | |
325 if (m.matches()) { | |
326 this.metaPatternsMatched.add(META_KMRANGE_INFO); | |
327 this.seriesHeader.setKmrange_info(parseMetaInfo(m.group(1))); | |
328 return true; | |
329 } | |
330 return false; | |
331 } | |
332 | |
333 private boolean handleMetaComment() { | |
334 final Matcher m = META_COMMENTS.matcher(this.currentLine); | |
335 if (m.matches()) { | |
336 this.metaPatternsMatched.add(META_COMMENTS); | |
337 this.seriesHeader.setComment(parseMetaInfo(m.group(1))); | |
338 return true; | |
339 } | |
340 return false; | |
341 } | |
342 | |
343 /** | |
344 * Parses currentLine for non-default meta info | |
345 * | |
346 * @return Whether the line has been handled | |
347 */ | |
348 protected boolean handleMetaOther() { | |
349 return false; | |
350 } | |
351 | |
352 /** | |
353 * Parses a header line for the km table column header line | |
354 * | |
355 * @return Whether the line has been handled and we are ready for reading the km values lines | |
356 */ | |
357 protected boolean handleMetaColumnTitles() { | |
358 if (META_COLUMNTITLES.matcher(this.currentLine).matches()) { | |
359 this.metaPatternsMatched.add(META_COLUMNTITLES); | |
360 this.columnTitles.clear(); | |
361 final String[] titles = this.currentLine.split(SEPARATOR_CHAR, 0); | |
362 for (int i = 0; i <= titles.length - 1; i++) | |
363 this.columnTitles.add(titles[i].trim()); | |
364 return true; | |
365 } | |
366 return false; | |
367 } | |
368 | |
369 private void handleDataLine() { | |
370 final String[] values = this.currentLine.split(SEPARATOR_CHAR, 0); | |
371 // Skip import line without data or only km | |
372 if (values.length < 2) | |
373 return; | |
374 Double km; | |
375 try { | |
376 km = Double.valueOf(parseDouble(values[0]).doubleValue()); | |
377 if (kmMustBeUnique()) { | |
378 if (this.kmExists.contains(km)) { | |
379 logWarning("Ignoring duplicate station '" + values[0] + "' in line " + this.in.getLineNumber()); | |
380 return; | |
381 } | |
382 this.kmExists.add(km); | |
383 } | |
384 } | |
385 catch (final Exception e) { | |
386 logError("Not parseable km in line " + this.in.getLineNumber() + ": " + e.getMessage()); | |
387 return; | |
388 } | |
389 final KMLINE value = createKmLineImport(km, values); | |
390 if (value != null) | |
391 this.seriesHeader.addValue(value); | |
392 } | |
393 | |
394 /** | |
395 * Whether {@link handleDataLine} shall check for and reject km duplicates | |
396 */ | |
397 protected boolean kmMustBeUnique() { | |
398 return true; | |
399 } | |
400 | |
401 /** | |
402 * Creates a value import item with the km and other fields of the current line; | |
403 * the km has been validated | |
404 * | |
405 * @return value item, or null if parse error | |
406 */ | |
407 protected abstract KMLINE createKmLineImport(final Double km, final String[] values); | |
408 } |