Mercurial > dive4elements > river
comparison flys-backend/src/main/java/org/dive4elements/river/importer/parsers/WstParser.java @ 5828:dfb26b03b179
Moved directories to org.dive4elements.river
author | Sascha L. Teichmann <teichmann@intevation.de> |
---|---|
date | Thu, 25 Apr 2013 11:53:11 +0200 |
parents | flys-backend/src/main/java/de/intevation/flys/importer/parsers/WstParser.java@202a73ce6704 |
children | 18619c1e7c2a |
comparison
equal
deleted
inserted
replaced
5827:e308d4ecd35a | 5828:dfb26b03b179 |
---|---|
1 package de.intevation.flys.importer.parsers; | |
2 | |
3 import java.util.ArrayList; | |
4 import java.util.HashSet; | |
5 | |
6 import java.io.File; | |
7 import java.io.IOException; | |
8 import java.io.LineNumberReader; | |
9 import java.io.InputStreamReader; | |
10 import java.io.FileInputStream; | |
11 | |
12 import java.text.NumberFormat; | |
13 | |
14 import org.apache.log4j.Logger; | |
15 | |
16 import de.intevation.flys.utils.StringUtil; | |
17 import de.intevation.flys.utils.DateGuesser; | |
18 | |
19 import java.util.regex.Pattern; | |
20 import java.util.regex.Matcher; | |
21 | |
22 import java.math.BigDecimal; | |
23 | |
24 import de.intevation.flys.importer.ImportWstQRange; | |
25 import de.intevation.flys.importer.ImportWstColumn; | |
26 import de.intevation.flys.importer.ImportTimeInterval; | |
27 import de.intevation.flys.importer.ImportRange; | |
28 import de.intevation.flys.importer.ImportUnit; | |
29 import de.intevation.flys.importer.ImportWst; | |
30 | |
31 public class WstParser | |
32 { | |
33 private static Logger log = Logger.getLogger(WstParser.class); | |
34 | |
35 public static final String COLUMN_BEZ_TEXT = "column-bez-text"; | |
36 public static final String COLUMN_BEZ_BREITE = "column-bez-breite"; | |
37 public static final String COLUMN_QUELLE = "column-quelle"; | |
38 public static final String COLUMN_DATUM = "column-datum"; | |
39 | |
40 public static final BigDecimal UNDEFINED_ZERO = | |
41 new BigDecimal(0.0); | |
42 public static final BigDecimal MIN_RANGE = | |
43 new BigDecimal(-Double.MAX_VALUE); | |
44 public static final BigDecimal MAX_RANGE = | |
45 new BigDecimal(Double.MAX_VALUE); | |
46 | |
47 public static final String ENCODING = "ISO-8859-1"; | |
48 | |
49 public static final Pattern UNIT_COMMENT = | |
50 Pattern.compile("\\*\\s*[kK][mM]\\s+(.+)"); | |
51 | |
52 public static final Pattern UNIT = | |
53 Pattern.compile("[^\\[]*\\[([^]]+)\\].*"); | |
54 | |
55 public static final double INTERVAL_GAP = 0.00001d; | |
56 | |
57 protected ImportWst wst; | |
58 | |
59 protected ImportRange lastRange; | |
60 | |
61 public WstParser() { | |
62 } | |
63 | |
64 public ImportWst getWst() { | |
65 return wst; | |
66 } | |
67 | |
68 public void setWst(ImportWst wst) { | |
69 this.wst = wst; | |
70 } | |
71 | |
72 public ImportTimeInterval guessDate(String string) { | |
73 try { | |
74 return new ImportTimeInterval( | |
75 DateGuesser.guessDate(string)); | |
76 } | |
77 catch (IllegalArgumentException iae) { | |
78 } | |
79 return null; | |
80 } | |
81 | |
82 public void parse(File file) throws IOException { | |
83 | |
84 log.info("Parsing WST file '" + file + "'"); | |
85 | |
86 wst = new ImportWst(file.getName()); | |
87 | |
88 LineNumberReader in = null; | |
89 try { | |
90 in = | |
91 new LineNumberReader( | |
92 new InputStreamReader( | |
93 new FileInputStream(file), ENCODING)); | |
94 | |
95 String input; | |
96 boolean first = true; | |
97 int columnCount = 0; | |
98 | |
99 String [] lsBezeichner = null; | |
100 String [] langBezeichner = null; | |
101 int [] colNaWidths = null; | |
102 String [] quellen = null; | |
103 String [] daten = null; | |
104 | |
105 BigDecimal [] aktAbfluesse = null; | |
106 BigDecimal [] firstAbfluesse = null; | |
107 | |
108 BigDecimal minKm = MAX_RANGE; | |
109 BigDecimal maxKm = MIN_RANGE; | |
110 BigDecimal kmHist1 = null; | |
111 BigDecimal kmHist2 = null; | |
112 | |
113 boolean columnHeaderChecked = false; | |
114 | |
115 String einheit = "Wasserstand [NN + m]"; | |
116 | |
117 HashSet<BigDecimal> kms = new HashSet<BigDecimal>(); | |
118 | |
119 while ((input = in.readLine()) != null) { | |
120 String line = input; | |
121 if (first) { // fetch number of columns | |
122 if ((line = line.trim()).length() == 0) { | |
123 continue; | |
124 } | |
125 try { | |
126 columnCount = Integer.parseInt(line); | |
127 if (columnCount <= 0) { | |
128 throw new NumberFormatException( | |
129 "number columns <= 0"); | |
130 } | |
131 log.debug("Number of columns: " + columnCount); | |
132 wst.setNumberColumns(columnCount); | |
133 lsBezeichner = new String[columnCount]; | |
134 } | |
135 catch (NumberFormatException nfe) { | |
136 log.warn("WST: invalid number.", nfe); | |
137 continue; | |
138 } | |
139 first = false; | |
140 continue; | |
141 } | |
142 | |
143 line = line.replace(',', '.'); | |
144 | |
145 if (line.startsWith("*\u001f")) { | |
146 BigDecimal [] data = | |
147 parseLineAsDouble(line, columnCount, false, true); | |
148 | |
149 if (aktAbfluesse != null) { | |
150 if (kmHist1 != null && kmHist2 != null | |
151 && kmHist1.compareTo(kmHist2) < 0) { | |
152 BigDecimal t = minKm; minKm = maxKm; maxKm = t; | |
153 } | |
154 addInterval(minKm, maxKm, aktAbfluesse); | |
155 minKm = MAX_RANGE; | |
156 maxKm = MIN_RANGE; | |
157 } | |
158 | |
159 aktAbfluesse = new BigDecimal[columnCount]; | |
160 log.debug("new q range: " + columnCount); | |
161 for (int i = 0; i < Math.min(columnCount, data.length); ++i) { | |
162 if (data[i] != null) { | |
163 log.debug(" column: " + data[i]); | |
164 aktAbfluesse[i] = data[i]; | |
165 } | |
166 } | |
167 | |
168 if (firstAbfluesse == null) { | |
169 firstAbfluesse = (BigDecimal [])aktAbfluesse.clone(); | |
170 } | |
171 continue; | |
172 } | |
173 | |
174 if (line.startsWith("*!")) { | |
175 String spezial = line.substring(2).trim(); | |
176 | |
177 if (spezial.length() == 0) { | |
178 continue; | |
179 } | |
180 | |
181 if (spezial.startsWith(COLUMN_BEZ_TEXT)) { | |
182 spezial = spezial.substring(COLUMN_BEZ_TEXT.length()).trim(); | |
183 if (spezial.length() == 0) { | |
184 continue; | |
185 } | |
186 langBezeichner = StringUtil.splitQuoted(spezial, '"'); | |
187 } | |
188 else if (spezial.startsWith(COLUMN_BEZ_BREITE)) { | |
189 spezial = spezial.substring(COLUMN_BEZ_BREITE.length()).trim(); | |
190 | |
191 if (spezial.length() == 0) { | |
192 continue; | |
193 } | |
194 | |
195 String[] split = spezial.split("\\s+"); | |
196 | |
197 colNaWidths = new int[split.length]; | |
198 for (int i=0; i < split.length; i++) { | |
199 colNaWidths[i] = Integer.parseInt(split[i]); | |
200 } | |
201 } | |
202 else if (spezial.startsWith(COLUMN_QUELLE)) { | |
203 if (spezial.length() == 0) { | |
204 continue; | |
205 } | |
206 quellen = StringUtil.splitQuoted(spezial, '"'); | |
207 } | |
208 else if (spezial.startsWith(COLUMN_DATUM)) { | |
209 spezial = spezial.substring(COLUMN_DATUM.length()).trim(); | |
210 if (spezial.length() == 0) { | |
211 continue; | |
212 } | |
213 daten = StringUtil.splitQuoted(spezial, '"'); | |
214 } | |
215 continue; | |
216 } | |
217 | |
218 if (line.length() < 11) { | |
219 continue; | |
220 } | |
221 | |
222 if (line.startsWith("*")) { | |
223 Matcher m = UNIT_COMMENT.matcher(line); | |
224 if (m.matches()) { | |
225 log.debug("unit comment found"); | |
226 // XXX: This hack is needed because desktop | |
227 // FLYS is broken figuring out the unit | |
228 String [] units = m.group(1).split("\\s{2,}"); | |
229 m = UNIT.matcher(units[0]); | |
230 einheit = m.matches() ? m.group(1) : units[0]; | |
231 log.debug("unit: " + einheit); | |
232 } | |
233 continue; | |
234 } | |
235 | |
236 if (firstAbfluesse != null) { | |
237 if (!columnHeaderChecked) { | |
238 int unknownCount = 0; | |
239 HashSet<String> uniqueColumnNames = | |
240 new HashSet<String>(); | |
241 for (int i = 0; i < lsBezeichner.length; ++i) { | |
242 if (lsBezeichner[i] == null | |
243 || lsBezeichner[i].length() == 0) { | |
244 double q = firstAbfluesse[i].doubleValue(); | |
245 if (q < 0.001) { | |
246 lsBezeichner[i] = | |
247 "<unbekannt #" + unknownCount + ">"; | |
248 ++unknownCount; | |
249 } | |
250 else { | |
251 lsBezeichner[i] = "Q="+format(q); | |
252 } | |
253 } | |
254 String candidate = lsBezeichner[i]; | |
255 int collision = 1; | |
256 while (!uniqueColumnNames.add(candidate)) { | |
257 candidate = lsBezeichner[i] + | |
258 " (" + collision + ")"; | |
259 ++collision; | |
260 } | |
261 ImportWstColumn iwc = wst.getColumn(i); | |
262 iwc.setName(candidate); | |
263 String potentialDate = daten != null && i < daten.length | |
264 ? daten[i] | |
265 : candidate; | |
266 iwc.setTimeInterval(guessDate(potentialDate)); | |
267 } | |
268 columnHeaderChecked = true; | |
269 } | |
270 | |
271 BigDecimal [] data = | |
272 parseLineAsDouble(line, columnCount, true, false); | |
273 | |
274 BigDecimal kaem = data[0]; | |
275 | |
276 if (!kms.add(kaem)) { | |
277 log.warn( | |
278 "WST: km " + kaem + | |
279 " (line " + in.getLineNumber() + | |
280 ") found more than once. -> ignored"); | |
281 continue; | |
282 } | |
283 | |
284 kmHist2 = kmHist1; | |
285 kmHist1 = kaem; | |
286 | |
287 if (kaem.compareTo(minKm) < 0) { | |
288 minKm = kaem; | |
289 } | |
290 if (kaem.compareTo(maxKm) > 0) { | |
291 maxKm = kaem; | |
292 } | |
293 | |
294 // extract values | |
295 for (int i = 0; i < columnCount; ++i) { | |
296 addValue(kaem, data[i+1], i); | |
297 } | |
298 | |
299 } | |
300 else { // firstAbfluesse == null | |
301 if (langBezeichner != null) { | |
302 lsBezeichner = StringUtil.fitArray( | |
303 langBezeichner, lsBezeichner); | |
304 } | |
305 else if (colNaWidths != null) { | |
306 for (int j = 0, i = 0, N = input.length(); | |
307 j < colNaWidths.length && i < N; | |
308 i += colNaWidths[j++] | |
309 ) { | |
310 lsBezeichner[j] = input.substring( | |
311 i, i+colNaWidths[j]).trim(); | |
312 } | |
313 } | |
314 else { | |
315 // first column begins at position 8 in line | |
316 for (int i = 8, col = 0; i < input.length(); i += 9) { | |
317 if ((i + 9) > input.length()) { | |
318 i = input.length() - 10; | |
319 } | |
320 // one column header is 9 chars wide | |
321 lsBezeichner[col++] = | |
322 input.substring(i, i + 9).trim(); | |
323 | |
324 if (col == lsBezeichner.length) { | |
325 break; | |
326 } | |
327 } | |
328 } | |
329 } | |
330 | |
331 } // for all lines in WST file | |
332 | |
333 wst.setUnit(new ImportUnit(einheit)); | |
334 | |
335 if (kmHist1 != null && kmHist2 != null | |
336 && kmHist1.compareTo(kmHist2) < 0) { | |
337 BigDecimal t = minKm; minKm = maxKm; maxKm = t; | |
338 } | |
339 addInterval(minKm, maxKm, aktAbfluesse); | |
340 | |
341 fixRangesOrder(); | |
342 } | |
343 finally { | |
344 if (in != null) { | |
345 in.close(); | |
346 } | |
347 } | |
348 } | |
349 | |
350 protected void fixRangesOrder() { | |
351 wst.fixRangesOrder(); | |
352 } | |
353 | |
354 protected void addValue(BigDecimal km, BigDecimal w, int index) { | |
355 if (w != null) { | |
356 ImportWstColumn column = wst.getColumn(index); | |
357 column.addColumnValue(km, w); | |
358 } | |
359 } | |
360 | |
361 private static final NumberFormat NF = getNumberFormat(); | |
362 | |
363 private static final NumberFormat getNumberFormat() { | |
364 NumberFormat nf = NumberFormat.getInstance(); | |
365 nf.setMinimumFractionDigits(2); | |
366 nf.setMaximumFractionDigits(2); | |
367 return nf; | |
368 } | |
369 | |
370 protected static String format(double value) { | |
371 return NF.format(value); | |
372 } | |
373 | |
374 protected void addInterval( | |
375 BigDecimal from, | |
376 BigDecimal to, | |
377 BigDecimal [] values | |
378 ) { | |
379 log.debug("addInterval: " + from + " " + to); | |
380 | |
381 if (values == null || from == MAX_RANGE || from == MIN_RANGE) { | |
382 return; | |
383 } | |
384 | |
385 ImportRange range = new ImportRange(from, to); | |
386 | |
387 // little workaround to make the q ranges tightly fit. | |
388 // Leave a very small gap to ensure that the range queries | |
389 // still work. | |
390 | |
391 if (lastRange != null) { | |
392 double a1 = lastRange.getA().doubleValue(); | |
393 double b1 = lastRange.getB().doubleValue(); | |
394 double a2 = range.getA().doubleValue(); | |
395 | |
396 if (a1 < b1) { | |
397 lastRange.setB(new BigDecimal(a2 - INTERVAL_GAP)); | |
398 } | |
399 else { // a1 >= b1 | |
400 lastRange.setB(new BigDecimal(a2 + INTERVAL_GAP)); | |
401 } | |
402 } | |
403 | |
404 for (int i = 0; i < values.length; ++i) { | |
405 ImportWstColumn column = wst.getColumn(i); | |
406 ImportWstQRange wstQRange = new ImportWstQRange(range, values[i]); | |
407 column.addColumnQRange(wstQRange); | |
408 } | |
409 | |
410 lastRange = range; | |
411 } | |
412 | |
413 private static final BigDecimal [] parseLineAsDouble( | |
414 String line, | |
415 int count, | |
416 boolean bStation, | |
417 boolean bParseEmptyAsZero | |
418 ) { | |
419 String [] tokens = parseLine(line, count, bStation); | |
420 | |
421 BigDecimal [] doubles = new BigDecimal[tokens.length]; | |
422 | |
423 for (int i = 0; i < doubles.length; ++i) { | |
424 String token = tokens[i].trim(); | |
425 if (token.length() != 0) { | |
426 doubles[i] = new BigDecimal(token); | |
427 } | |
428 else if (bParseEmptyAsZero) { | |
429 doubles[i] = UNDEFINED_ZERO; | |
430 } | |
431 } | |
432 | |
433 return doubles; | |
434 } | |
435 | |
436 private static String [] parseLine( | |
437 String line, | |
438 int tokenCount, | |
439 boolean bParseStation | |
440 ) { | |
441 ArrayList<String> strings = new ArrayList<String>(); | |
442 | |
443 if (bParseStation) { | |
444 if (line.length() < 8) { | |
445 throw new IllegalArgumentException("station too short"); | |
446 } | |
447 strings.add(line.substring(0, 8)); | |
448 } | |
449 | |
450 int pos = 9; | |
451 for (int i = 0; i < tokenCount; ++i) { | |
452 if (line.length() >= pos + 8) { | |
453 strings.add(line.substring(pos, pos + 8)); | |
454 } | |
455 else { | |
456 strings.add(""); | |
457 } | |
458 pos += 9; | |
459 } | |
460 | |
461 return strings.toArray(new String[strings.size()]); | |
462 } | |
463 } | |
464 // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 : |