# HG changeset patch # User Sascha L. Teichmann # Date 1298370893 0 # Node ID 80669241956c1a8184d8c2d086e192949bdfd1b2 # Parent 119048655872fb032618f6cdee863cc1b5b17f04 Initial database import scripts. Not finished, yet. flys-backend/trunk@1333 c6561f87-3c4e-4783-a992-168aeb5c3f6f diff -r 119048655872 -r 80669241956c flys-backend/ChangeLog --- a/flys-backend/ChangeLog Thu Feb 10 12:07:35 2011 +0000 +++ b/flys-backend/ChangeLog Tue Feb 22 10:34:53 2011 +0000 @@ -1,4 +1,9 @@ -2011-02-10 Sascha L. Teichmann: +2011-01-22 Sascha L. Teichmann + + * contrib/import-kms.py, contrib/import-gew.py: Initial scripts + to import data into SQLite database. They still need some work. + +2011-02-10 Sascha L. Teichmann : * doc/schema/sqlite.sql: Added initial schema for FLYS database. diff -r 119048655872 -r 80669241956c flys-backend/contrib/import-gew.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flys-backend/contrib/import-gew.py Tue Feb 22 10:34:53 2011 +0000 @@ -0,0 +1,223 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import sys +import os +import codecs +import re + +HAUPTWERT = re.compile(r"\s*([^\s]+)\s+([^\s+]+)\s+([QWDT-])") +WHITESPACE = re.compile(r"\s+") + +class KM(object): + + def __init__(self, filename): + self.filename = filename + self.load_values() + + def load_values(self): + with codecs.open(self.filename, "r", "latin-1") as f: + for line in f: + line = line.strip() + if not line or line.startswith("*"): + parts = [s.strip() for s in line.split(";")] + # TODO: Use code from import-kms.py + +class AbflussTafel(object): + + def __init__(self, filename): + self.filename = filename + self.name = "" + self.values = [] + self.load_values() + + def load_values(self): + with codecs.open(self.filename, "r", "latin-1") as f: + first = True + for line in f: + line = line.strip() + if not line: continue + if line.startswith("#! name="): + self.name = line[8:] + continue + if line.startswith("#") or line.startswith("*"): + continue + line = line.replace(",", ".") + splits = WHITESPACE.split(line) + + if len(splits) < 2 or len(splits) > 11: + continue + + w = float(splits[0]) + + shift = 0 + + if len(splits) != 11 and first: + shift = 11 - len(splits) + + for idx, q in enumerate(splits[1:]): + i_w = w + shift + idx + i_q = float(q) + w_q = (i_w/100.0, i_q/100.0) + self.values.append(w_q) + + first = False + + +class Hauptwert(object): + def __init__(self, name, value, kind): + self.name = name + self.extra = value + self.kind = kind + +class Pegel(object): + def __init__(self, name, start, stop, sta, at, html): + self.name = name + self.start = start + self.stop = stop + self.sta = sta + self.at = at + self.html = html + self.aeo = 0.0 + self.nullpunkt = 0.0 + self.km = 0.0 + self.hauptwerte = [] + self.load_hauptwerte() + self.at_data = AbflussTafel(self.at) + + def load_hauptwerte(self): + with codecs.open(self.sta, "r", "latin-1") as f: + for line_no, line in enumerate(f): + line = line.rstrip() + if line_no == 0: + first = False + name = line[16:37].strip() + line = [s.replace(",", ".") for s in line[37:].split()] + self.aeo = float(line[0]) + self.nullpunkt = float(line[1]) + print >> sys.stderr, "pegel name: '%s'" % name + print >> sys.stderr, "pegel aeo: '%f'" % self.aeo + print >> sys.stderr, "pegel nullpunkt: '%f'" % self.nullpunkt + elif line_no == 1: + self.km = float(line[29:36].strip().replace(",", ".")) + print >> sys.stderr, "km: '%f'" % self.km + else: + if not line: continue + line = line.replace(",", ".") + m = HAUPTWERT.match(line) + if not m: continue + self.hauptwerte.append(Hauptwert( + m.group(1), float(m.group(2)), m.group(3))) + +class Gewaesser(object): + + def __init__(self, name=None, b_b=None, wst=None): + self.name = name + self.b_b = b_b + self.wst = wst + self.pegel = [] + + def load_pegel(self): + dir_name = os.path.dirname(self.wst) + pegel_glt = find_file(dir_name, "PEGEL.GLT") + if not pegel_glt: + print >> sys.stderr, "Missing PEGEL.GLT for %r" % self.name + return + + print >> sys.stderr, "pegel_glt: %r" % pegel_glt + + with codecs.open(pegel_glt, "r", "latin-1") as f: + for line in f: + line = line.strip() + if not line or line.startswith("#"): + continue + # using re to cope with quoted columns, + # shlex has unicode problems. + parts = [p for p in re.split("( |\\\".*?\\\"|'.*?')", line) + if p.strip()] + if len(parts) < 7: + print >> sys.stderr, "too less colums (need 7): %r" % line + continue + + print >> sys.stderr, "%r" % parts + self.pegel.append(Pegel( + parts[0], + min(float(parts[2]), float(parts[3])), + max(float(parts[2]), float(parts[3])), + norm_path(parts[4], dir_name), + norm_path(parts[5], dir_name), + parts[6])) + + + def __repr__(self): + return u"Gewaesser(name=%r, b_b=%r, wst=%r)" % ( + self.name, self.b_b, self.wst) + +def norm_path(path, ref): + if not os.path.isabs(path): + path = os.path.normpath(os.path.join(ref, path)) + return path + +def find_file(path, what): + what = what.lower() + for filename in os.listdir(path): + p = os.path.join(path, filename) + if os.path.isfile(p) and filename.lower() == what: + return p + return None + + +def read_gew(filename): + + gewaesser = [] + + current = Gewaesser() + + filename = os.path.abspath(filename) + dirname = os.path.dirname(filename) + + with codecs.open(filename, "r", "latin-1") as f: + for line in f: + line = line.strip() + if not line or line.startswith("*"): + continue + + if line.startswith(u"Gewässer:"): + if current.name: + gewaesser.append(current) + current = Gewaesser() + current.name = line[len(u"Gewässer:"):].strip() + elif line.startswith(u"B+B-Info:"): + current.b_b = norm_path(line[len(u"B+B-Info:"):].strip(), + dirname) + elif line.startswith(u"WSTDatei:"): + current.wst = norm_path(line[len(u"WSTDatei:"):].strip(), + dirname) + + if current.name: + gewaesser.append(current) + + return gewaesser + +def main(): + + if len(sys.argv) < 2: + print >> sys.stderr, "missing gew file" + sys.exit(1) + + gew_filename = sys.argv[1] + + if not os.path.isfile(gew_filename): + print >> sys.stderr, "'%s' is not a file" % gew_filename + sys.exit(1) + + gewaesser = read_gew(gew_filename) + + for gew in gewaesser: + gew.load_pegel() + + + +if __name__ == '__main__': + main() +# vim: set fileencoding=utf-8 : diff -r 119048655872 -r 80669241956c flys-backend/contrib/import-kms.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flys-backend/contrib/import-kms.py Tue Feb 22 10:34:53 2011 +0000 @@ -0,0 +1,213 @@ +#!/usr/bin/env python + +import sys +import logging +import re +import os + +import sqlite3 as db +import locale +import codecs + +from optparse import OptionParser + +log = logging.getLogger(__name__) +log.setLevel(logging.WARNING) +log.addHandler(logging.StreamHandler(sys.stderr)) + +RANGE = re.compile("([^#]*)#(.*)") +DEFAULT_DATABASE = "flys.db" + +SQL_NEXT_ID = "SELECT coalesce(max(id), -1) + 1 FROM %s" +SQL_SELECT_ID = "SELECT id FROM %s WHERE %s = ?" +SQL_INSERT_ID = "INSERT INTO %s (id, %s) VALUES (?, ?)" + +SQL_SELECT_RANGE_ID = """ +SELECT id FROM ranges WHERE river_id = ? AND a = ? AND b = ? +""" +SQL_INSERT_RANGE_ID = """ +INSERT INTO ranges (id, river_id, a, b) VALUES (?, ?, ?, ?) +""" +SQL_SELECT_ANNOTATION_ID = """ +SELECT id FROM annotations +WHERE range_id = ? AND attribute_id = ? AND position_id = ? +""" +SQL_INSERT_ANNOTATION_ID = """ +INSERT INTO annotations (id, range_id, attribute_id, position_id) +VALUES (?, ?, ?, ?) +""" + +def encode(s): + try: + return unicode(s, "latin-1") + except UnicodeDecodeError: + return unicode.encode(s, locale.getpreferredencoding()) + +class hashabledict(dict): + def __key(self): + return tuple((k, self[k]) for k in sorted(self)) + def __hash__(self): + return hash(self.__key()) + def __eq__(self, other): + return self.__key() == other.__key() + +def cache(f): + def func(*args, **kw): + key = (args, hashabledict(kw)) + try: + return f.__cache__[key] + except KeyError: + value = f(*args, **kw) + f.__cache__[key] = value + return value + f.__cache__ = {} + return func + +NEXT_IDS = {} +def next_id(cur, relation): + idx = NEXT_IDS.get(relation) + if idx is None: + cur.execute(SQL_NEXT_ID % relation) + idx = cur.fetchone()[0] + NEXT_IDS[relation] = idx + 1 + return idx + +def get_id(cur, relation, attribute, value): + select_stmt = SQL_SELECT_ID % (relation, attribute) + #log.debug(select_stmt) + cur.execute(select_stmt, (value,)) + row = cur.fetchone() + if row: return row[0] + idx = next_id(cur, relation) + insert_stmnt = SQL_INSERT_ID % (relation, attribute) + #log.debug(insert_stmnt) + cur.execute(insert_stmnt, (idx, value)) + cur.connection.commit() + log.debug("insert %s '%s' id: '%d'" % (relation, value, idx)) + return idx + +#@cache +def get_river_id(cur, name): + return get_id(cur, "rivers", "name", name) + +#@cache +def get_attribute_id(cur, value): + return get_id(cur, "attributes", "value", value) + +#@cache +def get_position_id(cur, value): + return get_id(cur, "positions", "value", value) + +#@cache +def get_range_id(cur, river_id, a, b): + cur.execute(SQL_SELECT_RANGE_ID, (river_id, a, b)) + row = cur.fetchone() + if row: return row[0] + idx = next_id(cur, "ranges") + cur.execute(SQL_INSERT_RANGE_ID, (idx, river_id, a, b)) + cur.connection.commit() + return idx + +#@cache +def get_annotation_id(cur, range_id, attribute_id, position_id): + cur.execute(SQL_SELECT_ANNOTATION_ID, ( + range_id, attribute_id, position_id)) + row = cur.fetchone() + if row: return row[0] + idx = next_id(cur, "annotations") + cur.execute(SQL_INSERT_ANNOTATION_ID, ( + idx, range_id, attribute_id, position_id)) + cur.connection.commit() + return idx + +def files(root, accept=lambda x: True): + if os.path.isfile(root): + if accept(root): yield root + elif os.path.isdir(root): + stack = [ root ] + while stack: + cur = stack.pop() + for f in os.listdir(cur): + p = os.path.join(cur, f) + if os.path.isdir(p): + stack.append(p) + elif os.path.isfile(p) and accept(p): + yield p + +def feed_km(cur, river_id, km_file): + + log.info("processing: %s" % km_file) + + for line in codecs.open(km_file, "r", "latin-1"): + line = line.strip() + if not line or line.startswith('*'): + continue + parts = [x.strip() for x in line.split(';')] + if len(parts) < 3: + log.error("cannot process: '%s'" % line) + continue + m = RANGE.match(parts[2]) + try: + if m: + x = [float(x.replace(",", ".")) for x in m.groups()] + a, b = min(x), max(x) + if a == b: b = None + else: + a, b = float(parts[2].replace(",", ".")), None + except ValueError: + log.error("cannot process: '%s'" % line) + continue + + attribute = parts[0] + position = parts[1] + attribute_id = get_attribute_id(cur, attribute) if attribute else None + position_id = get_position_id(cur, position) if position else None + + range_id = get_range_id(cur, river_id, a, b) + + get_annotation_id(cur, range_id, attribute_id, position_id) + +def main(): + + usage = "usage: %prog [options] river km-file ..." + parser = OptionParser(usage=usage) + parser.add_option( + "-v", "--verbose", action="store_true", + dest="verbose", + help="verbose output") + parser.add_option( + "-r", "--recursive", action="store_true", + dest="recursive", default=False, + help="recursive") + parser.add_option( + "-d", "--database", action="store", + dest="database", + help="database to connect with", + default=DEFAULT_DATABASE) + + options, args = parser.parse_args() + + if options.verbose: + log.setLevel(logging.INFO) + + if len(args) < 1: + log.error("missing river argument") + sys.exit(1) + + river = unicode(args[0], locale.getpreferredencoding()) + + with db.connect(options.database) as con: + cur = con.cursor() + river_id = get_river_id(cur, river) + + for arg in args[1:]: + if options.recursive: + for km_file in files( + arg, lambda x: x.lower().endswith(".km")): + feed_km(cur, river_id, km_file) + else: + feed_km(cur, river_id, arg) + + +if __name__ == '__main__': + main()