changeset 162:80669241956c

Initial database import scripts. Not finished, yet. flys-backend/trunk@1333 c6561f87-3c4e-4783-a992-168aeb5c3f6f
author Sascha L. Teichmann <sascha.teichmann@intevation.de>
date Tue, 22 Feb 2011 10:34:53 +0000
parents 119048655872
children 4a3b2912a0cd
files flys-backend/ChangeLog flys-backend/contrib/import-gew.py flys-backend/contrib/import-kms.py
diffstat 3 files changed, 442 insertions(+), 1 deletions(-) [+]
line wrap: on
line diff
--- a/flys-backend/ChangeLog	Thu Feb 10 12:07:35 2011 +0000
+++ b/flys-backend/ChangeLog	Tue Feb 22 10:34:53 2011 +0000
@@ -1,4 +1,9 @@
-2011-02-10	Sascha L. Teichmann:
+2011-01-22	Sascha L. Teichmann	<sascha.teichmann@intevation.de>
+
+	* contrib/import-kms.py, contrib/import-gew.py: Initial scripts
+	to import data into SQLite database. They still need some work.
+
+2011-02-10	Sascha L. Teichmann	<sascha.teichmann@intevation.de>:
 
 	* doc/schema/sqlite.sql: Added initial schema for
 	FLYS database.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/flys-backend/contrib/import-gew.py	Tue Feb 22 10:34:53 2011 +0000
@@ -0,0 +1,223 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import codecs
+import re
+
+HAUPTWERT  = re.compile(r"\s*([^\s]+)\s+([^\s+]+)\s+([QWDT-])")
+WHITESPACE = re.compile(r"\s+")
+
+class KM(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+        self.load_values()
+
+    def load_values(self):
+        with codecs.open(self.filename, "r", "latin-1") as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("*"):
+                    parts = [s.strip() for s in line.split(";")]
+                    # TODO: Use code from import-kms.py
+
+class AbflussTafel(object):
+
+    def __init__(self, filename):
+        self.filename = filename
+        self.name     = ""
+        self.values = []
+        self.load_values()
+
+    def load_values(self):
+        with codecs.open(self.filename, "r", "latin-1") as f:
+            first = True
+            for line in f:
+                line = line.strip()
+                if not line: continue
+                if line.startswith("#! name="):
+                    self.name = line[8:]
+                    continue
+                if line.startswith("#") or line.startswith("*"):
+                    continue
+                line = line.replace(",", ".")
+                splits = WHITESPACE.split(line)
+
+                if len(splits) < 2 or len(splits) > 11:
+                    continue
+
+                w = float(splits[0])
+
+                shift = 0
+
+                if len(splits) != 11 and first:
+                    shift = 11 - len(splits)
+
+                for idx, q in enumerate(splits[1:]):
+                    i_w = w + shift + idx
+                    i_q = float(q)
+                    w_q = (i_w/100.0, i_q/100.0)
+                    self.values.append(w_q)
+
+                first = False
+
+
+class Hauptwert(object):
+    def __init__(self, name, value, kind):
+        self.name  = name
+        self.extra = value
+        self.kind  = kind
+
+class Pegel(object):
+    def __init__(self, name, start, stop, sta, at, html):
+        self.name       = name
+        self.start      = start
+        self.stop       = stop
+        self.sta        = sta
+        self.at         = at
+        self.html       = html
+        self.aeo        = 0.0
+        self.nullpunkt  = 0.0
+        self.km         = 0.0
+        self.hauptwerte = []
+        self.load_hauptwerte()
+        self.at_data = AbflussTafel(self.at)
+
+    def load_hauptwerte(self):
+        with codecs.open(self.sta, "r", "latin-1") as f:
+            for line_no, line in enumerate(f):
+                line = line.rstrip()
+                if line_no == 0:
+                    first = False
+                    name = line[16:37].strip()
+                    line = [s.replace(",", ".") for s in line[37:].split()]
+                    self.aeo = float(line[0])
+                    self.nullpunkt = float(line[1])
+                    print >> sys.stderr, "pegel name: '%s'" % name
+                    print >> sys.stderr, "pegel aeo: '%f'" % self.aeo
+                    print >> sys.stderr, "pegel nullpunkt: '%f'" % self.nullpunkt
+                elif line_no == 1:
+                    self.km = float(line[29:36].strip().replace(",", "."))
+                    print >> sys.stderr, "km: '%f'" % self.km
+                else:
+                    if not line: continue
+                    line = line.replace(",", ".")
+                    m = HAUPTWERT.match(line)
+                    if not m: continue
+                    self.hauptwerte.append(Hauptwert(
+                        m.group(1), float(m.group(2)), m.group(3)))
+
+class Gewaesser(object):
+
+    def __init__(self, name=None, b_b=None, wst=None):
+        self.name = name
+        self.b_b = b_b
+        self.wst = wst
+        self.pegel = []
+
+    def load_pegel(self):
+        dir_name = os.path.dirname(self.wst)
+        pegel_glt = find_file(dir_name, "PEGEL.GLT")
+        if not pegel_glt:
+            print >> sys.stderr, "Missing PEGEL.GLT for %r" % self.name
+            return
+
+        print >> sys.stderr, "pegel_glt: %r" % pegel_glt
+
+        with codecs.open(pegel_glt, "r", "latin-1") as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith("#"):
+                    continue
+                # using re to cope with quoted columns,
+                # shlex has unicode problems.
+                parts = [p for p in re.split("( |\\\".*?\\\"|'.*?')", line) 
+                         if p.strip()]
+                if len(parts) < 7:
+                    print >> sys.stderr, "too less colums (need 7): %r" % line
+                    continue
+
+                print >> sys.stderr, "%r" % parts
+                self.pegel.append(Pegel(
+                    parts[0],
+                    min(float(parts[2]), float(parts[3])),
+                    max(float(parts[2]), float(parts[3])),
+                    norm_path(parts[4], dir_name),
+                    norm_path(parts[5], dir_name),
+                    parts[6]))
+
+
+    def __repr__(self):
+        return u"Gewaesser(name=%r, b_b=%r, wst=%r)" % (
+            self.name, self.b_b, self.wst)
+
+def norm_path(path, ref):
+    if not os.path.isabs(path):
+        path = os.path.normpath(os.path.join(ref, path))
+    return path
+
+def find_file(path, what):
+    what = what.lower()
+    for filename in os.listdir(path):
+        p = os.path.join(path, filename)
+        if os.path.isfile(p) and filename.lower() == what:
+            return p
+    return None
+    
+
+def read_gew(filename):
+
+    gewaesser = []
+
+    current = Gewaesser()
+
+    filename = os.path.abspath(filename)
+    dirname = os.path.dirname(filename)
+
+    with codecs.open(filename, "r", "latin-1") as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith("*"):
+                continue
+
+            if line.startswith(u"Gewässer:"):
+                if current.name:
+                    gewaesser.append(current)
+                    current = Gewaesser()
+                current.name = line[len(u"Gewässer:"):].strip()
+            elif line.startswith(u"B+B-Info:"):
+                current.b_b = norm_path(line[len(u"B+B-Info:"):].strip(),
+                                        dirname)
+            elif line.startswith(u"WSTDatei:"):
+                current.wst = norm_path(line[len(u"WSTDatei:"):].strip(),
+                                        dirname)
+
+        if current.name:
+            gewaesser.append(current)
+
+    return gewaesser
+
+def main():
+
+    if len(sys.argv) < 2:
+        print >> sys.stderr, "missing gew file"
+        sys.exit(1)
+
+    gew_filename = sys.argv[1]
+
+    if not os.path.isfile(gew_filename):
+        print >> sys.stderr, "'%s' is not a file" % gew_filename
+        sys.exit(1)
+
+    gewaesser = read_gew(gew_filename)
+
+    for gew in gewaesser:
+        gew.load_pegel()
+
+    
+
+if __name__ == '__main__':
+    main()
+# vim: set fileencoding=utf-8 :
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/flys-backend/contrib/import-kms.py	Tue Feb 22 10:34:53 2011 +0000
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+
+import sys
+import logging
+import re
+import os
+
+import sqlite3 as db
+import locale
+import codecs
+
+from optparse import OptionParser
+
+log = logging.getLogger(__name__) 
+log.setLevel(logging.WARNING)
+log.addHandler(logging.StreamHandler(sys.stderr))
+
+RANGE = re.compile("([^#]*)#(.*)")
+DEFAULT_DATABASE = "flys.db"
+
+SQL_NEXT_ID   = "SELECT coalesce(max(id), -1) + 1 FROM %s"
+SQL_SELECT_ID = "SELECT id FROM %s WHERE %s = ?"
+SQL_INSERT_ID = "INSERT INTO %s (id, %s) VALUES (?, ?)"
+
+SQL_SELECT_RANGE_ID = """
+SELECT id FROM ranges WHERE river_id = ? AND a = ? AND b = ?
+"""
+SQL_INSERT_RANGE_ID = """
+INSERT INTO ranges (id, river_id, a, b) VALUES (?, ?, ?, ?)
+"""
+SQL_SELECT_ANNOTATION_ID = """
+SELECT id FROM annotations
+WHERE range_id = ? AND attribute_id = ? AND position_id = ?
+"""
+SQL_INSERT_ANNOTATION_ID = """
+INSERT INTO annotations (id, range_id, attribute_id, position_id) 
+VALUES (?, ?, ?, ?)
+"""
+
+def encode(s):
+    try:
+        return unicode(s, "latin-1")
+    except UnicodeDecodeError:
+        return unicode.encode(s, locale.getpreferredencoding())
+
+class hashabledict(dict):
+    def __key(self):
+        return tuple((k, self[k]) for k in sorted(self))
+    def __hash__(self):
+        return hash(self.__key())
+    def __eq__(self, other):
+        return self.__key() == other.__key()
+
+def cache(f):
+    def func(*args, **kw):
+        key = (args, hashabledict(kw))
+        try:
+            return f.__cache__[key]
+        except KeyError:
+            value = f(*args, **kw)
+            f.__cache__[key] = value
+            return value
+    f.__cache__ = {}
+    return func
+
+NEXT_IDS = {}
+def next_id(cur, relation):
+    idx = NEXT_IDS.get(relation)
+    if idx is None:
+        cur.execute(SQL_NEXT_ID % relation)
+        idx = cur.fetchone()[0]
+    NEXT_IDS[relation] = idx + 1
+    return idx
+
+def get_id(cur, relation, attribute, value):
+    select_stmt = SQL_SELECT_ID % (relation, attribute)
+    #log.debug(select_stmt)
+    cur.execute(select_stmt, (value,))
+    row = cur.fetchone()
+    if row: return row[0]
+    idx = next_id(cur, relation)
+    insert_stmnt = SQL_INSERT_ID % (relation, attribute)
+    #log.debug(insert_stmnt)
+    cur.execute(insert_stmnt, (idx, value))
+    cur.connection.commit()
+    log.debug("insert %s '%s' id: '%d'" % (relation, value, idx))
+    return idx
+
+#@cache
+def get_river_id(cur, name):
+    return get_id(cur, "rivers", "name", name)
+
+#@cache
+def get_attribute_id(cur, value):
+    return get_id(cur, "attributes", "value", value)
+
+#@cache
+def get_position_id(cur, value):
+    return get_id(cur, "positions", "value", value)
+
+#@cache
+def get_range_id(cur, river_id, a, b):
+    cur.execute(SQL_SELECT_RANGE_ID, (river_id, a, b))
+    row = cur.fetchone()
+    if row: return row[0]
+    idx = next_id(cur, "ranges")
+    cur.execute(SQL_INSERT_RANGE_ID, (idx, river_id, a, b))
+    cur.connection.commit()
+    return idx
+
+#@cache
+def get_annotation_id(cur, range_id, attribute_id, position_id):
+    cur.execute(SQL_SELECT_ANNOTATION_ID, (
+        range_id, attribute_id, position_id))
+    row = cur.fetchone()
+    if row: return row[0]
+    idx = next_id(cur, "annotations")
+    cur.execute(SQL_INSERT_ANNOTATION_ID, (
+        idx, range_id, attribute_id, position_id))
+    cur.connection.commit()
+    return idx
+
+def files(root, accept=lambda x: True):
+    if os.path.isfile(root):
+        if accept(root): yield root
+    elif os.path.isdir(root):
+        stack = [ root ]
+        while stack:
+            cur = stack.pop()
+            for f in os.listdir(cur):
+                p = os.path.join(cur, f)
+                if os.path.isdir(p):
+                    stack.append(p)
+                elif os.path.isfile(p) and accept(p):
+                    yield p
+
+def feed_km(cur, river_id, km_file):
+
+    log.info("processing: %s" % km_file)
+
+    for line in codecs.open(km_file, "r", "latin-1"):
+        line = line.strip()
+        if not line or line.startswith('*'):
+            continue
+        parts = [x.strip() for x in line.split(';')]
+        if len(parts) < 3:
+            log.error("cannot process: '%s'" % line)
+            continue
+        m = RANGE.match(parts[2])
+        try:
+            if m:
+                x = [float(x.replace(",", ".")) for x in m.groups()]
+                a, b = min(x), max(x)
+                if a == b: b = None
+            else:
+                a, b = float(parts[2].replace(",", ".")), None
+        except ValueError:
+            log.error("cannot process: '%s'" % line)
+            continue
+
+        attribute = parts[0]
+        position  = parts[1]
+        attribute_id = get_attribute_id(cur, attribute) if attribute else None
+        position_id  = get_position_id(cur, position) if position else None
+
+        range_id = get_range_id(cur, river_id, a, b)
+
+        get_annotation_id(cur, range_id, attribute_id, position_id)
+
+def main():
+
+    usage = "usage: %prog [options] river km-file ..."
+    parser = OptionParser(usage=usage)
+    parser.add_option(
+        "-v", "--verbose", action="store_true",
+        dest="verbose",
+        help="verbose output")
+    parser.add_option(
+        "-r", "--recursive", action="store_true",
+        dest="recursive", default=False,
+        help="recursive")
+    parser.add_option(
+        "-d", "--database", action="store",
+        dest="database",
+        help="database to connect with",
+        default=DEFAULT_DATABASE)
+
+    options, args = parser.parse_args()
+
+    if options.verbose:
+        log.setLevel(logging.INFO)
+    
+    if len(args) < 1:
+        log.error("missing river argument")
+        sys.exit(1)
+
+    river = unicode(args[0], locale.getpreferredencoding())
+
+    with db.connect(options.database) as con:
+        cur = con.cursor()
+        river_id = get_river_id(cur, river)
+
+        for arg in args[1:]:
+            if options.recursive:
+                for km_file in files(
+                    arg, lambda x: x.lower().endswith(".km")):
+                    feed_km(cur, river_id, km_file)
+            else:
+                feed_km(cur, river_id, arg)
+        
+
+if __name__ == '__main__':
+    main()

http://dive4elements.wald.intevation.org