Mercurial > dive4elements > river

#!/usr/bin/env python

import sys
import logging
import re
import os

import sqlite3 as db
import locale
import codecs

from optparse import OptionParser

log = logging.getLogger(__name__)
log.setLevel(logging.WARNING)
log.addHandler(logging.StreamHandler(sys.stderr))

RANGE = re.compile("([^#]*)#(.*)")
DEFAULT_DATABASE = "flys.db"

SQL_NEXT_ID   = "SELECT coalesce(max(id), -1) + 1 FROM %s"
SQL_SELECT_ID = "SELECT id FROM %s WHERE %s = ?"
SQL_INSERT_ID = "INSERT INTO %s (id, %s) VALUES (?, ?)"

SQL_SELECT_RANGE_ID = """
SELECT id FROM ranges WHERE river_id = ? AND a = ? AND b = ?
"""
SQL_INSERT_RANGE_ID = """
INSERT INTO ranges (id, river_id, a, b) VALUES (?, ?, ?, ?)
"""
SQL_SELECT_ANNOTATION_ID = """
SELECT id FROM annotations
WHERE range_id = ? AND attribute_id = ? AND position_id = ?
"""
SQL_INSERT_ANNOTATION_ID = """
INSERT INTO annotations (id, range_id, attribute_id, position_id)
VALUES (?, ?, ?, ?)
"""

def encode(s):
    try:
        return unicode(s, "latin-1")
    except UnicodeDecodeError:
        return unicode.encode(s, locale.getpreferredencoding())

class hashabledict(dict):
    def __key(self):
        return tuple((k, self[k]) for k in sorted(self))
    def __hash__(self):
        return hash(self.__key())
    def __eq__(self, other):
        return self.__key() == other.__key()

def cache(f):
    def func(*args, **kw):
        key = (args, hashabledict(kw))
        try:
            return f.__cache__[key]
        except KeyError:
            value = f(*args, **kw)
            f.__cache__[key] = value
            return value
    f.__cache__ = {}
    return func

NEXT_IDS = {}
def next_id(cur, relation):
    idx = NEXT_IDS.get(relation)
    if idx is None:
        cur.execute(SQL_NEXT_ID % relation)
        idx = cur.fetchone()[0]
    NEXT_IDS[relation] = idx + 1
    return idx

def get_id(cur, relation, attribute, value):
    select_stmt = SQL_SELECT_ID % (relation, attribute)
    #log.debug(select_stmt)
    cur.execute(select_stmt, (value,))
    row = cur.fetchone()
    if row: return row[0]
    idx = next_id(cur, relation)
    insert_stmnt = SQL_INSERT_ID % (relation, attribute)
    #log.debug(insert_stmnt)
    cur.execute(insert_stmnt, (idx, value))
    cur.connection.commit()
    log.debug("insert %s '%s' id: '%d'" % (relation, value, idx))
    return idx

#@cache
def get_river_id(cur, name):
    return get_id(cur, "rivers", "name", name)

#@cache
def get_attribute_id(cur, value):
    return get_id(cur, "attributes", "value", value)

#@cache
def get_position_id(cur, value):
    return get_id(cur, "positions", "value", value)

#@cache
def get_range_id(cur, river_id, a, b):
    cur.execute(SQL_SELECT_RANGE_ID, (river_id, a, b))
    row = cur.fetchone()
    if row: return row[0]
    idx = next_id(cur, "ranges")
    cur.execute(SQL_INSERT_RANGE_ID, (idx, river_id, a, b))
    cur.connection.commit()
    return idx

#@cache
def get_annotation_id(cur, range_id, attribute_id, position_id):
    cur.execute(SQL_SELECT_ANNOTATION_ID, (
        range_id, attribute_id, position_id))
    row = cur.fetchone()
    if row: return row[0]
    idx = next_id(cur, "annotations")
    cur.execute(SQL_INSERT_ANNOTATION_ID, (
        idx, range_id, attribute_id, position_id))
    cur.connection.commit()
    return idx

def files(root, accept=lambda x: True):
    if os.path.isfile(root):
        if accept(root): yield root
    elif os.path.isdir(root):
        stack = [ root ]
        while stack:
            cur = stack.pop()
            for f in os.listdir(cur):
                p = os.path.join(cur, f)
                if os.path.isdir(p):
                    stack.append(p)
                elif os.path.isfile(p) and accept(p):
                    yield p

def feed_km(cur, river_id, km_file):

    log.info("processing: %s" % km_file)

    for line in codecs.open(km_file, "r", "latin-1"):
        line = line.strip()
        if not line or line.startswith('*'):
            continue
        parts = [x.strip() for x in line.split(';')]
        if len(parts) < 3:
            log.error("cannot process: '%s'" % line)
            continue
        m = RANGE.match(parts[2])
        try:
            if m:
                x = [float(x.replace(",", ".")) for x in m.groups()]
                a, b = min(x), max(x)
                if a == b: b = None
            else:
                a, b = float(parts[2].replace(",", ".")), None
        except ValueError:
            log.error("cannot process: '%s'" % line)
            continue

        attribute = parts[0]
        position  = parts[1]
        attribute_id = get_attribute_id(cur, attribute) if attribute else None
        position_id  = get_position_id(cur, position) if position else None

        range_id = get_range_id(cur, river_id, a, b)

        get_annotation_id(cur, range_id, attribute_id, position_id)

def main():

    usage = "usage: %prog [options] river km-file ..."
    parser = OptionParser(usage=usage)
    parser.add_option(
        "-v", "--verbose", action="store_true",
        dest="verbose",
        help="verbose output")
    parser.add_option(
        "-r", "--recursive", action="store_true",
        dest="recursive", default=False,
        help="recursive")
    parser.add_option(
        "-d", "--database", action="store",
        dest="database",
        help="database to connect with",
        default=DEFAULT_DATABASE)

    options, args = parser.parse_args()

    if options.verbose:
        log.setLevel(logging.INFO)

    if len(args) < 1:
        log.error("missing river argument")
        sys.exit(1)

    river = unicode(args[0], locale.getpreferredencoding())

    with db.connect(options.database) as con:
        cur = con.cursor()
        river_id = get_river_id(cur, river)

        for arg in args[1:]:
            if options.recursive:
                for km_file in files(
                    arg, lambda x: x.lower().endswith(".km")):
                    feed_km(cur, river_id, km_file)
            else:
                feed_km(cur, river_id, arg)


if __name__ == '__main__':
    main()
author	Ingo Weinzierl <weinzierl.ingo@googlemail.com>
date	Thu, 25 Oct 2012 14:01:46 +0200
parents	80669241956c
children