changeset 765:763c4137d6e1

Added classification of annotation types. Needs testing! flys-backend/trunk@2162 c6561f87-3c4e-4783-a992-168aeb5c3f6f
author Sascha L. Teichmann <sascha.teichmann@intevation.de>
date Sun, 19 Jun 2011 12:26:12 +0000 (2011-06-19)
parents e09f00ecb915
children aa9e3da95c31
files flys-backend/ChangeLog flys-backend/doc/annotation-types.xml flys-backend/src/main/java/de/intevation/flys/importer/AnnotationClassifier.java flys-backend/src/main/java/de/intevation/flys/importer/AnnotationsParser.java flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java flys-backend/src/main/java/de/intevation/flys/importer/Importer.java flys-backend/src/main/java/de/intevation/flys/importer/InfoGewParser.java
diffstat 7 files changed, 404 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/flys-backend/ChangeLog	Sat Jun 18 20:13:31 2011 +0000
+++ b/flys-backend/ChangeLog	Sun Jun 19 12:26:12 2011 +0000
@@ -1,4 +1,50 @@
-2011-06-14	Sascha L. Teichmann	<sascha.teichmann@intevation.de>
+2011-06-19	Sascha L. Teichmann	<sascha.teichmann@intevation.de>
+
+	Added classification of annotation types. Needs testing!
+
+	* doc/annotation-types.xml: New. Rules to classify the different
+	  types of annotations. The classification works like this:
+
+	  There are unique types like 'Bruecke', 'Pegel' and so on.
+	  They are defined in the /annotation/type section and
+	  identified by their name. One of the types can be set 
+	  as the default type if no rule applies.
+
+	  In the /annotation/pattern section are two types of pattern.
+
+	  1 - file pattern: If a KM file is opened its filename is
+	      matched against the regular expressions of these
+		  patterns. If a match is found the corresponding type
+		  is used as the default type in the open file.
+		  If no match is found the global default type is used
+		  as the default type.
+
+	  2 - line patterns: For each line of an open KM file these
+	      patterns are applied to find a match. If a match is
+		  found the corresponding type is used as the type of
+		  the annotation. If no match is found the default
+		  file default is assumed to be the right type. For
+		  the file default see 1.
+
+	* src/main/java/de/intevation/flys/importer/Importer.java:
+	  To activate the annotation type classification set
+	  the system property
+
+	      'flys.backend.importer.annotation.types'
+
+	  to the path of a XML looking like the annotation-types.xml
+	  file. If the system property is not set no classification
+	  is done.
+
+	* src/main/java/de/intevation/flys/importer/AnnotationClassifier.java:
+	  New. Implements the classification.
+	  
+	* src/main/java/de/intevation/flys/importer/AnnotationsParser.java,
+	  src/main/java/de/intevation/flys/importer/InfoGewParser.java,
+	  src/main/java/de/intevation/flys/importer/ImportRiver.java:
+	  Looped through the annotation type classification.
+
+2011-06-18	Sascha L. Teichmann	<sascha.teichmann@intevation.de>
 
 	* src/main/java/de/intevation/flys/model/River.java:
 	  Added method to find gauge by a position lying in its range.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/flys-backend/doc/annotation-types.xml	Sun Jun 19 12:26:12 2011 +0000
@@ -0,0 +1,56 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<annotation>
+    <types>
+        <type name="Abzweigung"/>
+        <type name="Berechnungsstrecke"/>
+        <type name="Brücke"/>
+        <type name="Bundesland"/>
+        <type name="Deich"/>
+        <type name="Einmündung"/>
+        <type name="Fähre"/>
+        <type name="Gemeinde"/>
+        <type name="Grenze"/>
+        <type name="Hafen"/>
+        <type name="HW-Schutz"/>
+        <type name="Landkreis"/>
+        <type name="Meldestelle"/>
+        <type name="Messstelle"/>
+        <type name="Pegel"/>
+        <type name="Stauwehr"/>
+        <type name="Staatsgrenze"/>
+        <type name="Staat"/>
+        <type name="WSA"/>
+        <type name="Zufluß"/>
+        <type name="Sonstige" default="true"/>
+    </types>
+
+    <patterns>
+        <file pattern="^Brücken$" type="Brücke"/>
+        <file pattern="^Deich.*$" type="Deich"/>
+        <file pattern="^Hafen$" type="Hafen"/>
+        <file pattern="^Pegel-alle$" type="Pegel"/>
+        <file pattern="^Pegel$" type="Pegel"/>
+        <file pattern="^Wehr$" type="Stauwehr"/>
+        <file pattern="^Stauwehr$" type="Stauwehr"/>
+        <file pattern="^Zufluß$" type="Zufluß"/>
+
+        <line pattern="^Abz\.?[:\s].*$" type="Abzweigung"/>
+        <line pattern="^Berechnungsstrecke.*$" type="Berechnungsstrecke"/>
+        <line pattern="^Brücke[:\s].*$" type="Brücke"/>
+        <line pattern="^Bundesland[:\s].*$" type="Bundesland"/>
+        <line pattern="^Einmündung[:\s].*$" type="Abzweig"/>
+        <line pattern="^Fähre[:\s].*$" type="Abzweig"/>
+        <line pattern="^Gemeinde[:\s].*$" type="Abzweig"/>
+        <line pattern="^Grenze[:\s].*$" type="Grenze"/>
+        <line pattern="^Hafen[:\s].*$" type="Hafen"/>
+        <line pattern="^HW-Schutz[:\s].*$" type="HW-Schutz"/>
+        <line pattern="^Landkreis[:\s].*$" type="Landkreis"/>
+        <line pattern="^Meldestelle[:\s].*$" type="Meldestelle"/>
+        <line pattern="^Pegel[:\s].*$" type="Pegel"/>
+        <line pattern="^Staatsgrenze[:\s].*$" type="Staatsgrenze"/>
+        <line pattern="^Staat[:\s].*$" type="Staat"/>
+        <line pattern="^Wehr[:\s].*$" type="Stauwehr"/>
+        <line pattern="^WSA[:\s].*$" type="WSA"/>
+        <line pattern="^Zufluß[:\s].*$" type="Zufluß"/>
+    </patterns>
+</annotation>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/importer/AnnotationClassifier.java	Sun Jun 19 12:26:12 2011 +0000
@@ -0,0 +1,233 @@
+package de.intevation.flys.importer;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Element;
+
+import javax.xml.xpath.XPathConstants;
+
+import java.io.File;
+import java.io.IOException;
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.List;
+import java.util.ArrayList;
+
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+import org.apache.log4j.Logger;
+
+import de.intevation.artifacts.common.utils.XMLUtils;
+
+public class AnnotationClassifier
+{
+    private static Logger log = Logger.getLogger(Importer.class);
+
+    public static final String TYPES_XPATH =
+        "/annotations/types/type";
+
+    public static final String FILE_PATTERNS_XPATH =
+        "/annotations/patterns/file";
+
+    public static final String DESCRIPTION_PATTERNS_XPATH =
+        "/annotations/patterns/line";
+
+
+    public static class Pair {
+
+        protected Pattern              pattern;
+        protected ImportAnnotationType annType;
+
+        public Pair(Pattern pattern, ImportAnnotationType annType) {
+            this.pattern  = pattern;
+            this.annType = annType;
+        }
+
+        public ImportAnnotationType match(String s) {
+            Matcher m = pattern.matcher(s);
+            return m.matches() ? annType : null;
+        }
+    } // class Pair
+
+
+    protected Map<String, ImportAnnotationType> types;
+    protected List<Pair>                        filePatterns;
+    protected List<Pair>                        descPatterns;
+
+    protected ImportAnnotationType defaultType;
+
+    public AnnotationClassifier() {
+    }
+
+    public AnnotationClassifier(Document rules) {
+        types        = new HashMap<String, ImportAnnotationType>();
+        filePatterns = new ArrayList<Pair>();
+        descPatterns = new ArrayList<Pair>();
+
+        buildRules(rules);
+    }
+
+    protected void buildRules(Document rules) {
+
+        buildTypes(rules);
+        buildFilePatterns(rules);
+        buildDescriptionPatterns(rules);
+    }
+
+    protected void buildTypes(Document rules) {
+
+        NodeList typeList = (NodeList)XMLUtils.xpath(
+            rules,
+            TYPES_XPATH,
+            XPathConstants.NODESET,
+            null);
+
+        if (typeList == null) {
+            log.info("no rules found.");
+            return;
+        }
+
+        for (int i = 0, N = typeList.getLength(); i < N; ++i) {
+            Element typeElement = (Element)typeList.item(i);
+            String name = typeElement.getAttribute("name");
+            if (name.length() == 0) {
+                log.warn("rule has no name");
+                continue;
+            }
+
+            ImportAnnotationType aic = new ImportAnnotationType(name);
+
+            types.put(name, aic);
+
+            if (typeElement.getAttribute("default").equals("true")) {
+                defaultType = aic;
+            }
+        }
+    }
+
+    protected void buildFilePatterns(Document rules) {
+
+        NodeList patternList = (NodeList)XMLUtils.xpath(
+            rules,
+            FILE_PATTERNS_XPATH,
+            XPathConstants.NODESET,
+            null);
+
+        if (patternList == null) {
+            log.info("no file patterns found.");
+            return;
+        }
+
+        for (int i = 0, N = patternList.getLength(); i < N; ++i) {
+            Element element = (Element)patternList.item(i);
+            Pair pair = buildPair(element);
+            if (pair != null) {
+                filePatterns.add(pair);
+            }
+        } 
+    }
+
+    protected void buildDescriptionPatterns(Document rules) {
+
+        NodeList patternList = (NodeList)XMLUtils.xpath(
+            rules,
+            DESCRIPTION_PATTERNS_XPATH,
+            XPathConstants.NODESET,
+            null);
+
+        if (patternList == null) {
+            log.info("no line patterns found.");
+            return;
+        }
+
+        for (int i = 0, N = patternList.getLength(); i < N; ++i) {
+            Element element = (Element)patternList.item(i);
+            Pair pair = buildPair(element);
+            if (pair != null) {
+                descPatterns.add(pair);
+            }
+        } 
+    }
+
+    protected Pair buildPair(Element element) {
+        String pattern = element.getAttribute("pattern");
+        String type    = element.getAttribute("type");
+
+        if (pattern.length() == 0) {
+            log.warn("pattern has no 'pattern' attribute.");
+            return null;
+        }
+
+        if (type.length() == 0) {
+            log.warn("pattern has no 'type' attribute.");
+            return null;
+        }
+
+        ImportAnnotationType annType = types.get(type);
+
+        if (annType == null) {
+            log.warn("pattern has unknown type '" + type + "'");
+            return null;
+        }
+
+        Pattern p;
+
+        try {
+            p = Pattern.compile(pattern,
+                    Pattern.CASE_INSENSITIVE|Pattern.UNICODE_CASE);
+        }
+        catch (IllegalArgumentException iae) {
+            log.warn("pattern '" + pattern + "' is invalid.", iae);
+            return null;
+        }
+
+        return new Pair(p, annType);
+    }
+
+    public ImportAnnotationType getDefaultType() {
+        return defaultType;
+    }
+
+    public ImportAnnotationType classifyFile(String filename) {
+        return classifyFile(filename, null);
+    }
+
+    public ImportAnnotationType classifyFile(
+        String                filename,
+        ImportAnnotationType def
+    ) {
+        if (filename.toLowerCase().endsWith(".km")) {
+            filename = filename.substring(0, filename.length()-3);
+        }
+
+        for (Pair pair: filePatterns) {
+            ImportAnnotationType annType = pair.match(filename);
+            if (annType != null) {
+                return annType;
+            }
+        }
+
+        return def;
+    }
+
+    public ImportAnnotationType classifyDescription(String description) {
+        return classifyDescription(description, null);
+    }
+
+    public ImportAnnotationType classifyDescription(
+        String                description,
+        ImportAnnotationType def
+    ) {
+        for (Pair pair: descPatterns) {
+            ImportAnnotationType annType = pair.match(description);
+            if (annType != null) {
+                return annType;
+            }
+        }
+
+        return def;
+    }
+}
+// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :
--- a/flys-backend/src/main/java/de/intevation/flys/importer/AnnotationsParser.java	Sat Jun 18 20:13:31 2011 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/importer/AnnotationsParser.java	Sun Jun 19 12:26:12 2011 +0000
@@ -31,15 +31,30 @@
     protected HashMap<String, ImportAttribute> attributes;
     protected HashMap<String, ImportPosition>  positions;
     protected TreeSet<ImportAnnotation>        annotations;
+    protected AnnotationClassifier             classifier;
 
     public AnnotationsParser() {
+        this(null);
+    }
+
+    public AnnotationsParser(AnnotationClassifier classifier) {
         attributes  = new HashMap<String, ImportAttribute>();
         positions   = new HashMap<String, ImportPosition>();
         annotations = new TreeSet<ImportAnnotation>();
+        this.classifier = classifier;
     }
 
     public void parseFile(File file) throws IOException {
         log.info("parsing km file: '" + file + "'");
+
+        ImportAnnotationType defaultIAT = null;
+
+        if (classifier != null) {
+            defaultIAT = classifier.classifyFile(
+                file.getName(),
+                classifier.getDefaultType());
+        }
+
         LineNumberReader in = null;
         try {
             in =
@@ -124,7 +139,9 @@
 
                 ImportRange range = new ImportRange(from, to);
 
-                ImportAnnotationType type = null; // TODO: do classification
+                ImportAnnotationType type = classifier != null
+                    ? classifier.classifyDescription(line, defaultIAT)
+                    : null;
 
                 ImportAnnotation annotation = new ImportAnnotation(
                     attribute, position, range, edge, type);
--- a/flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java	Sat Jun 18 20:13:31 2011 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java	Sun Jun 19 12:26:12 2011 +0000
@@ -60,14 +60,21 @@
 
     protected ImportWst wst;
 
+    protected AnnotationClassifier annotationClassifier;
+
     protected River peer;
 
     public ImportRiver() {
+        this(null);
+    }
+
+    public ImportRiver(AnnotationClassifier annotationClassifier) {
         extraWsts       = new ArrayList<ImportWst>();
         fixations       = new ArrayList<ImportWst>();
         officialLines   = new ArrayList<ImportWst>();
         floodWater      = new ArrayList<ImportWst>();
         floodProtection = new ArrayList<ImportWst>();
+        this.annotationClassifier = annotationClassifier;
     }
 
     public ImportRiver(String name, File wstFile, File bbInfoFile) {
@@ -330,7 +337,8 @@
 
     public void parseAnnotations() throws IOException {
         File riverDir = wstFile.getParentFile().getParentFile();
-        AnnotationsParser aparser = new AnnotationsParser();
+        AnnotationsParser aparser =
+            new AnnotationsParser(annotationClassifier);
         aparser.parse(riverDir);
 
         annotations = aparser.getAnnotations();
--- a/flys-backend/src/main/java/de/intevation/flys/importer/Importer.java	Sat Jun 18 20:13:31 2011 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/importer/Importer.java	Sun Jun 19 12:26:12 2011 +0000
@@ -1,5 +1,7 @@
 package de.intevation.flys.importer;
 
+import de.intevation.artifacts.common.utils.XMLUtils;
+
 import java.io.File;
 import java.io.IOException;
 
@@ -12,6 +14,8 @@
 import org.hibernate.Transaction;
 import org.hibernate.HibernateException;
 
+import org.w3c.dom.Document;
+
 public class Importer
 {
     private static Logger log = Logger.getLogger(Importer.class);
@@ -19,6 +23,9 @@
     public static final boolean DRY_RUN =
         Boolean.getBoolean("flys.backend.importer.dry.run");
 
+    public static final String ANNOTATION_TYPES =
+        "flys.backend.importer.annotation.types";
+
     protected List<ImportRiver> rivers;
 
     public Importer() {
@@ -79,9 +86,34 @@
         }
     }
 
+    public static AnnotationClassifier getAnnotationClassifier() {
+        String annotationTypes = System.getProperty(ANNOTATION_TYPES);
+
+        if (annotationTypes == null) {
+            return null;
+        }
+
+        File file = new File(annotationTypes);
+
+        if (!(file.isFile() && file.canRead())) {
+            log.warn("annotation type file '" + file + "' is not readable.");
+            return null;
+        }
+
+        Document rules = XMLUtils.parseDocument(file);
+
+        if (rules == null) {
+            log.warn("cannot parse annotation types file.");
+            return null;
+        }
+
+        return new AnnotationClassifier(rules);
+    }
+
     public static void main(String [] args) {
 
-        InfoGewParser infoGewParser = new InfoGewParser();
+        InfoGewParser infoGewParser = new InfoGewParser(
+            getAnnotationClassifier());
 
         for (String gew: args) {
             log.info("parsing info gew file: " + gew);
--- a/flys-backend/src/main/java/de/intevation/flys/importer/InfoGewParser.java	Sat Jun 18 20:13:31 2011 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/importer/InfoGewParser.java	Sun Jun 19 12:26:12 2011 +0000
@@ -34,8 +34,15 @@
 
     protected ArrayList<ImportRiver> rivers;
 
+    protected AnnotationClassifier annotationClassifier;
+
     public InfoGewParser() {
+        this(null);
+    }
+
+    public InfoGewParser(AnnotationClassifier annotationClassifier) {
         rivers = new ArrayList<ImportRiver>();
+        this.annotationClassifier = annotationClassifier;
     }
 
     public List<ImportRiver> getRivers() {
@@ -52,7 +59,7 @@
 
         File root = file.getParentFile();
 
-        ImportRiver importRiver = new ImportRiver();
+        ImportRiver importRiver = new ImportRiver(annotationClassifier);
         try {
             in =
                 new LineNumberReader(

http://dive4elements.wald.intevation.org