changeset 1206:c7370734b872

Prevent parsing and storing PRF duplicates. flys-backend/trunk@2315 c6561f87-3c4e-4783-a992-168aeb5c3f6f
author Sascha L. Teichmann <sascha.teichmann@intevation.de>
date Mon, 11 Jul 2011 09:31:52 +0000
parents 5f1506fc7636
children 7121a40671ff
files flys-backend/ChangeLog flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java flys-backend/src/main/java/de/intevation/flys/importer/PRFParser.java flys-backend/src/main/java/de/intevation/flys/utils/FileTools.java
diffstat 4 files changed, 178 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/flys-backend/ChangeLog	Fri Jul 08 09:45:48 2011 +0000
+++ b/flys-backend/ChangeLog	Mon Jul 11 09:31:52 2011 +0000
@@ -1,3 +1,19 @@
+2011-07-08	Sascha L. Teichmann	<sascha.teichmann@intevation.de>
+
+	* src/main/java/de/intevation/flys/utils/FileTools.java:
+	  Added a class HashedFile to compare files by there length
+	  and a message digest. Digest can be set with system property
+	  "flys.backend.file.cmp.digest" and defaults to MD5. Useful to
+	  detect file duplicates.
+
+	* src/main/java/de/intevation/flys/importer/PRFParser.java: Added
+	  method prfAccept(File) to callback to check if a found PRF file
+	  should be parsed. Useful to prevent parsing file duplicates.
+
+	* src/main/java/de/intevation/flys/importer/ImportRiver.java:
+	  Use the HashedFile and the PRFParser.Callback to prevent
+	  parsing of PRF duplicates.
+
 2011-07-08	Sascha L. Teichmann	<sascha.teichmann@intevation.de>
 
 	* doc/schema/postgresql.sql: Misspelled sequence.
--- a/flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java	Fri Jul 08 09:45:48 2011 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java	Mon Jul 11 09:31:52 2011 +0000
@@ -4,6 +4,8 @@
 
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
+import java.util.HashSet;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.Calendar;
@@ -16,12 +18,12 @@
 import de.intevation.flys.model.River;
 
 import de.intevation.flys.utils.FileTools;
+import de.intevation.flys.utils.FileTools.HashedFile;
 
 import org.hibernate.Session;
 import org.hibernate.Query;
 
 public class ImportRiver
-implements   PRFParser.Callback
 {
     private static Logger log = Logger.getLogger(ImportRiver.class);
 
@@ -362,7 +364,44 @@
             .getParentFile()  // Basisdaten
             .getParentFile()  // Hydrologie
             .getParentFile(); // <river>
-        parser.parsePRFs(riverDir, this);
+
+        parser.parsePRFs(riverDir, new PRFParser.Callback() {
+
+            Set<HashedFile> prfs = new HashSet<HashedFile>();
+
+            @Override
+            public boolean prfAccept(File file) {
+                HashedFile hf = new HashedFile(file);
+                boolean success = prfs.add(hf);
+                if (!success) {
+                    log.warn("PRF file '" + file + "' seems to be a duplicate.");
+                }
+                return success;
+            }
+
+            @Override
+            public void prfParsed(PRFParser parser) {
+                log.debug("callback from PRF parser");
+
+                String  description = parser.getDescription();
+                Integer year        = parser.getYear();
+                ImportTimeInterval ti = year != null
+                    ? new ImportTimeInterval(yearToDate(year))
+                    : null;
+
+                List<ImportCrossSectionLine> lines =
+                    new ArrayList<ImportCrossSectionLine>();
+
+                for (Map.Entry<Double, List<XY>> entry: parser.getData().entrySet()) {
+                    BigDecimal km     = new BigDecimal(entry.getKey());
+                    List<XY>   points = entry.getValue();
+                    lines.add(new ImportCrossSectionLine(km, points));
+                }
+
+                crossSections.add(new ImportCrossSection(
+                    ImportRiver.this, description, ti, lines));
+            }
+        });
     }
 
     public static Date yearToDate(int year) {
@@ -373,29 +412,6 @@
         return cal.getTime();
     }
 
-    @Override
-    public void prfParsed(PRFParser parser) {
-        log.debug("callback from PRF parser");
-
-        String  description = parser.getDescription();
-        Integer year        = parser.getYear();
-        ImportTimeInterval ti = year != null
-            ? new ImportTimeInterval(yearToDate(year))
-            : null;
-
-        List<ImportCrossSectionLine> lines =
-            new ArrayList<ImportCrossSectionLine>();
-
-        for (Map.Entry<Double, List<XY>> entry: parser.getData().entrySet()) {
-            BigDecimal km     = new BigDecimal(entry.getKey());
-            List<XY>   points = entry.getValue();
-            lines.add(new ImportCrossSectionLine(km, points));
-        }
-
-        crossSections.add(new ImportCrossSection(
-            this, description, ti, lines));
-    }
-
     public void storeDependencies() {
         storeAnnotations();
         storeCrossSections();
--- a/flys-backend/src/main/java/de/intevation/flys/importer/PRFParser.java	Fri Jul 08 09:45:48 2011 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/importer/PRFParser.java	Mon Jul 11 09:31:52 2011 +0000
@@ -40,7 +40,8 @@
     public static final int MAX_YEAR = 2100;
 
     public interface Callback {
-        void prfParsed(PRFParser parser);
+        boolean prfAccept(File file);
+        void    prfParsed(PRFParser parser);
     } // interface Parser
 
     public static class DataFormat {
@@ -397,6 +398,7 @@
             }
             else if (file.isFile()
                 && file.getName().toLowerCase().endsWith(".prf")
+                && (callback == null || callback.prfAccept(file))
             ) {
                 reset();
                 boolean success = parse(file);
--- a/flys-backend/src/main/java/de/intevation/flys/utils/FileTools.java	Fri Jul 08 09:45:48 2011 +0000
+++ b/flys-backend/src/main/java/de/intevation/flys/utils/FileTools.java	Mon Jul 11 09:31:52 2011 +0000
@@ -1,15 +1,28 @@
 package de.intevation.flys.utils;
 
 import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.FileInputStream;
 
 import java.util.Stack;
+import java.util.List;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.ArrayList;
 
 import org.apache.log4j.Logger;
 
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
 public class FileTools
 {
     private static Logger log = Logger.getLogger(FileTools.class);
 
+    public static final String DIGEST =
+        System.getProperty("flys.backend.file.cmp.digest", "MD5");
+
     private FileTools() {
     }
 
@@ -74,5 +87,110 @@
 
         return curr;
     }
+
+    public static class HashedFile 
+    implements Comparable<HashedFile>
+    {
+        protected File    file;
+        protected long    length;
+        protected byte [] hash;
+
+        public HashedFile(File file) {
+            this.file = file;
+            length = file.length();
+        }
+
+        public File getFile() {
+            return file;
+        }
+
+        protected byte [] getHash() {
+            if (hash == null) {
+                InputStream in = null;
+
+                try {
+                    in = new FileInputStream(file);
+
+                    MessageDigest digest = MessageDigest.getInstance(DIGEST);
+
+                    byte [] buf = new byte[40*1024];
+                    int r;
+
+                    while ((r = in.read(buf)) >= 0) {
+                        digest.update(buf, 0, r);
+                    }
+
+                    hash = digest.digest();
+                }
+                catch (IOException ioe) {
+                    log.error(ioe);
+                    hash = new byte[0];
+                }
+                catch (NoSuchAlgorithmException nsae) {
+                    log.error(nsae);
+                    hash = new byte[0];
+                }
+                finally {
+                    if (in != null) {
+                        try {
+                            in.close();
+                        }
+                        catch (IOException ioe) {
+                            log.error(ioe);
+                        }
+                    }
+                }
+            }
+            return hash;
+        }
+
+        @Override
+        public int compareTo(HashedFile other) {
+            if (length < other.length) return -1;
+            if (length > other.length) return +1;
+            return 0;
+        }
+
+        private static int compare(byte [] a, byte [] b) {
+            if (a.length < b.length) return -1;
+            if (a.length > b.length) return +1;
+            for (int i = 0; i < a.length; ++i) {
+                int x = a[i] & 0xff;
+                int y = b[i] & 0xff;
+                if (x < y) return -1;
+                if (x > y) return +1;
+            }
+            return 0;
+        }
+
+        @Override
+        public boolean equals(Object other) {
+            return other instanceof HashedFile 
+                && ((HashedFile)other).compareTo(this) == 0;
+        }
+
+        @Override
+        public int hashCode() {
+            return (int)(length ^ (length >>> 32));
+        }
+    } // class HashedFile
+
+    public static List<File> uniqueFiles(List<File> files) {
+
+        Set<HashedFile> set = new HashSet<HashedFile>();
+
+        for (File file: files) {
+            if (!set.add(new HashedFile(file))) {
+                log.warn("file '" + file + "' is a duplicate.");
+            }
+        }
+
+        ArrayList<File> out = new ArrayList<File>(set.size());
+        for (HashedFile hf: set) {
+            out.add(hf.file);
+        }
+
+        return out;
+    }
 }
 // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :

http://dive4elements.wald.intevation.org