# HG changeset patch # User Sascha L. Teichmann # Date 1310376712 0 # Node ID c7370734b872693034bb8f3e26033904c5ad3dc6 # Parent 5f1506fc763633918dcbdea0c4effcc874ed7f32 Prevent parsing and storing PRF duplicates. flys-backend/trunk@2315 c6561f87-3c4e-4783-a992-168aeb5c3f6f diff -r 5f1506fc7636 -r c7370734b872 flys-backend/ChangeLog --- a/flys-backend/ChangeLog Fri Jul 08 09:45:48 2011 +0000 +++ b/flys-backend/ChangeLog Mon Jul 11 09:31:52 2011 +0000 @@ -1,3 +1,19 @@ +2011-07-08 Sascha L. Teichmann + + * src/main/java/de/intevation/flys/utils/FileTools.java: + Added a class HashedFile to compare files by there length + and a message digest. Digest can be set with system property + "flys.backend.file.cmp.digest" and defaults to MD5. Useful to + detect file duplicates. + + * src/main/java/de/intevation/flys/importer/PRFParser.java: Added + method prfAccept(File) to callback to check if a found PRF file + should be parsed. Useful to prevent parsing file duplicates. + + * src/main/java/de/intevation/flys/importer/ImportRiver.java: + Use the HashedFile and the PRFParser.Callback to prevent + parsing of PRF duplicates. + 2011-07-08 Sascha L. Teichmann * doc/schema/postgresql.sql: Misspelled sequence. diff -r 5f1506fc7636 -r c7370734b872 flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java --- a/flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java Fri Jul 08 09:45:48 2011 +0000 +++ b/flys-backend/src/main/java/de/intevation/flys/importer/ImportRiver.java Mon Jul 11 09:31:52 2011 +0000 @@ -4,6 +4,8 @@ import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.HashSet; import java.util.ArrayList; import java.util.Date; import java.util.Calendar; @@ -16,12 +18,12 @@ import de.intevation.flys.model.River; import de.intevation.flys.utils.FileTools; +import de.intevation.flys.utils.FileTools.HashedFile; import org.hibernate.Session; import org.hibernate.Query; public class ImportRiver -implements PRFParser.Callback { private static Logger log = Logger.getLogger(ImportRiver.class); @@ -362,7 +364,44 @@ .getParentFile() // Basisdaten .getParentFile() // Hydrologie .getParentFile(); // - parser.parsePRFs(riverDir, this); + + parser.parsePRFs(riverDir, new PRFParser.Callback() { + + Set prfs = new HashSet(); + + @Override + public boolean prfAccept(File file) { + HashedFile hf = new HashedFile(file); + boolean success = prfs.add(hf); + if (!success) { + log.warn("PRF file '" + file + "' seems to be a duplicate."); + } + return success; + } + + @Override + public void prfParsed(PRFParser parser) { + log.debug("callback from PRF parser"); + + String description = parser.getDescription(); + Integer year = parser.getYear(); + ImportTimeInterval ti = year != null + ? new ImportTimeInterval(yearToDate(year)) + : null; + + List lines = + new ArrayList(); + + for (Map.Entry> entry: parser.getData().entrySet()) { + BigDecimal km = new BigDecimal(entry.getKey()); + List points = entry.getValue(); + lines.add(new ImportCrossSectionLine(km, points)); + } + + crossSections.add(new ImportCrossSection( + ImportRiver.this, description, ti, lines)); + } + }); } public static Date yearToDate(int year) { @@ -373,29 +412,6 @@ return cal.getTime(); } - @Override - public void prfParsed(PRFParser parser) { - log.debug("callback from PRF parser"); - - String description = parser.getDescription(); - Integer year = parser.getYear(); - ImportTimeInterval ti = year != null - ? new ImportTimeInterval(yearToDate(year)) - : null; - - List lines = - new ArrayList(); - - for (Map.Entry> entry: parser.getData().entrySet()) { - BigDecimal km = new BigDecimal(entry.getKey()); - List points = entry.getValue(); - lines.add(new ImportCrossSectionLine(km, points)); - } - - crossSections.add(new ImportCrossSection( - this, description, ti, lines)); - } - public void storeDependencies() { storeAnnotations(); storeCrossSections(); diff -r 5f1506fc7636 -r c7370734b872 flys-backend/src/main/java/de/intevation/flys/importer/PRFParser.java --- a/flys-backend/src/main/java/de/intevation/flys/importer/PRFParser.java Fri Jul 08 09:45:48 2011 +0000 +++ b/flys-backend/src/main/java/de/intevation/flys/importer/PRFParser.java Mon Jul 11 09:31:52 2011 +0000 @@ -40,7 +40,8 @@ public static final int MAX_YEAR = 2100; public interface Callback { - void prfParsed(PRFParser parser); + boolean prfAccept(File file); + void prfParsed(PRFParser parser); } // interface Parser public static class DataFormat { @@ -397,6 +398,7 @@ } else if (file.isFile() && file.getName().toLowerCase().endsWith(".prf") + && (callback == null || callback.prfAccept(file)) ) { reset(); boolean success = parse(file); diff -r 5f1506fc7636 -r c7370734b872 flys-backend/src/main/java/de/intevation/flys/utils/FileTools.java --- a/flys-backend/src/main/java/de/intevation/flys/utils/FileTools.java Fri Jul 08 09:45:48 2011 +0000 +++ b/flys-backend/src/main/java/de/intevation/flys/utils/FileTools.java Mon Jul 11 09:31:52 2011 +0000 @@ -1,15 +1,28 @@ package de.intevation.flys.utils; import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.FileInputStream; import java.util.Stack; +import java.util.List; +import java.util.Set; +import java.util.HashSet; +import java.util.ArrayList; import org.apache.log4j.Logger; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + public class FileTools { private static Logger log = Logger.getLogger(FileTools.class); + public static final String DIGEST = + System.getProperty("flys.backend.file.cmp.digest", "MD5"); + private FileTools() { } @@ -74,5 +87,110 @@ return curr; } + + public static class HashedFile + implements Comparable + { + protected File file; + protected long length; + protected byte [] hash; + + public HashedFile(File file) { + this.file = file; + length = file.length(); + } + + public File getFile() { + return file; + } + + protected byte [] getHash() { + if (hash == null) { + InputStream in = null; + + try { + in = new FileInputStream(file); + + MessageDigest digest = MessageDigest.getInstance(DIGEST); + + byte [] buf = new byte[40*1024]; + int r; + + while ((r = in.read(buf)) >= 0) { + digest.update(buf, 0, r); + } + + hash = digest.digest(); + } + catch (IOException ioe) { + log.error(ioe); + hash = new byte[0]; + } + catch (NoSuchAlgorithmException nsae) { + log.error(nsae); + hash = new byte[0]; + } + finally { + if (in != null) { + try { + in.close(); + } + catch (IOException ioe) { + log.error(ioe); + } + } + } + } + return hash; + } + + @Override + public int compareTo(HashedFile other) { + if (length < other.length) return -1; + if (length > other.length) return +1; + return 0; + } + + private static int compare(byte [] a, byte [] b) { + if (a.length < b.length) return -1; + if (a.length > b.length) return +1; + for (int i = 0; i < a.length; ++i) { + int x = a[i] & 0xff; + int y = b[i] & 0xff; + if (x < y) return -1; + if (x > y) return +1; + } + return 0; + } + + @Override + public boolean equals(Object other) { + return other instanceof HashedFile + && ((HashedFile)other).compareTo(this) == 0; + } + + @Override + public int hashCode() { + return (int)(length ^ (length >>> 32)); + } + } // class HashedFile + + public static List uniqueFiles(List files) { + + Set set = new HashSet(); + + for (File file: files) { + if (!set.add(new HashedFile(file))) { + log.warn("file '" + file + "' is a duplicate."); + } + } + + ArrayList out = new ArrayList(set.size()); + for (HashedFile hf: set) { + out.add(hf.file); + } + + return out; + } } // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :