# HG changeset patch # User Sascha L. Teichmann # Date 1343751257 0 # Node ID e01b9d1bc941849721114e2727023d1a737cac8a # Parent 5063c93dfb8ee2b39008838c6f9cca7558b0aafa FixA: Corrected the formulas of Grubbs' test for outliers. Still a bit broken. flys-artifacts/trunk@5162 c6561f87-3c4e-4783-a992-168aeb5c3f6f diff -r 5063c93dfb8e -r e01b9d1bc941 flys-artifacts/ChangeLog --- a/flys-artifacts/ChangeLog Mon Jul 30 11:25:19 2012 +0000 +++ b/flys-artifacts/ChangeLog Tue Jul 31 16:14:17 2012 +0000 @@ -1,3 +1,11 @@ +2012-07-30 Sascha L. Teichmann + + * src/main/java/de/intevation/flys/artifacts/math/Outlier.java: + Corrected the formulas of Grubbs' test for outliers. + TODO: Remove only one(!) data point. Currently it removes + more than on point without recalculating the fitting curve. + This leads to too much removed points. + 2012-07-30 Sascha L. Teichmann * src/main/java/de/intevation/flys/artifacts/states/SQRelation.java: diff -r 5063c93dfb8e -r e01b9d1bc941 flys-artifacts/src/main/java/de/intevation/flys/artifacts/math/Outlier.java --- a/flys-artifacts/src/main/java/de/intevation/flys/artifacts/math/Outlier.java Mon Jul 30 11:25:19 2012 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/math/Outlier.java Tue Jul 31 16:14:17 2012 +0000 @@ -15,6 +15,8 @@ public class Outlier { + public static final double EPSILON = 1e-5; + public static final double DEFAULT_ALPHA = 0.05; private static Logger log = Logger.getLogger(Outlier.class); @@ -52,8 +54,8 @@ @Override public int compareTo(IndexedValue other) { int diff = index - other.index; - if (index < 0) return -1; - return index > 0 ? +1 : 0; + if (diff < 0) return -1; + return diff > 0 ? +1 : 0; } } // class IndexedValue @@ -105,6 +107,14 @@ List inputValues, double alpha ) { + boolean debug = log.isDebugEnabled(); + + if (debug) { + log.debug("outliers significance: " + alpha); + } + + alpha = 1d - alpha; + ArrayList outliers = new ArrayList(); ArrayList values = @@ -113,6 +123,10 @@ for (;;) { int N = values.size(); + if (debug) { + log.debug("Values to check: " + N); + } + if (N < 4) { break; } @@ -128,35 +142,60 @@ double m = mean.getResult(); double s = std.getResult(); + if (debug) { + log.debug("mean: " + m); + log.debug("std dev: " + s); + } + double maxZ = -Double.MAX_VALUE; int iv = -1; for (int i = N-1; i >= 0; --i) { IndexedValue v = values.get(i); - double z = Math.abs(m - v.getValue())/s; + double z = Math.abs(v.getValue()-m); + if (debug) { + log.debug("z candidate: " + z); + } if (z > maxZ) { maxZ = z; iv = i; } } - double t = Math.sqrt((N*(N-2)*maxZ*maxZ) - /((N-1)*(N-1) - N*maxZ*maxZ)); + if (Math.abs(s) < EPSILON) { + break; + } + + maxZ /= s; TDistributionImpl tdist = new TDistributionImpl(N-2); - try { - double p = tdist.cumulativeProbability(t); + double t; - if (p < alpha) { - outliers.add(values.get(iv)); - values.remove(iv); - } - else { - break; - } + try { + t = tdist.inverseCumulativeProbability(alpha/(N+N)); } catch (MathException me) { log.error(me); + break; + } + + t *= t; + + double za = ((N-1)/Math.sqrt(N))*Math.sqrt(t/(N-2d+t)); + + if (debug) { + log.debug("max: " + maxZ + " crit: " + za); + } + + if (maxZ > za) { + outliers.add(values.get(iv)); + values.remove(iv); + } + else { + if (debug) { + log.debug("values left: " + N); + } + break; } }