# HG changeset patch # User Sascha L. Teichmann # Date 1338828296 0 # Node ID ab81ffd1343e1f4b391ad0af7529f5cb6eb1c320 # Parent 05a3fe8800b33840debccf6c7cc4fda9fcfb9021 FixA: Reactivated rewrite of the outlier checks. flys-artifacts/trunk@4576 c6561f87-3c4e-4783-a992-168aeb5c3f6f diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/ChangeLog --- a/flys-artifacts/ChangeLog Mon Jun 04 10:13:20 2012 +0000 +++ b/flys-artifacts/ChangeLog Mon Jun 04 16:44:56 2012 +0000 @@ -1,3 +1,30 @@ +2012-06-04 Sascha L. Teichmann + + Implemented outlier checks in fixings analysis. Expected to be + still broken, but the code is in the right place now and has the + right structure. + + * src/main/java/de/intevation/flys/artifacts/math/Outlier.java: + Fixed endless loop. + + * src/main/java/de/intevation/flys/artifacts/model/Parameters.java: + Added further set methods for indexed access. + + * src/main/java/de/intevation/flys/artifacts/model/fixings/Fitting.java: + New. Out factored fitting code from FixCalculation. Checks for outliers, too. + + * src/main/java/de/intevation/flys/artifacts/model/fixings/FixCalculation.java: + Moved fitting code out to separate class. Streamlined code a bit. + + * src/main/java/de/intevation/flys/artifacts/model/fixings/FixResult.java: + Store the outliers from fitting in separate data structure, too. + + * src/main/java/de/intevation/flys/artifacts/model/fixings/QW.java: New. + Base class for delta W/t data. Used as storage for outliers. + + * src/main/java/de/intevation/flys/artifacts/model/fixings/QWD.java: + Is a sub class of QW now. + 2012-06-04 Sascha L. Teichmann * src/main/java/de/intevation/flys/artifacts/model/fixings/FixCalculation.java: diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/artifacts/math/Outlier.java --- a/flys-artifacts/src/main/java/de/intevation/flys/artifacts/math/Outlier.java Mon Jun 04 10:13:20 2012 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/math/Outlier.java Mon Jun 04 16:44:56 2012 +0000 @@ -7,6 +7,7 @@ import org.apache.commons.math.distribution.TDistributionImpl; +import java.util.Collections; import java.util.List; import java.util.ArrayList; @@ -14,9 +15,13 @@ public class Outlier { + public static final double DEFAULT_ALPHA = 0.05; + private static Logger log = Logger.getLogger(Outlier.class); - public static class IndexedValue { + public static class IndexedValue + implements Comparable + { protected int index; protected double value; @@ -43,12 +48,60 @@ public void setValue(double value) { this.value = value; } + + @Override + public int compareTo(IndexedValue other) { + int diff = index - other.index; + if (index < 0) return -1; + return index > 0 ? +1 : 0; + } } // class IndexedValue + public static class Outliers { + + protected List retained; + protected List removed; + + public Outliers() { + } + + public Outliers( + List retained, + List removed + ) { + this.retained = retained; + this.removed = removed; + } + + public boolean hasOutliers() { + return !removed.isEmpty(); + } + + public List getRetained() { + return retained; + } + + public void setRetained(List retained) { + this.retained = retained; + } + + public List getRemoved() { + return removed; + } + + public void setRemoved(List removed) { + this.removed = removed; + } + } // class Outliers + public Outlier() { } - public static List findOutliers( + public static Outliers findOutliers(List inputValues) { + return findOutliers(inputValues, DEFAULT_ALPHA); + } + + public static Outliers findOutliers( List inputValues, double alpha ) { @@ -69,7 +122,7 @@ for (IndexedValue value: values) { mean.increment(value.getValue()); - std.increment(value.getValue()); + std .increment(value.getValue()); } double m = mean.getResult(); @@ -98,14 +151,18 @@ outliers.add(values.get(iv)); values.remove(iv); } + else { + break; + } } catch (MathException me) { log.error(me); } } + Collections.sort(outliers); - return outliers; + return new Outliers(values, outliers); } } // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 : diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/Parameters.java --- a/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/Parameters.java Mon Jun 04 10:13:20 2012 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/Parameters.java Mon Jun 04 16:44:56 2012 +0000 @@ -40,7 +40,6 @@ public int newRow() { int N = columns[0].size(); - log.debug("new row: " + N); for (int i = 0; i < columns.length; ++i) { columns[i].add(Double.NaN); @@ -71,6 +70,20 @@ } } + public boolean set(int row, int [] indices, double [] values) { + boolean invalid = false; + for (int i = 0; i < indices.length; ++i) { + double v = values[i]; + if (Double.isNaN(v)) { + invalid = true; + } + else { + columns[indices[i]].setQuick(row, v); + } + } + return invalid; + } + public int size() { return columns[0].size(); } diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/Fitting.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/Fitting.java Mon Jun 04 16:44:56 2012 +0000 @@ -0,0 +1,170 @@ +package de.intevation.flys.artifacts.model.fixings; + +import de.intevation.flys.artifacts.math.fitting.Function; + +import de.intevation.flys.artifacts.math.Outlier; + +import de.intevation.flys.artifacts.math.Outlier.IndexedValue; +import de.intevation.flys.artifacts.math.Outlier.Outliers; + +import org.apache.commons.math.MathException; + +import org.apache.commons.math.optimization.fitting.CurveFitter; + +import org.apache.commons.math.optimization.general.LevenbergMarquardtOptimizer; + +import gnu.trove.TDoubleArrayList; + +import org.apache.log4j.Logger; + +import java.util.ArrayList; +import java.util.List; + +public class Fitting +{ + private static Logger log = Logger.getLogger(Fitting.class); + + /** Use instance of this factory to find meta infos for outliers. */ + public interface QWFactory { + + QW create(double q, double w); + + } // interface QWFactory + + protected Function function; + protected QWFactory qwFactory; + protected double chiSqr; + protected double [] parameters; + protected ArrayList removed; + + + public Fitting() { + removed = new ArrayList(); + } + + public Fitting(Function function) { + this(); + this.function = function; + } + + public Fitting(Function function, QWFactory qwFactory) { + this(function); + this.qwFactory = qwFactory; + } + + public Function getFunction() { + return function; + } + + public void setFunction(Function function) { + this.function = function; + } + + public double getChiSquare() { + return chiSqr; + } + + public void reset() { + chiSqr = 0.0; + parameters = null; + removed.clear(); + } + + public boolean hasOutliers() { + return !removed.isEmpty(); + } + + public List getOutliers() { + return removed; + } + + public QW [] outliersToArray() { + return removed.toArray(new QW[removed.size()]); + } + + public double [] getParameters() { + return parameters; + } + + public boolean fit(double [] qs, double [] ws) { + + TDoubleArrayList xs = new TDoubleArrayList(qs.length); + TDoubleArrayList ys = new TDoubleArrayList(ws.length); + + for (int i = 0; i < qs.length; ++i) { + if (!Double.isNaN(qs[i]) && !Double.isNaN(ws[i])) { + xs.add(qs[i]); + ys.add(ws[i]); + } + else { + log.warn("remove invalid value " + qs[i] + " " + ws[i]); + } + } + + if (xs.size() < 2) { + return false; + } + + LevenbergMarquardtOptimizer lmo = new LevenbergMarquardtOptimizer(); + + double [] parameters; + + List inputs = new ArrayList(xs.size()); + + for (;;) { + CurveFitter cf = new CurveFitter(lmo); + + for (int i = 0, N = xs.size(); i < N; ++i) { + cf.addObservedPoint(xs.getQuick(i), ys.getQuick(i)); + } + + try { + parameters = cf.fit(function, function.getInitialGuess()); + } + catch (MathException me) { + log.warn(me); + return false; + } + + if (qwFactory == null) { + break; + } + + inputs.clear(); + + // This is the paraterized function for a given km. + de.intevation.flys.artifacts.math.Function instance = + function.instantiate(parameters); + + for (int i = 0, N = xs.size(); i < N; ++i) { + double y = instance.value(xs.getQuick(i)); + if (Double.isNaN(y)) { + continue; + } + inputs.add(new IndexedValue(i, ys.getQuick(i) - y)); + } + + Outliers outliers = Outlier.findOutliers(inputs); + + if (!outliers.hasOutliers()) { + break; + } + + List rem = outliers.getRemoved(); + + for (int i = rem.size()-1; i >= 0; --i) { + int idx = rem.get(i).getIndex(); + removed.add( + qwFactory.create( + xs.getQuick(idx), ys.getQuick(idx))); + xs.remove(idx); + ys.remove(idx); + } + } + + chiSqr = lmo.getChiSquare(); + + return true; + } +} +// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 : diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/FixCalculation.java --- a/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/FixCalculation.java Mon Jun 04 10:13:20 2012 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/FixCalculation.java Mon Jun 04 16:44:56 2012 +0000 @@ -25,19 +25,14 @@ import de.intevation.flys.utils.DateAverager; import de.intevation.flys.utils.DoubleUtil; -import de.intevation.flys.utils.Pair; +import de.intevation.flys.utils.EpsilonComparator; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; - -import org.apache.commons.math.MathException; - -import org.apache.commons.math.optimization.fitting.CurveFitter; - -import org.apache.commons.math.optimization.general.LevenbergMarquardtOptimizer; +import java.util.TreeMap; import org.apache.log4j.Logger; @@ -46,6 +41,8 @@ { private static Logger log = Logger.getLogger(FixCalculation.class); + public static final double EPSILON = 1e-4; + protected String river; protected double from; protected double to; @@ -71,6 +68,7 @@ DateRange [] analysisPeriods = access.getAnalysisPeriods(); Integer qSectorStart = access.getQSectorStart(); Integer qSectorEnd = access.getQSectorEnd(); + Boolean preprocessing = access.getPreprocessing(); if (river == null) { // TODO: i18n @@ -117,6 +115,11 @@ addProblem("fix.missing.qend.sector"); } + if (preprocessing == null) { + // TODO: i18n + addProblem("fix.missing.preprocessing"); + } + if (!hasProblems()) { this.river = river; this.from = from; @@ -127,6 +130,7 @@ this.analysisPeriods = analysisPeriods; this.qSectorStart = qSectorStart; this.qSectorEnd = qSectorEnd; + this.preprocessing = preprocessing; } } @@ -153,30 +157,9 @@ return new CalculationResult(this); } - FixingsColumnFactory fcf = FixingsColumnFactory.getInstance(); - - List dataColumns = - new ArrayList(events.length); - - for (int eventId: events) { - IdFilter idFilter = new IdFilter(eventId); + final List eventColumns = getEventColumns(overview); - List columns = overview.filter(null, idFilter); - if (columns.isEmpty()) { - // TODO: i18n - addProblem("fix.missing.column", eventId); - continue; - } - FixingsColumn dataColumn = fcf.getColumnData(columns.get(0)); - if (dataColumn == null) { - // TODO: i18n - addProblem("fix.cannot.load.data", eventId); - continue; - } - dataColumns.add(dataColumn); - } - - if (dataColumns.size() < 2) { + if (eventColumns.size() < 2) { // TODO: i18n addProblem("fix.too.less.data.columns"); return new CalculationResult(this); @@ -184,8 +167,33 @@ double [] kms = DoubleUtil.explode(from, to, step / 1000.0); - double [] ws = new double[dataColumns.size()]; - double [] qs = new double[ws.length]; + final double [] qs = new double[eventColumns.size()]; + final double [] ws = new double[qs.length]; + + // Depending on preprocessing we need to find the outliers. + Fitting.QWFactory qwFactory = !preprocessing + ? null // No outliers + : new Fitting.QWFactory() { + @Override + public QW create(double q, double w) { + // Check all the event columns for close match + // and take the description and the date from meta. + for (int i = 0; i < qs.length; ++i) { + if (Math.abs(qs[i]-q) < EPSILON + && Math.abs(ws[i]-w) < EPSILON) { + Column column = eventColumns.get(i); + return new QW( + q, w, + column.getDescription(), + column.getDate()); + } + } + log.warn("cannot find column for (" + q + ", " + w + ")"); + return new QW(q, w); + } + }; + + Fitting fitting = new Fitting(func, qwFactory); String [] parameterNames = func.getParameterNames(); @@ -198,43 +206,44 @@ log.debug("number of kms: " + kms.length); } - int kmIndex = results.columnIndex("km"); - int chiSqrIndex = results.columnIndex("chi_sqr"); + TreeMap outliers = + new TreeMap(EpsilonComparator.INSTANCE); + + int kmIndex = results.columnIndex("km"); + int chiSqrIndex = results.columnIndex("chi_sqr"); + int [] parameterIndices = results.columnIndices(parameterNames); int numFailed = 0; for (int i = 0; i < kms.length; ++i) { double km = kms[i]; + // Fill Qs and Ws from event columns. for (int j = 0; j < ws.length; ++j) { - FixingsColumn column = dataColumns.get(j); - qs[j] = column.getQ(km); - boolean interpolated = column.getW(km, ws, j); + boolean interpolated = + eventColumns.get(j).getQW(km, qs, ws, j); // TODO: mark as interpolated. } - Pair fitResult = fit(func, km, ws, qs); + fitting.reset(); - // TODO: Do preprocessing here! - if (fitResult == null) { // Problems are reported already. + if (!fitting.fit(qs, ws)) { ++numFailed; + // TODO: i18n + addProblem(km, "fix.fitting.failed"); continue; } + if (fitting.hasOutliers()) { + outliers.put(Double.valueOf(km), fitting.outliersToArray()); + } + int row = results.newRow(); results.set(row, kmIndex, km); - results.set(row, chiSqrIndex, fitResult.getB()); - - double [] parameters = fitResult.getA(); - for (int j = 0; j < parameters.length; ++j) { - if (Double.isNaN(parameters[j])) { - invalid = true; - } - else { - results.set(row, parameterNames[j], parameters[j]); - } - } + results.set(row, chiSqrIndex, fitting.getChiSquare()); + invalid |= results.set( + row, parameterIndices, fitting.getParameters()); } if (debug) { @@ -254,11 +263,42 @@ overview, results); - FixResult fr = new FixResult(results, deltaWTsKM); + FixResult fr = new FixResult(results, deltaWTsKM, outliers); return new CalculationResult(fr, this); } + + protected List getEventColumns(FixingsOverview overview) { + + FixingsColumnFactory fcf = FixingsColumnFactory.getInstance(); + + List columns = new ArrayList(events.length); + + for (int eventId: events) { + IdFilter idFilter = new IdFilter(eventId); + + List metas = overview.filter(null, idFilter); + + if (metas.isEmpty()) { + // TODO: i18n + addProblem("fix.missing.column", eventId); + continue; + } + + FixingsColumn data = fcf.getColumnData(metas.get(0)); + if (data == null) { + // TODO: i18n + addProblem("fix.cannot.load.data", eventId); + continue; + } + + columns.add(new Column(metas.get(0), data)); + } + + return columns; + } + public DeltaWTsKM calculateDeltaWTs( Function function, FixingsOverview overview, @@ -413,7 +453,7 @@ } Column column = cc.getColumn(meta); - if (column == null || !column.getWQ(km, wq)) { + if (column == null || !column.getQW(km, wq)) { continue; } @@ -427,7 +467,7 @@ Date date = column.getDate(); String description = column.getDescription(); - QWD qwd = new QWD(wq[1], wq[0], dw, date, description); + QWD qwd = new QWD(wq[1], wq[0], description, date, dw); qwds.add(qwd); @@ -451,7 +491,7 @@ String avgDescription = "avg.deltawt." + qSector; QWD avgQWD = new QWD( - avgQ, avgW, avgDw, avgDate, avgDescription); + avgQ, avgW, avgDescription, avgDate, avgDw); // TODO: Store average value. } @@ -490,7 +530,17 @@ return meta.getDescription(); } - public boolean getWQ(double km, double [] wq) { + public boolean getQW( + double km, + double [] qs, + double [] ws, + int index + ) { + qs[index] = data.getQ(km); + return data.getW(km, ws, index); + } + + public boolean getQW(double km, double [] wq) { data.getW(km, wq, 0); if (Double.isNaN(wq[0])) return false; wq[1] = data.getQ(km); @@ -587,35 +637,5 @@ System.arraycopy(parameters, 0, result, 2, parameters.length); return result; } - - protected Pair fit( - Function function, - double km, - double [] ws, - double [] qs - ) { - LevenbergMarquardtOptimizer lmo = new LevenbergMarquardtOptimizer(); - CurveFitter cf = new CurveFitter(lmo); - - for (int i = 0; i < ws.length; ++i) { - if (!Double.isNaN(ws[i]) && !Double.isNaN(qs[i])) { - cf.addObservedPoint(qs[i], ws[i]); - } - } - - try { - double [] parameters = - cf.fit(function, function.getInitialGuess()); - double chiSqr = lmo.getChiSquare(); - - return new Pair(parameters, chiSqr); - } - catch (MathException me) { - log.warn(me, me); - addProblem(km, "fix.fitting.failed"); - } - - return null; - } } // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 : diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/FixResult.java --- a/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/FixResult.java Mon Jun 04 10:13:20 2012 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/FixResult.java Mon Jun 04 16:44:56 2012 +0000 @@ -2,6 +2,8 @@ import de.intevation.flys.artifacts.model.Parameters; +import java.util.TreeMap; + import java.io.Serializable; public class FixResult @@ -9,13 +11,19 @@ { protected Parameters parameters; protected DeltaWTsKM deltaWTsKM; + protected TreeMap outliers; public FixResult() { } - public FixResult(Parameters parameters, DeltaWTsKM deltaWTsKM) { + public FixResult( + Parameters parameters, + DeltaWTsKM deltaWTsKM, + TreeMap outliers + ) { this.parameters = parameters; this.deltaWTsKM = deltaWTsKM; + this.outliers = outliers; } public Parameters getParameters() { @@ -33,5 +41,13 @@ public void setDeltaWTsKM(DeltaWTsKM deltaWTsKM) { this.deltaWTsKM = deltaWTsKM; } + + public TreeMap getOutliers() { + return outliers; + } + + public void setOutliers(TreeMap outliers) { + this.outliers = outliers; + } } // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 : diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/QW.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/QW.java Mon Jun 04 16:44:56 2012 +0000 @@ -0,0 +1,61 @@ +package de.intevation.flys.artifacts.model.fixings; + +import java.util.Date; + +import java.io.Serializable; + +public class QW +implements Serializable +{ + protected double q; + protected double w; + protected String description; + protected Date date; + + public QW() { + } + + public QW(double q, double w) { + this.q = q; + this.w = w; + } + + public QW(double q, double w, String description, Date date) { + this(q, w); + this.description = description; + this.date = date; + } + + public double getQ() { + return q; + } + + public void setQ(double q) { + this.q = q; + } + + public double getW() { + return w; + } + + public void setW(double w) { + this.w = w; + } + + public Date getDate() { + return date; + } + + public void setDate(Date date) { + this.date = date; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } +} +// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 : diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/QWD.java --- a/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/QWD.java Mon Jun 04 10:13:20 2012 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/artifacts/model/fixings/QWD.java Mon Jun 04 16:44:56 2012 +0000 @@ -2,16 +2,10 @@ import java.util.Date; -import java.io.Serializable; - public class QWD -implements Serializable +extends QW { - protected double q; - protected double w; protected double deltaW; - protected Date date; - protected String description; public QWD() { } @@ -19,47 +13,20 @@ public QWD( double q, double w, - double deltaW, + String description, Date date, - String description + double deltaW ) { - this.q = q; - this.w = q; - this.deltaW = deltaW; - this.date = date; - this.description = description; - } - - public double getQ() { - return q; - } - - public void setQ(double q) { - this.q = q; + super(q, w, description, date); + this.deltaW = deltaW; } - public double getW() { - return w; - } - - public void setW(double w) { - this.w = w; + public double getDeltaW() { + return deltaW; } - public Date getDate() { - return date; - } - - public void setDate(Date date) { - this.date = date; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; + public void setDeltaW(double deltaW) { + this.deltaW = deltaW; } } // vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 : diff -r 05a3fe8800b3 -r ab81ffd1343e flys-artifacts/src/main/java/de/intevation/flys/utils/EpsilonComparator.java --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flys-artifacts/src/main/java/de/intevation/flys/utils/EpsilonComparator.java Mon Jun 04 16:44:56 2012 +0000 @@ -0,0 +1,36 @@ +package de.intevation.flys.utils; + +import java.io.Serializable; + +import java.util.Comparator; + +public class EpsilonComparator +implements Serializable, Comparator +{ + public static final double EPSILON = 1e-5; + + public static final EpsilonComparator INSTANCE = + new EpsilonComparator(EPSILON); + + protected double epsilon; + + public EpsilonComparator() { + this(EPSILON); + } + + public EpsilonComparator(double epsilon) { + this.epsilon = Math.abs(epsilon); + } + + private static final double value(Double x) { + return x != null ? x.doubleValue() : 0.0; + } + + @Override + public int compare(Double a, Double b) { + double diff = value(a) - value(b); + if (diff < epsilon) return -1; + return diff > epsilon ? +1 : 0; + } +} +// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :