view flys-artifacts/src/main/java/de/intevation/flys/artifacts/math/Outlier.java @ 3564:e01b9d1bc941

FixA: Corrected the formulas of Grubbs' test for outliers. Still a bit broken. flys-artifacts/trunk@5162 c6561f87-3c4e-4783-a992-168aeb5c3f6f
author Sascha L. Teichmann <sascha.teichmann@intevation.de>
date Tue, 31 Jul 2012 16:14:17 +0000
parents ab81ffd1343e
children b136113dad53
line wrap: on
line source
package de.intevation.flys.artifacts.math;

import org.apache.commons.math.MathException;

import org.apache.commons.math.stat.descriptive.moment.Mean;
import org.apache.commons.math.stat.descriptive.moment.StandardDeviation;

import org.apache.commons.math.distribution.TDistributionImpl;

import java.util.Collections;
import java.util.List;
import java.util.ArrayList;

import org.apache.log4j.Logger;

public class Outlier
{
    public static final double EPSILON = 1e-5;

    public static final double DEFAULT_ALPHA = 0.05;

    private static Logger log = Logger.getLogger(Outlier.class);

    public static class IndexedValue
    implements          Comparable<IndexedValue>
    {
        protected int    index;
        protected double value;

        public IndexedValue() {
        }

        public IndexedValue(int index, double value) {
            this.index = index;
            this.value = value;
        }

        public int getIndex() {
            return index;
        }

        public void setIndex(int index) {
            this.index = index;
        }

        public double getValue() {
            return value;
        }

        public void setValue(double value) {
            this.value = value;
        }

        @Override
        public int compareTo(IndexedValue other) {
            int diff = index - other.index;
            if (diff < 0) return -1;
            return diff > 0 ? +1 : 0;
        }
    } // class IndexedValue

    public static class Outliers {

        protected List<IndexedValue> retained;
        protected List<IndexedValue> removed;

        public Outliers() {
        }

        public Outliers(
            List<IndexedValue> retained,
            List<IndexedValue> removed
        ) {
            this.retained = retained;
            this.removed  = removed;
        }

        public boolean hasOutliers() {
            return !removed.isEmpty();
        }

        public List<IndexedValue> getRetained() {
            return retained;
        }

        public void setRetained(List<IndexedValue> retained) {
            this.retained = retained;
        }

        public List<IndexedValue> getRemoved() {
            return removed;
        }

        public void setRemoved(List<IndexedValue> removed) {
            this.removed = removed;
        }
    } // class Outliers

    public Outlier() {
    }

    public static Outliers findOutliers(List<IndexedValue> inputValues) {
        return findOutliers(inputValues, DEFAULT_ALPHA);
    }

    public static Outliers findOutliers(
        List<IndexedValue> inputValues,
        double             alpha
    ) {
        boolean debug = log.isDebugEnabled();

        if (debug) {
            log.debug("outliers significance: " + alpha);
        }

        alpha = 1d - alpha;

        ArrayList<IndexedValue> outliers = new ArrayList<IndexedValue>();

        ArrayList<IndexedValue> values =
            new ArrayList<IndexedValue>(inputValues);

        for (;;) {
            int N = values.size();

            if (debug) {
                log.debug("Values to check: " + N);
            }

            if (N < 4) {
                break;
            }

            Mean mean = new Mean();
            StandardDeviation std = new StandardDeviation();

            for (IndexedValue value: values) {
                mean.increment(value.getValue());
                std .increment(value.getValue());
            }

            double m = mean.getResult();
            double s = std.getResult();

            if (debug) {
                log.debug("mean: " + m);
                log.debug("std dev: " + s);
            }

            double maxZ = -Double.MAX_VALUE;
            int iv = -1;
            for (int i = N-1; i >= 0; --i) {
                IndexedValue v = values.get(i);
                double z = Math.abs(v.getValue()-m);
                if (debug) {
                    log.debug("z candidate: " + z);
                }
                if (z > maxZ) {
                    maxZ = z;
                    iv = i;
                }
            }

            if (Math.abs(s) < EPSILON) {
                break;
            }

            maxZ /= s;

            TDistributionImpl tdist = new TDistributionImpl(N-2);

            double t;

            try {
                t = tdist.inverseCumulativeProbability(alpha/(N+N));
            }
            catch (MathException me) {
                log.error(me);
                break;
            }

            t *= t;

            double za = ((N-1)/Math.sqrt(N))*Math.sqrt(t/(N-2d+t));

            if (debug) {
                log.debug("max: " + maxZ + " crit: " + za);
            }

            if (maxZ > za) {
                outliers.add(values.get(iv));
                values.remove(iv);
            }
            else {
                if (debug) {
                    log.debug("values left: " + N);
                }
                break;
            }
        }

        Collections.sort(outliers);

        return new Outliers(values, outliers);
    }
}
// vim:set ts=4 sw=4 si et sta sts=4 fenc=utf8 :

http://dive4elements.wald.intevation.org