From 5ebffd1b54b738daaf186dbdf4bb3d09811f29c0 Mon Sep 17 00:00:00 2001 From: Lutz Fischer Date: Tue, 6 Aug 2019 18:36:17 +0200 Subject: [PATCH] added a stats class that can calculate average and stdev and estimate median mode and mad for arbitrary large sets of data in a streaming manner. Median mode and MAD are estimated based on a histogram that is build internally from the streamed data. --- .gitignore | 1 + pom.xml | Bin 800 -> 1494 bytes .../rappsilber/utils/UpdateableInteger.java | 4 + .../statistic/StreamingStatsEstimator.java | 328 ++++++++++++++++++ 4 files changed, 333 insertions(+) create mode 100644 src/main/java/org/rappsilber/utils/statistic/StreamingStatsEstimator.java diff --git a/.gitignore b/.gitignore index 32858aa..945da45 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml hs_err_pid* +/nbproject/ \ No newline at end of file diff --git a/pom.xml b/pom.xml index d7c370db215f7eeb0f3b3d8d75adcc901875de3b..8ffb952754fe4959484ab1d5ebd604bd1244199b 100644 GIT binary patch literal 1494 zcmdT@!EW0y487+o1m7(s>jo5oqGE>)8_=#r(+;~THsdI>B|(y#^zTPYwwtUjb{H^V zJ{UInNb$W#@vCD~lP7Rq8NJFDbe<8=g(;P;R@vjj=ln9e5;Ja{*+UTsqP1UTyAbS> zu|__Drqaq{2h_NV-QC`>_jKNMiOA*As}X%T9O&U9;@-{Y3-;sYdX0sR%$4>*>H@IJ ztEG>@Yg5P&hx+4TQ@5 zRA(~vW@(VFf-qUyrxK%XvNK9t%Lc^#D0Z)AjHZ$jeBqSswnMB<1H{R&14pFWsNke~ zCPgj12TxiLB?&t;vD%q7kWy{8;6Mk`>){C&0^#hm=WvPzUE*ap7rTT${QTlTly2CE zZdLZJRsr8u1wa>v)F@UgIN6Vg&wG3>t$MvZT}k)wVP8Jb3%dAieG&vgzCck$03SF@ u-{aXG*Vh#IR4|@54FC7MtaP%OxESNq|1UMrcxU0Cx_07Y+|coki`g&JGu90N delta 52 zcmcb{y?||l1{1Tnjs9c-Cb7wmOiGOAlT(-$Nio>yr=%7q7iAWdWaj7DaoOk>6y;~7 ICYRU&0D!#@o&W#< diff --git a/src/main/java/org/rappsilber/utils/UpdateableInteger.java b/src/main/java/org/rappsilber/utils/UpdateableInteger.java index 4b50f3e..a5b9c28 100644 --- a/src/main/java/org/rappsilber/utils/UpdateableInteger.java +++ b/src/main/java/org/rappsilber/utils/UpdateableInteger.java @@ -32,6 +32,10 @@ public UpdateableInteger(int value) { this.value = value; } + public UpdateableInteger(UpdateableInteger value) { + this.value = value.value; + } + @Override public int intValue() { return value; diff --git a/src/main/java/org/rappsilber/utils/statistic/StreamingStatsEstimator.java b/src/main/java/org/rappsilber/utils/statistic/StreamingStatsEstimator.java new file mode 100644 index 0000000..72efa44 --- /dev/null +++ b/src/main/java/org/rappsilber/utils/statistic/StreamingStatsEstimator.java @@ -0,0 +1,328 @@ +/* + * Copyright 2016 Lutz Fischer . + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.rappsilber.utils.statistic; + +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.TreeMap; +import java.util.concurrent.Semaphore; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.rappsilber.utils.UpdateableInteger; + +/** + * estimates a Median by splitting the assumed observed range into windows and count the occurrences within these windows. + * Basically making a histogram and then calculating the histogramMedian for this histogram. + * Which is hopefully a close enough approximation of the real median. + * @author Lutz Fischer + */ +public class StreamingStatsEstimator { + public class BinValue { + int count; + double sum; + double min; + double max; + + public BinValue() { + } + + public BinValue(double value) { + this.count = 1; + this.sum = value; + this.min = value; + this.max = value; + } + + public BinValue(BinValue v) { + this.count = v.count; + this.sum = v.sum; + this.min = v.min; + this.max = v.max; + } + + public double average() { + return sum/count; + } + + + public void add(double value) { + count++; + sum+=value; + if (value < min) { + min = value; + } else if (value > max) + max = value; + } + public void add(BinValue v) { + count+=v.count; + sum+=v.sum; + if (v.min < min) + min = v.min; + if (v.max > max) + max = v.max; + } + } + protected TreeMap m_values = new TreeMap(); + protected double m_resolution = 0.001; + protected int m_maxWindows = 1000; + + private int m_count; + private double m_avg; + private double m_min; + private double m_max; + private double m_mean2; + + public StreamingStatsEstimator(double resolution) { + this.m_resolution = resolution; + } + + public StreamingStatsEstimator() { + } + + public StreamingStatsEstimator(double resolution, int maxBins) { + this(resolution); + m_maxWindows = maxBins; + } + + + protected void reduceResolution() { + // make sure nobody else is currently doing anything to the values + // semcount - 1 as we already acquired one permit + + + // now we should be the only ones doing anything to the values + int ws = m_values.size(); + // we should only get in here if we exceded the number of possible windows + // but as we are in a multithreaded enviroment we could have ended up in here + // several times in parrallel + // so a previous call might have already taken care of it. + if (ws >= m_maxWindows) { + // we would exceed the maximum number of keys + // so we are the first to tread the current limit + double newResolution = m_resolution * 2; + TreeMap old_values = m_values; + TreeMap new_values = new TreeMap(); + + // go through all bins of old values and transfer the data to the new bins + for (Double oldbin : old_values.keySet()) { + Double newbin = Math.round(oldbin/newResolution)*newResolution; + BinValue newBinValue = new_values.get(newbin); + BinValue oldBinValue = old_values.get(oldbin); + if (newBinValue != null) { + newBinValue.add(oldBinValue); + } else { + newBinValue = new BinValue(oldBinValue); + new_values.put(newbin, newBinValue); + } + } + m_values = new_values; + m_resolution = newResolution; + } + } + + public void addValue(double d) { + Double key = Math.round(d/m_resolution)*m_resolution; + BinValue i = m_values.get(key); + + m_count++; + double delta = d - m_avg; + m_avg += delta / m_count; + m_mean2 += delta * (d - m_avg); + if (m_min > d) + m_min = d; + else if (m_max < d) + m_max = d; + + + if (i == null) { + + int ws = m_values.size(); + + if (ws >= m_maxWindows) { + // we would exceed the maximum number of keys + // so we have to reduce the resolution + reduceResolution(); + } else { + i= new BinValue(d); + m_values.put(key, i); + } + } else { + i.add(d); + } + } + + public double getMedianEstimation() { + // count how many we have + TreeMap v = m_values; + return histogramMedian(v, m_count); + } + + + public double getModeEstimation() { + // count how many we have + TreeMap v = m_values; + return histogramMode(v); + } + + public static double histogramMedian(TreeMap v, int allcounts) { + double dCenter = allcounts /2.0; + int iCenter = (int)Math.round(dCenter); + int iCenter1 = (int)iCenter; + + if (iCenter == dCenter) + iCenter1++; + + int count=0; + BinValue last =v.firstEntry().getValue(); + // find the middle bin + for (Map.Entry b : v.entrySet()) { + BinValue bv = b.getValue(); + // we have the exact median + if (count==iCenter) { + if (iCenter != iCenter1){ + return (last.max + bv.min)/2; + } else + return last.max; + } + int countlast=count+b.getValue().count; + + // the median is somewher in this bin + if (countlast > iCenter) { + // assume the getAverage of this bin is the value int the middle + // of the binned values. + int bvmiddle = (int)b.getValue().count/2; + + // what is the middle of the bin in terms of the whole dataset + int countmiddle = count+bvmiddle; + + double average = bv.average(); + + // now we can estimate the median based on that assumption + if (countmiddle > iCenter) { + int offcenter = countmiddle-iCenter; + int halfBinCount = (bv.count/2); + + if (iCenter == iCenter1) + return average - ((average-bv.min)/halfBinCount*offcenter); + else + return (average - ((average-bv.min)/halfBinCount*offcenter) + + average - ((average-bv.min)/halfBinCount*(offcenter+1)))/2; + } else { + int offcenter = iCenter - countmiddle; + int halfBinCount = (bv.count/2); + + if (iCenter == iCenter1) + return average + ((bv.max - average)/halfBinCount*offcenter); + else + return (average + ((bv.max - average)/halfBinCount*offcenter) + + average + ((bv.max - average)/halfBinCount*(offcenter+1)))/2; + } + } + count=countlast; + + last = b.getValue(); + } + throw new IndexOutOfBoundsException("Median should not be outside the list of values"); + } + + public static double histogramMode(Map v) { + int max =0; + double mode = Double.NaN; + for (BinValue bv : v.values()) { + if (bv.count> max) { + max=bv.count; + mode=bv.average(); + } + } + return mode; + } + + public double getModeMADEstimation() { + return getMADEstimation(histogramMode(m_values)); + } + + public double getMADEstimation() { + return getMADEstimation(histogramMedian(m_values,m_count)); + } + + public double getMADEstimation(double center) { + TreeMap deviations = new TreeMap<>(); + double median = center; + + // create a new histogram of ditances + for (BinValue bv : m_values.values()) { + // for each bin create a new one that represents the distance + BinValue distBV = new BinValue(bv); + distBV.min = Math.abs(distBV.min-median); + distBV.max = Math.abs(distBV.max-median); + distBV.sum = Math.abs((distBV.average()-median)*distBV.count); + BinValue ebv = deviations.get(distBV.average()); + if (ebv == null) { + deviations.put(distBV.average(), distBV); + } else { + ebv.add(distBV); + } + } + + return histogramMedian(deviations, m_count); + } + + /** + * returns the getAverage over all seen values + * @return + */ + public double getAverage() { + return m_avg; + } + + /** + * returns the standard deviation over all seen values + * @return + */ + public double getStdDev() { + return Math.sqrt(m_mean2 / m_count); + } + + public double getMin() { + return m_min; + } + + public double getMax() { + return m_max; + } + + public static void main(String[] args) { + StreamingStatsEstimator e = new StreamingStatsEstimator(0.0001, 1000); + Random r = new Random(1234); + for (int i =1; i< 10000000; i++) { + e.addValue(3+r.nextGaussian()); + if (i%100 == 0) + e.addValue(10+r.nextGaussian()); + } + + System.out.println("avergae : " +e.getAverage() +"\nmedia:" + e.getMedianEstimation()); + System.out.println("mad:" + e.getMADEstimation()); + System.out.println("StDev:" + e.getStdDev()); + System.out.println("StDev(MAD):" + e.getMADEstimation()*1.4826); + System.out.println("mode:" + e.getModeEstimation()); + System.out.println("MAD on mode:" + e.getMADEstimation(e.getModeEstimation())); + System.out.println("StDev(MAD on mode):" + e.getMADEstimation(e.getModeEstimation())*1.4826); + System.out.println("min:" + e.m_min); + System.out.println("max:" + e.m_max); + + } +}