bitly · awreece · Jul 26, 2013
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # Copyright 2010 bit.ly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -99,6 +99,70 @@ def test_median():
     assert 4 == median([4,5,2,1,9,10]) # even-sized int list. (4+5)/2 = 4
     assert "4.50" == "%.2f" % median([4.0,5,2,1,9,10]) #even-sized float list. (4.0+5)/2 = 4.5
 
+#
+# Inverse normal, taken lovingly from:
+#     http://home.online.no/~pjacklam/notes/invnorm/impl/field/ltqnorm.txt
+#
+def ltqnorm( p ):
+    """
+    Modified from the author's original perl code (original comments follow below)
+    by [email protected].  May 3, 2004.
+
+    Lower tail quantile for standard normal distribution function.
+
+    This function returns an approximation of the inverse cumulative
+    standard normal distribution function.  I.e., given P, it returns
+    an approximation to the X satisfying P = Pr{Z <= X} where Z is a
+    random variable from the standard normal distribution.
+
+    The algorithm uses a minimax approximation by rational functions
+    and the result has a relative error whose absolute value is less
+    than 1.15e-9.
+
+    Author:      Peter John Acklam
+    Time-stamp:  2000-07-19 18:26:14
+    E-mail:      [email protected]
+    WWW URL:     http://home.online.no/~pjacklam
+    """
+
+    if p <= 0 or p >= 1:
+        # The original perl code exits here, we'll throw an exception instead
+        raise ValueError( "Argument to ltqnorm %f must be in open interval (0,1)" % p )
+
+    # Coefficients in rational approximations.
+    a = (-3.969683028665376e+01,  2.209460984245205e+02, \
+         -2.759285104469687e+02,  1.383577518672690e+02, \
+         -3.066479806614716e+01,  2.506628277459239e+00)
+    b = (-5.447609879822406e+01,  1.615858368580409e+02, \
+         -1.556989798598866e+02,  6.680131188771972e+01, \
+         -1.328068155288572e+01 )
+    c = (-7.784894002430293e-03, -3.223964580411365e-01, \
+         -2.400758277161838e+00, -2.549732539343734e+00, \
+          4.374664141464968e+00,  2.938163982698783e+00)
+    d = ( 7.784695709041462e-03,  3.224671290700398e-01, \
+          2.445134137142996e+00,  3.754408661907416e+00)
+
+    # Define break-points.
+    plow  = 0.02425
+    phigh = 1 - plow
+
+    # Rational approximation for lower region:
+    if p < plow:
+       q  = math.sqrt(-2*math.log(p))
+       return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / \
+               ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1)
+
+    # Rational approximation for upper region:
+    if phigh < p:
+       q  = math.sqrt(-2*math.log(1-p))
+       return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / \
+                ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1)
+
+    # Rational approximation for central region:
+    q = p - 0.5
+    r = q*q
+    return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q / \
+           (((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1)
 
 def histogram(stream, options):
     """
@@ -160,7 +224,19 @@ def histogram(stream, options):
     if skipped:
         print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
     if options.mvsd:
-        print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data))
+        mean_string = "%f" % mvsd.mean()
+        if options.confidence and 0 < float(options.confidence) < 1:
+            # Convert to float and adjust so it is the correct value for a half
+            # interval.
+            confidence = (float(options.confidence) + 1.0)/2
+            if len(accepted_data) >= 30:
+                interval_width = ltqnorm(confidence)
+                interval_width *= mvsd.sd() / math.sqrt(samples)
+            # TODO(awreece) Students t distribution if <30 samples.
+            interval_width = Decimal(interval_width)
+            mean_string = "%f (+/- %f)" % (mvsd.mean(), interval_width)
+
+        print "# Mean = %s; Variance = %f; SD = %f; Median %f" % (mean_string, mvsd.var(), mvsd.sd(), median(accepted_data))
     print "# each * represents a count of %d" % bucket_scale
     bucket_min = min_v
     bucket_max = min_v
@@ -172,7 +248,6 @@ def histogram(stream, options):
         if bucket_count:
             star_count = bucket_count / bucket_scale
         print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '*' * star_count)
-
 
 if __name__ == "__main__":
     parser = OptionParser()
@@ -185,6 +260,8 @@ def histogram(stream, options):
                         help="Number of buckets to use for the histogram")
     parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
                         help="Dissable the calculation of Mean, Vairance and SD. (improves performance)")
+    parser.add_option("-c", "--confidence", dest="confidence",
+                        help="Confidence interval width (set to 0 or don't specify for no interval). Requires msvd.")
 
     (options, args) = parser.parse_args()
     if sys.stdin.isatty():

diff --git a/setup.py b/setup.py
@@ -20,5 +20,5 @@
                 'data_hacks/ninety_five_percent.py',
                 'data_hacks/run_for.py',
                 'data_hacks/bar_chart.py',
-                'data_hacks/sample.py']
-     )
+                'data_hacks/sample.py'],
+     )