bitly · jkawamoto · Dec 9, 2016 · Dec 9, 2016 · Mar 23, 2017
diff --git a/data_hacks/__init__.py b/data_hacks/__init__.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import sys
+from data_hacks.bar_chart import run as bar_chart
+from data_hacks import histogram as hist
+from data_hacks.ninety_five_percent import run as ninety_five_percent
+from data_hacks.sample import run as sample
+
+
+class BarChartOpt(object):
+
+    def __init__(
+            self, agg_value_key=False, agg_key_value=False, sort_keys=True,
+            sort_values=False, reverse_sort=False, numeric_sort=False,
+            percentage=False, dot="∎"):
+        self.agg_value_key = agg_value_key
+        self.agg_key_value = agg_key_value
+        self.sort_keys = sort_keys
+        self.sort_values = sort_values
+        self.reverse_sort = reverse_sort
+        self.numeric_sort = numeric_sort
+        self.percentage = percentage
+        self.dot = dot
+
+
+class HistogramOpt(object):
+
+    def __init__(
+            self, agg_value_key=False, agg_key_value=False, min=None,
+            max=None, buckets=None, logscale=False, custbuckets=None,
+            mvsd=True, format="%10.4f", percentage=False, dot="∎"):
+        self.agg_value_key = agg_value_key
+        self.agg_key_value = agg_key_value
+        self.min = min
+        self.max = max
+        self.buckets = buckets
+        self.logscale = logscale
+        self.custbuckets = custbuckets
+        self.mvsd = mvsd
+        self.format = format
+        self.percentage = percentage
+        self.dot = dot
+
+
+def histogram(stream, options, output=sys.stdout):
+    hist.histogram(hist.load_stream(
+        stream, options.agg_value_key, options.agg_key_value), options, output)
diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py
@@ -20,6 +20,7 @@
 
 https://github.com/bitly/data_hacks
 """
+from __future__ import print_function
 import sys
 import math
 from collections import defaultdict
@@ -37,7 +38,7 @@ def load_stream(input_stream):
         if clean_line:
             yield clean_line
 
-def run(input_stream, options):
+def run(input_stream, options, output=sys.stdout):
     data = defaultdict(int)
     total = 0
     for row in input_stream:
@@ -54,20 +55,22 @@ def run(input_stream, options):
         else:
             data[row] += 1
             total += 1
-    
+
     if not data:
-        print "Error: no data"
+        print("Error: no data", file=output)
         sys.exit(1)
-    
+
     max_length = max([len(key) for key in data.keys()])
     max_length = min(max_length, 50)
     value_characters = 80 - max_length
     max_value = max(data.values())
     scale = int(math.ceil(float(max_value) / value_characters))
     scale = max(1, scale)
-
-    print "# each " + options.dot + " represents a count of %d. total %d" % (scale, total)
-
+
+    print(
+        "# each " + options.dot + " represents a count of %d. total %d" % (scale, total),
+        file=output)
+
     if options.sort_values:
         data = [[value, key] for key, value in data.items()]
         data.sort(key=lambda x: x[0], reverse=options.reverse_sort)
@@ -79,13 +82,15 @@ def run(input_stream, options):
             data.sort(key=lambda x: (Decimal(x[1])), reverse=options.reverse_sort)
         else:
             data.sort(key=lambda x: x[1], reverse=options.reverse_sort)
-    
+
     str_format = "%" + str(max_length) + "s [%6d] %s%s"
     percentage = ""
     for value, key in data:
         if options.percentage:
             percentage = " (%0.2f%%)" % (100 * Decimal(value) / Decimal(total))
-        print str_format % (key[:max_length], value, (value / scale) * options.dot, percentage)
+        print(
+            str_format % (key[:max_length], value, (value / scale) * options.dot, percentage),
+            file=output)
 
 if __name__ == "__main__":
     parser = OptionParser()
@@ -107,10 +112,9 @@ def run(input_stream, options):
     parser.add_option("--dot", dest="dot", default='∎', help="Dot representation")
 
     (options, args) = parser.parse_args()
-    
+
     if sys.stdin.isatty():
         parser.print_usage()
-        print "for more help use --help"
+        print("for more help use --help")
         sys.exit(1)
     run(load_stream(sys.stdin), options)
-
diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py
@@ -24,6 +24,7 @@
 https://github.com/bitly/data_hacks
 """
 
+from __future__ import print_function
 import sys
 from decimal import Decimal
 import logging
@@ -97,7 +98,7 @@ def load_stream(input_stream, agg_value_key, agg_key_value):
                 yield DataPoint(Decimal(clean_line), 1)
         except:
             logging.exception('failed %r', line)
-            print >>sys.stderr, "invalid line %r" % line
+            print("invalid line %r" % line, file=sys.stderr)
 
 
 def median(values, key=None):
@@ -121,7 +122,7 @@ def test_median():
     assert "4.50" == "%.2f" % median([4.0, 5, 2, 1, 9, 10])
 
 
-def histogram(stream, options):
+def histogram(stream, options, output=sys.stdout):
     """
     Loop over the stream and add each entry to the dataset, printing out at the
     end.
@@ -233,15 +234,17 @@ def log_steps(k, n):
         bucket_scale = int(max(bucket_counts) / 75)
 
     print("# NumSamples = %d; Min = %0.2f; Max = %0.2f" %
-          (samples, min_v, max_v))
+          (samples, min_v, max_v), file=output)
     if skipped:
         print("# %d value%s outside of min/max" %
-              (skipped, skipped > 1 and 's' or ''))
+              (skipped, skipped > 1 and 's' or ''), file=output)
     if options.mvsd:
         print("# Mean = %f; Variance = %f; SD = %f; Median %f" %
               (mvsd.mean(), mvsd.var(), mvsd.sd(),
-               median(accepted_data, key=lambda x: x.value)))
-    print "# each " + options.dot + " represents a count of %d" % bucket_scale
+               median(accepted_data, key=lambda x: x.value)), file=output)
+    print(
+        "# each " + options.dot + " represents a count of %d" % bucket_scale,
+        file=output)
     bucket_min = min_v
     bucket_max = min_v
     percentage = ""
@@ -256,8 +259,8 @@ def log_steps(k, n):
         if options.percentage:
             percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) /
                                          Decimal(samples))
-        print format_string % (bucket_min, bucket_max, bucket_count, options.dot *
-                               star_count, percentage)
+        print(format_string % (bucket_min, bucket_max, bucket_count, options.dot *
+                               star_count, percentage), file=output)
 
 
 if __name__ == "__main__":
@@ -294,7 +297,7 @@ def log_steps(k, n):
     if sys.stdin.isatty():
         # if isatty() that means it's run without anything piped into it
         parser.print_usage()
-        print "for more help use --help"
+        print("for more help use --help")
         sys.exit(1)
     histogram(load_stream(sys.stdin, options.agg_value_key,
                           options.agg_key_value), options)
diff --git a/data_hacks/ninety_five_percent.py b/data_hacks/ninety_five_percent.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # Copyright 2010 Bitly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -20,14 +20,15 @@
 https://github.com/bitly/data_hacks
 """
 
+from __future__ import print_function
 import sys
 import os
 from decimal import Decimal
 
-def run():
+def run(stream=sys.stdin, output=sys.stdout):
     count = 0
     data = {}
-    for line in sys.stdin:
+    for line in stream:
         line = line.strip()
         if not line:
             # skip empty lines (ie: newlines)
@@ -37,9 +38,9 @@ def run():
             count +=1
             data[t] = data.get(t, 0) + 1
         except:
-            print >>sys.stderr, "invalid line %r" % line
-    print calc_95(data, count)
-        
+            print("invalid line %r" % line, file=sys.stderr)
+    print(calc_95(data, count), file=output)
+
 def calc_95(data, count):
     # find the time it took for x entry, where x is the threshold
     threshold = Decimal(count) * Decimal('.95')
@@ -54,6 +55,6 @@ def calc_95(data, count):
 
 if __name__ == "__main__":
     if sys.stdin.isatty() or '--help' in sys.argv or '-h' in sys.argv:
-        print "Usage: cat data | %s" % os.path.basename(sys.argv[0])
+        print("Usage: cat data | %s" % os.path.basename(sys.argv[0]))
         sys.exit(1)
     run()
diff --git a/data_hacks/sample.py b/data_hacks/sample.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# 
+#
 # Copyright 2010 Bitly
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may
@@ -20,16 +20,16 @@
 https://github.com/bitly/data_hacks
 """
 
+from __future__ import print_function
 import sys
 import random
 from optparse import OptionParser
 from decimal import Decimal
 
-def run(sample_rate):
-    input_stream = sys.stdin
+def run(sample_rate, input_stream=sys.stdin, output=sys.stdout):
     for line in input_stream:
         if random.randint(1,100) <= sample_rate:
-            sys.stdout.write(line)
+            output.write(line)
 
 def get_sample_rate(rate_string):
     """ return a rate as a percentage"""
@@ -49,17 +49,17 @@ def get_sample_rate(rate_string):
     parser = OptionParser(usage="cat data | %prog [options] [sample_rate]")
     parser.add_option("--verbose", dest="verbose", default=False, action="store_true")
     (options, args) = parser.parse_args()
-    
+
     if not args or sys.stdin.isatty():
         parser.print_usage()
         sys.exit(1)
-    
+
     try:
         sample_rate = get_sample_rate(sys.argv[-1])
     except ValueError, e:
-        print >>sys.stderr, e
+        print(e, file=sys.stderr)
         parser.print_usage()
         sys.exit(1)
     if options.verbose:
-        print >>sys.stderr, "Sample rate is %d%%" % sample_rate 
+        print("Sample rate is %d%%" % sample_rate, file=sys.stderr)
     run(sample_rate)