Add Los Angeles precinct bulletin test

t-t-t-t-t · Feb 28, 2016 · 3a0fdad · 3a0fdad
1 parent e213f0c
commit 3a0fdad
Show file tree

Hide file tree

Showing 2 changed files with 125 additions and 0 deletions.
diff --git a/tests/pdfs/la-precinct-bulletin-2014-p1.pdf b/tests/pdfs/la-precinct-bulletin-2014-p1.pdf
diff --git a/tests/test-la-precinct-bulletin-2014-p1.py b/tests/test-la-precinct-bulletin-2014-p1.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+import unittest
+import pandas as pd
+import pdfplumber
+from pdfplumber.utils import within_bbox, extract_columns, collate_chars
+import sys, os
+import re
+
+import logging
+logging.disable(logging.ERROR)
+
+HERE = os.path.abspath(os.path.dirname(__file__))
+
+def _collate_chars(x):
+    return collate_chars(x, x_tolerance=1)
+
+def parse_results_line(chars):
+    _left = chars[chars["x0rel"] < 125]
+    left = _collate_chars(_left) if len(_left) else None
+    _right = chars[(chars["x0rel"] > 155)]
+    right = int(_collate_chars(_right)) if len(_right) else None
+    _mid = chars[(chars["x0rel"] > 125) & (chars["x0rel"] < 155)]
+    mid = _collate_chars(_mid) if len(_mid) else None
+    return { "text": left, "aff": mid, "votes": right }
+
+class PrecinctPage(object):
+    def __init__(self, pdf, pageid):
+        z = lambda objs: pd.DataFrame([ x
+            for x in objs if x["pageid"] == pageid])
+        self.chars = z(pdf.chars)
+        self.lines = z(pdf.lines)
+        self.rects = z(pdf.rects)
+        self.bboxes = self.get_bboxes()
+
+    def get_bboxes(self):
+        outer, inner = [ r for i, r in self.rects.iterrows() ]
+        col_top = inner["top"] + inner["height"]
+        col_bot = outer["top"] + outer["height"]
+        line_xs = self.lines["x0"].tolist()
+        return {
+            "h1": (outer["x0"], outer["top"], outer["x1"], inner["top"]),
+            "h2": (outer["x0"], inner["top"], outer["x1"], col_top),
+            "c1": (outer["x0"], col_top, line_xs[0], col_bot),
+            "c2": (line_xs[0], col_top, line_xs[1], col_bot),
+            "c3": (line_xs[1], col_top, line_xs[2], col_bot),
+            "c4": (line_xs[2], col_top, outer["x1"], col_bot),
+        }
+
+    @property
+    def precinct(self):
+        h1_left = list(self.bboxes["h1"])
+        h1_left[-2] = float(h1_left[-2]) / 2
+        h1_left_chars = within_bbox(self.chars, h1_left)
+        txt = h1_left_chars.groupby("top").apply(_collate_chars).iloc[-1]
+        p_id = "|".join(re.split(r"\s{2,}", txt)[1:3])
+        return p_id
+
+    @property
+    def ballots_cast(self):
+        h2_chars = within_bbox(self.chars, self.bboxes["h2"])
+        txt = h2_chars.groupby("top").apply(_collate_chars).iloc[0]
+        return int(re.match(r"(\d+) BALLOTS CAST", txt).group(1))    
+
+    @property
+    def registered_voters(self):
+        h2_chars = within_bbox(self.chars, self.bboxes["h2"])
+        txt = h2_chars.groupby("top").apply(_collate_chars).iloc[1]
+        return int(re.match(r"(\d+) REGISTERED VOTERS", txt).group(1))
+
+    def parse_col(self, col_chars):
+        c = col_chars.copy()
+        c["x0rel"] = c["x0"] - c["x0"].min()
+        results_lines = c.groupby("top").apply(parse_results_line)
+        items = []
+        item = {}
+        vote_seen = False
+        for i, r in results_lines.iteritems():
+            if r["votes"] == None:
+                if vote_seen == True:
+                    items.append(item)
+                    item = {}
+                    vote_seen = False
+                item["desc"] = item["desc"] + "|" + r["text"] if item.get("desc", False) else r["text"]
+            if type(r["votes"]) == int:
+                vote_seen = True
+                item["options"] = item.get("options", [])
+                item["options"].append(r)
+        items.append(item)
+        return items
+
+    @property
+    def results(self):
+        r = []
+        for col in [ "c1", "c2", "c3", "c4" ]:
+            b = within_bbox(self.chars, self.bboxes[col])
+            r += self.parse_col(b)
+        return r
+
+    def to_dict(self):
+        return {
+            "precinct": self.precinct,
+            "registered_voters": self.registered_voters,
+            "ballots_cast": self.ballots_cast,
+            "results": self.results
+        }
+
+class Test(unittest.TestCase):
+
+    def setUp(self):
+        path = os.path.join(HERE, "pdfs/la-precinct-bulletin-2014-p1.pdf")
+        self.pdf = pdfplumber.from_path(path)
+        self.PDF_WIDTH = self.pdf.pages[0].width
+
+    def test_plain(self):
+        pass
+
+    def test_pandas(self):
+        p1 = PrecinctPage(self.pdf, 1).to_dict()
+        assert(p1["registered_voters"] == 1100)
+        assert(p1["ballots_cast"] == 327)
+        assert(p1["precinct"] == "0050003A|ACTON")
+        last = p1["results"][-1]
+        assert(last["desc"] == "ANTELOPE VALLEY HEALTH BD")
+        assert(last["options"][-1]["text"] == "ROE LEER")
+        assert(last["options"][-1]["votes"] == 39)