ES&S CVR parsing: Protect against unknown columns in CVR file (#1960)

We added a hotfix in #1954 to allow different sets of metadata columsn in the CVR file. However, we still may see unknown columns we haven't seen before. Since there's no good way to differentiate those columsn from contest columns, our current approach may silently fail in that case. The consequence would be that metadata columns are treated as contests. This may or may not cause downstream issues. To reduce the likelihood of that happening, we change our method of searching for the dividing line between metadata columns and contest columns to look for the _last_ known metadata header. That way, we'll get it right in every case except the case where the dividing line is a header we haven't seen before, which is much less likely to occur.
votingworks · Aug 14, 2024 · 5c4bb29 · 5c4bb29
1 parent 19895b9
commit 5c4bb29
Show file tree

Hide file tree

Showing 3 changed files with 169 additions and 19 deletions.
diff --git a/server/api/cvrs.py b/server/api/cvrs.py
@@ -812,7 +812,11 @@ def parse_ballots_file(
     def parse_contest_metadata(cvr_csv: CSVIterator) -> CVR_CONTESTS_METADATA:
         headers = next(cvr_csv)
         # Based on files we've seen, the first few columns are metadata, and the
-        # rest are contest names
+        # rest are contest names. We want to figure out where the dividing line
+        # is. The challenge is that there may be metadata columns that we've
+        # never seen before. To maximize the chance of getting this right, we
+        # look for the last column that matches a known metadata header, hoping
+        # that the dividing line will be one of our known headers.
         known_metadata_headers = [
             "Election ID",
             "Audit Number",
@@ -823,11 +827,12 @@ def parse_contest_metadata(cvr_csv: CSVIterator) -> CVR_CONTESTS_METADATA:
             "Precinct",
             "Ballot Style",
         ]
-        first_contest_column = next(
+        last_header_column = next(
             index
-            for index, header in enumerate(headers)
-            if header not in known_metadata_headers
+            for index, header in reversed(list(enumerate(headers)))
+            if header in known_metadata_headers
         )
+        first_contest_column = last_header_column + 1
         contest_names = headers[first_contest_column:]
         # { contest_name: choice_names }
         contest_choices = defaultdict(set)

diff --git a/server/tests/ballot_comparison/snapshots/snap_test_cvrs.py b/server/tests/ballot_comparison/snapshots/snap_test_cvrs.py
@@ -1238,6 +1238,127 @@
     },
 }
 
+snapshots["test_ess_cvr_upload_cvr_file_with_tabulator_cvr_column 1"] = [
+    {
+        "ballot_position": 1,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0001000415",
+        "interpretations": "0,1,1,0,0",
+        "tabulator": "0001",
+    },
+    {
+        "ballot_position": 2,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0001000416",
+        "interpretations": "1,0,1,0,0",
+        "tabulator": "0001",
+    },
+    {
+        "ballot_position": 3,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0001000417",
+        "interpretations": "0,1,0,1,0",
+        "tabulator": "0001",
+    },
+    {
+        "ballot_position": 1,
+        "batch_name": "BATCH1",
+        "imprinted_id": "0001013415",
+        "interpretations": "0,1,1,0,0",
+        "tabulator": "0001",
+    },
+    {
+        "ballot_position": 2,
+        "batch_name": "BATCH1",
+        "imprinted_id": "0001013416",
+        "interpretations": "1,0,1,0,0",
+        "tabulator": "0001",
+    },
+    {
+        "ballot_position": 3,
+        "batch_name": "BATCH1",
+        "imprinted_id": "0001013417",
+        "interpretations": "u,u,1,0,0",
+        "tabulator": "0001",
+    },
+    {
+        "ballot_position": 1,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0002000171",
+        "interpretations": "1,0,0,1,0",
+        "tabulator": "0002",
+    },
+    {
+        "ballot_position": 2,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0002000172",
+        "interpretations": "0,1,0,1,0",
+        "tabulator": "0002",
+    },
+    {
+        "ballot_position": 3,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0002000173",
+        "interpretations": "1,0,0,1,0",
+        "tabulator": "0002",
+    },
+    {
+        "ballot_position": 4,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0002000174",
+        "interpretations": "0,1,0,0,1",
+        "tabulator": "0002",
+    },
+    {
+        "ballot_position": 5,
+        "batch_name": "BATCH2",
+        "imprinted_id": "0002000175",
+        "interpretations": "1,0,0,0,1",
+        "tabulator": "0002",
+    },
+    {
+        "ballot_position": 1,
+        "batch_name": "BATCH1",
+        "imprinted_id": "0002003171",
+        "interpretations": "o,o,1,0,0",
+        "tabulator": "0002",
+    },
+    {
+        "ballot_position": 2,
+        "batch_name": "BATCH1",
+        "imprinted_id": "0002003172",
+        "interpretations": "0,1,1,0,0",
+        "tabulator": "0002",
+    },
+    {
+        "ballot_position": 3,
+        "batch_name": "BATCH1",
+        "imprinted_id": "0002003173",
+        "interpretations": "1,0,1,0,0",
+        "tabulator": "0002",
+    },
+]
+
+snapshots["test_ess_cvr_upload_cvr_file_with_tabulator_cvr_column 2"] = {
+    "Contest 1": {
+        "choices": {
+            "Choice 1-1": {"column": 0, "num_votes": 6},
+            "Choice 1-2": {"column": 1, "num_votes": 6},
+        },
+        "total_ballots_cast": 14,
+        "votes_allowed": 1,
+    },
+    "Contest 2": {
+        "choices": {
+            "Choice 2-1": {"column": 2, "num_votes": 8},
+            "Choice 2-2": {"column": 3, "num_votes": 4},
+            "Choice 2-3": {"column": 4, "num_votes": 2},
+        },
+        "total_ballots_cast": 14,
+        "votes_allowed": 1,
+    },
+}
+
 snapshots["test_hart_cvr_upload 1"] = [
     {
         "ballot_position": 1,

diff --git a/server/tests/ballot_comparison/test_cvrs.py b/server/tests/ballot_comparison/test_cvrs.py
@@ -1394,21 +1394,21 @@ def test_clearballot_cvr_upload_invalid(
 15,BATCH2,Not Reviewed,,,,N,REP 405,Election Day,0002000175,7074480632,Card,Election Day,28,405,21,0002,
 """
 
-ESS_CVR_WITH_TABULATOR_CVR_COLUMN = """Cast Vote Record,Precinct,Ballot Style,Tabulator CVR,Contest 1,Contest 2
-1,p,bs,0001013415,Choice 1-2,Choice 2-1
-2,p,bs,0001013416,Choice 1-1,Choice 2-1
-3,p,bs,0001013417,undervote,Choice 2-1
-4,p,bs,0002003171,overvote,Choice 2-1
-5,p,bs,0002003172,Choice 1-2,Choice 2-1
-6,p,bs,0002003173,Choice 1-1,Choice 2-1
-7,p,bs,0001000415,Choice 1-2,Choice 2-1
-8,p,bs,0001000416,Choice 1-1,Choice 2-1
-9,p,bs,0001000417,Choice 1-2,Choice 2-2
-10,p,bs,0002000171,Choice 1-1,Choice 2-2
-11,p,bs,0002000172,Choice 1-2,Choice 2-2
-12,p,bs,0002000173,Choice 1-1,Choice 2-2
-13,p,bs,0002000174,Choice 1-2,Choice 2-3
-15,p,bs,0002000175,Choice 1-1,Choice 2-3
+ESS_CVR_WITH_TABULATOR_CVR_COLUMN = """Unknown Column,Cast Vote Record,Precinct,Ballot Style,Tabulator CVR,Contest 1,Contest 2
+x,1,p,bs,0001013415,Choice 1-2,Choice 2-1
+x,2,p,bs,0001013416,Choice 1-1,Choice 2-1
+x,3,p,bs,0001013417,undervote,Choice 2-1
+x,4,p,bs,0002003171,overvote,Choice 2-1
+x,5,p,bs,0002003172,Choice 1-2,Choice 2-1
+x,6,p,bs,0002003173,Choice 1-1,Choice 2-1
+x,7,p,bs,0001000415,Choice 1-2,Choice 2-1
+x,8,p,bs,0001000416,Choice 1-1,Choice 2-1
+x,9,p,bs,0001000417,Choice 1-2,Choice 2-2
+x,10,p,bs,0002000171,Choice 1-1,Choice 2-2
+x,11,p,bs,0002000172,Choice 1-2,Choice 2-2
+x,12,p,bs,0002000173,Choice 1-1,Choice 2-2
+x,13,p,bs,0002000174,Choice 1-2,Choice 2-3
+x,15,p,bs,0002000175,Choice 1-1,Choice 2-3
 """
 
 
@@ -1743,6 +1743,7 @@ def test_ess_cvr_upload_cvr_file_with_tabulator_cvr_column(
     election_id: str,
     jurisdiction_ids: List[str],
     ess_manifests,  # pylint: disable=unused-argument
+    snapshot,
 ):
     set_logged_in_user(client, UserType.AUDIT_ADMIN, DEFAULT_AA_EMAIL)
     rv = client.get(f"/api/election/{election_id}/jurisdiction")
@@ -1825,6 +1826,29 @@ def test_ess_cvr_upload_cvr_file_with_tabulator_cvr_column(
         },
     )
 
+    cvr_ballots = (
+        CvrBallot.query.join(Batch)
+        .filter_by(jurisdiction_id=jurisdiction_ids[0])
+        .order_by(CvrBallot.imprinted_id)
+        .all()
+    )
+    assert len(cvr_ballots) == manifest_num_ballots - 1
+    snapshot.assert_match(
+        [
+            dict(
+                batch_name=cvr.batch.name,
+                tabulator=cvr.batch.tabulator,
+                ballot_position=cvr.ballot_position,
+                imprinted_id=cvr.imprinted_id,
+                interpretations=cvr.interpretations,
+            )
+            for cvr in cvr_ballots
+        ]
+    )
+    snapshot.assert_match(
+        Jurisdiction.query.get(jurisdiction_ids[0]).cvr_contests_metadata
+    )
+
 
 def build_hart_cvr(
     batch_name: str,