From 5129092ac9b4c10b75364d7e149f0525cef9eea3 Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Tue, 24 May 2022 17:39:45 -0400
Subject: [PATCH 1/4] initial version of concordance script

---
 dcctools/concordance.py | 69 +++++++++++++++++++++++++++++++++++++++++
 requirements.txt        |  1 +
 2 files changed, 70 insertions(+)
 create mode 100644 dcctools/concordance.py

diff --git a/dcctools/concordance.py b/dcctools/concordance.py
new file mode 100644
index 0000000..114fcfa
--- /dev/null
+++ b/dcctools/concordance.py
@@ -0,0 +1,69 @@
+import argparse
+from itertools import combinations
+import pandas as pd
+from pathlib import Path
+
+from config import Configuration
+
+c = Configuration("config.json")
+link_id_csv_path = Path(c.matching_results_folder) / "link_ids.csv"
+link_ids = pd.read_csv(link_id_csv_path, dtype=str, index_col=0)
+
+parser = argparse.ArgumentParser(
+    description="Script for performing concordance analysis across site data"
+)
+parser.add_argument(
+    "system_data_folder",
+    help="Path to the folder containing exported system data",
+)
+args = parser.parse_args()
+
+systems = c.systems
+system_data = {}
+system_data_folder = Path(args.system_data_folder)
+
+for site in systems:
+    system_data[site] = pd.read_csv(system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0).add_suffix(f"_{site}")
+
+for n in range(2, len(systems) + 1):
+    print(f"{n}-wise concordance")
+    for test_systems in combinations(systems, n):
+        print(test_systems)
+        # get the columns(systems) of interest, and drop rows containing any NA
+
+        test_data = link_ids[list(test_systems)].dropna()
+
+        print(f"Link IDs common to all {n} systems: {len(test_data)}")
+
+        if len(test_data) == 0:
+            continue
+
+        for s in test_systems:
+            test_data = test_data.merge(system_data[s], left_index=True, right_index=True)
+
+        # index (link_id) no longer useful past this point
+        test_data = test_data.reset_index()
+
+        for field in ['birth_date', 'sex']:
+            test_cols = [col for col in test_data.columns if col.startswith(field)]
+
+            data_to_compare = test_data[test_cols]
+
+            # count the number of unique values per row
+            concordance = data_to_compare.nunique(axis=1).value_counts().to_dict()
+            # a count of 1 means all columns had the same value == concordance
+            # the range of possible values is 1..n
+            # for this analysis there is no difference between 2, 3, ..., n
+
+            concordance_pct = 0
+            if 1 in concordance:
+                concordance_pct = concordance[1] / len(data_to_compare)
+
+            print(f"{field}: {concordance} --> {concordance_pct * 100.0}%")
+
+        print()
+
+    print()
+    print()
+
+
diff --git a/requirements.txt b/requirements.txt
index cc43774..c3c79be 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,3 +2,4 @@ pytest>=5.3.2
 requests>=2.20.0
 pymongo>=3.11.2
 tqdm>=4.36.1
+pandas>=1.2.1
\ No newline at end of file

From f22ad86376d63bc11fb45721b996be695432de4a Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Tue, 24 May 2022 17:44:36 -0400
Subject: [PATCH 2/4] code cleanup

---
 dcctools/concordance.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/dcctools/concordance.py b/dcctools/concordance.py
index 114fcfa..97533bf 100644
--- a/dcctools/concordance.py
+++ b/dcctools/concordance.py
@@ -1,8 +1,8 @@
 import argparse
 from itertools import combinations
-import pandas as pd
 from pathlib import Path
 
+import pandas as pd
 from config import Configuration
 
 c = Configuration("config.json")
@@ -23,7 +23,9 @@
 system_data_folder = Path(args.system_data_folder)
 
 for site in systems:
-    system_data[site] = pd.read_csv(system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0).add_suffix(f"_{site}")
+    system_data[site] = pd.read_csv(
+        system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0
+    ).add_suffix(f"_{site}")
 
 for n in range(2, len(systems) + 1):
     print(f"{n}-wise concordance")
@@ -39,12 +41,14 @@
             continue
 
         for s in test_systems:
-            test_data = test_data.merge(system_data[s], left_index=True, right_index=True)
+            test_data = test_data.merge(
+                system_data[s], left_index=True, right_index=True
+            )
 
         # index (link_id) no longer useful past this point
         test_data = test_data.reset_index()
 
-        for field in ['birth_date', 'sex']:
+        for field in ["birth_date", "sex"]:
             test_cols = [col for col in test_data.columns if col.startswith(field)]
 
             data_to_compare = test_data[test_cols]
@@ -65,5 +69,3 @@
 
     print()
     print()
-
-

From a7bdff70d0f887e8a703469244d369e1561f531c Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Wed, 22 Jun 2022 15:28:13 -0400
Subject: [PATCH 3/4] drop dups, plus some review feedback

---
 dcctools/concordance.py | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/dcctools/concordance.py b/dcctools/concordance.py
index 97533bf..9321521 100644
--- a/dcctools/concordance.py
+++ b/dcctools/concordance.py
@@ -23,9 +23,26 @@
 system_data_folder = Path(args.system_data_folder)
 
 for site in systems:
-    system_data[site] = pd.read_csv(
-        system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0
-    ).add_suffix(f"_{site}")
+    site_data = pd.read_csv(
+        system_data_folder / f"concord_{site}.csv",
+        dtype=str,
+        # the file has many columns, we only care about a few:
+        usecols=["linkid", "birth_date", "sex"],
+    )
+    # concordance data comes from a use case query,
+    # which may contain data from multiple years.
+    # birth_date and sex come from DEMOGRAPHIC table
+    # which only has one row per individual total, not per year
+    # so we can drop duplicates on (linkid, birth_date, sex)
+    site_data = site_data.drop_duplicates()
+
+    # the index isn't considered in the dup check,
+    # so linkid gets set as index afterward
+    # (there may be ways to optimize this)
+    site_data = site_data.set_index("linkid")
+    site_data = site_data.add_suffix(f"_{site}")
+    system_data[site] = site_data
+
 
 for n in range(2, len(systems) + 1):
     print(f"{n}-wise concordance")
@@ -49,9 +66,7 @@
         test_data = test_data.reset_index()
 
         for field in ["birth_date", "sex"]:
-            test_cols = [col for col in test_data.columns if col.startswith(field)]
-
-            data_to_compare = test_data[test_cols]
+            data_to_compare = test_data.filter(regex=f"{field}*", axis=1)
 
             # count the number of unique values per row
             concordance = data_to_compare.nunique(axis=1).value_counts().to_dict()
@@ -63,7 +78,7 @@
             if 1 in concordance:
                 concordance_pct = concordance[1] / len(data_to_compare)
 
-            print(f"{field}: {concordance} --> {concordance_pct * 100.0}%")
+            print(f"{field}: {concordance} --> {concordance_pct * 100: .2f}%")
 
         print()
 

From 04126883102f49e812b7b0bcd94a80c2811bfcbb Mon Sep 17 00:00:00 2001
From: Dylan Hall <dehall@mitre.org>
Date: Mon, 11 Jul 2022 18:21:11 -0400
Subject: [PATCH 4/4] strip whitespace characters in sex column

---
 dcctools/concordance.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/dcctools/concordance.py b/dcctools/concordance.py
index 9321521..920d3c8 100644
--- a/dcctools/concordance.py
+++ b/dcctools/concordance.py
@@ -36,6 +36,11 @@
     # so we can drop duplicates on (linkid, birth_date, sex)
     site_data = site_data.drop_duplicates()
 
+    # trying to get sex values to line up.
+    # prior results showed 0% concordance so we suspect maybe trailing spaces?
+    site_data["sex"] = site_data["sex"].apply(
+        lambda x: x.strip() if isinstance(x, str) else x
+    )
     # the index isn't considered in the dup check,
     # so linkid gets set as index afterward
     # (there may be ways to optimize this)