From 5129092ac9b4c10b75364d7e149f0525cef9eea3 Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Tue, 24 May 2022 17:39:45 -0400 Subject: [PATCH 1/4] initial version of concordance script --- dcctools/concordance.py | 69 +++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 2 files changed, 70 insertions(+) create mode 100644 dcctools/concordance.py diff --git a/dcctools/concordance.py b/dcctools/concordance.py new file mode 100644 index 0000000..114fcfa --- /dev/null +++ b/dcctools/concordance.py @@ -0,0 +1,69 @@ +import argparse +from itertools import combinations +import pandas as pd +from pathlib import Path + +from config import Configuration + +c = Configuration("config.json") +link_id_csv_path = Path(c.matching_results_folder) / "link_ids.csv" +link_ids = pd.read_csv(link_id_csv_path, dtype=str, index_col=0) + +parser = argparse.ArgumentParser( + description="Script for performing concordance analysis across site data" +) +parser.add_argument( + "system_data_folder", + help="Path to the folder containing exported system data", +) +args = parser.parse_args() + +systems = c.systems +system_data = {} +system_data_folder = Path(args.system_data_folder) + +for site in systems: + system_data[site] = pd.read_csv(system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0).add_suffix(f"_{site}") + +for n in range(2, len(systems) + 1): + print(f"{n}-wise concordance") + for test_systems in combinations(systems, n): + print(test_systems) + # get the columns(systems) of interest, and drop rows containing any NA + + test_data = link_ids[list(test_systems)].dropna() + + print(f"Link IDs common to all {n} systems: {len(test_data)}") + + if len(test_data) == 0: + continue + + for s in test_systems: + test_data = test_data.merge(system_data[s], left_index=True, right_index=True) + + # index (link_id) no longer useful past this point + test_data = test_data.reset_index() + + for field in ['birth_date', 'sex']: + test_cols = [col for col in test_data.columns if col.startswith(field)] + + data_to_compare = test_data[test_cols] + + # count the number of unique values per row + concordance = data_to_compare.nunique(axis=1).value_counts().to_dict() + # a count of 1 means all columns had the same value == concordance + # the range of possible values is 1..n + # for this analysis there is no difference between 2, 3, ..., n + + concordance_pct = 0 + if 1 in concordance: + concordance_pct = concordance[1] / len(data_to_compare) + + print(f"{field}: {concordance} --> {concordance_pct * 100.0}%") + + print() + + print() + print() + + diff --git a/requirements.txt b/requirements.txt index cc43774..c3c79be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ pytest>=5.3.2 requests>=2.20.0 pymongo>=3.11.2 tqdm>=4.36.1 +pandas>=1.2.1 \ No newline at end of file From f22ad86376d63bc11fb45721b996be695432de4a Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Tue, 24 May 2022 17:44:36 -0400 Subject: [PATCH 2/4] code cleanup --- dcctools/concordance.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/dcctools/concordance.py b/dcctools/concordance.py index 114fcfa..97533bf 100644 --- a/dcctools/concordance.py +++ b/dcctools/concordance.py @@ -1,8 +1,8 @@ import argparse from itertools import combinations -import pandas as pd from pathlib import Path +import pandas as pd from config import Configuration c = Configuration("config.json") @@ -23,7 +23,9 @@ system_data_folder = Path(args.system_data_folder) for site in systems: - system_data[site] = pd.read_csv(system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0).add_suffix(f"_{site}") + system_data[site] = pd.read_csv( + system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0 + ).add_suffix(f"_{site}") for n in range(2, len(systems) + 1): print(f"{n}-wise concordance") @@ -39,12 +41,14 @@ continue for s in test_systems: - test_data = test_data.merge(system_data[s], left_index=True, right_index=True) + test_data = test_data.merge( + system_data[s], left_index=True, right_index=True + ) # index (link_id) no longer useful past this point test_data = test_data.reset_index() - for field in ['birth_date', 'sex']: + for field in ["birth_date", "sex"]: test_cols = [col for col in test_data.columns if col.startswith(field)] data_to_compare = test_data[test_cols] @@ -65,5 +69,3 @@ print() print() - - From a7bdff70d0f887e8a703469244d369e1561f531c Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Wed, 22 Jun 2022 15:28:13 -0400 Subject: [PATCH 3/4] drop dups, plus some review feedback --- dcctools/concordance.py | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/dcctools/concordance.py b/dcctools/concordance.py index 97533bf..9321521 100644 --- a/dcctools/concordance.py +++ b/dcctools/concordance.py @@ -23,9 +23,26 @@ system_data_folder = Path(args.system_data_folder) for site in systems: - system_data[site] = pd.read_csv( - system_data_folder / f"concord_{site}.csv", dtype=str, index_col=0 - ).add_suffix(f"_{site}") + site_data = pd.read_csv( + system_data_folder / f"concord_{site}.csv", + dtype=str, + # the file has many columns, we only care about a few: + usecols=["linkid", "birth_date", "sex"], + ) + # concordance data comes from a use case query, + # which may contain data from multiple years. + # birth_date and sex come from DEMOGRAPHIC table + # which only has one row per individual total, not per year + # so we can drop duplicates on (linkid, birth_date, sex) + site_data = site_data.drop_duplicates() + + # the index isn't considered in the dup check, + # so linkid gets set as index afterward + # (there may be ways to optimize this) + site_data = site_data.set_index("linkid") + site_data = site_data.add_suffix(f"_{site}") + system_data[site] = site_data + for n in range(2, len(systems) + 1): print(f"{n}-wise concordance") @@ -49,9 +66,7 @@ test_data = test_data.reset_index() for field in ["birth_date", "sex"]: - test_cols = [col for col in test_data.columns if col.startswith(field)] - - data_to_compare = test_data[test_cols] + data_to_compare = test_data.filter(regex=f"{field}*", axis=1) # count the number of unique values per row concordance = data_to_compare.nunique(axis=1).value_counts().to_dict() @@ -63,7 +78,7 @@ if 1 in concordance: concordance_pct = concordance[1] / len(data_to_compare) - print(f"{field}: {concordance} --> {concordance_pct * 100.0}%") + print(f"{field}: {concordance} --> {concordance_pct * 100: .2f}%") print() From 04126883102f49e812b7b0bcd94a80c2811bfcbb Mon Sep 17 00:00:00 2001 From: Dylan Hall Date: Mon, 11 Jul 2022 18:21:11 -0400 Subject: [PATCH 4/4] strip whitespace characters in sex column --- dcctools/concordance.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dcctools/concordance.py b/dcctools/concordance.py index 9321521..920d3c8 100644 --- a/dcctools/concordance.py +++ b/dcctools/concordance.py @@ -36,6 +36,11 @@ # so we can drop duplicates on (linkid, birth_date, sex) site_data = site_data.drop_duplicates() + # trying to get sex values to line up. + # prior results showed 0% concordance so we suspect maybe trailing spaces? + site_data["sex"] = site_data["sex"].apply( + lambda x: x.strip() if isinstance(x, str) else x + ) # the index isn't considered in the dup check, # so linkid gets set as index afterward # (there may be ways to optimize this)