Skip to content

Add NAN code support to claims_hosp #902

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion _delphi_utils_python/tests/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

from delphi_utils import create_export_csv, Nans


def _clean_directory(directory):
"""Clean files out of a directory."""
for fname in listdir(directory):
Expand Down
42 changes: 36 additions & 6 deletions claims_hosp/delphi_claims_hosp/update_indicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# third party
import numpy as np
import pandas as pd
from delphi_utils import GeoMapper
from delphi_utils import GeoMapper, Nans

# first party
from delphi_utils import Weekday
Expand Down Expand Up @@ -235,7 +235,7 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
all_rates = output_dict["rates"]
all_se = output_dict["se"]
all_include = output_dict["include"]
out_n = 0
out_n, out_i = 0, 0
for i, date in enumerate(dates):
filename = "%s/%s_%s_%s.csv" % (
output_path,
Expand All @@ -244,7 +244,10 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
self.signal_name,
)
with open(filename, "w") as outfile:
outfile.write("geo_id,val,se,direction,sample_size\n")
outfile.write(
"geo_id,val,se,direction,sample_size," +
"missing_val,missing_se,missing_sample_size\n"
)
for geo_id in geo_ids:
val = all_rates[geo_id][i]
se = all_se[geo_id][i]
Expand All @@ -257,11 +260,38 @@ def write_to_csv(self, output_dict, output_path="./receiving"):
if self.write_se:
assert val > 0 and se > 0, "p=0, std_err=0 invalid"
outfile.write(
"%s,%f,%s,%s,%s\n" % (geo_id, val, se, "NA", "NA"))
"%s,%f,%s,%s,%s,%d,%d,%d\n" % (
geo_id, val, se, "NA", "NA",
Nans.NOT_MISSING.value,
Nans.NOT_MISSING.value,
Nans.NOT_APPLICABLE.value
)
)
else:
# for privacy reasons we will not report the standard error
outfile.write(
"%s,%f,%s,%s,%s\n" % (geo_id, val, "NA", "NA", "NA"))
"%s,%f,%s,%s,%s,%d,%d,%d\n" % (
geo_id, val, "NA", "NA", "NA",
Nans.NOT_MISSING.value,
Nans.CENSORED.value,
Nans.NOT_APPLICABLE.value
)
)
out_n += 1
else:
# Write nans out anyway for versioning
logging.warning("writing insufficient data for geo_id {0}, {1}".format(
geo_id, i
))
outfile.write(
"%s,%s,%s,%s,%s,%d,%d,%d\n" % (
geo_id, "NA", "NA", "NA", "NA",
Nans.CENSORED.value,
Nans.CENSORED.value,
Nans.NOT_APPLICABLE.value
)
)
out_i += 1

logging.debug("wrote %d rows for %d %s", out_n, len(geo_ids), geo_level)
logging.debug("wrote %d valued csvs for %d %s", out_n, len(geo_ids), geo_level)
logging.debug("wrote %d nan-valued csvs for %d %s", out_i, len(geo_ids), geo_level)
3 changes: 1 addition & 2 deletions claims_hosp/tests/test_indicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,12 @@ def test_fit_fips(self):
date_range = pd.date_range("2020-05-01", "2020-05-20")
all_fips = self.fips_data.fips.unique()
loc_index_fips_data = self.fips_data.set_index(["fips", "timestamp"])
sample_fips = nr.choice(all_fips, 10)
sample_fips = all_fips[:50]

for fips in sample_fips:
sub_data = loc_index_fips_data.loc[fips]
sub_data = sub_data.reindex(date_range, fill_value=0)
res0 = ClaimsHospIndicator.fit(sub_data, date_range[0], fips)
# first value is burn-in
assert np.min(res0["rate"][1:]) > 0
assert np.max(res0["rate"][1:]) <= 100

Expand Down
14 changes: 8 additions & 6 deletions claims_hosp/tests/test_update_indicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,8 +144,9 @@ def test_write_to_csv_results(self):
expected_name = f"20200502_geography_{Config.signal_name}.csv"
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))
expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([0.1, 1]))
Expand All @@ -159,10 +160,10 @@ def test_write_to_csv_results(self):
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))
assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a"]).all()
assert np.array_equal(output_data.val.values, np.array([0.5]))
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([0.5, np.nan]), equal_nan=True)
assert np.isnan(output_data.se.values).all()
assert np.isnan(output_data.direction.values).all()
assert np.isnan(output_data.sample_size.values).all()
Expand All @@ -171,7 +172,7 @@ def test_write_to_csv_results(self):
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))
assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([1.5, 3]))
Expand Down Expand Up @@ -224,8 +225,9 @@ def test_write_to_csv_with_se_results(self):
expected_name = f"20200502_geography_{signal_name}.csv"
assert exists(join(td.name, expected_name))
output_data = pd.read_csv(join(td.name, expected_name))
expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"]
assert (
output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"]
output_data.columns == expected_columns
).all()
assert (output_data.geo_id == ["a", "b"]).all()
assert np.array_equal(output_data.val.values, np.array([0.1, 1]))
Expand Down