diff --git a/_delphi_utils_python/tests/test_export.py b/_delphi_utils_python/tests/test_export.py index d9906300d..7da87ef72 100644 --- a/_delphi_utils_python/tests/test_export.py +++ b/_delphi_utils_python/tests/test_export.py @@ -9,7 +9,6 @@ from delphi_utils import create_export_csv, Nans - def _clean_directory(directory): """Clean files out of a directory.""" for fname in listdir(directory): diff --git a/claims_hosp/delphi_claims_hosp/update_indicator.py b/claims_hosp/delphi_claims_hosp/update_indicator.py index b4169370d..ef416ae3e 100644 --- a/claims_hosp/delphi_claims_hosp/update_indicator.py +++ b/claims_hosp/delphi_claims_hosp/update_indicator.py @@ -13,7 +13,7 @@ # third party import numpy as np import pandas as pd -from delphi_utils import GeoMapper +from delphi_utils import GeoMapper, Nans # first party from delphi_utils import Weekday @@ -235,7 +235,7 @@ def write_to_csv(self, output_dict, output_path="./receiving"): all_rates = output_dict["rates"] all_se = output_dict["se"] all_include = output_dict["include"] - out_n = 0 + out_n, out_i = 0, 0 for i, date in enumerate(dates): filename = "%s/%s_%s_%s.csv" % ( output_path, @@ -244,7 +244,10 @@ def write_to_csv(self, output_dict, output_path="./receiving"): self.signal_name, ) with open(filename, "w") as outfile: - outfile.write("geo_id,val,se,direction,sample_size\n") + outfile.write( + "geo_id,val,se,direction,sample_size," + + "missing_val,missing_se,missing_sample_size\n" + ) for geo_id in geo_ids: val = all_rates[geo_id][i] se = all_se[geo_id][i] @@ -257,11 +260,38 @@ def write_to_csv(self, output_dict, output_path="./receiving"): if self.write_se: assert val > 0 and se > 0, "p=0, std_err=0 invalid" outfile.write( - "%s,%f,%s,%s,%s\n" % (geo_id, val, se, "NA", "NA")) + "%s,%f,%s,%s,%s,%d,%d,%d\n" % ( + geo_id, val, se, "NA", "NA", + Nans.NOT_MISSING.value, + Nans.NOT_MISSING.value, + Nans.NOT_APPLICABLE.value + ) + ) else: # for privacy reasons we will not report the standard error outfile.write( - "%s,%f,%s,%s,%s\n" % (geo_id, val, "NA", "NA", "NA")) + "%s,%f,%s,%s,%s,%d,%d,%d\n" % ( + geo_id, val, "NA", "NA", "NA", + Nans.NOT_MISSING.value, + Nans.CENSORED.value, + Nans.NOT_APPLICABLE.value + ) + ) out_n += 1 + else: + # Write nans out anyway for versioning + logging.warning("writing insufficient data for geo_id {0}, {1}".format( + geo_id, i + )) + outfile.write( + "%s,%s,%s,%s,%s,%d,%d,%d\n" % ( + geo_id, "NA", "NA", "NA", "NA", + Nans.CENSORED.value, + Nans.CENSORED.value, + Nans.NOT_APPLICABLE.value + ) + ) + out_i += 1 - logging.debug("wrote %d rows for %d %s", out_n, len(geo_ids), geo_level) + logging.debug("wrote %d valued csvs for %d %s", out_n, len(geo_ids), geo_level) + logging.debug("wrote %d nan-valued csvs for %d %s", out_i, len(geo_ids), geo_level) diff --git a/claims_hosp/tests/test_indicator.py b/claims_hosp/tests/test_indicator.py index c4a8828a6..2c80a4387 100644 --- a/claims_hosp/tests/test_indicator.py +++ b/claims_hosp/tests/test_indicator.py @@ -57,13 +57,12 @@ def test_fit_fips(self): date_range = pd.date_range("2020-05-01", "2020-05-20") all_fips = self.fips_data.fips.unique() loc_index_fips_data = self.fips_data.set_index(["fips", "timestamp"]) - sample_fips = nr.choice(all_fips, 10) + sample_fips = all_fips[:50] for fips in sample_fips: sub_data = loc_index_fips_data.loc[fips] sub_data = sub_data.reindex(date_range, fill_value=0) res0 = ClaimsHospIndicator.fit(sub_data, date_range[0], fips) - # first value is burn-in assert np.min(res0["rate"][1:]) > 0 assert np.max(res0["rate"][1:]) <= 100 diff --git a/claims_hosp/tests/test_update_indicator.py b/claims_hosp/tests/test_update_indicator.py index 23c901a49..6e7d3e763 100644 --- a/claims_hosp/tests/test_update_indicator.py +++ b/claims_hosp/tests/test_update_indicator.py @@ -144,8 +144,9 @@ def test_write_to_csv_results(self): expected_name = f"20200502_geography_{Config.signal_name}.csv" assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) + expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"] assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() assert (output_data.geo_id == ["a", "b"]).all() assert np.array_equal(output_data.val.values, np.array([0.1, 1])) @@ -159,10 +160,10 @@ def test_write_to_csv_results(self): assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() - assert (output_data.geo_id == ["a"]).all() - assert np.array_equal(output_data.val.values, np.array([0.5])) + assert (output_data.geo_id == ["a", "b"]).all() + assert np.array_equal(output_data.val.values, np.array([0.5, np.nan]), equal_nan=True) assert np.isnan(output_data.se.values).all() assert np.isnan(output_data.direction.values).all() assert np.isnan(output_data.sample_size.values).all() @@ -171,7 +172,7 @@ def test_write_to_csv_results(self): assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() assert (output_data.geo_id == ["a", "b"]).all() assert np.array_equal(output_data.val.values, np.array([1.5, 3])) @@ -224,8 +225,9 @@ def test_write_to_csv_with_se_results(self): expected_name = f"20200502_geography_{signal_name}.csv" assert exists(join(td.name, expected_name)) output_data = pd.read_csv(join(td.name, expected_name)) + expected_columns = ["geo_id", "val", "se", "direction", "sample_size", "missing_val", "missing_se", "missing_sample_size"] assert ( - output_data.columns == ["geo_id", "val", "se", "direction", "sample_size"] + output_data.columns == expected_columns ).all() assert (output_data.geo_id == ["a", "b"]).all() assert np.array_equal(output_data.val.values, np.array([0.1, 1]))