Skip to content

Commit 29ba7b9

Browse files
committed
NANS for HHS:
* add missing columns
1 parent 0b7103a commit 29ba7b9

File tree

2 files changed

+111
-81
lines changed

2 files changed

+111
-81
lines changed

hhs_hosp/delphi_hhs/run.py

+80-65
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,13 @@
99

1010
import time
1111
from delphi_epidata import Epidata
12-
from delphi_utils.export import create_export_csv
13-
from delphi_utils.geomap import GeoMapper
14-
from delphi_utils import get_structured_logger
12+
from delphi_utils import create_export_csv, get_structured_logger, Nans, GeoMapper
1513
import numpy as np
1614
import pandas as pd
1715

1816
from .constants import SIGNALS, GEOS, SMOOTHERS, CONFIRMED, SUM_CONF_SUSP, CONFIRMED_FLU
1917

18+
2019
def _date_to_int(d):
2120
"""Return a date object as a yyyymmdd int."""
2221
return int(d.strftime("%Y%m%d"))
@@ -64,6 +63,19 @@ def generate_date_ranges(start, end):
6463
return output
6564

6665

66+
def add_nancodes(df):
67+
"""Add nancodes to a signal dataframe."""
68+
# Default missingness codes
69+
df["missing_val"] = Nans.NOT_MISSING
70+
df["missing_se"] = Nans.NOT_APPLICABLE
71+
df["missing_sample_size"] = Nans.NOT_APPLICABLE
72+
73+
# Mark any remaining nans with unknown
74+
remaining_nans_mask = df["val"].isnull()
75+
df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER
76+
return df
77+
78+
6779
def run_module(params):
6880
"""
6981
Generate ground truth HHS hospitalization data.
@@ -79,16 +91,16 @@ def run_module(params):
7991
"""
8092
start_time = time.time()
8193
logger = get_structured_logger(
82-
__name__, filename=params["common"].get("log_filename"),
83-
log_exceptions=params["common"].get("log_exceptions", True))
94+
__name__,
95+
filename=params["common"].get("log_filename"),
96+
log_exceptions=params["common"].get("log_exceptions", True),
97+
)
8498
mapper = GeoMapper()
8599
request_all_states = ",".join(mapper.get_geo_values("state_id"))
86100
end_day = date.today()
87-
if "epidata" in params["common"] and \
88-
"as_of" in params["common"]["epidata"]:
101+
if "epidata" in params["common"] and "as_of" in params["common"]["epidata"]:
89102
end_day = min(
90-
end_day,
91-
datetime.strptime(str(params["common"]["epidata"]["as_of"]), "%Y%m%d").date()
103+
end_day, datetime.strptime(str(params["common"]["epidata"]["as_of"]), "%Y%m%d").date()
92104
)
93105
past_reference_day = date(year=2020, month=1, day=1) # first available date in DB
94106
date_range = generate_date_ranges(past_reference_day, end_day)
@@ -100,33 +112,32 @@ def run_module(params):
100112
raise Exception(f"Bad result from Epidata for {r}: {response['message']}")
101113
if response["result"] == -2 and r == date_range[-1]: # -2 code means no results
102114
continue
103-
dfs.append(pd.DataFrame(response['epidata']))
115+
dfs.append(pd.DataFrame(response["epidata"]))
104116
all_columns = pd.concat(dfs)
105117
geo_mapper = GeoMapper()
106118
stats = []
107119
for sensor, smoother, geo in product(SIGNALS, SMOOTHERS, GEOS):
108-
logger.info("Generating signal and exporting to CSV",
109-
geo_res = geo,
110-
sensor = sensor,
111-
smoother = smoother)
112-
df = geo_mapper.add_geocode(make_signal(all_columns, sensor),
113-
"state_id",
114-
"state_code",
115-
from_col="state")
120+
logger.info(
121+
"Generating signal and exporting to CSV", geo_res=geo, sensor=sensor, smoother=smoother
122+
)
123+
df = geo_mapper.add_geocode(
124+
make_signal(all_columns, sensor), "state_id", "state_code", from_col="state"
125+
)
116126
if sensor.endswith("_prop"):
117-
df=pop_proportion(df, geo_mapper)
127+
df = pop_proportion(df, geo_mapper)
118128
df = make_geo(df, geo, geo_mapper)
129+
df["se"] = np.nan
130+
df["sample_size"] = np.nan
119131
df = smooth_values(df, smoother[0])
132+
df = add_nancodes(df)
120133
if df.empty:
121134
continue
122135
sensor_name = sensor + smoother[1]
123136
# don't export first 6 days for smoothed signals since they'll be nan.
124137
start_date = min(df.timestamp) + timedelta(6) if smoother[1] else min(df.timestamp)
125-
dates = create_export_csv(df,
126-
params["common"]["export_dir"],
127-
geo,
128-
sensor_name,
129-
start_date=start_date)
138+
dates = create_export_csv(
139+
df, params["common"]["export_dir"], geo, sensor_name, start_date=start_date
140+
)
130141
if len(dates) > 0:
131142
stats.append((max(dates), len(dates)))
132143

@@ -135,71 +146,75 @@ def run_module(params):
135146
csv_export_count = sum(s[-1] for s in stats)
136147
max_lag_in_days = min_max_date and (datetime.now() - min_max_date).days
137148
formatted_min_max_date = min_max_date and min_max_date.strftime("%Y-%m-%d")
138-
logger.info("Completed indicator run",
139-
elapsed_time_in_seconds = elapsed_time_in_seconds,
140-
csv_export_count = csv_export_count,
141-
max_lag_in_days = max_lag_in_days,
142-
oldest_final_export_date = formatted_min_max_date)
149+
logger.info(
150+
"Completed indicator run",
151+
elapsed_time_in_seconds=elapsed_time_in_seconds,
152+
csv_export_count=csv_export_count,
153+
max_lag_in_days=max_lag_in_days,
154+
oldest_final_export_date=formatted_min_max_date,
155+
)
143156

144157

145158
def smooth_values(df, smoother):
146159
"""Smooth the value column in the dataframe."""
147160
df["val"] = df["val"].astype(float)
148-
df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform(
149-
smoother.smooth
150-
)
161+
df["val"] = df[["geo_id", "val"]].groupby("geo_id")["val"].transform(smoother.smooth)
151162
return df
152163

153-
def pop_proportion(df,geo_mapper):
164+
165+
def pop_proportion(df, geo_mapper):
154166
"""Get the population-proportionate variants as the dataframe val."""
155-
pop_val=geo_mapper.add_population_column(df, "state_code")
156-
df["val"]=round(df["val"]/pop_val["population"]*100000, 7)
167+
pop_val = geo_mapper.add_population_column(df, "state_code")
168+
df["val"] = round(df["val"] / pop_val["population"] * 100000, 7)
157169
pop_val.drop("population", axis=1, inplace=True)
158170
return df
159171

172+
160173
def make_geo(state, geo, geo_mapper):
161174
"""Transform incoming geo (state) to another geo."""
162175
if geo == "state":
163176
exported = state.rename(columns={"state": "geo_id"})
164177
else:
165-
exported = geo_mapper.replace_geocode(state, "state_code", geo, new_col="geo_id")
166-
exported["se"] = np.nan
167-
exported["sample_size"] = np.nan
178+
exported = geo_mapper.replace_geocode(
179+
state, "state_code", geo, new_col="geo_id", date_col="timestamp"
180+
)
168181
return exported
169182

170183

171184
def make_signal(all_columns, sig):
172185
"""Generate column sums according to signal name."""
173-
assert sig in SIGNALS, f"Unexpected signal name '{sig}';" + \
174-
" familiar names are '{', '.join(SIGNALS)}'"
186+
assert sig in SIGNALS, (
187+
f"Unexpected signal name '{sig}';" + " familiar names are '{', '.join(SIGNALS)}'"
188+
)
175189
if sig.startswith(CONFIRMED):
176-
df = pd.DataFrame({
177-
"state": all_columns.state.apply(str.lower),
178-
"timestamp":int_date_to_previous_day_datetime(all_columns.date),
179-
"val": \
180-
all_columns.previous_day_admission_adult_covid_confirmed + \
181-
all_columns.previous_day_admission_pediatric_covid_confirmed
182-
})
190+
df = pd.DataFrame(
191+
{
192+
"state": all_columns.state.apply(str.lower),
193+
"timestamp": int_date_to_previous_day_datetime(all_columns.date),
194+
"val": all_columns.previous_day_admission_adult_covid_confirmed
195+
+ all_columns.previous_day_admission_pediatric_covid_confirmed,
196+
}
197+
)
183198
elif sig.startswith(SUM_CONF_SUSP):
184-
df = pd.DataFrame({
185-
"state": all_columns.state.apply(str.lower),
186-
"timestamp":int_date_to_previous_day_datetime(all_columns.date),
187-
"val": \
188-
all_columns.previous_day_admission_adult_covid_confirmed + \
189-
all_columns.previous_day_admission_adult_covid_suspected + \
190-
all_columns.previous_day_admission_pediatric_covid_confirmed + \
191-
all_columns.previous_day_admission_pediatric_covid_suspected,
192-
})
199+
df = pd.DataFrame(
200+
{
201+
"state": all_columns.state.apply(str.lower),
202+
"timestamp": int_date_to_previous_day_datetime(all_columns.date),
203+
"val": all_columns.previous_day_admission_adult_covid_confirmed
204+
+ all_columns.previous_day_admission_adult_covid_suspected
205+
+ all_columns.previous_day_admission_pediatric_covid_confirmed
206+
+ all_columns.previous_day_admission_pediatric_covid_suspected,
207+
}
208+
)
193209
elif sig.startswith(CONFIRMED_FLU):
194-
df = pd.DataFrame({
195-
"state": all_columns.state.apply(str.lower),
196-
"timestamp":int_date_to_previous_day_datetime(all_columns.date),
197-
"val": \
198-
all_columns.previous_day_admission_influenza_confirmed
199-
})
200-
else:
201-
raise Exception(
202-
"Bad programmer: signal '{sig}' in SIGNALS but not handled in make_signal"
210+
df = pd.DataFrame(
211+
{
212+
"state": all_columns.state.apply(str.lower),
213+
"timestamp": int_date_to_previous_day_datetime(all_columns.date),
214+
"val": all_columns.previous_day_admission_influenza_confirmed,
215+
}
203216
)
217+
else:
218+
raise Exception("Bad programmer: signal '{sig}' in SIGNALS but not handled in make_signal")
204219
df["val"] = df.val.astype(float)
205220
return df

hhs_hosp/tests/test_run.py

+31-16
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
import tempfile
55
import os
66

7-
from delphi_hhs.run import _date_to_int, int_date_to_previous_day_datetime, generate_date_ranges, \
7+
from delphi_hhs.run import _date_to_int, add_nancodes, int_date_to_previous_day_datetime, generate_date_ranges, \
88
make_signal, make_geo, run_module, pop_proportion
99
from delphi_hhs.constants import SMOOTHERS, GEOS, SIGNALS, \
1010
CONFIRMED, SUM_CONF_SUSP, CONFIRMED_FLU, CONFIRMED_PROP, SUM_CONF_SUSP_PROP, CONFIRMED_FLU_PROP
11-
from delphi_utils.geomap import GeoMapper
11+
from delphi_utils import GeoMapper, Nans
1212
from freezegun import freeze_time
1313
import numpy as np
1414
import pandas as pd
@@ -85,15 +85,15 @@ def test_make_signal():
8585
})
8686
pd.testing.assert_frame_equal(expected_flu, make_signal(data, CONFIRMED_FLU))
8787
pd.testing.assert_frame_equal(expected_flu, make_signal(data, CONFIRMED_FLU_PROP))
88-
88+
8989
with pytest.raises(Exception):
9090
make_signal(data, "zig")
9191

9292
def test_pop_proportion():
9393
geo_mapper = GeoMapper()
9494
state_pop = geo_mapper.get_crosswalk("state_code", "pop")
9595

96-
test_df = pd.DataFrame({
96+
test_df = pd.DataFrame({
9797
'state': ['PA'],
9898
'state_code': [42],
9999
'timestamp': [datetime(year=2020, month=1, day=1)],
@@ -109,7 +109,7 @@ def test_pop_proportion():
109109
'val': [15/pa_pop*100000],})
110110
)
111111

112-
test_df= pd.DataFrame({
112+
test_df= pd.DataFrame({
113113
'state': ['WV'],
114114
'state_code': [54],
115115
'timestamp': [datetime(year=2020, month=1, day=1)],
@@ -137,30 +137,23 @@ def test_make_geo():
137137
'val': [1., 2., 4.],
138138
})
139139

140-
template = {
141-
'se': np.nan,
142-
'sample_size': np.nan,
143-
}
144140
expecteds = {
145141
"state": pd.DataFrame(
146-
dict(template,
147-
geo_id=data.state,
142+
dict(geo_id=data.state,
148143
timestamp=data.timestamp,
149144
val=data.val)),
150145
"hhs": pd.DataFrame(
151-
dict(template,
152-
geo_id=['3', '5'],
146+
dict(geo_id=['3', '5'],
153147
timestamp=[test_timestamp] * 2,
154148
val=[3., 4.])),
155149
"nation": pd.DataFrame(
156-
dict(template,
157-
geo_id=['us'],
150+
dict(geo_id=['us'],
158151
timestamp=[test_timestamp],
159152
val=[7.]))
160153
}
161154
for geo, expected in expecteds.items():
162155
result = make_geo(data, geo, geo_mapper)
163-
for series in ["geo_id", "timestamp", "val", "se", "sample_size"]:
156+
for series in ["geo_id", "timestamp", "val"]:
164157
pd.testing.assert_series_equal(expected[series], result[series], obj=f"{geo}:{series}")
165158

166159

@@ -207,3 +200,25 @@ def test_ignore_last_range_no_results(mock_covid_hosp, mock_export):
207200
}
208201
}
209202
assert not run_module(params) # function should not raise value error and has no return value
203+
204+
def test_add_nancode():
205+
data = pd.DataFrame({
206+
'state': ['PA','WV','OH'],
207+
'state_code': [42, 54, 39],
208+
'timestamp': [pd.to_datetime("20200601")]*3,
209+
'val': [1, 2, np.nan],
210+
'se': [np.nan] * 3,
211+
'sample_size': [np.nan] * 3,
212+
})
213+
expected = pd.DataFrame({
214+
'state': ['PA','WV','OH'],
215+
'state_code': [42, 54, 39],
216+
'timestamp': [pd.to_datetime("20200601")]*3,
217+
'val': [1, 2, np.nan],
218+
'se': [np.nan] * 3,
219+
'sample_size': [np.nan] * 3,
220+
'missing_val': [Nans.NOT_MISSING] * 2 + [Nans.OTHER],
221+
'missing_se': [Nans.NOT_APPLICABLE] * 3,
222+
'missing_sample_size': [Nans.NOT_APPLICABLE] * 3,
223+
})
224+
pd.testing.assert_frame_equal(expected, add_nancodes(data))

0 commit comments

Comments
 (0)