Skip to content

Commit d1ee4ce

Browse files
committed
added/updated tests
1 parent e07c697 commit d1ee4ce

9 files changed

+43
-41
lines changed

doctor_visits/delphi_doctor_visits/config.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,17 @@ class Config:
1919
# data columns
2020
CLI_COLS = ["Covid_like", "Flu_like", "Mixed"]
2121
FLU1_COL = ["Flu1"]
22-
COUNT_COLS = ["Denominator"] + FLU1_COL + CLI_COLS
22+
COUNT_COLS = CLI_COLS + FLU1_COL + ["Denominator"]
2323
DATE_COL = "ServiceDate"
2424
GEO_COL = "PatCountyFIPS"
2525
AGE_COL = "PatAgeGroup"
2626
HRR_COLS = ["Pat HRR Name", "Pat HRR ID"]
27-
ID_COLS = [DATE_COL] + [GEO_COL] + HRR_COLS + [AGE_COL]
28-
FILT_COLS = ID_COLS + COUNT_COLS
2927
# as of 2020-05-11, input file expected to have 10 columns
3028
# id cols: ServiceDate, PatCountyFIPS, PatAgeGroup, Pat HRR ID/Pat HRR Name
3129
# value cols: Denominator, Covid_like, Flu_like, Flu1, Mixed
30+
ID_COLS = [DATE_COL] + [GEO_COL] + HRR_COLS + [AGE_COL]
31+
# drop HRR columns - unused for now since we assign HRRs by FIPS
32+
FILT_COLS = [DATE_COL] + [GEO_COL] + [AGE_COL] + COUNT_COLS
3233
DTYPES = {
3334
"ServiceDate": str,
3435
"PatCountyFIPS": str,

doctor_visits/delphi_doctor_visits/process_data.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ def write_to_csv(output_df: pd.DataFrame, geo_level: str, se:bool, out_name: str
5151
logger.debug(f"wrote {out_n} rows for {geo_level}")
5252

5353

54-
#TODO clean the date params
5554
def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: datetime, logger) -> pd.DataFrame:
5655
'''
5756
Reads csv using Dask and filters out based on date range and currently unused column,
@@ -65,8 +64,9 @@ def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: d
6564
6665
-------
6766
'''
68-
filename = Path(filepath).name
69-
logger.info(f"Processing {filename}")
67+
filepath = Path(filepath)
68+
logger.info(f"Processing {filepath}")
69+
7070
ddata = dd.read_csv(
7171
filepath,
7272
compression="gzip",
@@ -75,7 +75,9 @@ def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: d
7575
)
7676

7777
ddata = ddata.dropna()
78+
# rename inconsistent column names to match config column names
7879
ddata = ddata.rename(columns=Config.DEVIANT_COLS_MAP)
80+
7981
ddata = ddata[Config.FILT_COLS]
8082
ddata[Config.DATE_COL] = dd.to_datetime(ddata[Config.DATE_COL])
8183

@@ -89,5 +91,11 @@ def csv_to_df(filepath: str, startdate: datetime, enddate: datetime, dropdate: d
8991
date_filter = ((ddata[Config.DATE_COL] >= Config.FIRST_DATA_DATE) & (ddata[Config.DATE_COL] < dropdate))
9092

9193
df = ddata[date_filter].compute()
92-
logger.info(f"Done processing {filename}")
94+
95+
# aggregate age groups (so data is unique by service date and FIPS)
96+
df = df.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
97+
assert np.sum(df.duplicated()) == 0, "Duplicates after age group aggregation"
98+
assert (df[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative"
99+
100+
logger.info(f"Done processing {filepath}")
93101
return df

doctor_visits/delphi_doctor_visits/update_sensor.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
# third party
1616
import numpy as np
1717
import pandas as pd
18-
import dask.dataframe as dd
19-
2018

2119
# first party
2220
from delphi_utils import Weekday
@@ -42,10 +40,6 @@ def update_sensor(
4240
se: boolean to write out standard errors, if true, use an obfuscated name
4341
logger: the structured logger
4442
"""
45-
# aggregate age groups (so data is unique by service date and FIPS)
46-
data = data.groupby([Config.DATE_COL, Config.GEO_COL]).sum(numeric_only=True).reset_index()
47-
assert np.sum(data.duplicated()) == 0, "Duplicates after age group aggregation"
48-
assert (data[Config.COUNT_COLS] >= 0).all().all(), "Counts must be nonnegative"
4943

5044
drange = lambda s, e: np.array([s + timedelta(days=x) for x in range((e - s).days)])
5145
fit_dates = drange(Config.FIRST_DATA_DATE, dropdate)

doctor_visits/tests/teset_process_data.py

Lines changed: 0 additions & 24 deletions
This file was deleted.
Binary file not shown.
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
"""Tests for update_sensor.py."""
2+
from datetime import datetime
3+
import logging
4+
import pandas as pd
5+
6+
from delphi_doctor_visits.process_data import csv_to_df
7+
8+
TEST_LOGGER = logging.getLogger()
9+
10+
class TestProcessData:
11+
def test_csv_to_df(self):
12+
actual = csv_to_df(
13+
filepath="./test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.csv.gz",
14+
startdate=datetime(2020, 2, 4),
15+
enddate=datetime(2020, 2, 5),
16+
dropdate=datetime(2020, 2,6),
17+
logger=TEST_LOGGER,
18+
)
19+
20+
comparison = pd.read_pickle("./comparison/process_data/main_after_date_SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.pkl")
21+
pd.testing.assert_frame_equal(actual.reset_index(drop=True), comparison)

doctor_visits/tests/test_update_sensor.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Tests for update_sensor.py."""
2+
from datetime import datetime
23
import logging
34
import pandas as pd
45

@@ -8,11 +9,12 @@
89

910
class TestUpdateSensor:
1011
def test_update_sensor(self):
12+
df = pd.read_pickle("./test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.pkl")
1113
actual = update_sensor(
12-
filepath="./test_data/SYNEDI_AGG_OUTPATIENT_07022020_1455CDT.csv.gz",
13-
startdate="2020-02-04",
14-
enddate="2020-02-05",
15-
dropdate="2020-02-06",
14+
data=df,
15+
startdate=datetime(2020, 2, 4),
16+
enddate=datetime(2020, 2, 5),
17+
dropdate=datetime(2020, 2,6),
1618
geo="state",
1719
parallel=False,
1820
weekday=False,

0 commit comments

Comments
 (0)