Skip to content

Commit ecd8381

Browse files
authored
2130 deal with large number of null values in nssp data (#2141)
* delete combined signals + adjust tests * Revert "delete combined signals + adjust tests" This reverts commit 8fccf03. * no null in csv + adjust tests accordingly * simplify remove rows with missing values * add comments * lint * remove unnecessary deep=true in run.py * add test * add page_no_data.json * remove set caplog level to warning + rename test_output_files * add nation assert * simplify test_empty_data * revert conftest.py * fully revert conftest.py
1 parent 857ede4 commit ecd8381

File tree

6 files changed

+171
-29
lines changed

6 files changed

+171
-29
lines changed

nssp/delphi_nssp/run.py

+7
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,13 @@ def run_module(params, logger=None):
144144
missing_cols = set(CSV_COLS) - set(df.columns)
145145
df = add_needed_columns(df, col_names=list(missing_cols))
146146
df_csv = df[CSV_COLS + ["timestamp"]]
147+
148+
# remove rows with missing values
149+
df_csv = df_csv[df_csv["val"].notnull()]
150+
if df_csv.empty:
151+
logger.warning("No data for signal and geo combination", signal=signal, geo=geo)
152+
continue
153+
147154
# actual export
148155
dates = create_export_csv(
149156
df_csv,

nssp/tests/test_data/page.json

+34
Original file line numberDiff line numberDiff line change
@@ -196,5 +196,39 @@
196196
"fips": "8101",
197197
"trend_source": "HSA",
198198
"buildnumber": "2025-02-28"
199+
},
200+
{
201+
"week_end":"2022-10-15T00:00:00.000",
202+
"geography":"Colorado",
203+
"county":"Chaffee",
204+
"ed_trends_covid":"Data Unavailable",
205+
"ed_trends_influenza":"Data Unavailable",
206+
"ed_trends_rsv":"Data Unavailable",
207+
"hsa":"Chaffee, CO - Lake, CO",
208+
"hsa_counties":"Chaffee, Lake",
209+
"hsa_nci_id":"786",
210+
"fips":"8015",
211+
"trend_source":"HSA",
212+
"buildnumber":"2025-02-28"
213+
},
214+
{
215+
"week_end":"2022-10-15T00:00:00.000",
216+
"geography":"Colorado",
217+
"county":"Arapahoe",
218+
"percent_visits_covid": "1",
219+
"percent_visits_influenza": "1",
220+
"percent_visits_rsv": "1",
221+
"percent_visits_smoothed_covid": "1",
222+
"percent_visits_smoothed_1": "1",
223+
"percent_visits_smoothed_rsv": "1",
224+
"ed_trends_covid":"Decreasing",
225+
"ed_trends_influenza":"Decreasing",
226+
"ed_trends_rsv":"Decreasing",
227+
"hsa":"Denver (Denver), CO - Jefferson, CO",
228+
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
229+
"hsa_nci_id":"688",
230+
"fips":"8005",
231+
"trend_source":"HSA",
232+
"buildnumber":"2025-03-28"
199233
}
200234
]

nssp/tests/test_data/page_100_hrr.json

+34
Original file line numberDiff line numberDiff line change
@@ -196,5 +196,39 @@
196196
"fips": "8101",
197197
"trend_source": "HSA",
198198
"buildnumber": "2025-02-28"
199+
},
200+
{
201+
"week_end":"2022-10-15T00:00:00.000",
202+
"geography":"Colorado",
203+
"county":"Chaffee",
204+
"ed_trends_covid":"Data Unavailable",
205+
"ed_trends_influenza":"Data Unavailable",
206+
"ed_trends_rsv":"Data Unavailable",
207+
"hsa":"Chaffee, CO - Lake, CO",
208+
"hsa_counties":"Chaffee, Lake",
209+
"hsa_nci_id":"786",
210+
"fips":"8015",
211+
"trend_source":"HSA",
212+
"buildnumber":"2025-02-28"
213+
},
214+
{
215+
"week_end":"2022-10-15T00:00:00.000",
216+
"geography":"Colorado",
217+
"county":"Arapahoe",
218+
"percent_visits_covid": "100",
219+
"percent_visits_influenza": "100",
220+
"percent_visits_rsv": "100",
221+
"percent_visits_smoothed_covid": "100",
222+
"percent_visits_smoothed_1": "100",
223+
"percent_visits_smoothed_rsv": "100",
224+
"ed_trends_covid":"Decreasing",
225+
"ed_trends_influenza":"Decreasing",
226+
"ed_trends_rsv":"Decreasing",
227+
"hsa":"Denver (Denver), CO - Jefferson, CO",
228+
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
229+
"hsa_nci_id":"688",
230+
"fips":"8005",
231+
"trend_source":"HSA",
232+
"buildnumber":"2025-03-28"
199233
}
200234
]
+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
[
2+
{
3+
"week_end":"2022-10-15T00:00:00.000",
4+
"geography":"United States",
5+
"county":"All",
6+
"percent_visits_combined":"2.0",
7+
"percent_visits_covid":"1.63",
8+
"percent_visits_influenza":"0.17",
9+
"percent_visits_rsv":"0.21",
10+
"percent_visits_smoothed":"1.78",
11+
"percent_visits_smoothed_covid":"1.54",
12+
"percent_visits_smoothed_1":"0.12",
13+
"percent_visits_smoothed_rsv":"0.12",
14+
"ed_trends_covid":"Decreasing",
15+
"ed_trends_influenza":"No Change",
16+
"ed_trends_rsv":"Increasing",
17+
"hsa":"All",
18+
"hsa_counties":"All",
19+
"hsa_nci_id":"All",
20+
"fips":"0",
21+
"trend_source":"United States",
22+
"buildnumber":"2025-02-08"
23+
},
24+
{
25+
"week_end":"2022-10-15T00:00:00.000",
26+
"geography":"Colorado",
27+
"county":"Chaffee",
28+
"ed_trends_covid":"Data Unavailable",
29+
"ed_trends_influenza":"Data Unavailable",
30+
"ed_trends_rsv":"Data Unavailable",
31+
"hsa":"Chaffee, CO - Lake, CO",
32+
"hsa_counties":"Chaffee, Lake",
33+
"hsa_nci_id":"786",
34+
"fips":"8015",
35+
"trend_source":"HSA",
36+
"buildnumber":"2025-02-28"
37+
},
38+
{
39+
"week_end":"2022-10-15T00:00:00.000",
40+
"geography":"Colorado",
41+
"county":"Arapahoe",
42+
"ed_trends_covid":"Data Unavailable",
43+
"ed_trends_influenza":"Data Unavailable",
44+
"ed_trends_rsv":"Data Unavailable",
45+
"hsa":"Denver (Denver), CO - Jefferson, CO",
46+
"hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
47+
"hsa_nci_id":"688",
48+
"fips":"8005",
49+
"trend_source":"HSA",
50+
"buildnumber":"2025-03-28"
51+
}
52+
]

nssp/tests/test_pull.py

-4
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,5 @@ def test_normal_pull_nssp_data(self, mock_socrata, params, caplog):
9090
assert result["fips"].notnull().all(), "fips has rogue NaN"
9191
assert result["fips"].apply(lambda x: isinstance(x, str) and len(x) != 4).all(), "fips formatting should always be 5 digits; include leading zeros if aplicable"
9292

93-
# Check for each signal in SIGNALS
94-
for signal in SIGNALS:
95-
assert result[signal].notnull().all(), f"{signal} has rogue NaN"
96-
9793
for file in backup_files:
9894
os.remove(file)

nssp/tests/test_run.py

+44-25
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,26 @@
11
import glob
2-
from datetime import datetime, date
3-
import json
2+
import logging
3+
import os
44
from pathlib import Path
5+
import json
56
from unittest.mock import patch
6-
import tempfile
7-
import os
8-
import time
9-
from datetime import datetime
10-
117
import numpy as np
128
import pandas as pd
9+
from delphi_nssp.constants import GEOS, SIGNALS_MAP, DATASET_ID
10+
from delphi_nssp.run import add_needed_columns, run_module
1311
from epiweeks import Week
14-
from pandas.testing import assert_frame_equal
15-
from delphi_nssp.constants import GEOS, SIGNALS, SIGNALS_MAP, DATASET_ID
16-
from delphi_nssp.run import (
17-
add_needed_columns
18-
)
1912

13+
TEST_DIR = Path(__file__).parent
14+
15+
def remove_backup_and_receiving(params):
16+
export_dir = params["common"]["export_dir"]
17+
for file in Path(export_dir).glob("*.csv"):
18+
os.remove(file)
19+
20+
today = pd.Timestamp.today().strftime("%Y%m%d")
21+
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
22+
for file in backup_dir:
23+
os.remove(file)
2024

2125
class TestRun:
2226
def test_add_needed_columns(self):
@@ -42,7 +46,7 @@ def generate_week_file_prefix(self, dates):
4246
]
4347
return date_prefix
4448

45-
def test_output_files_exist(self, params, run_as_module):
49+
def test_output_files(self, params, run_as_module):
4650
export_dir = params["common"]["export_dir"]
4751
csv_files = [f.name for f in Path(export_dir).glob("*.csv")]
4852

@@ -68,13 +72,10 @@ def test_output_files_exist(self, params, run_as_module):
6872
]
6973
assert set(expected_columns).issubset(set(df.columns.values))
7074

71-
for file in Path(export_dir).glob("*.csv"):
72-
os.remove(file)
75+
# Verify that there's no NA/empty values in the val columns
76+
assert not df["val"].isnull().any()
7377

74-
today = pd.Timestamp.today().strftime("%Y%m%d")
75-
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
76-
for file in backup_dir:
77-
os.remove(file)
78+
remove_backup_and_receiving(params)
7879

7980
def test_valid_hrr(self, run_as_module_hrr, params):
8081
export_dir = params["common"]["export_dir"]
@@ -85,10 +86,28 @@ def test_valid_hrr(self, run_as_module_hrr, params):
8586
df = pd.read_csv(f)
8687
assert (df.val == 100).all()
8788

88-
for file in Path(export_dir).glob("*.csv"):
89-
os.remove(file)
89+
remove_backup_and_receiving(params)
90+
91+
@patch("sodapy.Socrata.get")
92+
def test_empty_data(self, mock_get, params, caplog):
93+
"""
94+
Tests correct handling when there is a geo and signal combination that has no data.
95+
"""
96+
97+
with open(f"{TEST_DIR}/test_data/page_no_data.json", "r") as f:
98+
EMPTY_TEST_DATA = json.load(f)
99+
mock_get.side_effect = [EMPTY_TEST_DATA, []]
100+
run_module(params)
101+
102+
assert "No data for signal and geo combination" in caplog.text
103+
104+
export_dir = params["common"]["export_dir"]
105+
csv_files = [f for f in Path(export_dir).glob("*.csv")]
106+
107+
# Since only one national entry in page_no_data.json with numeric data,
108+
# while the two counties have no numeric fields,
109+
# there should be no county, hrr, hhs, or msa files.
110+
assert not any(geo in f.name for geo in ["county", "hrr", "hhs", "msa"] for f in csv_files)
111+
assert all("nation" in f.name for f in csv_files)
90112

91-
today = pd.Timestamp.today().strftime("%Y%m%d")
92-
backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
93-
for file in backup_dir:
94-
os.remove(file)
113+
remove_backup_and_receiving(params)

0 commit comments

Comments
 (0)