2130 deal with large number of null values in nssp data (#2141)

minhkhul · web-flow · commit ecd838196cc3 · 2025-05-13T09:40:24.000-04:00
* delete combined signals + adjust tests * Revert "delete combined signals + adjust tests" This reverts commit 8fccf03. * no null in csv + adjust tests accordingly * simplify remove rows with missing values * add comments * lint * remove unnecessary deep=true in run.py * add test * add page_no_data.json * remove set caplog level to warning + rename test_output_files * add nation assert * simplify test_empty_data * revert conftest.py * fully revert conftest.py
diff --git a/nssp/delphi_nssp/run.py b/nssp/delphi_nssp/run.py
@@ -144,6 +144,13 @@ def run_module(params, logger=None):
             missing_cols = set(CSV_COLS) - set(df.columns)
             df = add_needed_columns(df, col_names=list(missing_cols))
             df_csv = df[CSV_COLS + ["timestamp"]]
+
+            # remove rows with missing values
+            df_csv = df_csv[df_csv["val"].notnull()]
+            if df_csv.empty:
+                logger.warning("No data for signal and geo combination", signal=signal, geo=geo)
+                continue
+
             # actual export
             dates = create_export_csv(
                 df_csv,
diff --git a/nssp/tests/test_data/page.json b/nssp/tests/test_data/page.json
@@ -196,5 +196,39 @@
         "fips": "8101",
         "trend_source": "HSA",
         "buildnumber": "2025-02-28"
+    },
+    {
+        "week_end":"2022-10-15T00:00:00.000",
+        "geography":"Colorado",
+        "county":"Chaffee",
+        "ed_trends_covid":"Data Unavailable",
+        "ed_trends_influenza":"Data Unavailable",
+        "ed_trends_rsv":"Data Unavailable",
+        "hsa":"Chaffee, CO - Lake, CO",
+        "hsa_counties":"Chaffee, Lake",
+        "hsa_nci_id":"786",
+        "fips":"8015",
+        "trend_source":"HSA",
+        "buildnumber":"2025-02-28"
+    },
+    {
+        "week_end":"2022-10-15T00:00:00.000",
+        "geography":"Colorado",
+        "county":"Arapahoe",
+        "percent_visits_covid": "1",
+        "percent_visits_influenza": "1",
+        "percent_visits_rsv": "1",
+        "percent_visits_smoothed_covid": "1",
+        "percent_visits_smoothed_1": "1",
+        "percent_visits_smoothed_rsv": "1",
+        "ed_trends_covid":"Decreasing",
+        "ed_trends_influenza":"Decreasing",
+        "ed_trends_rsv":"Decreasing",
+        "hsa":"Denver (Denver), CO - Jefferson, CO",
+        "hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
+        "hsa_nci_id":"688",
+        "fips":"8005",
+        "trend_source":"HSA",
+        "buildnumber":"2025-03-28"
     }
 ]
diff --git a/nssp/tests/test_data/page_100_hrr.json b/nssp/tests/test_data/page_100_hrr.json
@@ -196,5 +196,39 @@
         "fips": "8101",
         "trend_source": "HSA",
         "buildnumber": "2025-02-28"
+    },
+    {
+        "week_end":"2022-10-15T00:00:00.000",
+        "geography":"Colorado",
+        "county":"Chaffee",
+        "ed_trends_covid":"Data Unavailable",
+        "ed_trends_influenza":"Data Unavailable",
+        "ed_trends_rsv":"Data Unavailable",
+        "hsa":"Chaffee, CO - Lake, CO",
+        "hsa_counties":"Chaffee, Lake",
+        "hsa_nci_id":"786",
+        "fips":"8015",
+        "trend_source":"HSA",
+        "buildnumber":"2025-02-28"
+    },
+    {
+        "week_end":"2022-10-15T00:00:00.000",
+        "geography":"Colorado",
+        "county":"Arapahoe",
+        "percent_visits_covid": "100",
+        "percent_visits_influenza": "100",
+        "percent_visits_rsv": "100",
+        "percent_visits_smoothed_covid": "100",
+        "percent_visits_smoothed_1": "100",
+        "percent_visits_smoothed_rsv": "100",
+        "ed_trends_covid":"Decreasing",
+        "ed_trends_influenza":"Decreasing",
+        "ed_trends_rsv":"Decreasing",
+        "hsa":"Denver (Denver), CO - Jefferson, CO",
+        "hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
+        "hsa_nci_id":"688",
+        "fips":"8005",
+        "trend_source":"HSA",
+        "buildnumber":"2025-03-28"
     }
 ]
diff --git a/nssp/tests/test_data/page_no_data.json b/nssp/tests/test_data/page_no_data.json
@@ -0,0 +1,52 @@
+[
+    {
+        "week_end":"2022-10-15T00:00:00.000",
+        "geography":"United States",
+        "county":"All",
+        "percent_visits_combined":"2.0",
+        "percent_visits_covid":"1.63",
+        "percent_visits_influenza":"0.17",
+        "percent_visits_rsv":"0.21",
+        "percent_visits_smoothed":"1.78",
+        "percent_visits_smoothed_covid":"1.54",
+        "percent_visits_smoothed_1":"0.12",
+        "percent_visits_smoothed_rsv":"0.12",
+        "ed_trends_covid":"Decreasing",
+        "ed_trends_influenza":"No Change",
+        "ed_trends_rsv":"Increasing",
+        "hsa":"All",
+        "hsa_counties":"All",
+        "hsa_nci_id":"All",
+        "fips":"0",
+        "trend_source":"United States",
+        "buildnumber":"2025-02-08"
+    },
+    {
+        "week_end":"2022-10-15T00:00:00.000",
+        "geography":"Colorado",
+        "county":"Chaffee",
+        "ed_trends_covid":"Data Unavailable",
+        "ed_trends_influenza":"Data Unavailable",
+        "ed_trends_rsv":"Data Unavailable",
+        "hsa":"Chaffee, CO - Lake, CO",
+        "hsa_counties":"Chaffee, Lake",
+        "hsa_nci_id":"786",
+        "fips":"8015",
+        "trend_source":"HSA",
+        "buildnumber":"2025-02-28"
+    },
+    {
+        "week_end":"2022-10-15T00:00:00.000",
+        "geography":"Colorado",
+        "county":"Arapahoe",
+        "ed_trends_covid":"Data Unavailable",
+        "ed_trends_influenza":"Data Unavailable",
+        "ed_trends_rsv":"Data Unavailable",
+        "hsa":"Denver (Denver), CO - Jefferson, CO",
+        "hsa_counties":"Adams, Arapahoe, Clear Creek, Denver, Douglas, Elbert, Gilpin, Grand, Jefferson, Park, Summit",
+        "hsa_nci_id":"688",
+        "fips":"8005",
+        "trend_source":"HSA",
+        "buildnumber":"2025-03-28"
+    }
+]
diff --git a/nssp/tests/test_pull.py b/nssp/tests/test_pull.py
@@ -90,9 +90,5 @@ def test_normal_pull_nssp_data(self, mock_socrata, params, caplog):
         assert result["fips"].notnull().all(), "fips has rogue NaN"
         assert result["fips"].apply(lambda x: isinstance(x, str) and len(x) != 4).all(), "fips formatting should always be 5 digits; include leading zeros if aplicable"
 
-        # Check for each signal in SIGNALS
-        for signal in SIGNALS:
-            assert result[signal].notnull().all(), f"{signal} has rogue NaN"
-
         for file in backup_files:
             os.remove(file)
diff --git a/nssp/tests/test_run.py b/nssp/tests/test_run.py
@@ -1,22 +1,26 @@
 import glob
-from datetime import datetime, date
-import json
+import logging
+import os
 from pathlib import Path
+import json
 from unittest.mock import patch
-import tempfile
-import os
-import time
-from datetime import datetime
-
 import numpy as np
 import pandas as pd
+from delphi_nssp.constants import GEOS, SIGNALS_MAP, DATASET_ID
+from delphi_nssp.run import add_needed_columns, run_module
 from epiweeks import Week
-from pandas.testing import assert_frame_equal
-from delphi_nssp.constants import GEOS, SIGNALS, SIGNALS_MAP, DATASET_ID
-from delphi_nssp.run import (
-    add_needed_columns
-)
 
+TEST_DIR = Path(__file__).parent
+
+def remove_backup_and_receiving(params):
+    export_dir = params["common"]["export_dir"]
+    for file in Path(export_dir).glob("*.csv"):
+        os.remove(file)
+
+    today = pd.Timestamp.today().strftime("%Y%m%d")
+    backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
+    for file in backup_dir:
+        os.remove(file)
 
 class TestRun:
     def test_add_needed_columns(self):
@@ -42,7 +46,7 @@ def generate_week_file_prefix(self, dates):
         ]
         return date_prefix
 
-    def test_output_files_exist(self, params, run_as_module):
+    def test_output_files(self, params, run_as_module):
         export_dir = params["common"]["export_dir"]
         csv_files = [f.name for f in Path(export_dir).glob("*.csv")]
 
@@ -68,13 +72,10 @@ def test_output_files_exist(self, params, run_as_module):
             ]
             assert set(expected_columns).issubset(set(df.columns.values))
 
-        for file in Path(export_dir).glob("*.csv"):
-            os.remove(file)
+            # Verify that there's no NA/empty values in the val columns
+            assert not df["val"].isnull().any()
 
-        today = pd.Timestamp.today().strftime("%Y%m%d")
-        backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
-        for file in backup_dir:
-            os.remove(file)
+        remove_backup_and_receiving(params)
 
     def test_valid_hrr(self, run_as_module_hrr, params):
         export_dir = params["common"]["export_dir"]
@@ -85,10 +86,28 @@ def test_valid_hrr(self, run_as_module_hrr, params):
             df = pd.read_csv(f)
             assert (df.val == 100).all()
 
-        for file in Path(export_dir).glob("*.csv"):
-            os.remove(file)
+        remove_backup_and_receiving(params)
+
+    @patch("sodapy.Socrata.get")
+    def test_empty_data(self, mock_get, params, caplog):
+        """
+        Tests correct handling when there is a geo and signal combination that has no data.
+        """
+
+        with open(f"{TEST_DIR}/test_data/page_no_data.json", "r") as f:
+            EMPTY_TEST_DATA = json.load(f)
+        mock_get.side_effect = [EMPTY_TEST_DATA, []]
+        run_module(params)
+
+        assert "No data for signal and geo combination" in caplog.text
+
+        export_dir = params["common"]["export_dir"]
+        csv_files = [f for f in Path(export_dir).glob("*.csv")]
+
+        # Since only one national entry in page_no_data.json with numeric data,
+        # while the two counties have no numeric fields, 
+        # there should be no county, hrr, hhs, or msa files.
+        assert not any(geo in f.name for geo in ["county", "hrr", "hhs", "msa"] for f in csv_files)
+        assert all("nation" in f.name for f in csv_files)
 
-        today = pd.Timestamp.today().strftime("%Y%m%d")
-        backup_dir = glob.glob(f"{Path(params['common']['backup_dir'])}/{today}*")
-        for file in backup_dir:
-            os.remove(file)
+        remove_backup_and_receiving(params)