initial nssp covid forecast

dsweber2 · dsweber2 · commit 6b8a9fbe9b8d · 2025-06-24T17:50:52.000-05:00
diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
@@ -687,10 +687,22 @@ up_to_date_nssp_state_archive <- function(disease = c("covid", "influenza")) {
     issues = "*"
   )
   nssp_state %>%
-    select(geo_value, time_value, issue, nssp = value) %>%
+    select(geo_value, time_value, version = issue, nssp = value) %>%
+    bind_rows(get_nssp_github()) %>%
     as_epi_archive(compactify = TRUE) %>%
     extract2("DT") %>%
     # End of week to midweek correction.
-    mutate(time_value = time_value + 3) %>%
+    mutate(time_value = floor_date(time_value, "week", week_start = 4)-1) %>%
     as_epi_archive(compactify = TRUE)
 }
+
+get_nssp_github <- function() {
+  raw_file <- read_csv("https://raw.githubusercontent.com/CDCgov/covid19-forecast-hub/refs/heads/main/auxiliary-data/nssp-raw-data/latest.csv")
+  state_map <- get_population_data() %>% filter(state_id !="usa")
+  raw_file %>%
+    filter(county == "All") %>%
+    left_join(state_map, by = join_by(geography == state_name)) %>%
+    select(geo_value = state_id, time_value = week_end, nssp = percent_visits_covid) %>%
+    mutate(time_value = floor_date(time_value, "week", week_start = 4)-1) %>%
+    mutate(version = Sys.Date())
+}
diff --git a/R/forecasters/forecaster_scaled_pop_seasonal.R b/R/forecasters/forecaster_scaled_pop_seasonal.R
@@ -56,6 +56,7 @@ scaled_pop_seasonal <- function(
   clip_lower = TRUE,
   ...
 ) {
+  browser()
   scale_method <- arg_match(scale_method)
   center_method <- arg_match(center_method)
   nonlin_method <- arg_match(nonlin_method)
diff --git a/scripts/covid_hosp_prod.R b/scripts/covid_hosp_prod.R
@@ -62,6 +62,10 @@ g_climate_geo_agged <- function(epi_data, ahead, extra_data, ...) {
   )
 }
 g_windowed_seasonal <- function(epi_data, ahead, extra_data, ...) {
+  epi_data %>%
+    mutate(wkday = wday(time_value)) %>%
+    distinct(wkday, .keep_all = TRUE) %>%
+    select(time_value, wkday)
   fcst <-
     epi_data %>%
     scaled_pop_seasonal(
@@ -148,6 +152,10 @@ parameters_and_date_targets <- rlang::list2(
     covid_geo_exclusions,
     command = "scripts/covid_geo_exclusions.csv"
   ),
+  tar_file(
+    covid_nssp_geo_exclusions,
+    command = "scripts/covid_nssp_geo_exclusions.csv"
+  ),
   tar_file(
     covid_data_substitutions,
     command = "scripts/covid_data_substitutions.csv"
@@ -178,7 +186,8 @@ parameters_and_date_targets <- rlang::list2(
     nssp_latest_data,
     command = {
       nssp_archive_data %>%
-        epix_as_of(min(Sys.Date(), nssp_archive_data$versions_end))
+        epix_as_of(min(Sys.Date(), nssp_archive_data$versions_end)) %>%
+        arrange(desc(time_value))
     }
   )
 )
@@ -196,7 +205,7 @@ forecast_targets <- tar_map(
   ),
   names = c("id", "forecast_date_chr"),
   tar_target(
-    name = forecast_res,
+    name = forecast_nhsn,
     command = {
       # if the forecaster is named latest, it should use the most up to date
       # version of the data
@@ -217,7 +226,7 @@ forecast_targets <- tar_map(
         add_season_info() %>%
         mutate(
           geo_value = ifelse(geo_value == "usa", "us", geo_value),
-          time_value = time_value - 3
+          time_value = floor_date(time_value, "week", week_start = 4) - 1
         ) %>%
         filter(geo_value %nin% g_insufficient_data_geos)
       if (!grepl("latest", id)) {
@@ -235,11 +244,66 @@ forecast_targets <- tar_map(
         )
     },
     pattern = map(aheads)
+  ),
+  tar_target(
+    name = forecast_nssp,
+    command = {
+      # if the forecaster is named latest, it should use the most up to date
+      # version of the data
+      if (grepl("latest", id)) {
+        nhsn_data <- nhsn_archive_data %>%
+          epix_as_of(nhsn_archive_data$versions_end) %>%
+          filter(time_value < as.Date(forecast_date_int))
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(nssp_archive_data$versions_end) %>%
+          filter(time_value < as.Date(forecast_date_int))
+      } else {
+        nhsn_data <- nhsn_archive_data %>%
+          epix_as_of(min(as.Date(forecast_date_int), nhsn_archive_data$versions_end))
+        nssp_data <- nssp_archive_data %>%
+          epix_as_of(min(as.Date(forecast_date_int), nssp_archive_data$versions_end))
+      }
+      nssp_data <- nssp_data %>%
+        rename(value = nssp) %>%
+        add_season_info() %>%
+        mutate(
+          geo_value = ifelse(geo_value == "usa", "us", geo_value),
+          time_value = floor_date(time_value, "week", week_start = 4) - 1
+        ) %>%
+        filter(geo_value %nin% g_insufficient_data_geos)
+
+      if (!grepl("latest", id)) {
+        nhsn_data %<>%
+          data_substitutions(covid_data_substitutions, as.Date(forecast_generation_date_int))
+      }
+
+      # jank renaming to avoid hard-coded variable name problems
+      nhsn_data %<>% rename(nssp = value)
+      attributes(nssp_data)$metadata$as_of <- as.Date(forecast_date_int)
+
+      forecaster_fn <- get_partially_applied_forecaster(forecaster, aheads, params, param_names)
+
+      forecaster_fn(nssp_data, extra_data = nhsn_data) %>%
+        mutate(
+          forecaster = id,
+          geo_value = as.factor(geo_value)
+        )
+    },
+    pattern = map(aheads)
   )
 )
-combined_forecasts <- tar_combine(
-  name = forecast_full,
-  forecast_targets[["forecast_res"]],
+
+combined_nhsn_forecasts <- tar_combine(
+  name = forecast_nhsn_full,
+  forecast_targets[["forecast_nhsn"]],
+  command = {
+    dplyr::bind_rows(!!!.x)
+  }
+)
+
+combined_nssp_forecasts <- tar_combine(
+  name = forecast_nssp_full,
+  forecast_targets[["forecast_nssp"]],
   command = {
     dplyr::bind_rows(!!!.x)
   }
@@ -254,9 +318,16 @@ ensemble_targets <- tar_map(
   ),
   names = "forecast_date_chr",
   tar_target(
-    name = forecast_full_filtered,
+    name = forecast_nhsn_full_filtered,
     command = {
-      forecast_full %>%
+      forecast_nhsn_full %>%
+        filter(forecast_date == as.Date(forecast_date_int))
+    }
+  ),
+  tar_target(
+    name = forecast_nssp_full_filtered,
+    command = {
+      forecast_nssp_full %>%
         filter(forecast_date == as.Date(forecast_date_int))
     }
   ),
@@ -274,6 +345,17 @@ ensemble_targets <- tar_map(
       geo_forecasters_weights
     },
   ),
+  tar_target(
+    name = geo_nssp_forecasters_weights,
+    command = {
+      geo_nssp_forecasters_weights <-
+        parse_prod_weights(covid_nssp_geo_exclusions, forecast_date_int, g_forecaster_params_grid$id)
+      if (nrow(geo_nssp_forecasters_weights %>% filter(forecast_date == as.Date(forecast_date_int))) == 0) {
+        cli_abort("there are no weights for the forecast date {forecast_date}")
+      }
+      geo_nssp_forecasters_weights
+    },
+  ),
   tar_target(
     name = geo_exclusions,
     command = {
@@ -283,7 +365,7 @@ ensemble_targets <- tar_map(
   tar_target(
     name = ensemble_clim_lin,
     command = {
-      forecast_full_filtered %>%
+      forecast_nhsn_full_filtered %>%
         ensemble_climate_linear(
           aheads,
           other_weights = geo_forecasters_weights,
@@ -299,7 +381,7 @@ ensemble_targets <- tar_map(
   tar_target(
     name = ens_ar_only,
     command = {
-      forecast_full_filtered %>%
+      forecast_nhsn_full_filtered %>%
         filter(forecaster %in% c("windowed_seasonal", "windowed_seasonal_extra_sources")) %>%
         group_by(geo_value, forecast_date, target_end_date, quantile) %>%
         summarize(value = mean(value, na.rm = TRUE), .groups = "drop") %>%
@@ -312,25 +394,47 @@ ensemble_targets <- tar_map(
     command = {
       ensemble_clim_lin %>%
         bind_rows(
-          forecast_full_filtered %>%
+          forecast_nhsn_full_filtered %>%
             filter(forecaster %in% c("windowed_seasonal", "windowed_seasonal_extra_sources")) %>%
             filter(forecast_date < target_end_date) # don't use for neg aheads
         ) %>%
         ensemble_weighted(geo_forecasters_weights) %>%
         mutate(forecaster = "ensemble_mix")
     },
   ),
+  tar_target(
+    name = ensemble_nssp_mixture_res,
+    command = {
+      ensemble_clim_lin %>%
+        bind_rows(
+          forecast_nssp_full_filtered %>%
+            filter(forecaster %in% c("windowed_seasonal", "windowed_seasonal_extra_sources")) %>%
+            filter(forecast_date < target_end_date) # don't use for neg aheads
+        ) %>%
+        ensemble_weighted(geo_nssp_forecasters_weights) %>%
+        mutate(forecaster = "ensemble_mix")
+    },
+  ),
   tar_target(
     name = forecasts_and_ensembles,
     command = {
       bind_rows(
-        forecast_full_filtered,
+        forecast_nhsn_full_filtered,
         ensemble_clim_lin,
         ensemble_mixture_res,
         ens_ar_only
       )
     }
   ),
+  tar_target(
+    name = forecasts_and_ensembles_nssp,
+    command = {
+      bind_rows(
+        forecast_nssp_full_filtered,
+        ensemble_nssp_mixture_res,
+      )
+    }
+  ),
   tar_target(
     name = make_submission_csv,
     command = {
@@ -347,11 +451,24 @@ ensemble_targets <- tar_map(
       }
     },
   ),
+  tar_target(
+    name = make_nssp_submission_csv,
+    command = {
+      if (!g_backtest_mode && g_submission_directory != "cache") {
+        forecast_reference_date <- get_forecast_reference_date(forecast_date_int)
+        ensemble_nssp_mixture_res %>%
+          format_flusight(disease = "covid") %>%
+          write_submission_file(forecast_reference_date, file.path(g_submission_directory, "model-output/CMU-TimeSeries"))
+      } else {
+        cli_alert_info("Not making submission csv because we're in backtest mode or submission directory is cache")
+      }
+    },
+  ),
   tar_target(
     name = make_climate_submission_csv,
     command = {
       if (!g_backtest_mode && g_submission_directory != "cache") {
-        forecast_full_filtered %>%
+        forecast_nhsn_full_filtered %>%
           filter(forecaster %in% c("climate_base", "climate_geo_agged")) %>%
           group_by(geo_value, target_end_date, quantile) %>%
           summarize(forecast_date = as.Date(forecast_date_int), value = mean(value, na.rm = TRUE), .groups = "drop") %>%
@@ -406,19 +523,27 @@ ensemble_targets <- tar_map(
     },
   ),
   tar_target(
-    name = truth_data,
+    name = truth_data_pre_process,
     command = {
       # Plot both as_of and latest data to compare
       nhsn_data <- nhsn_archive_data %>%
         epix_as_of(min(as.Date(forecast_generation_date_int), nhsn_archive_data$versions_end)) %>%
         mutate(source = "nhsn as_of forecast") %>%
         bind_rows(nhsn_latest_data %>% mutate(source = "nhsn")) %>%
-        select(geo_value, target_end_date = time_value, value) %>%
+        select(geo_value, target_end_date = time_value, value, source) %>%
         filter(target_end_date > g_truth_data_date, geo_value %nin% g_insufficient_data_geos)
       nssp_data <- nssp_latest_data %>%
         select(geo_value, target_end_date = time_value, value = nssp) %>%
         filter(target_end_date > g_truth_data_date, geo_value %nin% g_insufficient_data_geos) %>%
         mutate(target_end_date = target_end_date + 3, source = "nssp")
+      list(nhsn_data, nssp_data)
+    }
+  ),
+  tar_target(
+    name = truth_data_nhsn,
+    command = {
+      nhsn_data <- truth_data_pre_process[[1]]
+      nssp_data <- truth_data_pre_process[[2]]
       nssp_renormalized <-
         nssp_data %>%
         left_join(
@@ -436,11 +561,36 @@ ensemble_targets <- tar_map(
         mutate(value = value * rel_max_value) %>%
         select(-rel_max_value)
       nhsn_data %>% bind_rows(nssp_renormalized)
-    },
+    }
+  ),
+  tar_target(
+    name = truth_data_nssp,
+    command = {
+      nhsn_data <- truth_data_pre_process[[1]]
+      nssp_data <- truth_data_pre_process[[2]]
+      nhsn_renormalized <-
+        nhsn_data %>%
+        left_join(
+          nhsn_data %>%
+            rename(nssp = value) %>%
+            full_join(
+              nssp_data %>%
+                select(geo_value, target_end_date, value),
+              by = join_by(geo_value, target_end_date)
+            ) %>%
+            group_by(geo_value) %>%
+            summarise(rel_max_value = max(value, na.rm = TRUE) / max(nssp, na.rm = TRUE)),
+          by = join_by(geo_value)
+        ) %>%
+        mutate(value = value * rel_max_value) %>%
+        select(-rel_max_value)
+      nssp_data %>% bind_rows(nhsn_renormalized)
+    }
   ),
   tar_target(
     notebook,
     command = {
+      browser()
       # Only render the report if there is only one forecast date
       # i.e. we're running this in prod on schedule
       if (!g_backtest_mode) {
@@ -453,9 +603,11 @@ ensemble_targets <- tar_map(
           ),
           params = list(
             disease = "covid",
-            forecast_res = forecasts_and_ensembles %>% ungroup() %>% filter(forecaster != "climate_geo_agged"),
+            forecast_nhsn = forecasts_and_ensembles %>% ungroup() %>% filter(forecaster != "climate_geo_agged"),
+            forecast_nssp = forecasts_and_ensembles_nssp,
             forecast_date = as.Date(forecast_date_int),
-            truth_data = truth_data
+            truth_data_nhsn = truth_data_nhsn,
+            truth_data_nssp = truth_data_nssp
           )
         )
       }
@@ -506,6 +658,7 @@ list2(
   parameters_and_date_targets,
   forecast_targets,
   ensemble_targets,
-  combined_forecasts,
+  combined_nhsn_forecasts,
+  combined_nssp_forecasts,
   score_targets
 )
diff --git a/scripts/reports/forecast_report.Rmd b/scripts/reports/forecast_report.Rmd