From e2b5b51b1a6a7a25689cadc192398b941cd2f80f Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Fri, 28 Feb 2025 18:25:15 -0800
Subject: [PATCH 1/3] fix: broken epix_merge operations

---
 R/aux_data_utils.R                       |   5 +-
 scripts/covid_hosp_explore.R             | 128 +++---
 scripts/one_offs/comparison-notebook.Rmd | 489 -----------------------
 scripts/targets-exploration-common.R     |   2 +-
 4 files changed, 81 insertions(+), 543 deletions(-)
 delete mode 100644 scripts/one_offs/comparison-notebook.Rmd

diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R
index 7823c7c7..6b1e96c4 100644
--- a/R/aux_data_utils.R
+++ b/R/aux_data_utils.R
@@ -274,7 +274,10 @@ daily_to_weekly_archive <- function(epi_arch,
         as_tibble()
     }
   ) %>%
-    as_epi_archive(compactify = TRUE)
+  # Always convert to data.frame after dplyr operations on data.table.
+  # https://github.com/cmu-delphi/epiprocess/issues/618
+  as.data.frame() %>%
+  as_epi_archive(compactify = TRUE)
 }
 
 
diff --git a/scripts/covid_hosp_explore.R b/scripts/covid_hosp_explore.R
index 959e591d..a6c45759 100644
--- a/scripts/covid_hosp_explore.R
+++ b/scripts/covid_hosp_explore.R
@@ -6,8 +6,9 @@ source("scripts/targets-exploration-common.R")
 hhs_signal <- "confirmed_admissions_covid_1d"
 if (!exists("ref_time_values_")) {
   # Alternatively you can let slide_forecaster figure out ref_time_values
-  start_date <- as.Date("2023-10-04")
+  start_date <- as.Date("2023-11-08")
   end_date <- as.Date("2024-04-24")
+  # end_date <- start_date + 7
   date_step <- 7L
   ref_time_values_ <- seq.Date(start_date, end_date, by = date_step)
 }
@@ -62,12 +63,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
       ),
       pop_scaling = FALSE,
       scale_method = "quantile",
-      center_method = "median",
-      nonlin_method = "quart_root",
-      filter_source = "",
-      filter_agg_level = "",
-      n_training = Inf,
-      drop_non_seasons = FALSE,
+      n_training = Inf
     ),
     expand_grid(
       forecaster = "scaled_pop",
@@ -93,12 +89,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
       ),
       pop_scaling = FALSE,
       scale_method = "quantile",
-      center_method = "median",
-      nonlin_method = "quart_root",
-      filter_source = "",
-      filter_agg_level = "",
-      n_training = Inf,
-      drop_non_seasons = FALSE,
+      n_training = Inf
     ),
     expand_grid(
       forecaster = "scaled_pop",
@@ -124,12 +115,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
       ),
       pop_scaling = FALSE,
       scale_method = "quantile",
-      center_method = "median",
-      nonlin_method = "quart_root",
-      filter_source = "",
-      filter_agg_level = "",
-      n_training = Inf,
-      drop_non_seasons = FALSE,
+      n_training = Inf
     )
   ),
   scled_pop_season = tidyr::expand_grid(
@@ -141,7 +127,13 @@ forecaster_parameter_combinations_ <- rlang::list2(
     ),
     pop_scaling = FALSE,
     n_training = Inf,
-    seasonal_method = list(c("covid"), c("window"), c("covid", "window"), c("climatological"), c("climatological", "window"))
+    seasonal_method = list(
+      c("covid"),
+      c("window"),
+      c("covid", "window"),
+      c("climatological"),
+      c("climatological", "window")
+    )
   )
 ) %>%
   map(function(x) {
@@ -178,9 +170,8 @@ scaled_pop_scaled <- list(
 smooth_scaled <- list(
   forecaster = "smoothed_scaled",
   trainer = "quantreg",
-  lags =
-  # list(smoothed, sd)
-    list(c(0, 7, 14, 21, 28), c(0)),
+  # lags = list(smoothed, sd)
+  lags = list(c(0, 7, 14, 21, 28), c(0)),
   smooth_width = as.difftime(2, units = "weeks"),
   sd_width = as.difftime(4, units = "weeks"),
   sd_mean_width = as.difftime(2, units = "weeks"),
@@ -188,6 +179,7 @@ smooth_scaled <- list(
   n_training = Inf
 )
 # Human-readable object to be used for inspecting the ensembles in the pipeline.
+# fmt: skip
 ensemble_parameter_combinations_ <- tribble(
   ~ensemble, ~ensemble_args, ~forecasters,
   # mean forecaster
@@ -240,7 +232,12 @@ ensemble_parameter_combinations_ <- tribble(
   ) %>%
   add_id(exclude = "forecasters")
 # spoofing ensembles for right now
-ensemble_parameter_combinations_ <- tibble::tibble(id = character(), ensemble = character(), ensemble_args = character(), children_ids = character())
+ensemble_parameter_combinations_ <- tibble::tibble(
+  id = character(),
+  ensemble = character(),
+  ensemble_args = character(),
+  children_ids = character()
+)
 # Check that every ensemble dependent is actually included.
 missing_forecasters <- setdiff(
   ensemble_parameter_combinations_ %>% pull(children_ids) %>% unlist() %>% unique(),
@@ -272,7 +269,7 @@ rlang::list2(
     tar_target(
       name = hhs_archive_data_asof,
       command = {
-        get_health_data(as.Date(ref_time_values)) %>%
+        get_health_data(as.Date(ref_time_values), disease = "covid") %>%
           mutate(version = as.Date(ref_time_values)) %>%
           relocate(geo_value, time_value, version, hhs)
       },
@@ -348,6 +345,9 @@ rlang::list2(
           # weekly data is indexed from the start of the week
           mutate(time_value = time_value + 6 - time_value_adjust) %>%
           mutate(version = time_value) %>%
+          # Always convert to data.frame after dplyr operations on data.table.
+          # https://github.com/cmu-delphi/epiprocess/issues/618
+          as.data.frame() %>%
           as_epi_archive(compactify = TRUE)
         nssp_archive
       }
@@ -380,27 +380,39 @@ rlang::list2(
             geo_type = "hhs",
             geo_values = "*"
           )
-          google_symptoms_archive_min <-
-            google_symptoms_state_archive %>%
+          google_symptoms_archive_min <- google_symptoms_state_archive %>%
             bind_rows(google_symptoms_hhs_archive) %>%
             select(geo_value, time_value, value) %>%
             daily_to_weekly() %>%
             mutate(version = time_value) %>%
-            as_epi_archive(compactify = TRUE)
-          google_symptoms_archive_min$DT %>%
             filter(!is.na(value)) %>%
             relocate(geo_value, time_value, version, value) %>%
+            as.data.frame() %>%
             as_epi_archive(compactify = TRUE)
         })
-        all_of_them[[1]]$DT %<>% rename(google_symptoms_4_bronchitis = value)
-        all_of_them[[2]]$DT %<>% rename(google_symptoms_5_ageusia = value)
+        all_of_them[[1]] <- all_of_them[[1]]$DT %>%
+          rename(google_symptoms_4_bronchitis = value) %>%
+          # Always convert to data.frame after dplyr operations on data.table.
+          # https://github.com/cmu-delphi/epiprocess/issues/618
+          as.data.frame() %>%
+          as_epi_archive(compactify = TRUE)
+        all_of_them[[2]] <- all_of_them[[2]]$DT %>%
+          rename(google_symptoms_5_ageusia = value) %>%
+          # Always convert to data.frame after dplyr operations on data.table.
+          # https://github.com/cmu-delphi/epiprocess/issues/618
+          as.data.frame() %>%
+          as_epi_archive(compactify = TRUE)
         google_symptoms_archive <- epix_merge(all_of_them[[1]], all_of_them[[2]])
         google_symptoms_archive <- google_symptoms_archive$DT %>%
           mutate(google_symptoms = google_symptoms_4_bronchitis + google_symptoms_5_ageusia) %>%
+          # Always convert to data.frame after dplyr operations on data.table.
+          # https://github.com/cmu-delphi/epiprocess/issues/618
+          as.data.frame() %>%
           as_epi_archive(compactify = TRUE)
         # not just using dplyr to allow for na.rm
         google_symptoms_archive$DT$google_symptoms <-
-          rowSums(google_symptoms_archive$DT[, c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")],
+          rowSums(
+            google_symptoms_archive$DT[, c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")],
             na.rm = TRUE
           )
         pre_pipeline <- google_symptoms_archive %>%
@@ -413,6 +425,9 @@ rlang::list2(
         }
         google_symptoms_archive$DT %>%
           select(-starts_with("source")) %>%
+          # Always convert to data.frame after dplyr operations on data.table
+          # https://github.com/cmu-delphi/epiprocess/issues/618
+          as.data.frame() %>%
           as_epi_archive(compactify = TRUE)
       }
     ),
@@ -479,8 +494,14 @@ rlang::list2(
         nwss <- readr::read_csv(most_recent) %>%
           rename(value = state_med_conc) %>%
           arrange(geo_value, time_value)
-        state_code <- readr::read_csv(here::here("aux_data", "flusion_data", "state_codes_table.csv"), show_col_types = FALSE)
-        hhs_codes <- readr::read_csv(here::here("aux_data", "flusion_data", "state_code_hhs_table.csv"), show_col_types = FALSE)
+        state_code <- readr::read_csv(
+          here::here("aux_data", "flusion_data", "state_codes_table.csv"),
+          show_col_types = FALSE
+        )
+        hhs_codes <- readr::read_csv(
+          here::here("aux_data", "flusion_data", "state_code_hhs_table.csv"),
+          show_col_types = FALSE
+        )
         state_to_hhs <- hhs_codes %>%
           left_join(state_code, by = "state_code") %>%
           select(hhs_region = hhs, geo_value = state_id)
@@ -489,8 +510,7 @@ rlang::list2(
           drop_na() %>%
           select(-agg_level, -year, -agg_level, -population, -density)
         pop_data <- gen_pop_and_density_data()
-        nwss_hhs_region <-
-          nwss %>%
+        nwss_hhs_region <- nwss %>%
           left_join(state_to_hhs, by = "geo_value") %>%
           mutate(year = year(time_value)) %>%
           left_join(pop_data, by = join_by(geo_value, year)) %>%
@@ -517,8 +537,12 @@ rlang::list2(
     tar_target(
       name = hhs_region,
       command = {
-        hhs_region <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_code_hhs_table.csv")
-        state_id <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_codes_table.csv")
+        hhs_region <- readr::read_csv(
+          "https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_code_hhs_table.csv"
+        )
+        state_id <- readr::read_csv(
+          "https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_codes_table.csv"
+        )
         hhs_region %>%
           left_join(state_id, by = "state_code") %>%
           select(hhs_region = hhs, geo_value = state_id) %>%
@@ -534,22 +558,22 @@ rlang::list2(
           rename("hhs" := value) %>%
           add_hhs_region_sum(hhs_region) %>%
           filter(geo_value != "us") %>%
-          as_epi_archive(
-            compactify = TRUE
-          )
+          # Always convert to data.frame after dplyr operations on data.table
+          # https://github.com/cmu-delphi/epiprocess/issues/618
+          as.data.frame() %>%
+          as_epi_archive(compactify = TRUE)
         joined_archive_data$geo_type <- "custom"
         # drop aggregated geo_values
-        joined_archive_data <- joined_archive_data %>%
-          epix_merge(nwss_coarse, sync = "locf")
-        joined_archive_data$geo_type <- "custom"
-        # TODO: Maybe bring these back
-        # epix_merge(doctor_visits_weekly_archive, sync = "locf") %>%
-        joined_archive_data %<>%
-          epix_merge(nssp_archive, sync = "locf")
+        joined_archive_data <- joined_archive_data %>% epix_merge(nwss_coarse, sync = "locf")
+        joined_archive_data %<>% epix_merge(nssp_archive, sync = "locf")
         joined_archive_data$geo_type <- "custom"
-        joined_archive_data %<>%
-          epix_merge(google_symptoms_archive, sync = "locf")
-        joined_archive_data$DT %<>% filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp")))
+        joined_archive_data %<>% epix_merge(google_symptoms_archive, sync = "locf")
+        joined_archive_data$DT %<>%
+          filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp"))) %>%
+          # Always convert to data.frame after dplyr operations on data.table
+          # https://github.com/cmu-delphi/epiprocess/issues/618
+          as.data.frame() %>%
+          as_epi_archive(compactify = TRUE)
         joined_archive_data$geo_type <- "state"
         slide_forecaster(
           epi_archive = joined_archive_data,
@@ -591,7 +615,7 @@ rlang::list2(
         rename(model = forecaster) %>%
         rename(prediction = value) %>%
         filter(!is.na(geo_value))
-      evaluate_predictions(predictions_cards = filtered_forecasts, truth_data = actual_eval_data) %>%
+      evaluate_predictions(forecasts = filtered_forecasts, truth_data = actual_eval_data) %>%
         rename(forecaster = model)
     }
   ),
diff --git a/scripts/one_offs/comparison-notebook.Rmd b/scripts/one_offs/comparison-notebook.Rmd
deleted file mode 100644
index cd858f22..00000000
--- a/scripts/one_offs/comparison-notebook.Rmd
+++ /dev/null
@@ -1,489 +0,0 @@
----
-title: Evaluation of Hospitalization Forecasters 2024-2025
-date: "`r format(Sys.time(), '%d %B %Y')`"
-output:
-  html_document:
-    code_folding: hide
-params:
-  forecaster_set: 1
-editor_options:
-  chunk_output_type: console
----
-
-$$\\[.4in]$$
-
-```{r echo=FALSE}
-knitr::opts_chunk$set(
-  fig.align = "center",
-  message = FALSE,
-  warning = FALSE,
-  cache = TRUE
-)
-ggplot2::theme_set(ggplot2::theme_bw())
-```
-
-```{r}
-library(aws.s3)
-library(data.table)
-library(dplyr)
-library(DT)
-library(ggplot2)
-library(plotly)
-library(readr)
-library(stringr)
-library(tidyr)
-```
-
-```{r}
-# params <- list(
-#     forecaster_set = 5
-# )
-# Load the table of parameter combinations
-s3load("flu_2023_forecaster_parameter_combinations.rds", bucket = "forecasting-team-data")
-
-# Select forecasters for this notebook
-cmu_forecasters <- forecaster_parameter_combinations_[[params$forecaster_set]]$id
-outside_forecasters <- c("FluSight-baseline", "FluSight-ensemble")
-
-# Load scores and filter them, get global variables
-s3load(object = "flu_2023_joined_scores.rds", bucket = "forecasting-team-data")
-scores <- flu_2023_joined_scores %>%
-  filter(forecaster %in% c(cmu_forecasters, outside_forecasters))
-forecast_dates <- scores %>%
-  pull(forecast_date) %>%
-  unique()
-forecasters <- c(cmu_forecasters, outside_forecasters)
-aheads <- scores %>%
-  pull(ahead) %>%
-  unique()
-forecaster_family <- forecaster_parameter_combinations_[[params$forecaster_set]]$forecaster %>% unique()
-base_forecaster_name <- "FluSight-baseline"
-
-# Define aggregation functions
-Mean <- function(x) mean(x, na.rm = TRUE)
-GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset)))
-```
-
-### Notebook Information
-
-#### Forecaster Family
-
-`r forecaster_family`
-
-#### Target Dates
-
-`r forecast_dates`
-
-#### Forecaster Parameter Mapping and Overall Scores
-
-The table is sorted by descending WIS and contains all the forecasters in this notebook.
-
-```{r}
-# Display the table
-param_table <- forecaster_parameter_combinations_[[params$forecaster_set]] %>%
-  select(-any_of(c("forecaster", "keys_to_ignore", "pop_scaling"))) %>%
-  {
-    if ("n_training" %in% colnames(.)) {
-      (.) %>% mutate(n_training = as.character(n_training))
-    } else {
-      .
-    }
-  } %>%
-  full_join(
-    scores %>%
-      group_by(forecaster) %>%
-      summarize(
-        mean_wis = round(mean(wis, na.rm = TRUE), 2),
-        mean_ae = round(mean(ae, na.rm = TRUE), 2),
-        mean_coverage_80 = round(mean(coverage_80, na.rm = TRUE), 2)
-      ) %>%
-      rename(
-        id = forecaster
-      )
-  ) %>%
-  arrange(mean_wis)
-datatable(param_table)
-```
-
-$$\\[.07in]$$
-
-### Score Plots {.tabset}
-
-- The WIS plots are relative to the FluSight-baseline (which isn't plotted, but is implicitly the 1 line). These plots are aggregated with the geometric mean.
-- The absolute error plots are also relative to the FluSight-baseline, but are aggregated with the arithmetic mean.
-- The dashed black line in all plots is the FluSight-ensemble.
-
-#### WIS by Forecast Date
-
-```{r}
-var <- "wis"
-group_cols <- c("forecaster", "forecast_date", "ahead")
-
-# Aggregate metric across groups
-df <- scores %>%
-  select(all_of(c(group_cols, var))) %>%
-  drop_na(!!sym(var)) %>%
-  summarize(!!var := GeoMean(!!sym(var)), .by = all_of(group_cols)) %>%
-  filter(ahead >= 0)
-
-# Make sure we don't divide by zero
-if (
-  df %>%
-    filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>%
-    nrow() > 0
-) {
-  warning("scale_by_forecaster will divide by zero in column ", var)
-}
-
-# Normalize the metric by the baseline
-normalized_df <- df %>%
-  pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>%
-  mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>%
-  pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>%
-  mutate(forecaster = stringr::str_remove(forecaster, var)) %>%
-  filter(forecaster != base_forecaster_name)
-
-facets.label <- str_glue("{aheads} days ahead")
-names(facets.label) <- aheads
-subtitle <- sprintf(
-  "Forecasts made over %s to %s",
-  format(min(forecast_dates), "%B %d, %Y"),
-  format(max(forecast_dates), "%B %d, %Y")
-)
-p <- ggplot(
-  normalized_df %>% filter(forecaster != "FluSight-ensemble"),
-  aes(x = forecast_date, y = !!sym(var))
-) +
-  geom_line(aes(color = forecaster, group = forecaster)) +
-  geom_point(aes(color = forecaster, group = forecaster)) +
-  geom_line(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = forecast_date, y = !!sym(var)),
-    color = "black", linetype = 2
-  ) +
-  geom_point(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = forecast_date, y = !!sym(var)),
-    color = "black", shape = 21, fill = "white"
-  ) +
-  facet_grid(rows = vars(ahead)) +
-  facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) +
-  scale_y_log10() +
-  scale_color_discrete() +
-  guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Forecast Dates", y = "Geometric Mean WIS")
-
-ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
-```
-
-#### WIS by Ahead
-
-```{r}
-var <- "wis"
-group_cols <- c("forecaster", "ahead")
-
-# Aggregate metric across groups
-df <- scores %>%
-  select(all_of(c(group_cols, var))) %>%
-  drop_na(!!sym(var)) %>%
-  summarize(!!var := GeoMean(!!sym(var)), .by = all_of(group_cols)) %>%
-  filter(ahead >= 0)
-
-# Make sure we don't divide by zero
-if (df %>% filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>% nrow() > 0) {
-  warning("scale_by_forecaster will divide by zero in column ", var)
-}
-
-# Normalize the metric by the baseline
-normalized_df <- df %>%
-  pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>%
-  mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>%
-  pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>%
-  mutate(forecaster = stringr::str_remove(forecaster, var)) %>%
-  filter(forecaster != base_forecaster_name)
-
-subtitle <- sprintf(
-  "Forecasts made over %s to %s",
-  format(min(forecast_dates), "%B %d, %Y"),
-  format(max(forecast_dates), "%B %d, %Y")
-)
-p <- ggplot(
-  normalized_df %>% filter(forecaster != "FluSight-ensemble"),
-  aes(x = ahead, y = !!sym(var))
-) +
-  geom_line(aes(color = forecaster, group = forecaster)) +
-  geom_point(aes(color = forecaster, group = forecaster)) +
-  geom_line(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = ahead, y = !!sym(var)),
-    color = "black", linetype = 2
-  ) +
-  geom_point(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = ahead, y = !!sym(var)),
-    color = "black", shape = 21, fill = "white"
-  ) +
-  scale_y_log10() +
-  scale_color_discrete() +
-  guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Days ahead", y = "Geometric Mean WIS")
-
-ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
-```
-
-#### Absolute Error by Forecast Date
-
-```{r}
-var <- "ae"
-group_cols <- c("forecaster", "forecast_date", "ahead")
-
-# Aggregate metric across groups
-df <- scores %>%
-  select(all_of(c(group_cols, var))) %>%
-  drop_na(!!sym(var)) %>%
-  summarize(!!var := Mean(!!sym(var)), .by = all_of(group_cols)) %>%
-  filter(ahead >= 0)
-
-# Make sure we don't divide by zero
-if (
-  df %>%
-    filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>%
-    nrow() > 0
-) {
-  warning("scale_by_forecaster will divide by zero in column ", var)
-}
-
-# Normalize the metric by the baseline
-normalized_df <- df %>%
-  pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>%
-  mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>%
-  pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>%
-  mutate(forecaster = stringr::str_remove(forecaster, var)) %>%
-  filter(forecaster != base_forecaster_name)
-
-facets.label <- str_glue("{aheads} days ahead")
-names(facets.label) <- aheads
-subtitle <- sprintf(
-  "Forecasts made over %s to %s",
-  format(min(forecast_dates), "%B %d, %Y"),
-  format(max(forecast_dates), "%B %d, %Y")
-)
-p <- ggplot(
-  normalized_df %>% filter(forecaster != "FluSight-ensemble"),
-  aes(x = forecast_date, y = !!sym(var))
-) +
-  geom_line(aes(color = forecaster, group = forecaster)) +
-  geom_point(aes(color = forecaster, group = forecaster)) +
-  geom_line(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = forecast_date, y = !!sym(var)),
-    color = "black", linetype = 2
-  ) +
-  geom_point(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = forecast_date, y = !!sym(var)),
-    color = "black", shape = 21, fill = "white"
-  ) +
-  facet_grid(rows = vars(ahead)) +
-  facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) +
-  scale_y_log10() +
-  scale_color_discrete() +
-  guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Forecast Dates", y = "Geometric Mean WIS")
-
-ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
-```
-
-#### Absolute Error by Ahead
-
-```{r}
-var <- "ae"
-group_cols <- c("forecaster", "ahead")
-
-# Aggregate metric across groups
-df <- scores %>%
-  select(all_of(c(group_cols, var))) %>%
-  drop_na(!!sym(var)) %>%
-  summarize(!!var := Mean(!!sym(var)), .by = all_of(group_cols)) %>%
-  filter(ahead >= 0)
-
-# Make sure we don't divide by zero
-if (df %>% filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>% nrow() > 0) {
-  warning("scale_by_forecaster will divide by zero in column ", var)
-}
-
-# Normalize the metric by the baseline
-normalized_df <- df %>%
-  pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>%
-  mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>%
-  pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>%
-  mutate(forecaster = stringr::str_remove(forecaster, var)) %>%
-  filter(forecaster != base_forecaster_name)
-
-subtitle <- sprintf(
-  "Forecasts made over %s to %s",
-  format(min(forecast_dates), "%B %d, %Y"),
-  format(max(forecast_dates), "%B %d, %Y")
-)
-p <- ggplot(
-  normalized_df %>% filter(forecaster != "FluSight-ensemble"),
-  aes(x = ahead, y = !!sym(var))
-) +
-  geom_line(aes(color = forecaster, group = forecaster)) +
-  geom_point(aes(color = forecaster, group = forecaster)) +
-  geom_line(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = ahead, y = !!sym(var)),
-    color = "black", linetype = 2
-  ) +
-  geom_point(
-    data = normalized_df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = ahead, y = !!sym(var)),
-    color = "black", shape = 21, fill = "white"
-  ) +
-  scale_y_log10() +
-  scale_color_discrete() +
-  guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Days ahead", y = "Geometric Mean WIS")
-
-ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
-```
-
-#### % Coverage by Forecast Date
-
-```{r}
-var <- "coverage_80"
-group_cols <- c("forecaster", "forecast_date", "ahead")
-
-# Aggregate metric across groups
-df <- scores %>%
-  drop_na(!!sym(var)) %>%
-  summarize(!!var := Mean(!!sym(var)), .by = all_of(group_cols)) %>%
-  filter(ahead >= 0)
-
-facets.label <- str_glue("{aheads} days ahead")
-names(facets.label) <- aheads
-subtitle <- sprintf(
-  "Forecasts made over %s to %s",
-  format(min(forecast_dates), "%B %d, %Y"),
-  format(max(forecast_dates), "%B %d, %Y")
-)
-p <- ggplot(
-  df %>% filter(forecaster != "FluSight-ensemble"),
-  aes(x = forecast_date, y = !!sym(var))
-) +
-  geom_line(aes(color = forecaster, group = forecaster)) +
-  geom_point(aes(color = forecaster, group = forecaster)) +
-  geom_line(
-    data = df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = forecast_date, y = !!sym(var)),
-    color = "black", linetype = 2
-  ) +
-  geom_point(
-    data = df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = forecast_date, y = !!sym(var)),
-    color = "black", shape = 21, fill = "white"
-  ) +
-  geom_hline(yintercept = .8, linetype = 1, color = "black") +
-  facet_grid(rows = vars(ahead)) +
-  facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) +
-  scale_color_discrete() +
-  guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Forecast Dates", y = "Mean 80% Coverage")
-
-ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
-```
-
-#### % Coverage by Ahead
-
-```{r}
-var <- "coverage_80"
-id_cols <- c("forecaster", "ahead")
-
-# Aggregate metric across groups
-df <- scores %>%
-  select(all_of(c(id_cols, var))) %>%
-  drop_na(!!sym(var)) %>%
-  summarize(!!var := Mean(!!sym(var)), .by = all_of(id_cols)) %>%
-  filter(ahead >= 0)
-
-subtitle <- sprintf(
-  "Forecasts made over %s to %s",
-  format(min(forecast_dates), "%B %d, %Y"),
-  format(max(forecast_dates), "%B %d, %Y")
-)
-p <- ggplot(
-  df %>% filter(forecaster != "FluSight-ensemble"),
-  aes(x = ahead, y = !!sym(var))
-) +
-  geom_line(aes(color = forecaster, group = forecaster)) +
-  geom_point(aes(color = forecaster, group = forecaster)) +
-  geom_line(
-    data = df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = ahead, y = !!sym(var)),
-    color = "black", linetype = 2
-  ) +
-  geom_point(
-    data = df %>% filter(forecaster == "FluSight-ensemble"),
-    aes(x = ahead, y = !!sym(var)),
-    color = "black", shape = 21, fill = "white"
-  ) +
-  geom_hline(yintercept = .8, linetype = 1, color = "black") +
-  scale_color_discrete() +
-  guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Days ahead", y = "Mean 80% Coverage")
-
-ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
-```
-
-### Fan plots
-
-Fan plots showing the 80% prediction intervals for the forecasts made by the CMU forecasters and the outside forecasters. The black line is the truth data.
-
-```{r}
-# Load the forecasts and the truth data
-s3load(object = "flu_2023_joined_forecasts.rds", bucket = "forecasting-team-data")
-s3load(object = "flu_2023_truth_data.rds", bucket = "forecasting-team-data")
-
-# We plot a subset of the dates and geos for the fan plot
-plot_dates <- seq.Date(as.Date("2023-10-07"), by = "4 weeks", length.out = 8)
-geo_vals <- c("ca", "fl", "pa", "tx")
-forecast_subset <- flu_2023_joined_forecasts %>%
-  filter(
-    forecaster %in% c(cmu_forecasters, outside_forecasters),
-    geo_value %in% geo_vals,
-    forecast_date %in% plot_dates
-  ) %>%
-  mutate(quantile = as.character(quantile)) %>%
-  pivot_wider(names_from = "quantile", values_from = "prediction") %>%
-  mutate(ahead = as.numeric(target_end_date - forecast_date)) %>%
-  inner_join(
-    param_table %>% rename(forecaster = id) %>% select(forecaster, mean_wis)
-  ) %>%
-  arrange(mean_wis)
-
-p <- ggplot(
-  data = forecast_subset,
-  aes(x = target_end_date, group = forecast_date)
-) +
-  geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = forecast_date), alpha = 0.3) +
-  geom_line(aes(y = `0.5`, color = forecast_date), linetype = 2L) +
-  geom_point(aes(y = `0.5`, color = forecast_date), size = 0.75) +
-  geom_line(
-    data = flu_2023_truth_data %>% filter(geo_value %in% geo_vals),
-    aes(x = target_end_date, y = true_value),
-    inherit.aes = FALSE, na.rm = TRUE,
-    color = "black", linetype = 1
-  ) +
-  facet_grid(forecaster ~ geo_value) +
-  labs(x = "Reference Date", y = "Forecasts")
-
-ggplotly(p, tooltip = "text", height = 3000, width = 1000) %>%
-  layout(hoverlabel = list(bgcolor = "white"))
-```
diff --git a/scripts/targets-exploration-common.R b/scripts/targets-exploration-common.R
index 89457c1d..412b733e 100644
--- a/scripts/targets-exploration-common.R
+++ b/scripts/targets-exploration-common.R
@@ -219,7 +219,7 @@ make_forecasts_and_scores <- function() {
           rename("model" = "id")
 
         # Score
-        evaluate_predictions(predictions_cards = forecast_scaled, truth_data = actual_eval_data) %>%
+        evaluate_predictions(forecasts = forecast_scaled, truth_data = actual_eval_data) %>%
           rename("id" = "model")
       }
     )

From 6d8b49a3799760710e5ddb742c8474a292a72961 Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Mon, 3 Mar 2025 12:49:04 -0800
Subject: [PATCH 2/3] fix: google symptoms, sum after whitening

---
 scripts/covid_hosp_explore.R | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/scripts/covid_hosp_explore.R b/scripts/covid_hosp_explore.R
index a6c45759..e06203c2 100644
--- a/scripts/covid_hosp_explore.R
+++ b/scripts/covid_hosp_explore.R
@@ -409,21 +409,19 @@ rlang::list2(
           # https://github.com/cmu-delphi/epiprocess/issues/618
           as.data.frame() %>%
           as_epi_archive(compactify = TRUE)
-        # not just using dplyr to allow for na.rm
-        google_symptoms_archive$DT$google_symptoms <-
-          rowSums(
-            google_symptoms_archive$DT[, c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")],
-            na.rm = TRUE
-          )
         pre_pipeline <- google_symptoms_archive %>%
           epix_as_of(as.Date("2023-10-04")) %>%
           mutate(source = "none")
-        colnames <- c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia", "google_symptoms")
+        colnames <- c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")
         for (colname in colnames) {
           learned_params <- calculate_whitening_params(pre_pipeline, colname = colname)
           google_symptoms_archive$DT %<>% data_whitening(colname = colname, learned_params, join_cols = "geo_value")
         }
         google_symptoms_archive$DT %>%
+          mutate(
+            google_symptoms = ifelse(is.na(google_symptoms_4_bronchitis), 0, google_symptoms_4_bronchitis) +
+              ifelse(is.na(google_symptoms_5_ageusia), 0, google_symptoms_5_ageusia)
+          ) %>%
           select(-starts_with("source")) %>%
           # Always convert to data.frame after dplyr operations on data.table
           # https://github.com/cmu-delphi/epiprocess/issues/618
@@ -568,7 +566,7 @@ rlang::list2(
         joined_archive_data %<>% epix_merge(nssp_archive, sync = "locf")
         joined_archive_data$geo_type <- "custom"
         joined_archive_data %<>% epix_merge(google_symptoms_archive, sync = "locf")
-        joined_archive_data$DT %<>%
+        joined_archive_data <- joined_archive_data$DT %>%
           filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp"))) %>%
           # Always convert to data.frame after dplyr operations on data.table
           # https://github.com/cmu-delphi/epiprocess/issues/618

From 3965b84345085adb9cd5c03d8ed335292224811c Mon Sep 17 00:00:00 2001
From: Dmitry Shemetov <dshemetov@ucdavis.edu>
Date: Mon, 3 Mar 2025 12:49:15 -0800
Subject: [PATCH 3/3] fix: notebook coverage 80 -> 90

---
 scripts/reports/comparison-notebook.Rmd | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scripts/reports/comparison-notebook.Rmd b/scripts/reports/comparison-notebook.Rmd
index 1c167daa..a8b4ac66 100644
--- a/scripts/reports/comparison-notebook.Rmd
+++ b/scripts/reports/comparison-notebook.Rmd
@@ -1,5 +1,5 @@
 ---
-title: "`r params$forecaster_family`: evaluation on 2023/24 in 2024/25" 
+title: "`r params$forecaster_family`: evaluation on 2023/24 in 2024/25"
 date: "compiled on `r format(Sys.time(), '%d %B %Y')`"
 output:
   html_document:
@@ -116,12 +116,12 @@ param_table <- params$forecaster_parameters %>%
         geomean_ae = round(GeoMean(ae), 2),
         mean_wis = round(Mean(wis), 2),
         geomean_wis = round(GeoMean(wis), 2),
-        mean_coverage_80 = round(Mean(coverage_80), 2),
+        mean_coverage_90 = round(Mean(coverage_90), 2),
       ) %>%
       rename(id = forecaster)
   ) %>%
   arrange(mean_ae) %>%
-  relocate(id, mean_ae, geomean_ae, mean_wis, geomean_wis, mean_coverage_80)
+  relocate(id, mean_ae, geomean_ae, mean_wis, geomean_wis, mean_coverage_90)
 datatable(param_table)
 ```
 
@@ -502,7 +502,7 @@ ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
 #### % Coverage by Forecast Date
 
 ```{r}
-var <- "coverage_80"
+var <- "coverage_90"
 group_cols <- c("forecaster", "forecast_date", "ahead")
 
 # Aggregate metric across groups
@@ -539,7 +539,7 @@ p <- ggplot(
   facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) +
   scale_color_discrete() +
   guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Forecast Dates", y = "Arithmetic Mean 80% Coverage")
+  labs(title = subtitle, x = "Forecast Dates", y = "Arithmetic Mean 90% Coverage")
 
 ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
   layout(hoverlabel = list(bgcolor = "white"))
@@ -548,7 +548,7 @@ ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
 #### % Coverage by Ahead
 
 ```{r}
-var <- "coverage_80"
+var <- "coverage_90"
 id_cols <- c("forecaster", "ahead")
 
 # Aggregate metric across groups
@@ -582,7 +582,7 @@ p <- ggplot(
   geom_hline(yintercept = .8, linetype = 1, color = "black") +
   scale_color_discrete() +
   guides(color = guide_legend(ncol = 2)) +
-  labs(title = subtitle, x = "Days ahead", y = "Arithmetic Mean 80% Coverage")
+  labs(title = subtitle, x = "Days ahead", y = "Arithmetic Mean 90% Coverage")
 
 ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
   layout(hoverlabel = list(bgcolor = "white"))
@@ -590,7 +590,7 @@ ggplotly(p, tooltip = "text", height = 800, width = 1000) %>%
 
 ### Fan plots {.tabset}
 
-Fan plots showing the 80% prediction intervals for the forecasts made by the CMU forecasters and the outside forecasters. The black line is the truth data.
+Fan plots showing the 90% prediction intervals for the forecasts made by the CMU forecasters and the outside forecasters. The black line is the truth data.
 
 ```{r}
 if (params$disease == "flu") {