From e2b5b51b1a6a7a25689cadc192398b941cd2f80f Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 28 Feb 2025 18:25:15 -0800 Subject: [PATCH 1/3] fix: broken epix_merge operations --- R/aux_data_utils.R | 5 +- scripts/covid_hosp_explore.R | 128 +++--- scripts/one_offs/comparison-notebook.Rmd | 489 ----------------------- scripts/targets-exploration-common.R | 2 +- 4 files changed, 81 insertions(+), 543 deletions(-) delete mode 100644 scripts/one_offs/comparison-notebook.Rmd diff --git a/R/aux_data_utils.R b/R/aux_data_utils.R index 7823c7c7..6b1e96c4 100644 --- a/R/aux_data_utils.R +++ b/R/aux_data_utils.R @@ -274,7 +274,10 @@ daily_to_weekly_archive <- function(epi_arch, as_tibble() } ) %>% - as_epi_archive(compactify = TRUE) + # Always convert to data.frame after dplyr operations on data.table. + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% + as_epi_archive(compactify = TRUE) } diff --git a/scripts/covid_hosp_explore.R b/scripts/covid_hosp_explore.R index 959e591d..a6c45759 100644 --- a/scripts/covid_hosp_explore.R +++ b/scripts/covid_hosp_explore.R @@ -6,8 +6,9 @@ source("scripts/targets-exploration-common.R") hhs_signal <- "confirmed_admissions_covid_1d" if (!exists("ref_time_values_")) { # Alternatively you can let slide_forecaster figure out ref_time_values - start_date <- as.Date("2023-10-04") + start_date <- as.Date("2023-11-08") end_date <- as.Date("2024-04-24") + # end_date <- start_date + 7 date_step <- 7L ref_time_values_ <- seq.Date(start_date, end_date, by = date_step) } @@ -62,12 +63,7 @@ forecaster_parameter_combinations_ <- rlang::list2( ), pop_scaling = FALSE, scale_method = "quantile", - center_method = "median", - nonlin_method = "quart_root", - filter_source = "", - filter_agg_level = "", - n_training = Inf, - drop_non_seasons = FALSE, + n_training = Inf ), expand_grid( forecaster = "scaled_pop", @@ -93,12 +89,7 @@ forecaster_parameter_combinations_ <- rlang::list2( ), pop_scaling = FALSE, scale_method = "quantile", - center_method = "median", - nonlin_method = "quart_root", - filter_source = "", - filter_agg_level = "", - n_training = Inf, - drop_non_seasons = FALSE, + n_training = Inf ), expand_grid( forecaster = "scaled_pop", @@ -124,12 +115,7 @@ forecaster_parameter_combinations_ <- rlang::list2( ), pop_scaling = FALSE, scale_method = "quantile", - center_method = "median", - nonlin_method = "quart_root", - filter_source = "", - filter_agg_level = "", - n_training = Inf, - drop_non_seasons = FALSE, + n_training = Inf ) ), scled_pop_season = tidyr::expand_grid( @@ -141,7 +127,13 @@ forecaster_parameter_combinations_ <- rlang::list2( ), pop_scaling = FALSE, n_training = Inf, - seasonal_method = list(c("covid"), c("window"), c("covid", "window"), c("climatological"), c("climatological", "window")) + seasonal_method = list( + c("covid"), + c("window"), + c("covid", "window"), + c("climatological"), + c("climatological", "window") + ) ) ) %>% map(function(x) { @@ -178,9 +170,8 @@ scaled_pop_scaled <- list( smooth_scaled <- list( forecaster = "smoothed_scaled", trainer = "quantreg", - lags = - # list(smoothed, sd) - list(c(0, 7, 14, 21, 28), c(0)), + # lags = list(smoothed, sd) + lags = list(c(0, 7, 14, 21, 28), c(0)), smooth_width = as.difftime(2, units = "weeks"), sd_width = as.difftime(4, units = "weeks"), sd_mean_width = as.difftime(2, units = "weeks"), @@ -188,6 +179,7 @@ smooth_scaled <- list( n_training = Inf ) # Human-readable object to be used for inspecting the ensembles in the pipeline. +# fmt: skip ensemble_parameter_combinations_ <- tribble( ~ensemble, ~ensemble_args, ~forecasters, # mean forecaster @@ -240,7 +232,12 @@ ensemble_parameter_combinations_ <- tribble( ) %>% add_id(exclude = "forecasters") # spoofing ensembles for right now -ensemble_parameter_combinations_ <- tibble::tibble(id = character(), ensemble = character(), ensemble_args = character(), children_ids = character()) +ensemble_parameter_combinations_ <- tibble::tibble( + id = character(), + ensemble = character(), + ensemble_args = character(), + children_ids = character() +) # Check that every ensemble dependent is actually included. missing_forecasters <- setdiff( ensemble_parameter_combinations_ %>% pull(children_ids) %>% unlist() %>% unique(), @@ -272,7 +269,7 @@ rlang::list2( tar_target( name = hhs_archive_data_asof, command = { - get_health_data(as.Date(ref_time_values)) %>% + get_health_data(as.Date(ref_time_values), disease = "covid") %>% mutate(version = as.Date(ref_time_values)) %>% relocate(geo_value, time_value, version, hhs) }, @@ -348,6 +345,9 @@ rlang::list2( # weekly data is indexed from the start of the week mutate(time_value = time_value + 6 - time_value_adjust) %>% mutate(version = time_value) %>% + # Always convert to data.frame after dplyr operations on data.table. + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% as_epi_archive(compactify = TRUE) nssp_archive } @@ -380,27 +380,39 @@ rlang::list2( geo_type = "hhs", geo_values = "*" ) - google_symptoms_archive_min <- - google_symptoms_state_archive %>% + google_symptoms_archive_min <- google_symptoms_state_archive %>% bind_rows(google_symptoms_hhs_archive) %>% select(geo_value, time_value, value) %>% daily_to_weekly() %>% mutate(version = time_value) %>% - as_epi_archive(compactify = TRUE) - google_symptoms_archive_min$DT %>% filter(!is.na(value)) %>% relocate(geo_value, time_value, version, value) %>% + as.data.frame() %>% as_epi_archive(compactify = TRUE) }) - all_of_them[[1]]$DT %<>% rename(google_symptoms_4_bronchitis = value) - all_of_them[[2]]$DT %<>% rename(google_symptoms_5_ageusia = value) + all_of_them[[1]] <- all_of_them[[1]]$DT %>% + rename(google_symptoms_4_bronchitis = value) %>% + # Always convert to data.frame after dplyr operations on data.table. + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% + as_epi_archive(compactify = TRUE) + all_of_them[[2]] <- all_of_them[[2]]$DT %>% + rename(google_symptoms_5_ageusia = value) %>% + # Always convert to data.frame after dplyr operations on data.table. + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% + as_epi_archive(compactify = TRUE) google_symptoms_archive <- epix_merge(all_of_them[[1]], all_of_them[[2]]) google_symptoms_archive <- google_symptoms_archive$DT %>% mutate(google_symptoms = google_symptoms_4_bronchitis + google_symptoms_5_ageusia) %>% + # Always convert to data.frame after dplyr operations on data.table. + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% as_epi_archive(compactify = TRUE) # not just using dplyr to allow for na.rm google_symptoms_archive$DT$google_symptoms <- - rowSums(google_symptoms_archive$DT[, c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")], + rowSums( + google_symptoms_archive$DT[, c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")], na.rm = TRUE ) pre_pipeline <- google_symptoms_archive %>% @@ -413,6 +425,9 @@ rlang::list2( } google_symptoms_archive$DT %>% select(-starts_with("source")) %>% + # Always convert to data.frame after dplyr operations on data.table + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% as_epi_archive(compactify = TRUE) } ), @@ -479,8 +494,14 @@ rlang::list2( nwss <- readr::read_csv(most_recent) %>% rename(value = state_med_conc) %>% arrange(geo_value, time_value) - state_code <- readr::read_csv(here::here("aux_data", "flusion_data", "state_codes_table.csv"), show_col_types = FALSE) - hhs_codes <- readr::read_csv(here::here("aux_data", "flusion_data", "state_code_hhs_table.csv"), show_col_types = FALSE) + state_code <- readr::read_csv( + here::here("aux_data", "flusion_data", "state_codes_table.csv"), + show_col_types = FALSE + ) + hhs_codes <- readr::read_csv( + here::here("aux_data", "flusion_data", "state_code_hhs_table.csv"), + show_col_types = FALSE + ) state_to_hhs <- hhs_codes %>% left_join(state_code, by = "state_code") %>% select(hhs_region = hhs, geo_value = state_id) @@ -489,8 +510,7 @@ rlang::list2( drop_na() %>% select(-agg_level, -year, -agg_level, -population, -density) pop_data <- gen_pop_and_density_data() - nwss_hhs_region <- - nwss %>% + nwss_hhs_region <- nwss %>% left_join(state_to_hhs, by = "geo_value") %>% mutate(year = year(time_value)) %>% left_join(pop_data, by = join_by(geo_value, year)) %>% @@ -517,8 +537,12 @@ rlang::list2( tar_target( name = hhs_region, command = { - hhs_region <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_code_hhs_table.csv") - state_id <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_codes_table.csv") + hhs_region <- readr::read_csv( + "https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_code_hhs_table.csv" + ) + state_id <- readr::read_csv( + "https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_codes_table.csv" + ) hhs_region %>% left_join(state_id, by = "state_code") %>% select(hhs_region = hhs, geo_value = state_id) %>% @@ -534,22 +558,22 @@ rlang::list2( rename("hhs" := value) %>% add_hhs_region_sum(hhs_region) %>% filter(geo_value != "us") %>% - as_epi_archive( - compactify = TRUE - ) + # Always convert to data.frame after dplyr operations on data.table + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% + as_epi_archive(compactify = TRUE) joined_archive_data$geo_type <- "custom" # drop aggregated geo_values - joined_archive_data <- joined_archive_data %>% - epix_merge(nwss_coarse, sync = "locf") - joined_archive_data$geo_type <- "custom" - # TODO: Maybe bring these back - # epix_merge(doctor_visits_weekly_archive, sync = "locf") %>% - joined_archive_data %<>% - epix_merge(nssp_archive, sync = "locf") + joined_archive_data <- joined_archive_data %>% epix_merge(nwss_coarse, sync = "locf") + joined_archive_data %<>% epix_merge(nssp_archive, sync = "locf") joined_archive_data$geo_type <- "custom" - joined_archive_data %<>% - epix_merge(google_symptoms_archive, sync = "locf") - joined_archive_data$DT %<>% filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp"))) + joined_archive_data %<>% epix_merge(google_symptoms_archive, sync = "locf") + joined_archive_data$DT %<>% + filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp"))) %>% + # Always convert to data.frame after dplyr operations on data.table + # https://github.com/cmu-delphi/epiprocess/issues/618 + as.data.frame() %>% + as_epi_archive(compactify = TRUE) joined_archive_data$geo_type <- "state" slide_forecaster( epi_archive = joined_archive_data, @@ -591,7 +615,7 @@ rlang::list2( rename(model = forecaster) %>% rename(prediction = value) %>% filter(!is.na(geo_value)) - evaluate_predictions(predictions_cards = filtered_forecasts, truth_data = actual_eval_data) %>% + evaluate_predictions(forecasts = filtered_forecasts, truth_data = actual_eval_data) %>% rename(forecaster = model) } ), diff --git a/scripts/one_offs/comparison-notebook.Rmd b/scripts/one_offs/comparison-notebook.Rmd deleted file mode 100644 index cd858f22..00000000 --- a/scripts/one_offs/comparison-notebook.Rmd +++ /dev/null @@ -1,489 +0,0 @@ ---- -title: Evaluation of Hospitalization Forecasters 2024-2025 -date: "`r format(Sys.time(), '%d %B %Y')`" -output: - html_document: - code_folding: hide -params: - forecaster_set: 1 -editor_options: - chunk_output_type: console ---- - -$$\\[.4in]$$ - -```{r echo=FALSE} -knitr::opts_chunk$set( - fig.align = "center", - message = FALSE, - warning = FALSE, - cache = TRUE -) -ggplot2::theme_set(ggplot2::theme_bw()) -``` - -```{r} -library(aws.s3) -library(data.table) -library(dplyr) -library(DT) -library(ggplot2) -library(plotly) -library(readr) -library(stringr) -library(tidyr) -``` - -```{r} -# params <- list( -# forecaster_set = 5 -# ) -# Load the table of parameter combinations -s3load("flu_2023_forecaster_parameter_combinations.rds", bucket = "forecasting-team-data") - -# Select forecasters for this notebook -cmu_forecasters <- forecaster_parameter_combinations_[[params$forecaster_set]]$id -outside_forecasters <- c("FluSight-baseline", "FluSight-ensemble") - -# Load scores and filter them, get global variables -s3load(object = "flu_2023_joined_scores.rds", bucket = "forecasting-team-data") -scores <- flu_2023_joined_scores %>% - filter(forecaster %in% c(cmu_forecasters, outside_forecasters)) -forecast_dates <- scores %>% - pull(forecast_date) %>% - unique() -forecasters <- c(cmu_forecasters, outside_forecasters) -aheads <- scores %>% - pull(ahead) %>% - unique() -forecaster_family <- forecaster_parameter_combinations_[[params$forecaster_set]]$forecaster %>% unique() -base_forecaster_name <- "FluSight-baseline" - -# Define aggregation functions -Mean <- function(x) mean(x, na.rm = TRUE) -GeoMean <- function(x, offset = 0) exp(Mean(log(x + offset))) -``` - -### Notebook Information - -#### Forecaster Family - -`r forecaster_family` - -#### Target Dates - -`r forecast_dates` - -#### Forecaster Parameter Mapping and Overall Scores - -The table is sorted by descending WIS and contains all the forecasters in this notebook. - -```{r} -# Display the table -param_table <- forecaster_parameter_combinations_[[params$forecaster_set]] %>% - select(-any_of(c("forecaster", "keys_to_ignore", "pop_scaling"))) %>% - { - if ("n_training" %in% colnames(.)) { - (.) %>% mutate(n_training = as.character(n_training)) - } else { - . - } - } %>% - full_join( - scores %>% - group_by(forecaster) %>% - summarize( - mean_wis = round(mean(wis, na.rm = TRUE), 2), - mean_ae = round(mean(ae, na.rm = TRUE), 2), - mean_coverage_80 = round(mean(coverage_80, na.rm = TRUE), 2) - ) %>% - rename( - id = forecaster - ) - ) %>% - arrange(mean_wis) -datatable(param_table) -``` - -$$\\[.07in]$$ - -### Score Plots {.tabset} - -- The WIS plots are relative to the FluSight-baseline (which isn't plotted, but is implicitly the 1 line). These plots are aggregated with the geometric mean. -- The absolute error plots are also relative to the FluSight-baseline, but are aggregated with the arithmetic mean. -- The dashed black line in all plots is the FluSight-ensemble. - -#### WIS by Forecast Date - -```{r} -var <- "wis" -group_cols <- c("forecaster", "forecast_date", "ahead") - -# Aggregate metric across groups -df <- scores %>% - select(all_of(c(group_cols, var))) %>% - drop_na(!!sym(var)) %>% - summarize(!!var := GeoMean(!!sym(var)), .by = all_of(group_cols)) %>% - filter(ahead >= 0) - -# Make sure we don't divide by zero -if ( - df %>% - filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>% - nrow() > 0 -) { - warning("scale_by_forecaster will divide by zero in column ", var) -} - -# Normalize the metric by the baseline -normalized_df <- df %>% - pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>% - mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>% - pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>% - mutate(forecaster = stringr::str_remove(forecaster, var)) %>% - filter(forecaster != base_forecaster_name) - -facets.label <- str_glue("{aheads} days ahead") -names(facets.label) <- aheads -subtitle <- sprintf( - "Forecasts made over %s to %s", - format(min(forecast_dates), "%B %d, %Y"), - format(max(forecast_dates), "%B %d, %Y") -) -p <- ggplot( - normalized_df %>% filter(forecaster != "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)) -) + - geom_line(aes(color = forecaster, group = forecaster)) + - geom_point(aes(color = forecaster, group = forecaster)) + - geom_line( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)), - color = "black", linetype = 2 - ) + - geom_point( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)), - color = "black", shape = 21, fill = "white" - ) + - facet_grid(rows = vars(ahead)) + - facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) + - scale_y_log10() + - scale_color_discrete() + - guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Forecast Dates", y = "Geometric Mean WIS") - -ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% - layout(hoverlabel = list(bgcolor = "white")) -``` - -#### WIS by Ahead - -```{r} -var <- "wis" -group_cols <- c("forecaster", "ahead") - -# Aggregate metric across groups -df <- scores %>% - select(all_of(c(group_cols, var))) %>% - drop_na(!!sym(var)) %>% - summarize(!!var := GeoMean(!!sym(var)), .by = all_of(group_cols)) %>% - filter(ahead >= 0) - -# Make sure we don't divide by zero -if (df %>% filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>% nrow() > 0) { - warning("scale_by_forecaster will divide by zero in column ", var) -} - -# Normalize the metric by the baseline -normalized_df <- df %>% - pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>% - mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>% - pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>% - mutate(forecaster = stringr::str_remove(forecaster, var)) %>% - filter(forecaster != base_forecaster_name) - -subtitle <- sprintf( - "Forecasts made over %s to %s", - format(min(forecast_dates), "%B %d, %Y"), - format(max(forecast_dates), "%B %d, %Y") -) -p <- ggplot( - normalized_df %>% filter(forecaster != "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)) -) + - geom_line(aes(color = forecaster, group = forecaster)) + - geom_point(aes(color = forecaster, group = forecaster)) + - geom_line( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)), - color = "black", linetype = 2 - ) + - geom_point( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)), - color = "black", shape = 21, fill = "white" - ) + - scale_y_log10() + - scale_color_discrete() + - guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Days ahead", y = "Geometric Mean WIS") - -ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% - layout(hoverlabel = list(bgcolor = "white")) -``` - -#### Absolute Error by Forecast Date - -```{r} -var <- "ae" -group_cols <- c("forecaster", "forecast_date", "ahead") - -# Aggregate metric across groups -df <- scores %>% - select(all_of(c(group_cols, var))) %>% - drop_na(!!sym(var)) %>% - summarize(!!var := Mean(!!sym(var)), .by = all_of(group_cols)) %>% - filter(ahead >= 0) - -# Make sure we don't divide by zero -if ( - df %>% - filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>% - nrow() > 0 -) { - warning("scale_by_forecaster will divide by zero in column ", var) -} - -# Normalize the metric by the baseline -normalized_df <- df %>% - pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>% - mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>% - pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>% - mutate(forecaster = stringr::str_remove(forecaster, var)) %>% - filter(forecaster != base_forecaster_name) - -facets.label <- str_glue("{aheads} days ahead") -names(facets.label) <- aheads -subtitle <- sprintf( - "Forecasts made over %s to %s", - format(min(forecast_dates), "%B %d, %Y"), - format(max(forecast_dates), "%B %d, %Y") -) -p <- ggplot( - normalized_df %>% filter(forecaster != "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)) -) + - geom_line(aes(color = forecaster, group = forecaster)) + - geom_point(aes(color = forecaster, group = forecaster)) + - geom_line( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)), - color = "black", linetype = 2 - ) + - geom_point( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)), - color = "black", shape = 21, fill = "white" - ) + - facet_grid(rows = vars(ahead)) + - facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) + - scale_y_log10() + - scale_color_discrete() + - guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Forecast Dates", y = "Geometric Mean WIS") - -ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% - layout(hoverlabel = list(bgcolor = "white")) -``` - -#### Absolute Error by Ahead - -```{r} -var <- "ae" -group_cols <- c("forecaster", "ahead") - -# Aggregate metric across groups -df <- scores %>% - select(all_of(c(group_cols, var))) %>% - drop_na(!!sym(var)) %>% - summarize(!!var := Mean(!!sym(var)), .by = all_of(group_cols)) %>% - filter(ahead >= 0) - -# Make sure we don't divide by zero -if (df %>% filter(forecaster == base_forecaster_name & near(!!sym(var), 0)) %>% nrow() > 0) { - warning("scale_by_forecaster will divide by zero in column ", var) -} - -# Normalize the metric by the baseline -normalized_df <- df %>% - pivot_wider(names_from = forecaster, names_prefix = var, values_from = !!sym(var)) %>% - mutate(across(starts_with(var), ~ .x / !!sym(paste0(var, base_forecaster_name)))) %>% - pivot_longer(cols = starts_with(var), names_to = "forecaster", values_to = var) %>% - mutate(forecaster = stringr::str_remove(forecaster, var)) %>% - filter(forecaster != base_forecaster_name) - -subtitle <- sprintf( - "Forecasts made over %s to %s", - format(min(forecast_dates), "%B %d, %Y"), - format(max(forecast_dates), "%B %d, %Y") -) -p <- ggplot( - normalized_df %>% filter(forecaster != "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)) -) + - geom_line(aes(color = forecaster, group = forecaster)) + - geom_point(aes(color = forecaster, group = forecaster)) + - geom_line( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)), - color = "black", linetype = 2 - ) + - geom_point( - data = normalized_df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)), - color = "black", shape = 21, fill = "white" - ) + - scale_y_log10() + - scale_color_discrete() + - guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Days ahead", y = "Geometric Mean WIS") - -ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% - layout(hoverlabel = list(bgcolor = "white")) -``` - -#### % Coverage by Forecast Date - -```{r} -var <- "coverage_80" -group_cols <- c("forecaster", "forecast_date", "ahead") - -# Aggregate metric across groups -df <- scores %>% - drop_na(!!sym(var)) %>% - summarize(!!var := Mean(!!sym(var)), .by = all_of(group_cols)) %>% - filter(ahead >= 0) - -facets.label <- str_glue("{aheads} days ahead") -names(facets.label) <- aheads -subtitle <- sprintf( - "Forecasts made over %s to %s", - format(min(forecast_dates), "%B %d, %Y"), - format(max(forecast_dates), "%B %d, %Y") -) -p <- ggplot( - df %>% filter(forecaster != "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)) -) + - geom_line(aes(color = forecaster, group = forecaster)) + - geom_point(aes(color = forecaster, group = forecaster)) + - geom_line( - data = df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)), - color = "black", linetype = 2 - ) + - geom_point( - data = df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = forecast_date, y = !!sym(var)), - color = "black", shape = 21, fill = "white" - ) + - geom_hline(yintercept = .8, linetype = 1, color = "black") + - facet_grid(rows = vars(ahead)) + - facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) + - scale_color_discrete() + - guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Forecast Dates", y = "Mean 80% Coverage") - -ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% - layout(hoverlabel = list(bgcolor = "white")) -``` - -#### % Coverage by Ahead - -```{r} -var <- "coverage_80" -id_cols <- c("forecaster", "ahead") - -# Aggregate metric across groups -df <- scores %>% - select(all_of(c(id_cols, var))) %>% - drop_na(!!sym(var)) %>% - summarize(!!var := Mean(!!sym(var)), .by = all_of(id_cols)) %>% - filter(ahead >= 0) - -subtitle <- sprintf( - "Forecasts made over %s to %s", - format(min(forecast_dates), "%B %d, %Y"), - format(max(forecast_dates), "%B %d, %Y") -) -p <- ggplot( - df %>% filter(forecaster != "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)) -) + - geom_line(aes(color = forecaster, group = forecaster)) + - geom_point(aes(color = forecaster, group = forecaster)) + - geom_line( - data = df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)), - color = "black", linetype = 2 - ) + - geom_point( - data = df %>% filter(forecaster == "FluSight-ensemble"), - aes(x = ahead, y = !!sym(var)), - color = "black", shape = 21, fill = "white" - ) + - geom_hline(yintercept = .8, linetype = 1, color = "black") + - scale_color_discrete() + - guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Days ahead", y = "Mean 80% Coverage") - -ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% - layout(hoverlabel = list(bgcolor = "white")) -``` - -### Fan plots - -Fan plots showing the 80% prediction intervals for the forecasts made by the CMU forecasters and the outside forecasters. The black line is the truth data. - -```{r} -# Load the forecasts and the truth data -s3load(object = "flu_2023_joined_forecasts.rds", bucket = "forecasting-team-data") -s3load(object = "flu_2023_truth_data.rds", bucket = "forecasting-team-data") - -# We plot a subset of the dates and geos for the fan plot -plot_dates <- seq.Date(as.Date("2023-10-07"), by = "4 weeks", length.out = 8) -geo_vals <- c("ca", "fl", "pa", "tx") -forecast_subset <- flu_2023_joined_forecasts %>% - filter( - forecaster %in% c(cmu_forecasters, outside_forecasters), - geo_value %in% geo_vals, - forecast_date %in% plot_dates - ) %>% - mutate(quantile = as.character(quantile)) %>% - pivot_wider(names_from = "quantile", values_from = "prediction") %>% - mutate(ahead = as.numeric(target_end_date - forecast_date)) %>% - inner_join( - param_table %>% rename(forecaster = id) %>% select(forecaster, mean_wis) - ) %>% - arrange(mean_wis) - -p <- ggplot( - data = forecast_subset, - aes(x = target_end_date, group = forecast_date) -) + - geom_ribbon(aes(ymin = `0.05`, ymax = `0.95`, fill = forecast_date), alpha = 0.3) + - geom_line(aes(y = `0.5`, color = forecast_date), linetype = 2L) + - geom_point(aes(y = `0.5`, color = forecast_date), size = 0.75) + - geom_line( - data = flu_2023_truth_data %>% filter(geo_value %in% geo_vals), - aes(x = target_end_date, y = true_value), - inherit.aes = FALSE, na.rm = TRUE, - color = "black", linetype = 1 - ) + - facet_grid(forecaster ~ geo_value) + - labs(x = "Reference Date", y = "Forecasts") - -ggplotly(p, tooltip = "text", height = 3000, width = 1000) %>% - layout(hoverlabel = list(bgcolor = "white")) -``` diff --git a/scripts/targets-exploration-common.R b/scripts/targets-exploration-common.R index 89457c1d..412b733e 100644 --- a/scripts/targets-exploration-common.R +++ b/scripts/targets-exploration-common.R @@ -219,7 +219,7 @@ make_forecasts_and_scores <- function() { rename("model" = "id") # Score - evaluate_predictions(predictions_cards = forecast_scaled, truth_data = actual_eval_data) %>% + evaluate_predictions(forecasts = forecast_scaled, truth_data = actual_eval_data) %>% rename("id" = "model") } ) From 6d8b49a3799760710e5ddb742c8474a292a72961 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 3 Mar 2025 12:49:04 -0800 Subject: [PATCH 2/3] fix: google symptoms, sum after whitening --- scripts/covid_hosp_explore.R | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/covid_hosp_explore.R b/scripts/covid_hosp_explore.R index a6c45759..e06203c2 100644 --- a/scripts/covid_hosp_explore.R +++ b/scripts/covid_hosp_explore.R @@ -409,21 +409,19 @@ rlang::list2( # https://github.com/cmu-delphi/epiprocess/issues/618 as.data.frame() %>% as_epi_archive(compactify = TRUE) - # not just using dplyr to allow for na.rm - google_symptoms_archive$DT$google_symptoms <- - rowSums( - google_symptoms_archive$DT[, c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")], - na.rm = TRUE - ) pre_pipeline <- google_symptoms_archive %>% epix_as_of(as.Date("2023-10-04")) %>% mutate(source = "none") - colnames <- c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia", "google_symptoms") + colnames <- c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia") for (colname in colnames) { learned_params <- calculate_whitening_params(pre_pipeline, colname = colname) google_symptoms_archive$DT %<>% data_whitening(colname = colname, learned_params, join_cols = "geo_value") } google_symptoms_archive$DT %>% + mutate( + google_symptoms = ifelse(is.na(google_symptoms_4_bronchitis), 0, google_symptoms_4_bronchitis) + + ifelse(is.na(google_symptoms_5_ageusia), 0, google_symptoms_5_ageusia) + ) %>% select(-starts_with("source")) %>% # Always convert to data.frame after dplyr operations on data.table # https://github.com/cmu-delphi/epiprocess/issues/618 @@ -568,7 +566,7 @@ rlang::list2( joined_archive_data %<>% epix_merge(nssp_archive, sync = "locf") joined_archive_data$geo_type <- "custom" joined_archive_data %<>% epix_merge(google_symptoms_archive, sync = "locf") - joined_archive_data$DT %<>% + joined_archive_data <- joined_archive_data$DT %>% filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp"))) %>% # Always convert to data.frame after dplyr operations on data.table # https://github.com/cmu-delphi/epiprocess/issues/618 From 3965b84345085adb9cd5c03d8ed335292224811c Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 3 Mar 2025 12:49:15 -0800 Subject: [PATCH 3/3] fix: notebook coverage 80 -> 90 --- scripts/reports/comparison-notebook.Rmd | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scripts/reports/comparison-notebook.Rmd b/scripts/reports/comparison-notebook.Rmd index 1c167daa..a8b4ac66 100644 --- a/scripts/reports/comparison-notebook.Rmd +++ b/scripts/reports/comparison-notebook.Rmd @@ -1,5 +1,5 @@ --- -title: "`r params$forecaster_family`: evaluation on 2023/24 in 2024/25" +title: "`r params$forecaster_family`: evaluation on 2023/24 in 2024/25" date: "compiled on `r format(Sys.time(), '%d %B %Y')`" output: html_document: @@ -116,12 +116,12 @@ param_table <- params$forecaster_parameters %>% geomean_ae = round(GeoMean(ae), 2), mean_wis = round(Mean(wis), 2), geomean_wis = round(GeoMean(wis), 2), - mean_coverage_80 = round(Mean(coverage_80), 2), + mean_coverage_90 = round(Mean(coverage_90), 2), ) %>% rename(id = forecaster) ) %>% arrange(mean_ae) %>% - relocate(id, mean_ae, geomean_ae, mean_wis, geomean_wis, mean_coverage_80) + relocate(id, mean_ae, geomean_ae, mean_wis, geomean_wis, mean_coverage_90) datatable(param_table) ``` @@ -502,7 +502,7 @@ ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% #### % Coverage by Forecast Date ```{r} -var <- "coverage_80" +var <- "coverage_90" group_cols <- c("forecaster", "forecast_date", "ahead") # Aggregate metric across groups @@ -539,7 +539,7 @@ p <- ggplot( facet_wrap(~ahead, nrow = 4, labeller = labeller(ahead = facets.label)) + scale_color_discrete() + guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Forecast Dates", y = "Arithmetic Mean 80% Coverage") + labs(title = subtitle, x = "Forecast Dates", y = "Arithmetic Mean 90% Coverage") ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% layout(hoverlabel = list(bgcolor = "white")) @@ -548,7 +548,7 @@ ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% #### % Coverage by Ahead ```{r} -var <- "coverage_80" +var <- "coverage_90" id_cols <- c("forecaster", "ahead") # Aggregate metric across groups @@ -582,7 +582,7 @@ p <- ggplot( geom_hline(yintercept = .8, linetype = 1, color = "black") + scale_color_discrete() + guides(color = guide_legend(ncol = 2)) + - labs(title = subtitle, x = "Days ahead", y = "Arithmetic Mean 80% Coverage") + labs(title = subtitle, x = "Days ahead", y = "Arithmetic Mean 90% Coverage") ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% layout(hoverlabel = list(bgcolor = "white")) @@ -590,7 +590,7 @@ ggplotly(p, tooltip = "text", height = 800, width = 1000) %>% ### Fan plots {.tabset} -Fan plots showing the 80% prediction intervals for the forecasts made by the CMU forecasters and the outside forecasters. The black line is the truth data. +Fan plots showing the 90% prediction intervals for the forecasts made by the CMU forecasters and the outside forecasters. The black line is the truth data. ```{r} if (params$disease == "flu") {