Skip to content

Commit 8ab8e89

Browse files
authored
hotfix: covid explore (#174)
* fix: dplyr on data.table broke epix_merge * fix: google symptoms, sum after whitening * fix: notebook coverage 80 -> 90
1 parent baf07a2 commit 8ab8e89

5 files changed

+92
-556
lines changed

R/aux_data_utils.R

+4-1
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,10 @@ daily_to_weekly_archive <- function(epi_arch,
274274
as_tibble()
275275
}
276276
) %>%
277-
as_epi_archive(compactify = TRUE)
277+
# Always convert to data.frame after dplyr operations on data.table.
278+
# https://github.com/cmu-delphi/epiprocess/issues/618
279+
as.data.frame() %>%
280+
as_epi_archive(compactify = TRUE)
278281
}
279282

280283

scripts/covid_hosp_explore.R

+79-57
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ source("scripts/targets-exploration-common.R")
66
hhs_signal <- "confirmed_admissions_covid_1d"
77
if (!exists("ref_time_values_")) {
88
# Alternatively you can let slide_forecaster figure out ref_time_values
9-
start_date <- as.Date("2023-10-04")
9+
start_date <- as.Date("2023-11-08")
1010
end_date <- as.Date("2024-04-24")
11+
# end_date <- start_date + 7
1112
date_step <- 7L
1213
ref_time_values_ <- seq.Date(start_date, end_date, by = date_step)
1314
}
@@ -62,12 +63,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
6263
),
6364
pop_scaling = FALSE,
6465
scale_method = "quantile",
65-
center_method = "median",
66-
nonlin_method = "quart_root",
67-
filter_source = "",
68-
filter_agg_level = "",
69-
n_training = Inf,
70-
drop_non_seasons = FALSE,
66+
n_training = Inf
7167
),
7268
expand_grid(
7369
forecaster = "scaled_pop",
@@ -93,12 +89,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
9389
),
9490
pop_scaling = FALSE,
9591
scale_method = "quantile",
96-
center_method = "median",
97-
nonlin_method = "quart_root",
98-
filter_source = "",
99-
filter_agg_level = "",
100-
n_training = Inf,
101-
drop_non_seasons = FALSE,
92+
n_training = Inf
10293
),
10394
expand_grid(
10495
forecaster = "scaled_pop",
@@ -124,12 +115,7 @@ forecaster_parameter_combinations_ <- rlang::list2(
124115
),
125116
pop_scaling = FALSE,
126117
scale_method = "quantile",
127-
center_method = "median",
128-
nonlin_method = "quart_root",
129-
filter_source = "",
130-
filter_agg_level = "",
131-
n_training = Inf,
132-
drop_non_seasons = FALSE,
118+
n_training = Inf
133119
)
134120
),
135121
scled_pop_season = tidyr::expand_grid(
@@ -141,7 +127,13 @@ forecaster_parameter_combinations_ <- rlang::list2(
141127
),
142128
pop_scaling = FALSE,
143129
n_training = Inf,
144-
seasonal_method = list(c("covid"), c("window"), c("covid", "window"), c("climatological"), c("climatological", "window"))
130+
seasonal_method = list(
131+
c("covid"),
132+
c("window"),
133+
c("covid", "window"),
134+
c("climatological"),
135+
c("climatological", "window")
136+
)
145137
)
146138
) %>%
147139
map(function(x) {
@@ -178,16 +170,16 @@ scaled_pop_scaled <- list(
178170
smooth_scaled <- list(
179171
forecaster = "smoothed_scaled",
180172
trainer = "quantreg",
181-
lags =
182-
# list(smoothed, sd)
183-
list(c(0, 7, 14, 21, 28), c(0)),
173+
# lags = list(smoothed, sd)
174+
lags = list(c(0, 7, 14, 21, 28), c(0)),
184175
smooth_width = as.difftime(2, units = "weeks"),
185176
sd_width = as.difftime(4, units = "weeks"),
186177
sd_mean_width = as.difftime(2, units = "weeks"),
187178
pop_scaling = TRUE,
188179
n_training = Inf
189180
)
190181
# Human-readable object to be used for inspecting the ensembles in the pipeline.
182+
# fmt: skip
191183
ensemble_parameter_combinations_ <- tribble(
192184
~ensemble, ~ensemble_args, ~forecasters,
193185
# mean forecaster
@@ -240,7 +232,12 @@ ensemble_parameter_combinations_ <- tribble(
240232
) %>%
241233
add_id(exclude = "forecasters")
242234
# spoofing ensembles for right now
243-
ensemble_parameter_combinations_ <- tibble::tibble(id = character(), ensemble = character(), ensemble_args = character(), children_ids = character())
235+
ensemble_parameter_combinations_ <- tibble::tibble(
236+
id = character(),
237+
ensemble = character(),
238+
ensemble_args = character(),
239+
children_ids = character()
240+
)
244241
# Check that every ensemble dependent is actually included.
245242
missing_forecasters <- setdiff(
246243
ensemble_parameter_combinations_ %>% pull(children_ids) %>% unlist() %>% unique(),
@@ -272,7 +269,7 @@ rlang::list2(
272269
tar_target(
273270
name = hhs_archive_data_asof,
274271
command = {
275-
get_health_data(as.Date(ref_time_values)) %>%
272+
get_health_data(as.Date(ref_time_values), disease = "covid") %>%
276273
mutate(version = as.Date(ref_time_values)) %>%
277274
relocate(geo_value, time_value, version, hhs)
278275
},
@@ -348,6 +345,9 @@ rlang::list2(
348345
# weekly data is indexed from the start of the week
349346
mutate(time_value = time_value + 6 - time_value_adjust) %>%
350347
mutate(version = time_value) %>%
348+
# Always convert to data.frame after dplyr operations on data.table.
349+
# https://github.com/cmu-delphi/epiprocess/issues/618
350+
as.data.frame() %>%
351351
as_epi_archive(compactify = TRUE)
352352
nssp_archive
353353
}
@@ -380,39 +380,52 @@ rlang::list2(
380380
geo_type = "hhs",
381381
geo_values = "*"
382382
)
383-
google_symptoms_archive_min <-
384-
google_symptoms_state_archive %>%
383+
google_symptoms_archive_min <- google_symptoms_state_archive %>%
385384
bind_rows(google_symptoms_hhs_archive) %>%
386385
select(geo_value, time_value, value) %>%
387386
daily_to_weekly() %>%
388387
mutate(version = time_value) %>%
389-
as_epi_archive(compactify = TRUE)
390-
google_symptoms_archive_min$DT %>%
391388
filter(!is.na(value)) %>%
392389
relocate(geo_value, time_value, version, value) %>%
390+
as.data.frame() %>%
393391
as_epi_archive(compactify = TRUE)
394392
})
395-
all_of_them[[1]]$DT %<>% rename(google_symptoms_4_bronchitis = value)
396-
all_of_them[[2]]$DT %<>% rename(google_symptoms_5_ageusia = value)
393+
all_of_them[[1]] <- all_of_them[[1]]$DT %>%
394+
rename(google_symptoms_4_bronchitis = value) %>%
395+
# Always convert to data.frame after dplyr operations on data.table.
396+
# https://github.com/cmu-delphi/epiprocess/issues/618
397+
as.data.frame() %>%
398+
as_epi_archive(compactify = TRUE)
399+
all_of_them[[2]] <- all_of_them[[2]]$DT %>%
400+
rename(google_symptoms_5_ageusia = value) %>%
401+
# Always convert to data.frame after dplyr operations on data.table.
402+
# https://github.com/cmu-delphi/epiprocess/issues/618
403+
as.data.frame() %>%
404+
as_epi_archive(compactify = TRUE)
397405
google_symptoms_archive <- epix_merge(all_of_them[[1]], all_of_them[[2]])
398406
google_symptoms_archive <- google_symptoms_archive$DT %>%
399407
mutate(google_symptoms = google_symptoms_4_bronchitis + google_symptoms_5_ageusia) %>%
408+
# Always convert to data.frame after dplyr operations on data.table.
409+
# https://github.com/cmu-delphi/epiprocess/issues/618
410+
as.data.frame() %>%
400411
as_epi_archive(compactify = TRUE)
401-
# not just using dplyr to allow for na.rm
402-
google_symptoms_archive$DT$google_symptoms <-
403-
rowSums(google_symptoms_archive$DT[, c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")],
404-
na.rm = TRUE
405-
)
406412
pre_pipeline <- google_symptoms_archive %>%
407413
epix_as_of(as.Date("2023-10-04")) %>%
408414
mutate(source = "none")
409-
colnames <- c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia", "google_symptoms")
415+
colnames <- c("google_symptoms_4_bronchitis", "google_symptoms_5_ageusia")
410416
for (colname in colnames) {
411417
learned_params <- calculate_whitening_params(pre_pipeline, colname = colname)
412418
google_symptoms_archive$DT %<>% data_whitening(colname = colname, learned_params, join_cols = "geo_value")
413419
}
414420
google_symptoms_archive$DT %>%
421+
mutate(
422+
google_symptoms = ifelse(is.na(google_symptoms_4_bronchitis), 0, google_symptoms_4_bronchitis) +
423+
ifelse(is.na(google_symptoms_5_ageusia), 0, google_symptoms_5_ageusia)
424+
) %>%
415425
select(-starts_with("source")) %>%
426+
# Always convert to data.frame after dplyr operations on data.table
427+
# https://github.com/cmu-delphi/epiprocess/issues/618
428+
as.data.frame() %>%
416429
as_epi_archive(compactify = TRUE)
417430
}
418431
),
@@ -479,8 +492,14 @@ rlang::list2(
479492
nwss <- readr::read_csv(most_recent) %>%
480493
rename(value = state_med_conc) %>%
481494
arrange(geo_value, time_value)
482-
state_code <- readr::read_csv(here::here("aux_data", "flusion_data", "state_codes_table.csv"), show_col_types = FALSE)
483-
hhs_codes <- readr::read_csv(here::here("aux_data", "flusion_data", "state_code_hhs_table.csv"), show_col_types = FALSE)
495+
state_code <- readr::read_csv(
496+
here::here("aux_data", "flusion_data", "state_codes_table.csv"),
497+
show_col_types = FALSE
498+
)
499+
hhs_codes <- readr::read_csv(
500+
here::here("aux_data", "flusion_data", "state_code_hhs_table.csv"),
501+
show_col_types = FALSE
502+
)
484503
state_to_hhs <- hhs_codes %>%
485504
left_join(state_code, by = "state_code") %>%
486505
select(hhs_region = hhs, geo_value = state_id)
@@ -489,8 +508,7 @@ rlang::list2(
489508
drop_na() %>%
490509
select(-agg_level, -year, -agg_level, -population, -density)
491510
pop_data <- gen_pop_and_density_data()
492-
nwss_hhs_region <-
493-
nwss %>%
511+
nwss_hhs_region <- nwss %>%
494512
left_join(state_to_hhs, by = "geo_value") %>%
495513
mutate(year = year(time_value)) %>%
496514
left_join(pop_data, by = join_by(geo_value, year)) %>%
@@ -517,8 +535,12 @@ rlang::list2(
517535
tar_target(
518536
name = hhs_region,
519537
command = {
520-
hhs_region <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_code_hhs_table.csv")
521-
state_id <- readr::read_csv("https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_codes_table.csv")
538+
hhs_region <- readr::read_csv(
539+
"https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_code_hhs_table.csv"
540+
)
541+
state_id <- readr::read_csv(
542+
"https://raw.githubusercontent.com/cmu-delphi/covidcast-indicators/refs/heads/main/_delphi_utils_python/delphi_utils/data/2020/state_codes_table.csv"
543+
)
522544
hhs_region %>%
523545
left_join(state_id, by = "state_code") %>%
524546
select(hhs_region = hhs, geo_value = state_id) %>%
@@ -534,22 +556,22 @@ rlang::list2(
534556
rename("hhs" := value) %>%
535557
add_hhs_region_sum(hhs_region) %>%
536558
filter(geo_value != "us") %>%
537-
as_epi_archive(
538-
compactify = TRUE
539-
)
559+
# Always convert to data.frame after dplyr operations on data.table
560+
# https://github.com/cmu-delphi/epiprocess/issues/618
561+
as.data.frame() %>%
562+
as_epi_archive(compactify = TRUE)
540563
joined_archive_data$geo_type <- "custom"
541564
# drop aggregated geo_values
542-
joined_archive_data <- joined_archive_data %>%
543-
epix_merge(nwss_coarse, sync = "locf")
544-
joined_archive_data$geo_type <- "custom"
545-
# TODO: Maybe bring these back
546-
# epix_merge(doctor_visits_weekly_archive, sync = "locf") %>%
547-
joined_archive_data %<>%
548-
epix_merge(nssp_archive, sync = "locf")
565+
joined_archive_data <- joined_archive_data %>% epix_merge(nwss_coarse, sync = "locf")
566+
joined_archive_data %<>% epix_merge(nssp_archive, sync = "locf")
549567
joined_archive_data$geo_type <- "custom"
550-
joined_archive_data %<>%
551-
epix_merge(google_symptoms_archive, sync = "locf")
552-
joined_archive_data$DT %<>% filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp")))
568+
joined_archive_data %<>% epix_merge(google_symptoms_archive, sync = "locf")
569+
joined_archive_data <- joined_archive_data$DT %>%
570+
filter(grepl("[a-z]{2}", geo_value), !(geo_value %in% c("as", "pr", "vi", "gu", "mp"))) %>%
571+
# Always convert to data.frame after dplyr operations on data.table
572+
# https://github.com/cmu-delphi/epiprocess/issues/618
573+
as.data.frame() %>%
574+
as_epi_archive(compactify = TRUE)
553575
joined_archive_data$geo_type <- "state"
554576
slide_forecaster(
555577
epi_archive = joined_archive_data,
@@ -591,7 +613,7 @@ rlang::list2(
591613
rename(model = forecaster) %>%
592614
rename(prediction = value) %>%
593615
filter(!is.na(geo_value))
594-
evaluate_predictions(predictions_cards = filtered_forecasts, truth_data = actual_eval_data) %>%
616+
evaluate_predictions(forecasts = filtered_forecasts, truth_data = actual_eval_data) %>%
595617
rename(forecaster = model)
596618
}
597619
),

0 commit comments

Comments
 (0)