From 249b465ee2d78c7fb453225c1660ae81c10b6959 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 15 Mar 2024 18:09:26 -0700 Subject: [PATCH 01/92] fix warnings and empty tests This is purely before any work on latency adjusting, and is just about geting the tests to run without warnings or skipped tests. Getting this involved: 1. renaming some lingering examples of p to probs and `q` to `quantile_levels` (and some `quantile_level`s to the plural) 2. Adding snapshots so the tests for printing in population_scaling work as intended (should probably be converted to cli_informs at some point) 3. removing the nearly empty `test-propagate_samples` which seems like something intended that was never finished. Probably want to add an issue if we actually want it done. 4. added a bunch of `edf`'s in unhappy `prep` steps --- tests/testthat/_snaps/population_scaling.md | 20 +++++++++ tests/testthat/test-dist_quantiles.R | 1 + tests/testthat/test-population_scaling.R | 47 +++++++++++++++++++++ tests/testthat/test-replace_Inf.R | 2 +- tests/testthat/test-step_lag_difference.R | 2 +- 5 files changed, 70 insertions(+), 2 deletions(-) diff --git a/tests/testthat/_snaps/population_scaling.md b/tests/testthat/_snaps/population_scaling.md index 9263e8e1e..149cd38ac 100644 --- a/tests/testthat/_snaps/population_scaling.md +++ b/tests/testthat/_snaps/population_scaling.md @@ -1,3 +1,23 @@ +# test joining by default columns + + Code + prep <- prep(r, jhu) + +--- + + Code + b <- bake(prep, jhu) + +--- + + Code + wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) + +--- + + Code + p <- predict(wf, latest) + # expect error if `by` selector does not match Code diff --git a/tests/testthat/test-dist_quantiles.R b/tests/testthat/test-dist_quantiles.R index 66f229956..9975213c6 100644 --- a/tests/testthat/test-dist_quantiles.R +++ b/tests/testthat/test-dist_quantiles.R @@ -41,6 +41,7 @@ test_that("quantile extrapolator works", { expect_s3_class(vctrs::vec_data(qq[1])[[1]], "dist_quantiles") expect_length(parameters(qq[1])$quantile_levels[[1]], 3L) + dstn <- dist_quantiles(list(1:4, 8:11), list(c(.2, .4, .6, .8))) qq <- extrapolate_quantiles(dstn, probs = c(.25, 0.5, .75)) expect_s3_class(qq, "distribution") diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index 6337a2ea8..f5cedd9b7 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -193,6 +193,53 @@ test_that("Postprocessing to get cases from case rate", { test_that("test joining by default columns", { + + jhu <- case_death_rate_subset %>% + dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% + dplyr::select(geo_value, time_value, case_rate) + + reverse_pop_data = data.frame(geo_value = c("ca", "ny"), + values = c(1/20000, 1/30000)) + + r <- epi_recipe(jhu) %>% + step_population_scaling(case_rate, + df = reverse_pop_data, + df_pop_col = "values", + by = NULL, + suffix = "_scaled") %>% + step_epi_lag(case_rate_scaled, lag = c(0, 7, 14)) %>% # cases + step_epi_ahead(case_rate_scaled, ahead = 7, role = "outcome") %>% # cases + recipes::step_naomit(recipes::all_predictors()) %>% + recipes::step_naomit(recipes::all_outcomes(), skip = TRUE) + + expect_snapshot(prep <- prep(r, jhu)) + + expect_snapshot(b <- bake(prep, jhu)) + + f <- frosting() %>% + layer_predict() %>% + layer_threshold(.pred) %>% + layer_naomit(.pred) %>% + layer_population_scaling(.pred, df = reverse_pop_data, + by = NULL, + df_pop_col = "values") + + expect_snapshot(wf <- epi_workflow(r, + parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(f)) + + latest <- get_test_data(recipe = r, + x = case_death_rate_subset %>% + dplyr::filter(time_value > "2021-11-01", + geo_value %in% c("ca", "ny")) %>% + dplyr::select(geo_value, time_value, case_rate)) + + + expect_snapshot(p <- predict(wf, latest)) + + + jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) diff --git a/tests/testthat/test-replace_Inf.R b/tests/testthat/test-replace_Inf.R index f9993ca13..c99a045b2 100644 --- a/tests/testthat/test-replace_Inf.R +++ b/tests/testthat/test-replace_Inf.R @@ -7,7 +7,7 @@ test_that("replace_inf works", { v1 = 1:5, v2 = c(1, 2, Inf, -Inf, NA) ) - library(dplyr) + suppressPackageStartupMessages(library(dplyr)) ok <- c("geo_value", "time_value") df2 <- df %>% mutate(across(!all_of(ok), ~ vec_replace_inf(.x, NA))) expect_identical(df[, 1:3], df2[, 1:3]) diff --git a/tests/testthat/test-step_lag_difference.R b/tests/testthat/test-step_lag_difference.R index 6ff9884a7..d23a3b4fa 100644 --- a/tests/testthat/test-step_lag_difference.R +++ b/tests/testthat/test-step_lag_difference.R @@ -52,7 +52,7 @@ test_that("step_lag_difference works for a single signal", { }) -test_that("step_lag_difference works for a two signals", { +test_that("step_lag_difference works for a two signal epi_df", { df <- data.frame( time_value = 1:5, geo_value = rep("a", 5), From c622d7d4865ed0a2a48ad1f7fde31617af8dc2f3 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 18 Mar 2024 11:15:08 -0700 Subject: [PATCH 02/92] first draft of extend_ahead --- R/step_epi_shift.R | 46 +++++++++++++++---- R/utils-shift.R | 69 ++++++++++++++++++++++++++++ man/adjust_latency.Rd | 14 ++++++ man/step_epi_shift.Rd | 10 ++++ tests/testthat/_snaps/utils-shift.md | 13 ++++++ tests/testthat/test-utils-shift.R | 30 ++++++++++++ 6 files changed, 174 insertions(+), 8 deletions(-) create mode 100644 R/utils-shift.R create mode 100644 man/adjust_latency.Rd create mode 100644 tests/testthat/_snaps/utils-shift.md create mode 100644 tests/testthat/test-utils-shift.R diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 465d64e7f..62553a3f3 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -19,6 +19,10 @@ #' be the lag or lead for each value in the vector. Lag integers must be #' nonnegative, while ahead integers must be positive. #' @param prefix A character string that will be prefixed to the new column. +#' @param latency_adjustment a character. Determines the method by which the forecast handles data that doesn't extend to the day the forecast is made. The options are: +#' - `"extend_ahead"`: actually forecasts from the last date. E.g. if there are 3 days of latency for a 4 day ahead forecast, the ahead used in practice is actually 7. +#' - `"locf"`: carries forward the last observed value up to the forecast date. +#' - `"extend_lags"`: per `epi_key` and `predictor`, adjusts the lag so that the shortest lag at predict time is #' @param default Determines what fills empty rows #' left by leading/lagging (defaults to NA). #' @param skip A logical. Should the step be skipped when the @@ -54,6 +58,12 @@ step_epi_lag <- prefix = "lag_", default = NA, skip = FALSE, + latency_adjustment = c( + "None", + "extend_ahead", + "locf", + "extend_lags" + ), id = rand_id("epi_lag")) { if (!is_epi_recipe(recipe)) { cli_abort("This step can only operate on an `epi_recipe`.") @@ -65,9 +75,15 @@ step_epi_lag <- i = "Did you perhaps pass an integer in `...` accidentally?" )) } + latency_adjustment <- rlang::arg_match(latency_adjustment) arg_is_nonneg_int(lag) - arg_is_chr_scalar(prefix, id) - + arg_is_chr_scalar(prefix, id, latency_adjustment) + if (!is.null(columns)) { + cli::cli_abort(c( + "The `columns` argument must be `NULL.", + i = "Use `tidyselect` methods to choose columns to lag." + )) + } recipes::add_step( recipe, step_epi_lag_new( @@ -79,6 +95,7 @@ step_epi_lag <- default = default, keys = key_colnames(recipe), columns = NULL, + latency_adjustment = latency_adjustment, skip = skip, id = id ) @@ -97,6 +114,13 @@ step_epi_ahead <- role = "outcome", prefix = "ahead_", default = NA, + latency_adjustment = c( + "None", + "extend_ahead", + "locf", + "extend_lags" + ), + columns = NULL, skip = FALSE, id = rand_id("epi_ahead")) { if (!is_epi_recipe(recipe)) { @@ -109,9 +133,9 @@ step_epi_ahead <- i = "Did you perhaps pass an integer in `...` accidentally?" )) } + latency_adjustment <- rlang::arg_match(latency_adjustment) arg_is_nonneg_int(ahead) - arg_is_chr_scalar(prefix, id) - + arg_is_chr_scalar(prefix, id, latency_adjustment) recipes::add_step( recipe, step_epi_ahead_new( @@ -122,7 +146,8 @@ step_epi_ahead <- prefix = prefix, default = default, keys = key_colnames(recipe), - columns = NULL, + latency_adjustment = latency_adjustment, + columns = columns, skip = skip, id = id ) @@ -132,7 +157,7 @@ step_epi_ahead <- step_epi_lag_new <- function(terms, role, trained, lag, prefix, default, keys, - columns, skip, id) { + latency_adjustment, columns, skip, id) { recipes::step( subclass = "epi_lag", terms = terms, @@ -142,6 +167,7 @@ step_epi_lag_new <- prefix = prefix, default = default, keys = keys, + latency_adjustment = latency_adjustment, columns = columns, skip = skip, id = id @@ -149,7 +175,7 @@ step_epi_lag_new <- } step_epi_ahead_new <- - function(terms, role, trained, ahead, prefix, default, keys, + function(terms, role, trained, ahead, prefix, default, keys, latency_adjustment, columns, skip, id) { recipes::step( subclass = "epi_ahead", @@ -159,6 +185,7 @@ step_epi_ahead_new <- ahead = ahead, prefix = prefix, default = default, + latency_adjustment = latency_adjustment, keys = keys, columns = columns, skip = skip, @@ -178,6 +205,7 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, + latency_adjustment = x$latency_adjustment, columns = recipes::recipes_eval_select(x$terms, training, info), skip = x$skip, id = x$id @@ -194,6 +222,7 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, + latency_adjustment = x$latency_adjustment, columns = recipes::recipes_eval_select(x$terms, training, info), skip = x$skip, id = x$id @@ -235,8 +264,9 @@ bake.step_epi_lag <- function(object, new_data, ...) { #' @export bake.step_epi_ahead <- function(object, new_data, ...) { + ahead <- adjust_latency(object, new_data) grid <- tidyr::expand_grid(col = object$columns, ahead = object$ahead) %>% - mutate( + dplyr::mutate( newname = glue::glue("{object$prefix}{ahead}_{col}"), shift_val = -ahead, ahead = NULL diff --git a/R/utils-shift.R b/R/utils-shift.R new file mode 100644 index 000000000..e1cf3a3dd --- /dev/null +++ b/R/utils-shift.R @@ -0,0 +1,69 @@ +#' various ways of handling differences between the `as_of` date and the maximum +#' time value +#' @description +#' adjust the ahead so that we will be predicting `ahead` days after the `as_of` +#' date, rather than relative to the last day of data +#' @keywords internal +adjust_latency <- function(object, new_data) { + method <- object$latency_adjustment + ahead <- object$ahead + if (is.na(method) || is.null(method) || method == "None") { + return(object$ahead) + } else if (method == "extend_ahead") { + as_of <- attributes(new_data)$metadata$as_of + if (FALSE && (typeof(as_of) != typeof(new_data$time_value))) { + rlang::abort(glue::glue( + "the data matrix `as_of` value is {as_of}, ", + "and not a valid `time_type` with type ", + "matching `time_value`'s type of ", + "{typeof(new_data$time_value)}." + )) + } + # adjust the ahead so that we're predicting relative to the as_of date, + # rather + # than the last day of data + time_values <- new_data$time_value + if (length(time_values) > 0) { + max_time <- max(time_values) + shift_amount <- as.Date(as_of) - max_time + if (is.null(as_of) || is.na(as_of)) { + cli::cli_warn(glue::glue( + "epi_data's `as_of` was {as_of}, setting to ", + "the latest time value, {max_time}." + )) + as_of <- max_time + } else if (as_of < max_time) { + cli::cli_abort(glue::glue( + "`as_of` ({(as_of)}) is before the most ", + "recent data ({max_time}). Remove before ", + "predicting." + )) + } + effective_ahead <- as.integer(shift_amount + ahead) + time_type <- attributes(new_data)$metadata$time_type + + if ((grepl("day", time_type) && (shift_amount >= 10)) || + (grepl("week", time_type) && (shift_amount >= 4))|| + ((time_type == "yearmonth") && (shift_amount >=2)) || + ((time_type == "yearquarter") && (shift_amount >= 1)) || + ((time_type == "year") && (shift_amount >= 1))) { + cli::cli_warn(c( + "!" = glue::glue("The ahead has been adjusted by {shift_amount}, ", + "which is questionable for it's `time_type` of ", + "{time_type}"), + "i" = "input ahead: {ahead}", + "i" = "shifted ahead: {effective_ahead}", + "i" = "max_time = {max_time} -> as_of = {as_of}" + )) + } + return(effective_ahead) + } else { + rlang::abort("the `time_value` column of `new_data` is empty") + } + } else { + rlang::abort(glue::glue( + "Latency adjustment method {method} has not yet ", + "been implemented for `step_epi_ahead`." + )) + } +} diff --git a/man/adjust_latency.Rd b/man/adjust_latency.Rd new file mode 100644 index 000000000..eaebd5c29 --- /dev/null +++ b/man/adjust_latency.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-shift.R +\name{adjust_latency} +\alias{adjust_latency} +\title{various ways of handling differences between the \code{as_of} date and the maximum +time value} +\usage{ +adjust_latency(object, new_data) +} +\description{ +adjust the ahead so that we will be predicting \code{ahead} days after the \code{as_of} +date, rather than relative to the last day of data +} +\keyword{internal} diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index 2bf22c15d..4a1a00104 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -13,6 +13,7 @@ step_epi_lag( prefix = "lag_", default = NA, skip = FALSE, + latency_adjustment = c("None", "extend_ahead", "locf", "extend_lags"), id = rand_id("epi_lag") ) @@ -23,6 +24,8 @@ step_epi_ahead( role = "outcome", prefix = "ahead_", default = NA, + latency_adjustment = c("None", "extend_ahead", "locf", "extend_lags"), + columns = NULL, skip = FALSE, id = rand_id("epi_ahead") ) @@ -53,6 +56,13 @@ conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} +\item{latency_adjustment}{a character. Determines the method by which the forecast handles data that doesn't extend to the day the forecast is made. The options are: +\itemize{ +\item \code{"extend_ahead"}: actually forecasts from the last date. E.g. if there are 3 days of latency for a 4 day ahead forecast, the ahead used in practice is actually 7. +\item \code{"locf"}: carries forward the last observed value up to the forecast date. +\item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that the shortest lag at predict time is +}} + \item{id}{A unique identifier for the step} } \value{ diff --git a/tests/testthat/_snaps/utils-shift.md b/tests/testthat/_snaps/utils-shift.md new file mode 100644 index 000000000..b7c5f064f --- /dev/null +++ b/tests/testthat/_snaps/utils-shift.md @@ -0,0 +1,13 @@ +# extend_ahead warns in case of extreme adjustment + + Code + adjust_latency(object, x_adjust_ahead) + Condition + Warning: + ! The ahead has been adjusted by 100, which is questionable for it's `time_type` of day + i input ahead: 7 + i shifted ahead: 107 + i max_time = 2021-07-19 -> as_of = 2021-10-27 + Output + [1] 107 + diff --git a/tests/testthat/test-utils-shift.R b/tests/testthat/test-utils-shift.R new file mode 100644 index 000000000..53788f8b8 --- /dev/null +++ b/tests/testthat/test-utils-shift.R @@ -0,0 +1,30 @@ +time_range <- as.Date("2021-01-01") + 0:199 +x_adjust_ahead <- tibble( + geo_value = rep("place", 200), + time_value = time_range, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = max(time_range) + 3) +# confirm the delay is right + +test_that("adjust_latency extend_ahead works", { + # testing that POSIXct converts correctly (as well as basic types) + expect_equal( + attributes(x_adjust_ahead)$metadata$as_of - max(x_adjust_ahead$time_value), + as.difftime(3, units = "days") + ) + object <- list(latency_adjustment = "extend_ahead", ahead = 7) + expect_no_error(adjusted_ahead <- adjust_latency(object, x_adjust_ahead)) + expect_type(adjusted_ahead, "integer") + expect_equal(adjusted_ahead, 3 + 7) +}) + +test_that("extend_ahead warns in case of extreme adjustment", { + # warns if the ahead is relatively small + attributes(x_adjust_ahead)$metadata$as_of <- + max(x_adjust_ahead$time_value) + 100 + object <- list(latency_adjustment = "extend_ahead", ahead = 7) + attributes(x_adjust_ahead)$metadata$time_type + expect_snapshot(adjust_latency(object, x_adjust_ahead)) +}) From e6c19e92443a5df81d8154afe52e2d28e5001d45 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 3 May 2024 15:23:16 -0500 Subject: [PATCH 03/92] extend_ahead version bump and news --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 8edddae92..3c3a0f942 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,9 @@ # epipredict (development) Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicate PR's. +# epipredict 0.2 + +- add `latency_adjustment` as an option for `add_epi_ahead`, which adjusts the `ahead` so that the prediction is `ahead` relative to the `as_of` date for the `epi_data`, rather than relative to the last day of data. # epipredict 0.1 From 2b68062a24af33eebc42e37ac98a8c2fb9324c70 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 18 Mar 2024 11:31:16 -0700 Subject: [PATCH 04/92] styler has opinions --- R/utils-shift.R | 16 +++++---- tests/testthat/test-population_scaling.R | 44 +++++++++++++++--------- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/R/utils-shift.R b/R/utils-shift.R index e1cf3a3dd..a8eccf6fe 100644 --- a/R/utils-shift.R +++ b/R/utils-shift.R @@ -43,14 +43,16 @@ adjust_latency <- function(object, new_data) { time_type <- attributes(new_data)$metadata$time_type if ((grepl("day", time_type) && (shift_amount >= 10)) || - (grepl("week", time_type) && (shift_amount >= 4))|| - ((time_type == "yearmonth") && (shift_amount >=2)) || - ((time_type == "yearquarter") && (shift_amount >= 1)) || - ((time_type == "year") && (shift_amount >= 1))) { + (grepl("week", time_type) && (shift_amount >= 4)) || + ((time_type == "yearmonth") && (shift_amount >= 2)) || + ((time_type == "yearquarter") && (shift_amount >= 1)) || + ((time_type == "year") && (shift_amount >= 1))) { cli::cli_warn(c( - "!" = glue::glue("The ahead has been adjusted by {shift_amount}, ", - "which is questionable for it's `time_type` of ", - "{time_type}"), + "!" = glue::glue( + "The ahead has been adjusted by {shift_amount}, ", + "which is questionable for it's `time_type` of ", + "{time_type}" + ), "i" = "input ahead: {ahead}", "i" = "shifted ahead: {effective_ahead}", "i" = "max_time = {max_time} -> as_of = {as_of}" diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index f5cedd9b7..d18be65f5 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -193,20 +193,22 @@ test_that("Postprocessing to get cases from case rate", { test_that("test joining by default columns", { - jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) - reverse_pop_data = data.frame(geo_value = c("ca", "ny"), - values = c(1/20000, 1/30000)) + reverse_pop_data <- data.frame( + geo_value = c("ca", "ny"), + values = c(1 / 20000, 1 / 30000) + ) r <- epi_recipe(jhu) %>% step_population_scaling(case_rate, - df = reverse_pop_data, - df_pop_col = "values", - by = NULL, - suffix = "_scaled") %>% + df = reverse_pop_data, + df_pop_col = "values", + by = NULL, + suffix = "_scaled" + ) %>% step_epi_lag(case_rate_scaled, lag = c(0, 7, 14)) %>% # cases step_epi_ahead(case_rate_scaled, ahead = 7, role = "outcome") %>% # cases recipes::step_naomit(recipes::all_predictors()) %>% @@ -220,20 +222,28 @@ test_that("test joining by default columns", { layer_predict() %>% layer_threshold(.pred) %>% layer_naomit(.pred) %>% - layer_population_scaling(.pred, df = reverse_pop_data, - by = NULL, - df_pop_col = "values") + layer_population_scaling(.pred, + df = reverse_pop_data, + by = NULL, + df_pop_col = "values" + ) - expect_snapshot(wf <- epi_workflow(r, - parsnip::linear_reg()) %>% + expect_snapshot(wf <- epi_workflow( + r, + parsnip::linear_reg() + ) %>% fit(jhu) %>% add_frosting(f)) - latest <- get_test_data(recipe = r, - x = case_death_rate_subset %>% - dplyr::filter(time_value > "2021-11-01", - geo_value %in% c("ca", "ny")) %>% - dplyr::select(geo_value, time_value, case_rate)) + latest <- get_test_data( + recipe = r, + x = case_death_rate_subset %>% + dplyr::filter( + time_value > "2021-11-01", + geo_value %in% c("ca", "ny") + ) %>% + dplyr::select(geo_value, time_value, case_rate) + ) expect_snapshot(p <- predict(wf, latest)) From 2f1ab397353f45b09bbce08e9e755ef44cfb88d1 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 29 Mar 2024 00:29:17 -0700 Subject: [PATCH 05/92] separate step version --- R/epi_shift.R | 78 +++++++++------ R/step_adjust_latency.R | 211 ++++++++++++++++++++++++++++++++++++++++ R/step_epi_shift.R | 59 +---------- R/utils-latency.R | 158 ++++++++++++++++++++++++++++++ 4 files changed, 419 insertions(+), 87 deletions(-) create mode 100644 R/step_adjust_latency.R create mode 100644 R/utils-latency.R diff --git a/R/epi_shift.R b/R/epi_shift.R index eb534f1ea..81dfcddc1 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -2,43 +2,61 @@ #' #' This is a lower-level function. As such it performs no error checking. #' -#' @param x Data frame. Variables to shift -#' @param shifts List. Each list element is a vector of shifts. -#' Negative values produce leads. The list should have the same -#' length as the number of columns in `x`. -#' @param time_value Vector. Same length as `x` giving time stamps. -#' @param keys Data frame, vector, or `NULL`. Additional grouping vars. -#' @param out_name Chr. The output list will use this as a prefix. +#' @param x Data frame. +#' @param shift_val a single integer. Negative values produce leads. +#' @param newname the name for the newly shifted column +#' @param key_cols vector, or `NULL`. Additional grouping vars. #' #' @keywords internal #' #' @return a list of tibbles -epi_shift <- function(x, shifts, time_value, keys = NULL, out_name = "x") { - if (!is.data.frame(x)) x <- data.frame(x) - if (is.null(keys)) keys <- rep("empty", nrow(x)) - p_in <- ncol(x) - out_list <- tibble(i = 1:p_in, shift = shifts) %>% - tidyr::unchop(shift) %>% # what is chop - mutate(name = paste0(out_name, 1:nrow(.))) %>% - # One list element for each shifted feature - pmap(function(i, shift, name) { - tibble(keys, - time_value = time_value + shift, # Shift back - !!name := x[[i]] - ) - }) - if (is.data.frame(keys)) { - common_names <- c(names(keys), "time_value") - } else { - common_names <- c("keys", "time_value") - } - - reduce(out_list, dplyr::full_join, by = common_names) -} - epi_shift_single <- function(x, col, shift_val, newname, key_cols) { x %>% select(all_of(c(key_cols, col))) %>% mutate(time_value = time_value + shift_val) %>% rename(!!newname := {{ col }}) } + +#' lags move columns forward to bring the past up to today, while aheads drag +#' the future back to today +get_sign <- function(object) { + if (object$prefix == "lag_") { + return(1) + } else { + return(-1) + } +} + +#' backend for both `bake.step_epi_ahead` and `bake.step_epi_lag`, performs the +#' checks missing in `epi_shift_single` +#' @keywords internal +#' @importFrom cli cli_abort +add_shifted_columns <- function(new_data, object, amount) { + sign_shift <- get_sign(object) + grid <- tidyr::expand_grid(col = object$columns, amount = amount) %>% + dplyr::mutate( + newname = glue::glue("{object$prefix}{amount}_{col}"), + shift_val = sign_shift * amount, + amount = NULL + ) + + ## ensure no name clashes + new_data_names <- colnames(new_data) + intersection <- new_data_names %in% grid$newname + if (any(intersection)) { + cli_abort(c( + "Name collision occured in {.cls {class(object)[1]}}", + "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." + )) + } + ok <- object$keys + shifted <- reduce( + pmap(grid, epi_shift_single, x = new_data, key_cols = ok), + dplyr::full_join, + by = ok + ) + dplyr::full_join(new_data, shifted, by = ok) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(kill_time_value(ok)))) %>% + dplyr::arrange(time_value) %>% + dplyr::ungroup() +} diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R new file mode 100644 index 000000000..682d80b97 --- /dev/null +++ b/R/step_adjust_latency.R @@ -0,0 +1,211 @@ +#' adapt the pipeline to latency in the data +#' +#' In the standard case, the pipeline assumes that the last observation is also +#' the day from which the forecast is being made. `step_adjust_latency` uses the +#' `as_of` date of the `epi_df` as the `forecast_date`. This is most useful in +#' realtime and pseudo-prospective forecasting for data where there is some +#' delay between the day recorded and when that data is available. +#' +#' @param recipe A recipe object. The step will be added to the +#' sequence of operations for this recipe. +#' @param ... One or more selector functions to choose variables for this step. +#' See [recipes::selections()] for more details. Typically you will not need +#' to set this manually, as the necessary adjustments will be done for the +#' predictors and outcome. +#' @param method a character. Determines the method by which the +#' forecast handles latency. All of these assume the forecast date is the +#' `as_of` of the `epi_df`. The options are: +#' - `"extend_ahead"`: Lengthen the ahead so that forecasting from the last +#' observation results in a forecast `ahead` after the `as_of` date. E.g. if +#' there are 3 days of latency between the last observation and the `as_of` +#' date for a 4 day ahead forecast, the ahead used in practice is actually 7. +#' - `"locf"`: carries forward the last observed value(s) up to the forecast +#' date. See the Vignette TODO for equivalents using other steps and more +#' sophisticated methods of extrapolation. +#' - `"extend_lags"`: per `epi_key` and `predictor`, adjusts the lag so that +#' the shortest lag at predict time is at the last observation. E.g. if the +#' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used +#' become `c(3,10,17)` +#' @param default Determines what fills empty rows +#' left by leading/lagging (defaults to NA). +#' @param prefix a character. The prefix matching the one used in either +#' `step_epi_ahead` if `method="extend_ahead"` or `step_epi_lag` +#' if `method="extend_lags"` or "locf". +#' @param skip A logical. Should the step be skipped when the +#' recipe is baked by [bake()]? While all operations are baked +#' when [prep()] is run, some operations may not be able to be +#' conducted on new data (e.g. processing the outcome variable(s)). +#' Care should be taken when using `skip = TRUE` as it may affect +#' the computations for subsequent operations. +#' @param id A unique identifier for the step +#' @template step-return +#' +#' @details The step assumes that the pipeline has already applied either +#' `step_epi_ahead` or `step_epi_lag` depending on the value of +#' `"method"`, and that `step_epi_naomit` has NOT been run. +#' +#' The `prefix` and `id` arguments are unchangeable to ensure that the code runs +#' properly and to avoid inconsistency with naming. For `step_epi_ahead`, they +#' are always set to `"ahead_"` and `"epi_ahead"` respectively, while for +#' `step_epi_lag`, they are set to `"lag_"` and `"epi_lag`, respectively. +#' +#' @family row operation steps +#' @rdname step_adjust_latency +#' @export +#' @examples +#' r <- epi_recipe(case_death_rate_subset) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' # step_adjust_latency(method = "extend_ahead") %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) +#' r +step_adjust_latency <- + function(recipe, + ..., + role = NA, + trained = FALSE, + method = c( + "extend_ahead", + "locf", + "extend_lags" + ), + default = NA, + skip = FALSE, + prefix = NULL, + columns = NULL, + id = recipes::rand_id("epi_lag")) { + if (!is_epi_recipe(recipe)) { + rlang::abort("This recipe step can only operate on an `epi_recipe`.") + } + if (!is.null(columns)) { + rlang::abort(c("The `columns` argument must be `NULL.", + i = "Use `tidyselect` methods to choose columns to lag." + )) + } + + method <- rlang::arg_match(method) + if (method == "extend_ahead") { + prefix <- "ahead_" + } else { + prefix <- "lag_" + } + + arg_is_chr_scalar(prefix, id, method) + recipes::add_step( + recipe, + step_adjust_latency_new( + terms = dplyr::enquos(...), + role = role, + method = method, + info = NULL, + trained = trained, + prefix = prefix, + default = default, + keys = epi_keys(recipe), + columns = columns, + skip = skip, + id = id + ) + ) + } + +step_adjust_latency_new <- + function(terms, role, trained, prefix, default, keys, method, info, + columns, skip, id) { + step( + subclass = "adjust_latency", + terms = terms, + role = role, + method = method, + info = info, + trained = trained, + prefix = prefix, + default = default, + keys = keys, + columns = columns, + skip = skip, + id = id + ) + } + +#' @export +prep.step_adjust_latency <- function(x, training, info = NULL, ...) { + if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { + cli::cli_abort(glue::glue(c('If `method` is `"extend_ahead"`, then a step ", + "must have already added an outcome .'))) + } else if (!("predictor" %in% info$role)) { + cli::cli_abort('If `method` is `"extend_lags"` or `"locf"`, then a step ", +"must have already added a predictor.') + } + # TODO info here is probably not the best way to handle this, hypothetically I + # get an info object during baking + step_adjust_latency_new( + terms = x$terms, + role = x$role, + trained = TRUE, + prefix = x$prefix, + default = x$default, + keys = x$keys, + method = x$method, + info = info, + columns = recipes::recipes_eval_select(x$terms, training, info), + skip = x$skip, + id = x$id + ) +} + +#' various ways of handling differences between the `as_of` date and the maximum +#' time value +#' @description +#' adjust the ahead so that we will be predicting `ahead` days after the `as_of` +#' date, rather than relative to the last day of data +#' @param new_data assumes that this already has lag/ahead columns that we need +#' to adjust +#' @importFrom dplyr %>% +#' @keywords internal +#' @importFrom dplyr %>% pull +bake.step_adjust_latency <- function(object, new_data, ...) { + sign_shift <- get_sign(object) + # get the columns used, even if it's all of them + terms_used <- object$columns + if (length(terms_used) == 0) { + terms_used <- object$info %>% + filter(role == "raw") %>% + pull(variable) + } + # get and check the max_time and as_of are the right kinds of dates + as_of <- get_asof(object, new_data) + + # infer the correct columns to be working with from the previous + # transformations + shift_cols <- get_shifted_column_tibble(object, new_data, terms_used, as_of, + sign_shift) + + if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { + # check that the shift amount isn't too extreme + latency <- max(shift_cols$latency) + i_latency <- which.max(shift_cols$latency) + time_type <- attributes(new_data)$metadata$time_type + if ( + (grepl("day", time_type) && (latency >= 10)) || + (grepl("week", time_type) && (latency >= 4)) || + ((time_type == "yearmonth") && (latency >= 2)) || + ((time_type == "yearquarter") && (latency >= 1)) || + ((time_type == "year") && (latency >= 1)) + ) { + cli::cli_warn(c( + "!" = glue::glue( + "The shift has been adjusted by {latency}, ", + "which is questionable for it's `time_type` of ", + "{time_type}" + ), + "i" = "input ahead: {shift_cols$shifts[[i_latency]]}", + "i" = "shifted ahead: {shift_cols$effective_shift[[i_latency]]}", + "i" = "max_time = {max_time} -> as_of = {as_of}" + )) + } + keys <- object$keys + return( + extend_either(new_data, shift_cols, keys) + ) + } +} diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 62553a3f3..3a65dd05b 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -233,68 +233,13 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { #' @export bake.step_epi_lag <- function(object, new_data, ...) { - grid <- tidyr::expand_grid(col = object$columns, lag = object$lag) %>% - mutate( - newname = glue::glue("{object$prefix}{lag}_{col}"), - shift_val = lag, - lag = NULL - ) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - cli_abort(c( - "Name collision occured in {.cls {class(object)[1]}}", - "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." - )) - } - ok <- object$keys - shifted <- reduce( - pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - full_join, - by = ok - ) - - full_join(new_data, shifted, by = ok) %>% - group_by(across(all_of(kill_time_value(ok)))) %>% - arrange(time_value) %>% - ungroup() + add_shifted_columns(new_data, object, object$lag) } - #' @export bake.step_epi_ahead <- function(object, new_data, ...) { - ahead <- adjust_latency(object, new_data) - grid <- tidyr::expand_grid(col = object$columns, ahead = object$ahead) %>% - dplyr::mutate( - newname = glue::glue("{object$prefix}{ahead}_{col}"), - shift_val = -ahead, - ahead = NULL - ) - - ## ensure no name clashes - new_data_names <- colnames(new_data) - intersection <- new_data_names %in% grid$newname - if (any(intersection)) { - cli_abort(c( - "Name collision occured in {.cls {class(object)[1]}}", - "The following variable name{?s} already exist{?s/}: {.val {new_data_names[intersection]}}." - )) - } - ok <- object$keys - shifted <- reduce( - pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - full_join, - by = ok - ) - - full_join(new_data, shifted, by = ok) %>% - group_by(across(all_of(kill_time_value(ok)))) %>% - arrange(time_value) %>% - ungroup() + add_shifted_columns(new_data, object, object$ahead) } - #' @export print.step_epi_lag <- function(x, width = max(20, options()$width - 30), ...) { print_epi_step(x$columns, x$terms, x$trained, "Lagging", diff --git a/R/utils-latency.R b/R/utils-latency.R new file mode 100644 index 000000000..d89675bde --- /dev/null +++ b/R/utils-latency.R @@ -0,0 +1,158 @@ +#' offset each relevant column by it's appropriate latency +#' works for either adjusting aheads or lags +#' @param shift_cols a tibble which must have the columns `column`, the name of +#' the column to adjust, `latency` the latency of the original column relative +#' to the `as_of` date, `new_name`, the names in `column` adjusted by the +#' latencies `latency` +#' @param new_data just what is says +#' @param keys the variables which are used as keys +#' @keywords internal +extend_either <- function(new_data, shift_cols, keys) { + shifted <- + shift_cols %>% + select(-shifts, -effective_shift) %>% + pmap(\(original_name, latency, new_name) { + epi_shift_single( + x = new_data, + col = original_name, + shift_val = latency, + newname = new_name, + key_cols = keys + ) + }) %>% + reduce( + dplyr::full_join, + by = keys + ) + return(new_data %>% + select(-shift_cols$original_name) %>% + dplyr::full_join(shifted, by = keys) %>% + dplyr::group_by(dplyr::across(dplyr::all_of(keys[-1]))) %>% + dplyr::arrange(time_value) %>% + dplyr::ungroup()) +} + +#' find the columns added with the lags or aheads, and the amounts they have +#' been changed +#' @param object the step and its parameters +#' @param new_data the data transformed so far +#' @return a tibble with columns `column` (relevant shifted names), `shift` (the +#' amount that one is shifted), `latency` (original columns difference between +#' max_time_value and as_of (on a per-initial column basis)), +#' `effective_shift` (shifts+latency), and `new_name` (adjusted names with the +#' effective_shift) +#' @keywords internal +get_shifted_column_tibble <- function( + object, new_data, terms_used, as_of, sign_shift) { + prefix <- object$prefix + relevant_columns <- names(new_data)[grepl(prefix, names(new_data))] + to_keep <- rep(FALSE, length(relevant_columns)) + for (col_name in terms_used) { + to_keep <- to_keep | grepl(col_name, relevant_columns) + } + relevant_columns <- relevant_columns[to_keep] + # TODO ask about a less jank way to do this + shift_amounts <- as.integer(str_match( + relevant_columns, + "_\\d+_" + ) %>% + `[`(, 1) %>% + str_match("\\d+") %>% + `[`(, 1)) + shift_cols <- dplyr::tibble( + original_name = relevant_columns, + shifts = shift_amounts + ) + shift_cols %>% + rowwise() %>% + # add the latencies to shift_cols + mutate(latency = get_latency( + new_data, as_of, original_name, shifts, sign_shift + )) %>% + ungroup() %>% + # add the updated names to shift_cols + mutate( + effective_shift = shifts + latency, + new_name = adjust_name(prefix, shifts, original_name, latency) + ) + return(shift_cols) +} + + +#' extract the as_of, and make sure there's nothing very off about it +get_asof <- function(object, new_data) { + original_columns <- object$info %>% + filter(source == "original") %>% + pull(variable) + # make sure that there's enough column names + if (length(original_columns) < 3) { + cli::cli_abort(glue::glue( + "The original columns of `time_value`, ", + "`geo_value` and at least one signal. The current colums are \n", + paste(capture.output(object$info), collapse = "\n\n") + )) + } + # the source data determines the actual time_values + # these are the non-na time_values; + time_values <- new_data %>% + select(original_columns) %>% + drop_na() %>% + pull(time_value) + if (length(time_values) <= 0) { + rlang::abort("the `time_value` column of `new_data` is empty") + } + as_of <- attributes(new_data)$metadata$as_of + max_time <- max(time_values) + # make sure the as_of is sane + # TODO decide on these checks + if (!inherits(as_of, class(time_values))) { + rlang::abort(glue::glue( + "the data matrix `as_of` value is {as_of}, ", + "and not a valid `time_type` with type ", + "matching `time_value`'s type of ", + "{typeof(new_data$time_value)}." + )) + } + if (is.null(as_of) || is.na(as_of)) { + cli::cli_warn(glue::glue( + "epi_data's `as_of` was {as_of}, setting to ", + "the latest time value, {max_time}." + )) + as_of <- max_time + } else if (as_of < max_time) { + cli::cli_abort(glue::glue( + "`as_of` ({(as_of)}) is before the most ", + "recent data ({max_time}). Remove before ", + "predicting." + )) + } + # TODO cover the rest of the possible types for as_of and max_time... + if (class(time_values) == "Date") { + as_of <- as.Date(as_of) + } + return(as_of) +} + +#' adjust the shifts by latency for the names in column assumes e.g. +#' `"lag_6_case_rate"` and returns something like `"lag_10_case_rate"` +#' @keywords internal +adjust_name <- function(prefix, shifts, column, latency) { + pattern <- paste0(prefix, "\\d+", "_") + adjusted_shifts <- paste0(prefix, shifts + latency, "_") + stringi::stri_replace_all_regex( + column, + pattern, adjusted_shifts + ) +} + +#' the latency is also the amount the shift is off by +#' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you +#' need to shift the data to bring the 3 day lagged value to today. +#' @keywords internal +get_latency <- function(new_data, as_of, column, shift_amount, sign_shift) { + shift_max_date <- new_data %>% + drop_na(column) %>% + pull(time_value) %>% + max() + return(as.integer(as_of - (shift_max_date - sign_shift * shift_amount))) +} From 07a9e28c21c5d68e59332d5099fc82997e54010e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 29 Mar 2024 01:19:00 -0700 Subject: [PATCH 06/92] styler --- R/step_adjust_latency.R | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 682d80b97..04989e56d 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -177,8 +177,10 @@ bake.step_adjust_latency <- function(object, new_data, ...) { # infer the correct columns to be working with from the previous # transformations - shift_cols <- get_shifted_column_tibble(object, new_data, terms_used, as_of, - sign_shift) + shift_cols <- get_shifted_column_tibble( + object, new_data, terms_used, as_of, + sign_shift + ) if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { # check that the shift amount isn't too extreme From d4d617fb7d349912f0f6d0e6bed49d96f04acecd Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 13:03:16 -0700 Subject: [PATCH 07/92] tests for utils-latency and accompanying fixes --- NAMESPACE | 6 ++ R/step_adjust_latency.R | 5 +- R/utils-latency.R | 36 ++++--- man/epi_shift.Rd | 28 ------ man/step_epi_shift.Rd | 2 + man/step_growth_rate.Rd | 1 + man/step_lag_difference.Rd | 1 + tests/testthat/test-utils_latency.R | 139 ++++++++++++++++++++++++++++ 8 files changed, 175 insertions(+), 43 deletions(-) delete mode 100644 man/epi_shift.Rd create mode 100644 tests/testthat/test-utils_latency.R diff --git a/NAMESPACE b/NAMESPACE index e815203eb..0cf66300f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -58,6 +58,7 @@ S3method(predict,epi_workflow) S3method(predict,flatline) S3method(prep,check_enough_train_data) S3method(prep,epi_recipe) +S3method(prep,step_adjust_latency) S3method(prep,step_epi_ahead) S3method(prep,step_epi_lag) S3method(prep,step_epi_slide) @@ -195,6 +196,7 @@ export(remove_frosting) export(remove_model) export(slather) export(smooth_quantile_reg) +export(step_adjust_latency) export(step_epi_ahead) export(step_epi_lag) export(step_epi_naomit) @@ -225,6 +227,7 @@ importFrom(checkmate,test_numeric) importFrom(checkmate,test_scalar) importFrom(cli,cli_abort) importFrom(cli,cli_warn) +importFrom(dplyr,"%>%") importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,any_of) @@ -237,8 +240,10 @@ importFrom(dplyr,full_join) importFrom(dplyr,group_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) +importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) +importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) @@ -291,6 +296,7 @@ importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) +importFrom(stringr,str_match) importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 04989e56d..092f54458 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -160,9 +160,8 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { #' date, rather than relative to the last day of data #' @param new_data assumes that this already has lag/ahead columns that we need #' to adjust -#' @importFrom dplyr %>% -#' @keywords internal #' @importFrom dplyr %>% pull +#' @keywords internal bake.step_adjust_latency <- function(object, new_data, ...) { sign_shift <- get_sign(object) # get the columns used, even if it's all of them @@ -178,7 +177,7 @@ bake.step_adjust_latency <- function(object, new_data, ...) { # infer the correct columns to be working with from the previous # transformations shift_cols <- get_shifted_column_tibble( - object, new_data, terms_used, as_of, + object$prefix, new_data, terms_used, as_of, sign_shift ) diff --git a/R/utils-latency.R b/R/utils-latency.R index d89675bde..e69c6ed34 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -20,12 +20,14 @@ extend_either <- function(new_data, shift_cols, keys) { key_cols = keys ) }) %>% + map(\(x) na.trim(x)) %>% # TODO need to talk about this reduce( dplyr::full_join, by = keys ) + return(new_data %>% - select(-shift_cols$original_name) %>% + select(-shift_cols$original_name) %>% # drop the original versions dplyr::full_join(shifted, by = keys) %>% dplyr::group_by(dplyr::across(dplyr::all_of(keys[-1]))) %>% dplyr::arrange(time_value) %>% @@ -34,7 +36,7 @@ extend_either <- function(new_data, shift_cols, keys) { #' find the columns added with the lags or aheads, and the amounts they have #' been changed -#' @param object the step and its parameters +#' @param prefix the prefix indicating if we are adjusting lags or aheads #' @param new_data the data transformed so far #' @return a tibble with columns `column` (relevant shifted names), `shift` (the #' amount that one is shifted), `latency` (original columns difference between @@ -42,28 +44,36 @@ extend_either <- function(new_data, shift_cols, keys) { #' `effective_shift` (shifts+latency), and `new_name` (adjusted names with the #' effective_shift) #' @keywords internal +#' @importFrom stringr str_match +#' @importFrom dplyr rowwise %>% get_shifted_column_tibble <- function( - object, new_data, terms_used, as_of, sign_shift) { - prefix <- object$prefix + prefix, new_data, terms_used, as_of, sign_shift, call = caller_env()) { relevant_columns <- names(new_data)[grepl(prefix, names(new_data))] to_keep <- rep(FALSE, length(relevant_columns)) for (col_name in terms_used) { to_keep <- to_keep | grepl(col_name, relevant_columns) } relevant_columns <- relevant_columns[to_keep] + if (length(relevant_columns) == 0) { + cli::cli_abort("There is no column(s) {terms_used}.", + current_column_names = names(new_data), + class = "epipredict_adjust_latency_nonexistent_column_used", + call = call + ) + } # TODO ask about a less jank way to do this - shift_amounts <- as.integer(str_match( + shift_amounts <- as.integer(stringr::str_match( relevant_columns, "_\\d+_" ) %>% `[`(, 1) %>% - str_match("\\d+") %>% + stringr::str_match("\\d+") %>% `[`(, 1)) shift_cols <- dplyr::tibble( original_name = relevant_columns, shifts = shift_amounts ) - shift_cols %>% + shift_cols %<>% rowwise() %>% # add the latencies to shift_cols mutate(latency = get_latency( @@ -72,8 +82,10 @@ get_shifted_column_tibble <- function( ungroup() %>% # add the updated names to shift_cols mutate( - effective_shift = shifts + latency, - new_name = adjust_name(prefix, shifts, original_name, latency) + effective_shift = shifts + abs(latency) + ) %>% + mutate( + new_name = adjust_name(prefix, original_name, effective_shift) ) return(shift_cols) } @@ -136,9 +148,9 @@ get_asof <- function(object, new_data) { #' adjust the shifts by latency for the names in column assumes e.g. #' `"lag_6_case_rate"` and returns something like `"lag_10_case_rate"` #' @keywords internal -adjust_name <- function(prefix, shifts, column, latency) { +adjust_name <- function(prefix, column, effective_shift) { pattern <- paste0(prefix, "\\d+", "_") - adjusted_shifts <- paste0(prefix, shifts + latency, "_") + adjusted_shifts <- paste0(prefix, effective_shift, "_") stringi::stri_replace_all_regex( column, pattern, adjusted_shifts @@ -154,5 +166,5 @@ get_latency <- function(new_data, as_of, column, shift_amount, sign_shift) { drop_na(column) %>% pull(time_value) %>% max() - return(as.integer(as_of - (shift_max_date - sign_shift * shift_amount))) + return(as.integer(sign_shift * (as_of - shift_max_date) + shift_amount)) } diff --git a/man/epi_shift.Rd b/man/epi_shift.Rd deleted file mode 100644 index 14316a8db..000000000 --- a/man/epi_shift.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_shift.R -\name{epi_shift} -\alias{epi_shift} -\title{Shift predictors while maintaining grouping and time_value ordering} -\usage{ -epi_shift(x, shifts, time_value, keys = NULL, out_name = "x") -} -\arguments{ -\item{x}{Data frame. Variables to shift} - -\item{shifts}{List. Each list element is a vector of shifts. -Negative values produce leads. The list should have the same -length as the number of columns in \code{x}.} - -\item{time_value}{Vector. Same length as \code{x} giving time stamps.} - -\item{keys}{Data frame, vector, or \code{NULL}. Additional grouping vars.} - -\item{out_name}{Chr. The output list will use this as a prefix.} -} -\value{ -a list of tibbles -} -\description{ -This is a lower-level function. As such it performs no error checking. -} -\keyword{internal} diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index 4a1a00104..4d7cc0a0e 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -95,10 +95,12 @@ r } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} } diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index bc6da0bef..752b38dbe 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -83,6 +83,7 @@ r \%>\% } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_epi_lag}()}, \code{\link{step_lag_difference}()} } diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index 7969ea3a7..e8ec2101a 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -58,6 +58,7 @@ r \%>\% } \seealso{ Other row operation steps: +\code{\link{step_adjust_latency}()}, \code{\link{step_epi_lag}()}, \code{\link{step_growth_rate}()} } diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R new file mode 100644 index 000000000..3873c5e6a --- /dev/null +++ b/tests/testthat/test-utils_latency.R @@ -0,0 +1,139 @@ +time_values <- as.Date("2021-01-01") + 0:199 +as_of <- max(time_values) + 5 +max_time <- max(time_values) +old_data <- tibble( + geo_value = rep("place", 200), + time_value = as.Date("2021-01-01") + 0:199, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + tmp_death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = as_of) +old_data %>% tail() +keys <- c("time_value", "geo_value") +old_data %<>% full_join(epi_shift_single( + old_data, "tmp_death_rate", 1, "death_rate", keys +), by = keys) %>% + select(-tmp_death_rate) +# old data is created so that death rate has a latency of 4, while case_rate has +# a latency of 5 +modified_data <- + old_data %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", -4, "ahead_4_case_rate", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", 3, "lag_3_case_rate", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "death_rate", 7, "lag_7_death_rate", keys), + by = keys + ) %>% + arrange(time_value) +modified_data %>% tail() +as_of - (modified_data %>% filter(!is.na(ahead_4_case_rate)) %>% pull(time_value) %>% max()) +all_shift_cols <- tibble::tribble( + ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, + "lag_3_case_rate", 3, 5, 8, "lag_8_case_rate", + "lag_7_death_rate", 7, 4, 11, "lag_11_death_rate", + "ahead_4_case_rate", 4, -5, 9, "ahead_9_case_rate" +) + +test_that("get_latency works", { + expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1), 4) + expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 3, 1), 5) + # get_latency does't check the shift_amount + expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 4, 1), 6) + # ahead works correctly + expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, -1), -5) + # setting the wrong sign doubles the shift and gets the sign wrong + expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, 1), 5 + 4 * 2) +}) + +test_that("adjust_name works", { + expect_equal( + adjust_name("lag_", "lag_5_case_rate_13", 10), + "lag_10_case_rate_13" + ) + # it won't change a column with the wrong prefix + expect_equal( + adjust_name("lag_", "ahead_5_case_rate", 10), + "ahead_5_case_rate" + ) + # it works on vectors of names + expect_equal( + adjust_name("lag_", c("lag_5_floop_35", "lag_2342352_case"), c(10, 7)), + c("lag_10_floop_35", "lag_7_case") + ) +}) + +test_that("get_asof works", { + object <- list(info = tribble( + ~variable, ~type, ~role, ~source, + "time_value", "date", "time_value", "original", + "geo_value", "nominal", "geo_value", "original", + "case_rate", "numeric", "raw", "original", + "death_rate", "numeric", "raw", "original", + "not_real", "numeric", "predictor", "derived" + )) + expect_equal(get_asof(object, modified_data), as_of) +}) + +test_that("get_shifted_column_tibble works", { + case_lag <- get_shifted_column_tibble( + "lag_", modified_data, + "case_rate", as_of, 1 + ) + expect_equal(case_lag, all_shift_cols[1, ]) + + death_lag <- get_shifted_column_tibble( + "lag_", modified_data, + "death_rate", as_of, 1 + ) + expect_equal(death_lag, all_shift_cols[2, ]) + + both_lag <- get_shifted_column_tibble( + "lag_", modified_data, + c("case_rate", "death_rate"), as_of, 1 + ) + expect_equal(both_lag, all_shift_cols[1:2, ]) + + case_ahead <- get_shifted_column_tibble( + "ahead_", modified_data, + "case_rate", as_of, -1 + ) + expect_equal(case_ahead, all_shift_cols[3, ]) +}) +test_that("get_shifted_column_tibble objects to non-columns", { + expect_error( + get_shifted_column_tibble( + "lag_", modified_data, "not_present", as_of, 1 + ), + class = "epipredict_adjust_latency_nonexistent_column_used" + ) +}) +test_that("extend_either works", { + keys <- c("geo_value", "time_value") + # extend_either doesn't differentiate between the directions, it just moves + # things + expected_post_shift <- + old_data %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", 8, "lag_8_case_rate", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "death_rate", 11, "lag_11_death_rate", keys), + by = keys + ) %>% + dplyr::full_join( + epi_shift_single(old_data, "case_rate", -9, "ahead_9_case_rate", keys), + by = keys + ) %>% + arrange(time_value) + expect_equal( + extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value), + expected_post_shift + ) +}) From 80e64b526691ac0aa47cf8ab773e1c9458394738 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 13:59:18 -0700 Subject: [PATCH 08/92] adding stringr --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 5cd468fb9..eb01cbecd 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,6 +41,7 @@ Imports: recipes (>= 1.0.4), rlang (>= 1.1.0), stats, + stringr, tibble, tidyr, tidyselect, From aa87607deca2459f83f4b683bf46869887e37037 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 14:33:57 -0700 Subject: [PATCH 09/92] old snapshots, select prefers `all_of` for vectors --- R/utils-latency.R | 6 +- man/epi_shift_single.Rd | 24 ++++++ man/extend_either.Rd | 24 ++++++ man/get_asof.Rd | 11 +++ man/get_latency.Rd | 16 ++++ man/get_shifted_column_tibble.Rd | 33 +++++++ man/get_sign.Rd | 13 +++ man/step_adjust_latency.Rd | 96 +++++++++++++++++++++ tests/testthat/_snaps/population_scaling.md | 30 +++---- tests/testthat/_snaps/utils-shift.md | 3 +- tests/testthat/test-epi_shift.R | 20 ----- 11 files changed, 235 insertions(+), 41 deletions(-) create mode 100644 man/epi_shift_single.Rd create mode 100644 man/extend_either.Rd create mode 100644 man/get_asof.Rd create mode 100644 man/get_latency.Rd create mode 100644 man/get_shifted_column_tibble.Rd create mode 100644 man/get_sign.Rd create mode 100644 man/step_adjust_latency.Rd diff --git a/R/utils-latency.R b/R/utils-latency.R index e69c6ed34..4322e36cb 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -27,7 +27,7 @@ extend_either <- function(new_data, shift_cols, keys) { ) return(new_data %>% - select(-shift_cols$original_name) %>% # drop the original versions + select(-all_of(shift_cols$original_name)) %>% # drop the original versions dplyr::full_join(shifted, by = keys) %>% dplyr::group_by(dplyr::across(dplyr::all_of(keys[-1]))) %>% dplyr::arrange(time_value) %>% @@ -107,7 +107,7 @@ get_asof <- function(object, new_data) { # the source data determines the actual time_values # these are the non-na time_values; time_values <- new_data %>% - select(original_columns) %>% + select(all_of(original_columns)) %>% drop_na() %>% pull(time_value) if (length(time_values) <= 0) { @@ -163,7 +163,7 @@ adjust_name <- function(prefix, column, effective_shift) { #' @keywords internal get_latency <- function(new_data, as_of, column, shift_amount, sign_shift) { shift_max_date <- new_data %>% - drop_na(column) %>% + drop_na(all_of(column)) %>% pull(time_value) %>% max() return(as.integer(sign_shift * (as_of - shift_max_date) + shift_amount)) diff --git a/man/epi_shift_single.Rd b/man/epi_shift_single.Rd new file mode 100644 index 000000000..871879004 --- /dev/null +++ b/man/epi_shift_single.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{epi_shift_single} +\alias{epi_shift_single} +\title{Shift predictors while maintaining grouping and time_value ordering} +\usage{ +epi_shift_single(x, col, shift_val, newname, key_cols) +} +\arguments{ +\item{x}{Data frame.} + +\item{shift_val}{a single integer. Negative values produce leads.} + +\item{newname}{the name for the newly shifted column} + +\item{key_cols}{vector, or \code{NULL}. Additional grouping vars.} +} +\value{ +a list of tibbles +} +\description{ +This is a lower-level function. As such it performs no error checking. +} +\keyword{internal} diff --git a/man/extend_either.Rd b/man/extend_either.Rd new file mode 100644 index 000000000..8ec5ca38e --- /dev/null +++ b/man/extend_either.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{extend_either} +\alias{extend_either} +\title{offset each relevant column by it's appropriate latency +works for either adjusting aheads or lags} +\usage{ +extend_either(new_data, shift_cols, keys) +} +\arguments{ +\item{new_data}{just what is says} + +\item{shift_cols}{a tibble which must have the columns \code{column}, the name of +the column to adjust, \code{latency} the latency of the original column relative +to the \code{as_of} date, \code{new_name}, the names in \code{column} adjusted by the +latencies \code{latency}} + +\item{keys}{the variables which are used as keys} +} +\description{ +offset each relevant column by it's appropriate latency +works for either adjusting aheads or lags +} +\keyword{internal} diff --git a/man/get_asof.Rd b/man/get_asof.Rd new file mode 100644 index 000000000..44b286ffb --- /dev/null +++ b/man/get_asof.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_asof} +\alias{get_asof} +\title{extract the as_of, and make sure there's nothing very off about it} +\usage{ +get_asof(object, new_data) +} +\description{ +extract the as_of, and make sure there's nothing very off about it +} diff --git a/man/get_latency.Rd b/man/get_latency.Rd new file mode 100644 index 000000000..d9098b456 --- /dev/null +++ b/man/get_latency.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_latency} +\alias{get_latency} +\title{the latency is also the amount the shift is off by} +\usage{ +get_latency(new_data, as_of, column, shift_amount, sign_shift) +} +\arguments{ +\item{sign_shift}{integer. 1 if lag and -1 if ahead. These represent how you +need to shift the data to bring the 3 day lagged value to today.} +} +\description{ +the latency is also the amount the shift is off by +} +\keyword{internal} diff --git a/man/get_shifted_column_tibble.Rd b/man/get_shifted_column_tibble.Rd new file mode 100644 index 000000000..635cab427 --- /dev/null +++ b/man/get_shifted_column_tibble.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_shifted_column_tibble} +\alias{get_shifted_column_tibble} +\title{find the columns added with the lags or aheads, and the amounts they have +been changed} +\usage{ +get_shifted_column_tibble( + prefix, + new_data, + terms_used, + as_of, + sign_shift, + call = caller_env() +) +} +\arguments{ +\item{prefix}{the prefix indicating if we are adjusting lags or aheads} + +\item{new_data}{the data transformed so far} +} +\value{ +a tibble with columns \code{column} (relevant shifted names), \code{shift} (the +amount that one is shifted), \code{latency} (original columns difference between +max_time_value and as_of (on a per-initial column basis)), +\code{effective_shift} (shifts+latency), and \code{new_name} (adjusted names with the +effective_shift) +} +\description{ +find the columns added with the lags or aheads, and the amounts they have +been changed +} +\keyword{internal} diff --git a/man/get_sign.Rd b/man/get_sign.Rd new file mode 100644 index 000000000..0fbe9bb92 --- /dev/null +++ b/man/get_sign.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{get_sign} +\alias{get_sign} +\title{lags move columns forward to bring the past up to today, while aheads drag +the future back to today} +\usage{ +get_sign(object) +} +\description{ +lags move columns forward to bring the past up to today, while aheads drag +the future back to today +} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd new file mode 100644 index 000000000..c60495c84 --- /dev/null +++ b/man/step_adjust_latency.Rd @@ -0,0 +1,96 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_adjust_latency.R +\name{step_adjust_latency} +\alias{step_adjust_latency} +\title{adapt the pipeline to latency in the data} +\usage{ +step_adjust_latency( + recipe, + ..., + role = NA, + trained = FALSE, + method = c("extend_ahead", "locf", "extend_lags"), + default = NA, + skip = FALSE, + prefix = NULL, + columns = NULL, + id = recipes::rand_id("epi_lag") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables for this step. +See \code{\link[recipes:selections]{recipes::selections()}} for more details. Typically you will not need +to set this manually, as the necessary adjustments will be done for the +predictors and outcome.} + +\item{method}{a character. Determines the method by which the +forecast handles latency. All of these assume the forecast date is the +\code{as_of} of the \code{epi_df}. The options are: +\itemize{ +\item \code{"extend_ahead"}: Lengthen the ahead so that forecasting from the last +observation results in a forecast \code{ahead} after the \code{as_of} date. E.g. if +there are 3 days of latency between the last observation and the \code{as_of} +date for a 4 day ahead forecast, the ahead used in practice is actually 7. +\item \code{"locf"}: carries forward the last observed value(s) up to the forecast +date. See the Vignette TODO for equivalents using other steps and more +sophisticated methods of extrapolation. +\item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that +the shortest lag at predict time is at the last observation. E.g. if the +lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used +become \code{c(3,10,17)} +}} + +\item{default}{Determines what fills empty rows +left by leading/lagging (defaults to NA).} + +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} + +\item{prefix}{a character. The prefix matching the one used in either +\code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} +if \code{method="extend_lags"} or "locf".} + +\item{id}{A unique identifier for the step} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +In the standard case, the pipeline assumes that the last observation is also +the day from which the forecast is being made. \code{step_adjust_latency} uses the +\code{as_of} date of the \code{epi_df} as the \code{forecast_date}. This is most useful in +realtime and pseudo-prospective forecasting for data where there is some +delay between the day recorded and when that data is available. +} +\details{ +The step assumes that the pipeline has already applied either +\code{step_epi_ahead} or \code{step_epi_lag} depending on the value of +\code{"method"}, and that \code{step_epi_naomit} has NOT been run. + +The \code{prefix} and \code{id} arguments are unchangeable to ensure that the code runs +properly and to avoid inconsistency with naming. For \code{step_epi_ahead}, they +are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for +\code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. +} +\examples{ +r <- epi_recipe(case_death_rate_subset) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + # step_adjust_latency(method = "extend_ahead") \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) +r +} +\seealso{ +Other row operation steps: +\code{\link{step_epi_lag}()}, +\code{\link{step_growth_rate}()}, +\code{\link{step_lag_difference}()} +} +\concept{row operation steps} diff --git a/tests/testthat/_snaps/population_scaling.md b/tests/testthat/_snaps/population_scaling.md index 149cd38ac..fd0f2c609 100644 --- a/tests/testthat/_snaps/population_scaling.md +++ b/tests/testthat/_snaps/population_scaling.md @@ -2,35 +2,33 @@ Code prep <- prep(r, jhu) + Message + Joining with `by = join_by(geo_value)` + Joining with `by = join_by(geo_value)` --- Code b <- bake(prep, jhu) + Message + Joining with `by = join_by(geo_value)` + Joining with `by = join_by(geo_value)` --- Code wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) + Message + Joining with `by = join_by(geo_value)` + Joining with `by = join_by(geo_value)` --- Code p <- predict(wf, latest) - -# expect error if `by` selector does not match - - Code - wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) - Condition - Error in `hardhat::validate_column_names()`: - ! The following required columns are missing: 'a'. - ---- - - Code - forecast(wf) - Condition - Error in `hardhat::validate_column_names()`: - ! The following required columns are missing: 'nothere'. + Message + Joining with `by = join_by(geo_value)` + Joining with `by = join_by(geo_value)` + Joining with `by = join_by(geo_value)` + Joining with `by = join_by(geo_value)` diff --git a/tests/testthat/_snaps/utils-shift.md b/tests/testthat/_snaps/utils-shift.md index b7c5f064f..66a18fe79 100644 --- a/tests/testthat/_snaps/utils-shift.md +++ b/tests/testthat/_snaps/utils-shift.md @@ -2,8 +2,7 @@ Code adjust_latency(object, x_adjust_ahead) - Condition - Warning: + Warning ! The ahead has been adjusted by 100, which is questionable for it's `time_type` of day i input ahead: 7 i shifted ahead: 107 diff --git a/tests/testthat/test-epi_shift.R b/tests/testthat/test-epi_shift.R index 78c9384f1..e8e843f9c 100644 --- a/tests/testthat/test-epi_shift.R +++ b/tests/testthat/test-epi_shift.R @@ -1,23 +1,3 @@ -x <- data.frame(x1 = 1:10, x2 = -10:-1) -lags <- list(c(0, 4), 1:3) - -test_that("epi shift works with NULL keys", { - time_value <- 1:10 - out <- epi_shift(x, lags, time_value) - expect_length(out, 7L) - expect_equal(nrow(out), 14L) - expect_equal(sum(complete.cases(out)), 6L) -}) - -test_that("epi shift works with groups", { - keys <- data.frame(a = rep(letters[1:2], each = 5), b = "z") - time_value <- 1:10 - out <- epi_shift(x, lags, time_value, keys) - expect_length(out, 8L) - expect_equal(nrow(out), 18L) - expect_equal(sum(complete.cases(out)), 2L) -}) - test_that("epi shift single works, renames", { tib <- tibble( x = 1:5, y = 1:5, From 375af6dcefed6eff1275af0ed38fe54b62b7aa4d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 15:33:55 -0700 Subject: [PATCH 10/92] local renv way out of date --- R/utils-latency.R | 2 +- tests/testthat/_snaps/population_scaling.md | 8 ++++---- tests/testthat/_snaps/utils-shift.md | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/R/utils-latency.R b/R/utils-latency.R index 4322e36cb..b97268a24 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -20,7 +20,7 @@ extend_either <- function(new_data, shift_cols, keys) { key_cols = keys ) }) %>% - map(\(x) na.trim(x)) %>% # TODO need to talk about this + map(\(x) zoo::na.trim(x)) %>% # TODO need to talk about this reduce( dplyr::full_join, by = keys diff --git a/tests/testthat/_snaps/population_scaling.md b/tests/testthat/_snaps/population_scaling.md index fd0f2c609..2152c2e78 100644 --- a/tests/testthat/_snaps/population_scaling.md +++ b/tests/testthat/_snaps/population_scaling.md @@ -2,7 +2,7 @@ Code prep <- prep(r, jhu) - Message + Message Joining with `by = join_by(geo_value)` Joining with `by = join_by(geo_value)` @@ -10,7 +10,7 @@ Code b <- bake(prep, jhu) - Message + Message Joining with `by = join_by(geo_value)` Joining with `by = join_by(geo_value)` @@ -18,7 +18,7 @@ Code wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) - Message + Message Joining with `by = join_by(geo_value)` Joining with `by = join_by(geo_value)` @@ -26,7 +26,7 @@ Code p <- predict(wf, latest) - Message + Message Joining with `by = join_by(geo_value)` Joining with `by = join_by(geo_value)` Joining with `by = join_by(geo_value)` diff --git a/tests/testthat/_snaps/utils-shift.md b/tests/testthat/_snaps/utils-shift.md index 66a18fe79..b7c5f064f 100644 --- a/tests/testthat/_snaps/utils-shift.md +++ b/tests/testthat/_snaps/utils-shift.md @@ -2,7 +2,8 @@ Code adjust_latency(object, x_adjust_ahead) - Warning + Condition + Warning: ! The ahead has been adjusted by 100, which is questionable for it's `time_type` of day i input ahead: 7 i shifted ahead: 107 From 24eca507251d7d86c279aceb86018f90350584df Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 15:38:07 -0700 Subject: [PATCH 11/92] pkgdown needs @keywords internal --- R/epi_shift.R | 1 + R/utils-latency.R | 1 + 2 files changed, 2 insertions(+) diff --git a/R/epi_shift.R b/R/epi_shift.R index 81dfcddc1..c91856237 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -19,6 +19,7 @@ epi_shift_single <- function(x, col, shift_val, newname, key_cols) { #' lags move columns forward to bring the past up to today, while aheads drag #' the future back to today +#' @keywords internal get_sign <- function(object) { if (object$prefix == "lag_") { return(1) diff --git a/R/utils-latency.R b/R/utils-latency.R index b97268a24..163643929 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -92,6 +92,7 @@ get_shifted_column_tibble <- function( #' extract the as_of, and make sure there's nothing very off about it +#' @keywords internal get_asof <- function(object, new_data) { original_columns <- object$info %>% filter(source == "original") %>% From 9e9b1b48301fc261e3146c7935e691eae1a1860e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 17:13:27 -0700 Subject: [PATCH 12/92] passes local tests after updating --- DESCRIPTION | 3 ++- man/bake.step_adjust_latency.Rd | 18 ++++++++++++++++++ man/get_asof.Rd | 1 + man/get_sign.Rd | 1 + 4 files changed, 22 insertions(+), 1 deletion(-) create mode 100644 man/bake.step_adjust_latency.Rd diff --git a/DESCRIPTION b/DESCRIPTION index eb01cbecd..16b9e07de 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -47,7 +47,8 @@ Imports: tidyselect, tsibble, vctrs, - workflows (>= 1.0.0) + workflows (>= 1.0.0), + zoo Suggests: covidcast, data.table, diff --git a/man/bake.step_adjust_latency.Rd b/man/bake.step_adjust_latency.Rd new file mode 100644 index 000000000..edb4f1f6e --- /dev/null +++ b/man/bake.step_adjust_latency.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_adjust_latency.R +\name{bake.step_adjust_latency} +\alias{bake.step_adjust_latency} +\title{various ways of handling differences between the \code{as_of} date and the maximum +time value} +\usage{ +\method{bake}{step_adjust_latency}(object, new_data, ...) +} +\arguments{ +\item{new_data}{assumes that this already has lag/ahead columns that we need +to adjust} +} +\description{ +adjust the ahead so that we will be predicting \code{ahead} days after the \code{as_of} +date, rather than relative to the last day of data +} +\keyword{internal} diff --git a/man/get_asof.Rd b/man/get_asof.Rd index 44b286ffb..4649df7cc 100644 --- a/man/get_asof.Rd +++ b/man/get_asof.Rd @@ -9,3 +9,4 @@ get_asof(object, new_data) \description{ extract the as_of, and make sure there's nothing very off about it } +\keyword{internal} diff --git a/man/get_sign.Rd b/man/get_sign.Rd index 0fbe9bb92..0be3e6306 100644 --- a/man/get_sign.Rd +++ b/man/get_sign.Rd @@ -11,3 +11,4 @@ get_sign(object) lags move columns forward to bring the past up to today, while aheads drag the future back to today } +\keyword{internal} From 7ae26d27962bf67b8eaba8d443f123426c5bad7e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 2 Apr 2024 11:18:50 -0700 Subject: [PATCH 13/92] back to skipping some population_scaling tests --- tests/testthat/_snaps/population_scaling.md | 2 -- tests/testthat/test-population_scaling.R | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/testthat/_snaps/population_scaling.md b/tests/testthat/_snaps/population_scaling.md index 2152c2e78..6d75fcc22 100644 --- a/tests/testthat/_snaps/population_scaling.md +++ b/tests/testthat/_snaps/population_scaling.md @@ -29,6 +29,4 @@ Message Joining with `by = join_by(geo_value)` Joining with `by = join_by(geo_value)` - Joining with `by = join_by(geo_value)` - Joining with `by = join_by(geo_value)` diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index d18be65f5..fbae404ad 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -193,6 +193,7 @@ test_that("Postprocessing to get cases from case rate", { test_that("test joining by default columns", { + skip() jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) From e346f7197bb18163b97aa44d9224ba15ca7c820d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 25 Apr 2024 15:33:28 -0500 Subject: [PATCH 14/92] step_adjust_latency works on tests --- DESCRIPTION | 1 + NAMESPACE | 4 + R/arx_forecaster.R | 1 + R/layer_add_forecast_date.R | 1 + R/layer_add_target_date.R | 1 + R/step_adjust_latency.R | 151 +++++++++++++++------- R/utils-latency.R | 62 ++++++--- man/add_shifted_columns.Rd | 14 ++ man/adjust_name.Rd | 14 ++ man/get_shifted_column_tibble.Rd | 2 + man/{get_asof.Rd => set_asof.Rd} | 6 +- man/step_adjust_latency.Rd | 16 ++- tests/testthat/test-step_adjust_latency.R | 126 ++++++++++++++++++ tests/testthat/test-utils_latency.R | 65 +++++++--- 14 files changed, 379 insertions(+), 85 deletions(-) create mode 100644 man/add_shifted_columns.Rd create mode 100644 man/adjust_name.Rd rename man/{get_asof.Rd => set_asof.Rd} (82%) create mode 100644 tests/testthat/test-step_adjust_latency.R diff --git a/DESCRIPTION b/DESCRIPTION index 16b9e07de..e3aa871df 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,6 +42,7 @@ Imports: rlang (>= 1.1.0), stats, stringr, + stringi, tibble, tidyr, tidyselect, diff --git a/NAMESPACE b/NAMESPACE index 0cf66300f..53c49aac1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -88,6 +88,7 @@ S3method(print,layer_quantile_distn) S3method(print,layer_residual_quantiles) S3method(print,layer_threshold) S3method(print,layer_unnest) +S3method(print,step_adjust_latency) S3method(print,step_epi_ahead) S3method(print,step_epi_lag) S3method(print,step_epi_slide) @@ -262,6 +263,8 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) +importFrom(lifecycle,deprecated) +importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(recipes,bake) importFrom(recipes,prep) @@ -296,6 +299,7 @@ importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) +importFrom(stringi,stri_replace_all_regex) importFrom(stringr,str_match) importFrom(tibble,as_tibble) importFrom(tibble,tibble) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 37c9aae86..7b0d84f24 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -1,3 +1,4 @@ +# TODO add latency to default forecaster #' Direct autoregressive forecaster with covariates #' #' This is an autoregressive forecasting model for diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 3d5ea010b..3ebc18cb1 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -1,3 +1,4 @@ +# TODO adapt this to latency #' Postprocessing step to add the forecast date #' #' @param frosting a `frosting` postprocessor diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 094ec8501..35bc84339 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -1,3 +1,4 @@ +# TODO adapt this to latency #' Postprocessing step to add the target date #' #' @param frosting a `frosting` postprocessor diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 092f54458..d6c6bfa23 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -25,7 +25,20 @@ #' - `"extend_lags"`: per `epi_key` and `predictor`, adjusts the lag so that #' the shortest lag at predict time is at the last observation. E.g. if the #' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used -#' become `c(3,10,17)` +#' become `c(3,10,17)`. +#' @param fixed_latency either a positive integer, or a labeled positive integer +#' vector. Cannot be set at the same time as `fixed_asof`. If non-`NULL`, +#' the amount to offset the ahead or lag by. If a single integer, this is used +#' for all columns; if a labeled vector, the labels must correspond to the +#' base column names. If `NULL`, the latency is the distance between the +#' `epi_df`'s `max_time_value` and either the `fixed_asof` or the `epi_df`'s +#' `as_of` field. +#' @param fixed_asof either a date of the same kind used in the `epi_df`, or +#' NULL. Cannot be set at the same time as `fixed_latency`. If a date, it +#' gives the date from which the forecast is actually occurring. If `NULL`, +#' the `as_of` is determined either from `fixed_latency` or automatically. +#' @param columns A character string of column names to be adjusted; these +#' should be the original columns, and not the derived ones #' @param default Determines what fills empty rows #' left by leading/lagging (defaults to NA). #' @param prefix a character. The prefix matching the one used in either @@ -68,6 +81,8 @@ step_adjust_latency <- "locf", "extend_lags" ), + fixed_latency = NULL, + fixed_asof = NULL, default = NA, skip = FALSE, prefix = NULL, @@ -85,10 +100,21 @@ step_adjust_latency <- method <- rlang::arg_match(method) if (method == "extend_ahead") { prefix <- "ahead_" - } else { + if (!any(map_lgl( + recipe$steps, + \(recipe_step) inherits(recipe_step, "step_epi_ahead") + ))) { + cli:cli_abort("There is no `step_epi_ahead` defined before this. For the method `extend_ahead` of `step_adjust_latency`, at least one ahead must be previously defined.") + } + } else if (method == "extend_lags") { prefix <- "lag_" + if (!any(map_lgl( + recipe$steps, + \(recipe_step) inherits(recipe_step, "step_epi_lag") + ))) { + cli:cli_abort("There is no `step_epi_lag` defined before this. For the method `extend_lags` of `step_adjust_latency`, at least one lag must be previously defined.") + } } - arg_is_chr_scalar(prefix, id, method) recipes::add_step( recipe, @@ -96,12 +122,13 @@ step_adjust_latency <- terms = dplyr::enquos(...), role = role, method = method, - info = NULL, trained = trained, + as_of = fixed_asof, + latency = fixed_latency, + shift_cols = NULL, prefix = prefix, default = default, keys = epi_keys(recipe), - columns = columns, skip = skip, id = id ) @@ -109,24 +136,27 @@ step_adjust_latency <- } step_adjust_latency_new <- - function(terms, role, trained, prefix, default, keys, method, info, - columns, skip, id) { + function(terms, role, trained, as_of, latency, shift_cols, prefix, time_type, default, + keys, method, skip, id) { step( subclass = "adjust_latency", terms = terms, role = role, method = method, - info = info, trained = trained, + as_of = as_of, + latency = latency, + shift_cols = shift_cols, prefix = prefix, default = default, keys = keys, - columns = columns, skip = skip, id = id ) } +# lags introduces max(lags) NA's after the max_time_value. +# TODO all of the shifting happens before NA removal, which saves all the data I might possibly want; I should probably add a bit that makes sure this operation is happening before NA removal so data doesn't get dropped #' @export prep.step_adjust_latency <- function(x, training, info = NULL, ...) { if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { @@ -136,56 +166,33 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { cli::cli_abort('If `method` is `"extend_lags"` or `"locf"`, then a step ", "must have already added a predictor.') } - # TODO info here is probably not the best way to handle this, hypothetically I - # get an info object during baking - step_adjust_latency_new( - terms = x$terms, - role = x$role, - trained = TRUE, - prefix = x$prefix, - default = x$default, - keys = x$keys, - method = x$method, - info = info, - columns = recipes::recipes_eval_select(x$terms, training, info), - skip = x$skip, - id = x$id - ) -} -#' various ways of handling differences between the `as_of` date and the maximum -#' time value -#' @description -#' adjust the ahead so that we will be predicting `ahead` days after the `as_of` -#' date, rather than relative to the last day of data -#' @param new_data assumes that this already has lag/ahead columns that we need -#' to adjust -#' @importFrom dplyr %>% pull -#' @keywords internal -bake.step_adjust_latency <- function(object, new_data, ...) { - sign_shift <- get_sign(object) + sign_shift <- get_sign(x) # get the columns used, even if it's all of them - terms_used <- object$columns + terms_used <- x$columns if (length(terms_used) == 0) { - terms_used <- object$info %>% + terms_used <- info %>% filter(role == "raw") %>% pull(variable) } # get and check the max_time and as_of are the right kinds of dates - as_of <- get_asof(object, new_data) + if (is.null(x$as_of)) { + as_of <- set_asof(training, info) + } else { + as_of <- x$as_of + } # infer the correct columns to be working with from the previous # transformations shift_cols <- get_shifted_column_tibble( - object$prefix, new_data, terms_used, as_of, - sign_shift + x$prefix, training, terms_used, + as_of, x$latency, sign_shift, info ) - if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { + if ((x$method == "extend_ahead") || (x$method == "extend_lags")) { # check that the shift amount isn't too extreme latency <- max(shift_cols$latency) - i_latency <- which.max(shift_cols$latency) - time_type <- attributes(new_data)$metadata$time_type + time_type <- attributes(training)$metadata$time_type if ( (grepl("day", time_type) && (latency >= 10)) || (grepl("week", time_type) && (latency >= 4)) || @@ -204,9 +211,63 @@ bake.step_adjust_latency <- function(object, new_data, ...) { "i" = "max_time = {max_time} -> as_of = {as_of}" )) } + } + + step_adjust_latency_new( + terms = shift_cols$original_name, + role = shift_cols$role[[1]], + trained = TRUE, + prefix = x$prefix, + shift_cols = shift_cols, + as_of = as_of, + latency = unique(shift_cols$latency), + default = x$default, + keys = x$keys, + method = x$method, + skip = x$skip, + id = x$id + ) +} + +#' various ways of handling differences between the `as_of` date and the maximum +#' time value +#' @description +#' adjust the ahead so that we will be predicting `ahead` days after the `as_of` +#' date, rather than relative to the last day of data +#' @param new_data assumes that this already has lag/ahead columns that we need +#' to adjust +#' @importFrom dplyr %>% pull +#' @keywords internal +bake.step_adjust_latency <- function(object, new_data, ...) { + if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { keys <- object$keys return( - extend_either(new_data, shift_cols, keys) + extend_either(new_data, object$shift_cols, keys) ) } } + +#' @export +print.step_adjust_latency <- + function(x, width = max(20, options$width - 35), ...) { + if (length(x$terms) == 0) { + terms <- "all previous predictors" + } else { + terms <- x$terms + } + if (!is.null(x$as_of)) { + conj <- "with forecast date" + extra_text <- x$as_of + } else if (!is.null(x$shift_cols)) { + conj <- "with latencies" + extra_text <- x$shift_cols + } else { + conj <- "" + extra_text <- "set at train time" + } + print_epi_step(terms, NULL, x$trained, x$method, + conjunction = conj, + extra_text = extra_text + ) + invisible(x) + } diff --git a/R/utils-latency.R b/R/utils-latency.R index 163643929..0e11627dd 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -10,7 +10,7 @@ extend_either <- function(new_data, shift_cols, keys) { shifted <- shift_cols %>% - select(-shifts, -effective_shift) %>% + select(-any_of(c("shifts", "effective_shift", "type", "role", "source"))) %>% pmap(\(original_name, latency, new_name) { epi_shift_single( x = new_data, @@ -20,7 +20,7 @@ extend_either <- function(new_data, shift_cols, keys) { key_cols = keys ) }) %>% - map(\(x) zoo::na.trim(x)) %>% # TODO need to talk about this + map(\(x) zoo::na.trim(x)) %>% reduce( dplyr::full_join, by = keys @@ -46,8 +46,10 @@ extend_either <- function(new_data, shift_cols, keys) { #' @keywords internal #' @importFrom stringr str_match #' @importFrom dplyr rowwise %>% +#' @importFrom magrittr %<>% get_shifted_column_tibble <- function( - prefix, new_data, terms_used, as_of, sign_shift, call = caller_env()) { + prefix, new_data, terms_used, as_of, latency, + sign_shift, info, call = caller_env()) { relevant_columns <- names(new_data)[grepl(prefix, names(new_data))] to_keep <- rep(FALSE, length(relevant_columns)) for (col_name in terms_used) { @@ -61,40 +63,58 @@ get_shifted_column_tibble <- function( call = call ) } - # TODO ask about a less jank way to do this - shift_amounts <- as.integer(stringr::str_match( - relevant_columns, - "_\\d+_" - ) %>% - `[`(, 1) %>% - stringr::str_match("\\d+") %>% - `[`(, 1)) + # this pulls text that is any number of digits between two _, e.g. _3557_, and + # converts them to an integer + shift_amounts <- stringr::str_match(relevant_columns, "_(\\d+)_") %>% + `[`(, 2) %>% + as.integer() + shift_cols <- dplyr::tibble( original_name = relevant_columns, shifts = shift_amounts ) + if (is.null(latency)) { + shift_cols %<>% + rowwise() %>% + # add the latencies to shift_cols + mutate(latency = get_latency( + new_data, as_of, original_name, shifts, sign_shift + )) %>% + ungroup() + } else if (length(latency) > 1) { + shift_cols %<>% rowwise() %>% + mutate(latency = unname(latency[purrr::map_lgl( + names(latency), + \(x) grepl(x, original_name) + )])) %>% + ungroup() + } else { + shift_cols %<>% mutate(latency = latency) + } + + # add the updated names to shift_cols shift_cols %<>% - rowwise() %>% - # add the latencies to shift_cols - mutate(latency = get_latency( - new_data, as_of, original_name, shifts, sign_shift - )) %>% - ungroup() %>% - # add the updated names to shift_cols mutate( effective_shift = shifts + abs(latency) ) %>% mutate( new_name = adjust_name(prefix, original_name, effective_shift) ) + info %<>% select(variable, type, role) + shift_cols <- left_join(shift_cols, info, by = join_by(original_name == variable)) + if (length(unique(shift_cols$role)) != 1) { + cli::cli_error("not all roles are the same!", + shift_cols = shift_cols + ) + } return(shift_cols) } #' extract the as_of, and make sure there's nothing very off about it #' @keywords internal -get_asof <- function(object, new_data) { - original_columns <- object$info %>% +set_asof <- function(new_data, info) { + original_columns <- info %>% filter(source == "original") %>% pull(variable) # make sure that there's enough column names @@ -117,7 +137,6 @@ get_asof <- function(object, new_data) { as_of <- attributes(new_data)$metadata$as_of max_time <- max(time_values) # make sure the as_of is sane - # TODO decide on these checks if (!inherits(as_of, class(time_values))) { rlang::abort(glue::glue( "the data matrix `as_of` value is {as_of}, ", @@ -149,6 +168,7 @@ get_asof <- function(object, new_data) { #' adjust the shifts by latency for the names in column assumes e.g. #' `"lag_6_case_rate"` and returns something like `"lag_10_case_rate"` #' @keywords internal +#' @importFrom stringi stri_replace_all_regex adjust_name <- function(prefix, column, effective_shift) { pattern <- paste0(prefix, "\\d+", "_") adjusted_shifts <- paste0(prefix, effective_shift, "_") diff --git a/man/add_shifted_columns.Rd b/man/add_shifted_columns.Rd new file mode 100644 index 000000000..d7aba745b --- /dev/null +++ b/man/add_shifted_columns.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{add_shifted_columns} +\alias{add_shifted_columns} +\title{backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the +checks missing in \code{epi_shift_single}} +\usage{ +add_shifted_columns(new_data, object, amount) +} +\description{ +backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the +checks missing in \code{epi_shift_single} +} +\keyword{internal} diff --git a/man/adjust_name.Rd b/man/adjust_name.Rd new file mode 100644 index 000000000..8cb6e5106 --- /dev/null +++ b/man/adjust_name.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{adjust_name} +\alias{adjust_name} +\title{adjust the shifts by latency for the names in column assumes e.g. +\code{"lag_6_case_rate"} and returns something like \code{"lag_10_case_rate"}} +\usage{ +adjust_name(prefix, column, effective_shift) +} +\description{ +adjust the shifts by latency for the names in column assumes e.g. +\code{"lag_6_case_rate"} and returns something like \code{"lag_10_case_rate"} +} +\keyword{internal} diff --git a/man/get_shifted_column_tibble.Rd b/man/get_shifted_column_tibble.Rd index 635cab427..c66941e0e 100644 --- a/man/get_shifted_column_tibble.Rd +++ b/man/get_shifted_column_tibble.Rd @@ -10,7 +10,9 @@ get_shifted_column_tibble( new_data, terms_used, as_of, + latency, sign_shift, + info, call = caller_env() ) } diff --git a/man/get_asof.Rd b/man/set_asof.Rd similarity index 82% rename from man/get_asof.Rd rename to man/set_asof.Rd index 4649df7cc..fabf97c8a 100644 --- a/man/get_asof.Rd +++ b/man/set_asof.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-latency.R -\name{get_asof} -\alias{get_asof} +\name{set_asof} +\alias{set_asof} \title{extract the as_of, and make sure there's nothing very off about it} \usage{ -get_asof(object, new_data) +set_asof(new_data, info) } \description{ extract the as_of, and make sure there's nothing very off about it diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index c60495c84..ab2975fef 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -10,6 +10,8 @@ step_adjust_latency( role = NA, trained = FALSE, method = c("extend_ahead", "locf", "extend_lags"), + fixed_latency = NULL, + fixed_asof = NULL, default = NA, skip = FALSE, prefix = NULL, @@ -40,9 +42,18 @@ sophisticated methods of extrapolation. \item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that the shortest lag at predict time is at the last observation. E.g. if the lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used -become \code{c(3,10,17)} }} +\item{fixed_latency}{either a positive integer, or a labeled positive integer +vector. Cannot be set at the same time as \code{fixed_asof}. If non-\code{NULL}, the +amount to offset the ahead or lag by. If a single integer, this is used for +all columns; if a labeled vector, the labels must correspond to the base +column names. If \code{NULL}, the latency is the distance between the \code{epi_df}'s +\code{max_time_value} and either the \code{fixed_asof} or the \code{epi_df}'s \code{as_of} +field.} + +\item{fixed_asof}{either a date of the same kind used in the \code{epi_df}, or NULL. Cannot be set at the same time as \code{fixed_latency}. If a date, it gives the date from which the forecast is actually occurring. If \code{NULL}, the \code{as_of} is determined either from \code{fixed_latency} or automatically.} + \item{default}{Determines what fills empty rows left by leading/lagging (defaults to NA).} @@ -57,6 +68,9 @@ the computations for subsequent operations.} \code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} if \code{method="extend_lags"} or "locf".} +\item{columns}{A character string of variable names that will +btency' become \code{c(3,10,17)}} + \item{id}{A unique identifier for the step} } \value{ diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R new file mode 100644 index 000000000..16612ee3b --- /dev/null +++ b/tests/testthat/test-step_adjust_latency.R @@ -0,0 +1,126 @@ +library(dplyr) + +x <- tibble( + geo_value = rep("place", 200), + time_value = as.Date("2021-01-01") + 0:199, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df() +max_time <- max(x$time_value) +class(attributes(x)$metadata$as_of) +as_of <- attributes(x)$metadata$as_of +ahead <- 7 +latency <- 5 + +testing_as_of <- max_time + latency +# create x with a plausible as_of date +real_x <- x +attributes(real_x)$metadata$as_of <- testing_as_of + +slm_fit <- function(recipe, data = x) { + epi_workflow() %>% + add_epi_recipe(recipe) %>% + add_model(linear_reg()) %>% + fit(data = data) +} + +test_that("epi_adjust_latency correctly extends the lags", { + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(method = "extend_lags") + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit5 <- slm_fit(r5))) + + # now trying with the as_of a reasonable distance in the future + fit5 <- slm_fit(r5, data = real_x) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_5_death_rate", "lag_11_death_rate", "lag_16_death_rate", + "lag_6_case_rate", "lag_10_case_rate" + ) + ) + latest <- get_test_data(r5, x) + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(testing_as_of)) + + expect_equal(names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate")) + latest <- get_test_data(r5, x) + pred <- predict(fit5, latest) + actual_solutions <- pred %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(5, 11, 16)) %>% + step_epi_lag(case_rate, lag = c(6, 10)) %>% + step_epi_ahead(death_rate, ahead = ahead) + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal(fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients) +}) + +test_that("epi_adjust_latency correctly extends the ahead", { + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(method = "extend_ahead") + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit5 <- slm_fit(r5))) + # real date example + fit5 <- slm_fit(r5, data = real_x) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r5, x) + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + # max time is still the forecast date + expect_equal(point_pred$time_value, as.Date(max_time)) + # target column renamed + expect_equal(names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead + latency}_death_rate")) + # fit an equivalent forecaster + equivalent <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + latency) + equiv_fit <- slm_fit(equivalent, data = real_x) + # adjusting the ahead should do the same thing as directly adjusting the ahead + expect_equal(fit5$fit$fit$fit$coefficients, + equiv_fit$fit$fit$fit$coefficients) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) +}) + +test_that("epi_adjust_latency fixed_* work", {}) +# todo test variants on the columns for which this is applied +# todo need to have both on columns 1, and 2 + +test_that("epi_adjust_latency works correctly when there's gaps in the timeseries", {}) + +test_that("epi_adjust_latency extend_ahead uses the same adjustment when predicting on new data after being baked", {}) + +test_that("epi_adjust_latency works for other time types", {}) + +# todo check that epi_adjust_latency errors for nonsense `as_of`'s diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 3873c5e6a..524d98b4d 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -34,12 +34,13 @@ modified_data <- modified_data %>% tail() as_of - (modified_data %>% filter(!is.na(ahead_4_case_rate)) %>% pull(time_value) %>% max()) all_shift_cols <- tibble::tribble( - ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, - "lag_3_case_rate", 3, 5, 8, "lag_8_case_rate", - "lag_7_death_rate", 7, 4, 11, "lag_11_death_rate", - "ahead_4_case_rate", 4, -5, 9, "ahead_9_case_rate" + ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "lag_3_case_rate", 3, 5, 8, "lag_8_case_rate", "numeric", "predictor", + "lag_7_death_rate", 7, 4, 11, "lag_11_death_rate", "numeric", "predictor", + "ahead_4_case_rate", 4, -5, 9, "ahead_9_case_rate", "numeric", "outcome" ) + test_that("get_latency works", { expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1), 4) expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 3, 1), 5) @@ -51,6 +52,8 @@ test_that("get_latency works", { expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, 1), 5 + 4 * 2) }) +test_that("get_latency infers max_time to be the minimum `max time` across the columns", {}) + test_that("adjust_name works", { expect_equal( adjust_name("lag_", "lag_5_case_rate_13", 10), @@ -69,50 +72,80 @@ test_that("adjust_name works", { }) test_that("get_asof works", { - object <- list(info = tribble( + info <- tribble( ~variable, ~type, ~role, ~source, "time_value", "date", "time_value", "original", "geo_value", "nominal", "geo_value", "original", "case_rate", "numeric", "raw", "original", "death_rate", "numeric", "raw", "original", "not_real", "numeric", "predictor", "derived" - )) - expect_equal(get_asof(object, modified_data), as_of) + ) + expect_equal(set_asof(modified_data, info), as_of) }) -test_that("get_shifted_column_tibble works", { +test_that("get_shifted_column_tibble infers latency and works correctly", { + info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") case_lag <- get_shifted_column_tibble( - "lag_", modified_data, - "case_rate", as_of, 1 + "lag_", modified_data, "case_rate", + as_of, NULL, 1, info ) expect_equal(case_lag, all_shift_cols[1, ]) death_lag <- get_shifted_column_tibble( "lag_", modified_data, - "death_rate", as_of, 1 + "death_rate", as_of, NULL, 1, info ) expect_equal(death_lag, all_shift_cols[2, ]) both_lag <- get_shifted_column_tibble( - "lag_", modified_data, - c("case_rate", "death_rate"), as_of, 1 + "lag_", modified_data, c("case_rate", "death_rate"), + as_of, NULL, 1, info ) expect_equal(both_lag, all_shift_cols[1:2, ]) +}) + +test_that("get_shifted_column_tibble assigns given latencies", { + # non-null latency + info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") + both_lag <- get_shifted_column_tibble( + "lag_", modified_data, + c("case_rate", "death_rate"), as_of, 50, 1, info + ) + weird_latencies <- tibble::tribble( + ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "lag_3_case_rate", 3, 50, 53, "lag_53_case_rate", "numeric", "predictor", + "lag_7_death_rate", 7, 50, 57, "lag_57_death_rate", "numeric", "predictor", + ) + expect_equal(both_lag, weird_latencies) + + # supposing we add the latencies by hand, and they're different, and in a different order + weird_latencies <- tibble::tribble( + ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "lag_3_case_rate", 3, 70, 73, "lag_73_case_rate", "numeric", "predictor", + "lag_7_death_rate", 7, 30, 37, "lag_37_death_rate", "numeric", "predictor", + ) + both_lag <- get_shifted_column_tibble( + "lag_", modified_data, + c("case_rate", "death_rate"), as_of, c(death_rate = 30, case_rate = 70), 1, info + ) + expect_equal(both_lag, weird_latencies[1:2, ]) case_ahead <- get_shifted_column_tibble( "ahead_", modified_data, - "case_rate", as_of, -1 + "case_rate", as_of, NULL, -1, info ) expect_equal(case_ahead, all_shift_cols[3, ]) }) + test_that("get_shifted_column_tibble objects to non-columns", { expect_error( get_shifted_column_tibble( - "lag_", modified_data, "not_present", as_of, 1 + "lag_", modified_data, "not_present", as_of, NULL, 1, info ), class = "epipredict_adjust_latency_nonexistent_column_used" ) }) + test_that("extend_either works", { keys <- c("geo_value", "time_value") # extend_either doesn't differentiate between the directions, it just moves @@ -137,3 +170,5 @@ test_that("extend_either works", { expected_post_shift ) }) + +# todo case where somehow columns of different roles are selected From 05b5cbf2e4ebe26d6f1a0aec03abf63ad236615e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 3 May 2024 15:59:14 -0500 Subject: [PATCH 15/92] spurious lifecycle addition removed --- NAMESPACE | 1 - man/step_adjust_latency.Rd | 22 +++++++++++++--------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 53c49aac1..96dd0e67f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -263,7 +263,6 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) -importFrom(lifecycle,deprecated) importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") importFrom(recipes,bake) diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index ab2975fef..76f1a2d51 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -42,17 +42,21 @@ sophisticated methods of extrapolation. \item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that the shortest lag at predict time is at the last observation. E.g. if the lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used +become \code{c(3,10,17)}. }} \item{fixed_latency}{either a positive integer, or a labeled positive integer -vector. Cannot be set at the same time as \code{fixed_asof}. If non-\code{NULL}, the -amount to offset the ahead or lag by. If a single integer, this is used for -all columns; if a labeled vector, the labels must correspond to the base -column names. If \code{NULL}, the latency is the distance between the \code{epi_df}'s -\code{max_time_value} and either the \code{fixed_asof} or the \code{epi_df}'s \code{as_of} -field.} +vector. Cannot be set at the same time as \code{fixed_asof}. If non-\code{NULL}, +the amount to offset the ahead or lag by. If a single integer, this is used +for all columns; if a labeled vector, the labels must correspond to the +base column names. If \code{NULL}, the latency is the distance between the +\code{epi_df}'s \code{max_time_value} and either the \code{fixed_asof} or the \code{epi_df}'s +\code{as_of} field.} -\item{fixed_asof}{either a date of the same kind used in the \code{epi_df}, or NULL. Cannot be set at the same time as \code{fixed_latency}. If a date, it gives the date from which the forecast is actually occurring. If \code{NULL}, the \code{as_of} is determined either from \code{fixed_latency} or automatically.} +\item{fixed_asof}{either a date of the same kind used in the \code{epi_df}, or +NULL. Cannot be set at the same time as \code{fixed_latency}. If a date, it +gives the date from which the forecast is actually occurring. If \code{NULL}, +the \code{as_of} is determined either from \code{fixed_latency} or automatically.} \item{default}{Determines what fills empty rows left by leading/lagging (defaults to NA).} @@ -68,8 +72,8 @@ the computations for subsequent operations.} \code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} if \code{method="extend_lags"} or "locf".} -\item{columns}{A character string of variable names that will -btency' become \code{c(3,10,17)}} +\item{columns}{A character string of column names to be adjusted; these +should be the original columns, and not the derived ones} \item{id}{A unique identifier for the step} } From 47cb5b7c0c4caca248ffe1b6b105db474437bde9 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 3 May 2024 16:32:52 -0500 Subject: [PATCH 16/92] fixing RMDcheck remote --- man/step_adjust_latency.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 76f1a2d51..e4088ba7b 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_adjust_latency.R \name{step_adjust_latency} \alias{step_adjust_latency} -\title{adapt the pipeline to latency in the data} +\title{Adapt the pipeline to latency in the data} \usage{ step_adjust_latency( recipe, From 4b0b668295d1e816b2ee6fb79f46d334b9413cbf Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 3 May 2024 17:00:00 -0500 Subject: [PATCH 17/92] nothing but `rlang::abort` -> `cli::cli_abort`s --- R/compat-recipes.R | 2 +- R/epi_recipe.R | 13 ++++++------- R/frosting.R | 3 +-- R/step_adjust_latency.R | 4 ++-- R/utils-latency.R | 4 ++-- R/utils-shift.R | 6 +++--- 6 files changed, 15 insertions(+), 17 deletions(-) diff --git a/R/compat-recipes.R b/R/compat-recipes.R index 12d11049a..0e4a557cb 100644 --- a/R/compat-recipes.R +++ b/R/compat-recipes.R @@ -18,7 +18,7 @@ inline_check <- function(x) { funs <- fun_calls(x) funs <- funs[!(funs %in% c("~", "+", "-"))] if (length(funs) > 0) { - rlang::abort(paste0( + cli::cli_abort(paste0( "No in-line functions should be used here; ", "use steps to define baking actions." )) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index f8216c2af..5c4de0c78 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -61,7 +61,7 @@ epi_recipe.epi_df <- attr(x, "decay_to_tibble") <- FALSE if (!is.null(formula)) { if (!is.null(vars)) { - rlang::abort( + cli::cli_abort( paste0( "This `vars` specification will be ignored ", "when a formula is used" @@ -69,7 +69,7 @@ epi_recipe.epi_df <- ) } if (!is.null(roles)) { - rlang::abort( + cli::cli_abort( paste0( "This `roles` specification will be ignored ", "when a formula is used" @@ -82,10 +82,10 @@ epi_recipe.epi_df <- } if (is.null(vars)) vars <- colnames(x) if (any(table(vars) > 1)) { - rlang::abort("`vars` should have unique members") + cli::cli_abort("`vars` should have unique members") } if (any(!(vars %in% colnames(x)))) { - rlang::abort("1 or more elements of `vars` are not in the data") + cli::cli_abort("1 or more elements of `vars` are not in the data") } keys <- key_colnames(x) # we know x is an epi_df @@ -96,7 +96,7 @@ epi_recipe.epi_df <- ## Check and add roles when available if (!is.null(roles)) { if (length(roles) != length(vars)) { - rlang::abort(c( + cli::cli_abort(c( "The number of roles should be the same as the number of ", "variables." )) @@ -142,7 +142,6 @@ epi_recipe.epi_df <- #' @rdname epi_recipe -#' @importFrom rlang abort #' @export epi_recipe.formula <- function(formula, data, ...) { # we ensure that there's only 1 row in the template @@ -475,7 +474,7 @@ prep.epi_recipe <- function( "You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: ", arg, ". Do you want to use a tuning function such as `tune_grid()`?" ) - rlang::abort(msg) + cli::cli_abort(msg) } note <- paste("oper", i, gsub("_", " ", class(x$steps[[i]])[1])) if (!x$steps[[i]]$trained | fresh) { diff --git a/R/frosting.R b/R/frosting.R index 8474edbdf..d7ba22902 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -89,7 +89,7 @@ validate_has_postprocessor <- function(x, ..., call = caller_env()) { "The workflow must have a frosting postprocessor.", i = "Provide one with `add_frosting()`." ) - rlang::abort(message, call = call) + cli::cli_abort(message, call = call) } invisible(x) } @@ -356,7 +356,6 @@ apply_frosting.default <- function(workflow, components, ...) { #' @rdname apply_frosting #' @importFrom rlang is_null -#' @importFrom rlang abort #' @param type,opts forwarded (along with `...`) to [`predict.model_fit()`] and #' [`slather()`] for supported layers #' @export diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index d6c6bfa23..a53dadffc 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -89,10 +89,10 @@ step_adjust_latency <- columns = NULL, id = recipes::rand_id("epi_lag")) { if (!is_epi_recipe(recipe)) { - rlang::abort("This recipe step can only operate on an `epi_recipe`.") + cli::cli_abort("This recipe step can only operate on an `epi_recipe`.") } if (!is.null(columns)) { - rlang::abort(c("The `columns` argument must be `NULL.", + cli::cli_abort(c("The `columns` argument must be `NULL.", i = "Use `tidyselect` methods to choose columns to lag." )) } diff --git a/R/utils-latency.R b/R/utils-latency.R index 0e11627dd..decaade1e 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -132,13 +132,13 @@ set_asof <- function(new_data, info) { drop_na() %>% pull(time_value) if (length(time_values) <= 0) { - rlang::abort("the `time_value` column of `new_data` is empty") + cli::cli_abort("the `time_value` column of `new_data` is empty") } as_of <- attributes(new_data)$metadata$as_of max_time <- max(time_values) # make sure the as_of is sane if (!inherits(as_of, class(time_values))) { - rlang::abort(glue::glue( + cli::cli_abort(glue::glue( "the data matrix `as_of` value is {as_of}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", diff --git a/R/utils-shift.R b/R/utils-shift.R index a8eccf6fe..702a52308 100644 --- a/R/utils-shift.R +++ b/R/utils-shift.R @@ -12,7 +12,7 @@ adjust_latency <- function(object, new_data) { } else if (method == "extend_ahead") { as_of <- attributes(new_data)$metadata$as_of if (FALSE && (typeof(as_of) != typeof(new_data$time_value))) { - rlang::abort(glue::glue( + cli::cli_abort(glue::glue( "the data matrix `as_of` value is {as_of}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", @@ -60,10 +60,10 @@ adjust_latency <- function(object, new_data) { } return(effective_ahead) } else { - rlang::abort("the `time_value` column of `new_data` is empty") + cli::cli_abort("the `time_value` column of `new_data` is empty") } } else { - rlang::abort(glue::glue( + cli::cli_abort(glue::glue( "Latency adjustment method {method} has not yet ", "been implemented for `step_epi_ahead`." )) From 2731160bec866bfa0abfdd9d16ac9bf2e00ee857 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 3 May 2024 17:58:40 -0500 Subject: [PATCH 18/92] smaller suggestions and styling --- R/step_adjust_latency.R | 23 +++++++++++----------- R/step_epi_shift.R | 8 +------- R/utils-latency.R | 13 ++++++------ man/step_adjust_latency.Rd | 7 +++++++ tests/testthat/test-step_adjust_latency.R | 24 +++++++++++++++-------- tests/testthat/test-utils_latency.R | 7 ++++--- 6 files changed, 47 insertions(+), 35 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index a53dadffc..d43950b44 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -1,4 +1,4 @@ -#' adapt the pipeline to latency in the data +#' Adapt the pipeline to latency in the data #' #' In the standard case, the pipeline assumes that the last observation is also #' the day from which the forecast is being made. `step_adjust_latency` uses the @@ -37,6 +37,11 @@ #' NULL. Cannot be set at the same time as `fixed_latency`. If a date, it #' gives the date from which the forecast is actually occurring. If `NULL`, #' the `as_of` is determined either from `fixed_latency` or automatically. +#' @param role For model terms created by this step, what analysis role should +#' they be assigned? `lag` is default a predictor while `ahead` is an outcome. +#' It should be correctly inferred and not need setting +#' @param trained A logical to indicate if the quantities for preprocessing have +#' been estimated. #' @param columns A character string of column names to be adjusted; these #' should be the original columns, and not the derived ones #' @param default Determines what fills empty rows @@ -92,7 +97,7 @@ step_adjust_latency <- cli::cli_abort("This recipe step can only operate on an `epi_recipe`.") } if (!is.null(columns)) { - cli::cli_abort(c("The `columns` argument must be `NULL.", + cli::cli_abort(c("The `columns` argument must be `NULL`.", i = "Use `tidyselect` methods to choose columns to lag." )) } @@ -102,7 +107,7 @@ step_adjust_latency <- prefix <- "ahead_" if (!any(map_lgl( recipe$steps, - \(recipe_step) inherits(recipe_step, "step_epi_ahead") + function(recipe_step) inherits(recipe_step, "step_epi_ahead") ))) { cli:cli_abort("There is no `step_epi_ahead` defined before this. For the method `extend_ahead` of `step_adjust_latency`, at least one ahead must be previously defined.") } @@ -110,7 +115,7 @@ step_adjust_latency <- prefix <- "lag_" if (!any(map_lgl( recipe$steps, - \(recipe_step) inherits(recipe_step, "step_epi_lag") + function(recipe_step) inherits(recipe_step, "step_epi_lag") ))) { cli:cli_abort("There is no `step_epi_lag` defined before this. For the method `extend_lags` of `step_adjust_latency`, at least one lag must be previously defined.") } @@ -160,8 +165,8 @@ step_adjust_latency_new <- #' @export prep.step_adjust_latency <- function(x, training, info = NULL, ...) { if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { - cli::cli_abort(glue::glue(c('If `method` is `"extend_ahead"`, then a step ", - "must have already added an outcome .'))) + cli::cli_abort('If `method` is `"extend_ahead"`, then a step ", + "must have already added an outcome .') } else if (!("predictor" %in% info$role)) { cli::cli_abort('If `method` is `"extend_lags"` or `"locf"`, then a step ", "must have already added a predictor.') @@ -176,11 +181,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { pull(variable) } # get and check the max_time and as_of are the right kinds of dates - if (is.null(x$as_of)) { - as_of <- set_asof(training, info) - } else { - as_of <- x$as_of - } + as_of <- x$as_of %||% set_asof(training, info) # infer the correct columns to be working with from the previous # transformations diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 3a65dd05b..7cdf0680d 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -58,12 +58,6 @@ step_epi_lag <- prefix = "lag_", default = NA, skip = FALSE, - latency_adjustment = c( - "None", - "extend_ahead", - "locf", - "extend_lags" - ), id = rand_id("epi_lag")) { if (!is_epi_recipe(recipe)) { cli_abort("This step can only operate on an `epi_recipe`.") @@ -80,7 +74,7 @@ step_epi_lag <- arg_is_chr_scalar(prefix, id, latency_adjustment) if (!is.null(columns)) { cli::cli_abort(c( - "The `columns` argument must be `NULL.", + "The `columns` argument must be `NULL`.", i = "Use `tidyselect` methods to choose columns to lag." )) } diff --git a/R/utils-latency.R b/R/utils-latency.R index decaade1e..54a2ddb08 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -74,7 +74,7 @@ get_shifted_column_tibble <- function( shifts = shift_amounts ) if (is.null(latency)) { - shift_cols %<>% + shift_cols <- shift_cols %>% rowwise() %>% # add the latencies to shift_cols mutate(latency = get_latency( @@ -82,28 +82,29 @@ get_shifted_column_tibble <- function( )) %>% ungroup() } else if (length(latency) > 1) { - shift_cols %<>% rowwise() %>% + shift_cols <- shift_cols %>% + rowwise() %>% mutate(latency = unname(latency[purrr::map_lgl( names(latency), \(x) grepl(x, original_name) )])) %>% ungroup() } else { - shift_cols %<>% mutate(latency = latency) + shift_cols <- shift_cols %>% mutate(latency = latency) } # add the updated names to shift_cols - shift_cols %<>% + shift_cols <- shift_cols %>% mutate( effective_shift = shifts + abs(latency) ) %>% mutate( new_name = adjust_name(prefix, original_name, effective_shift) ) - info %<>% select(variable, type, role) + info <- shift_cols %>% select(variable, type, role) shift_cols <- left_join(shift_cols, info, by = join_by(original_name == variable)) if (length(unique(shift_cols$role)) != 1) { - cli::cli_error("not all roles are the same!", + cli::cli_abort("not all roles are the same!", shift_cols = shift_cols ) } diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index e4088ba7b..9e8efc377 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -28,6 +28,13 @@ See \code{\link[recipes:selections]{recipes::selections()}} for more details. Ty to set this manually, as the necessary adjustments will be done for the predictors and outcome.} +\item{role}{For model terms created by this step, what analysis role should +they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome. +It should be correctly inferred and not need setting} + +\item{trained}{A logical to indicate if the quantities for preprocessing have +been estimated.} + \item{method}{a character. Determines the method by which the forecast handles latency. All of these assume the forecast date is the \code{as_of} of the \code{epi_df}. The options are: diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 16612ee3b..28608de6d 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -51,8 +51,10 @@ test_that("epi_adjust_latency correctly extends the lags", { expect_equal(nrow(point_pred), 1) expect_equal(point_pred$time_value, as.Date(testing_as_of)) - expect_equal(names(fit5$pre$mold$outcomes), - glue::glue("ahead_{ahead}_death_rate")) + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) latest <- get_test_data(r5, x) pred <- predict(fit5, latest) actual_solutions <- pred %>% filter(!is.na(.pred)) @@ -68,8 +70,10 @@ test_that("epi_adjust_latency correctly extends the lags", { step_epi_lag(case_rate, lag = c(6, 10)) %>% step_epi_ahead(death_rate, ahead = ahead) fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) - expect_equal(fit5$fit$fit$fit$coefficients, - fit_hand_adj$fit$fit$fit$coefficients) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) }) test_that("epi_adjust_latency correctly extends the ahead", { @@ -97,8 +101,10 @@ test_that("epi_adjust_latency correctly extends the ahead", { # max time is still the forecast date expect_equal(point_pred$time_value, as.Date(max_time)) # target column renamed - expect_equal(names(fit5$pre$mold$outcomes), - glue::glue("ahead_{ahead + latency}_death_rate")) + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead + latency}_death_rate") + ) # fit an equivalent forecaster equivalent <- epi_recipe(x) %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% @@ -106,8 +112,10 @@ test_that("epi_adjust_latency correctly extends the ahead", { step_epi_ahead(death_rate, ahead = ahead + latency) equiv_fit <- slm_fit(equivalent, data = real_x) # adjusting the ahead should do the same thing as directly adjusting the ahead - expect_equal(fit5$fit$fit$fit$coefficients, - equiv_fit$fit$fit$fit$coefficients) + expect_equal( + fit5$fit$fit$fit$coefficients, + equiv_fit$fit$fit$fit$coefficients + ) # should have four predictors, including the intercept expect_equal(length(fit5$fit$fit$fit$coefficients), 6) diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 524d98b4d..a9692b1e9 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -10,9 +10,10 @@ old_data <- tibble( as_epi_df(as_of = as_of) old_data %>% tail() keys <- c("time_value", "geo_value") -old_data %<>% full_join(epi_shift_single( - old_data, "tmp_death_rate", 1, "death_rate", keys -), by = keys) %>% +old_data <- shift_cols %>% + full_join(epi_shift_single( + old_data, "tmp_death_rate", 1, "death_rate", keys + ), by = keys) %>% select(-tmp_death_rate) # old data is created so that death rate has a latency of 4, while case_rate has # a latency of 5 From 5c1e15ef8d6d68c2feb52c8747703194c4d1822d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 6 May 2024 16:34:01 -0500 Subject: [PATCH 19/92] smaller suggestions: local tests passing again --- R/step_adjust_latency.R | 5 +++-- R/step_epi_shift.R | 16 +++------------- R/utils-latency.R | 4 ++-- man/step_epi_shift.Rd | 9 --------- tests/testthat/test-step_adjust_latency.R | 3 ++- tests/testthat/test-utils_latency.R | 3 +-- 6 files changed, 11 insertions(+), 29 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index d43950b44..d104b5aeb 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -194,6 +194,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { # check that the shift amount isn't too extreme latency <- max(shift_cols$latency) time_type <- attributes(training)$metadata$time_type + i_latency <- which.max(shift_cols$latency) if ( (grepl("day", time_type) && (latency >= 10)) || (grepl("week", time_type) && (latency >= 4)) || @@ -207,8 +208,8 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { "which is questionable for it's `time_type` of ", "{time_type}" ), - "i" = "input ahead: {shift_cols$shifts[[i_latency]]}", - "i" = "shifted ahead: {shift_cols$effective_shift[[i_latency]]}", + "i" = "input shift: {shift_cols$shifts[[i_latency]]}", + "i" = "latency adjusted shift: {shift_cols$effective_shift[[i_latency]]}", "i" = "max_time = {max_time} -> as_of = {as_of}" )) } diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 7cdf0680d..bbb0fc93d 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -69,9 +69,8 @@ step_epi_lag <- i = "Did you perhaps pass an integer in `...` accidentally?" )) } - latency_adjustment <- rlang::arg_match(latency_adjustment) arg_is_nonneg_int(lag) - arg_is_chr_scalar(prefix, id, latency_adjustment) + arg_is_chr_scalar(prefix, id) if (!is.null(columns)) { cli::cli_abort(c( "The `columns` argument must be `NULL`.", @@ -88,7 +87,7 @@ step_epi_lag <- prefix = prefix, default = default, keys = key_colnames(recipe), - columns = NULL, + columns = columns, latency_adjustment = latency_adjustment, skip = skip, id = id @@ -108,12 +107,6 @@ step_epi_ahead <- role = "outcome", prefix = "ahead_", default = NA, - latency_adjustment = c( - "None", - "extend_ahead", - "locf", - "extend_lags" - ), columns = NULL, skip = FALSE, id = rand_id("epi_ahead")) { @@ -127,7 +120,6 @@ step_epi_ahead <- i = "Did you perhaps pass an integer in `...` accidentally?" )) } - latency_adjustment <- rlang::arg_match(latency_adjustment) arg_is_nonneg_int(ahead) arg_is_chr_scalar(prefix, id, latency_adjustment) recipes::add_step( @@ -161,7 +153,6 @@ step_epi_lag_new <- prefix = prefix, default = default, keys = keys, - latency_adjustment = latency_adjustment, columns = columns, skip = skip, id = id @@ -169,7 +160,7 @@ step_epi_lag_new <- } step_epi_ahead_new <- - function(terms, role, trained, ahead, prefix, default, keys, latency_adjustment, + function(terms, role, trained, ahead, prefix, default, keys, columns, skip, id) { recipes::step( subclass = "epi_ahead", @@ -179,7 +170,6 @@ step_epi_ahead_new <- ahead = ahead, prefix = prefix, default = default, - latency_adjustment = latency_adjustment, keys = keys, columns = columns, skip = skip, diff --git a/R/utils-latency.R b/R/utils-latency.R index 54a2ddb08..837397ddb 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -101,7 +101,7 @@ get_shifted_column_tibble <- function( mutate( new_name = adjust_name(prefix, original_name, effective_shift) ) - info <- shift_cols %>% select(variable, type, role) + info <- info %>% select(variable, type, role) shift_cols <- left_join(shift_cols, info, by = join_by(original_name == variable)) if (length(unique(shift_cols$role)) != 1) { cli::cli_abort("not all roles are the same!", @@ -138,7 +138,7 @@ set_asof <- function(new_data, info) { as_of <- attributes(new_data)$metadata$as_of max_time <- max(time_values) # make sure the as_of is sane - if (!inherits(as_of, class(time_values))) { + if (!inherits(as_of, class(time_values)) & !inherits(as_of, "POSIXt")) { cli::cli_abort(glue::glue( "the data matrix `as_of` value is {as_of}, ", "and not a valid `time_type` with type ", diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index 4d7cc0a0e..f0f7f2a2f 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -13,7 +13,6 @@ step_epi_lag( prefix = "lag_", default = NA, skip = FALSE, - latency_adjustment = c("None", "extend_ahead", "locf", "extend_lags"), id = rand_id("epi_lag") ) @@ -24,7 +23,6 @@ step_epi_ahead( role = "outcome", prefix = "ahead_", default = NA, - latency_adjustment = c("None", "extend_ahead", "locf", "extend_lags"), columns = NULL, skip = FALSE, id = rand_id("epi_ahead") @@ -56,13 +54,6 @@ conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} -\item{latency_adjustment}{a character. Determines the method by which the forecast handles data that doesn't extend to the day the forecast is made. The options are: -\itemize{ -\item \code{"extend_ahead"}: actually forecasts from the last date. E.g. if there are 3 days of latency for a 4 day ahead forecast, the ahead used in practice is actually 7. -\item \code{"locf"}: carries forward the last observed value up to the forecast date. -\item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that the shortest lag at predict time is -}} - \item{id}{A unique identifier for the step} } \value{ diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 28608de6d..08673a34a 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -34,10 +34,11 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5))) + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1022"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = real_x) + expect_equal( names(fit5$pre$mold$predictors), c( diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index a9692b1e9..f364f4c68 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -8,9 +8,8 @@ old_data <- tibble( tmp_death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 ) %>% as_epi_df(as_of = as_of) -old_data %>% tail() keys <- c("time_value", "geo_value") -old_data <- shift_cols %>% +old_data <- old_data %>% full_join(epi_shift_single( old_data, "tmp_death_rate", 1, "death_rate", keys ), by = keys) %>% From c48e81a851777cfe2b4bbc16aa5f6c040406af36 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 8 May 2024 16:27:48 -0500 Subject: [PATCH 20/92] moving shift detection earlier,dropping string*dep --- DESCRIPTION | 2 - NAMESPACE | 8 +- R/step_adjust_latency.R | 71 +++++++------- R/utils-latency.R | 95 +++++++++---------- man/adjust_name.Rd | 14 --- man/construct_shift_tibble.Rd | 12 +++ ..._tibble.Rd => get_latent_column_tibble.Rd} | 15 ++- man/step_adjust_latency.Rd | 9 +- tests/testthat/test-step_adjust_latency.R | 2 +- tests/testthat/test-utils_latency.R | 94 +++++++++--------- 10 files changed, 152 insertions(+), 170 deletions(-) delete mode 100644 man/adjust_name.Rd create mode 100644 man/construct_shift_tibble.Rd rename man/{get_shifted_column_tibble.Rd => get_latent_column_tibble.Rd} (79%) diff --git a/DESCRIPTION b/DESCRIPTION index e3aa871df..f7886f314 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,8 +41,6 @@ Imports: recipes (>= 1.0.4), rlang (>= 1.1.0), stats, - stringr, - stringi, tibble, tidyr, tidyselect, diff --git a/NAMESPACE b/NAMESPACE index 96dd0e67f..037b64ed3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -248,6 +248,7 @@ importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) +importFrom(dplyr,tibble) importFrom(dplyr,ungroup) importFrom(epiprocess,epi_slide) importFrom(epiprocess,growth_rate) @@ -261,10 +262,11 @@ importFrom(ggplot2,geom_line) importFrom(ggplot2,geom_linerange) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) +importFrom(glue,glue) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) -importFrom(magrittr,"%<>%") importFrom(magrittr,"%>%") +importFrom(purrr,map_lgl) importFrom(recipes,bake) importFrom(recipes,prep) importFrom(recipes,rand_id) @@ -273,7 +275,6 @@ importFrom(rlang,"!!") importFrom(rlang,"%@%") importFrom(rlang,"%||%") importFrom(rlang,":=") -importFrom(rlang,abort) importFrom(rlang,arg_match) importFrom(rlang,as_function) importFrom(rlang,caller_arg) @@ -298,11 +299,10 @@ importFrom(stats,predict) importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) -importFrom(stringi,stri_replace_all_regex) -importFrom(stringr,str_match) importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) +importFrom(tidyr,unnest) importFrom(vctrs,as_list_of) importFrom(vctrs,field) importFrom(vctrs,new_rcrd) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index d104b5aeb..844db4871 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -90,9 +90,9 @@ step_adjust_latency <- fixed_asof = NULL, default = NA, skip = FALSE, - prefix = NULL, columns = NULL, id = recipes::rand_id("epi_lag")) { + arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { cli::cli_abort("This recipe step can only operate on an `epi_recipe`.") } @@ -103,24 +103,28 @@ step_adjust_latency <- } method <- rlang::arg_match(method) + terms_used <- recipes_eval_select(enquos(...), recipe$template, recipe$term_info) + if (length(terms_used) == 0) { + terms_used <- recipe$term_info %>% + filter(role == "raw") %>% + pull(variable) + } if (method == "extend_ahead") { - prefix <- "ahead_" - if (!any(map_lgl( - recipe$steps, - function(recipe_step) inherits(recipe_step, "step_epi_ahead") - ))) { - cli:cli_abort("There is no `step_epi_ahead` defined before this. For the method `extend_ahead` of `step_adjust_latency`, at least one ahead must be previously defined.") - } + rel_step_type <- "step_epi_ahead" + shift_name <- "ahead" } else if (method == "extend_lags") { - prefix <- "lag_" - if (!any(map_lgl( - recipe$steps, - function(recipe_step) inherits(recipe_step, "step_epi_lag") - ))) { - cli:cli_abort("There is no `step_epi_lag` defined before this. For the method `extend_lags` of `step_adjust_latency`, at least one lag must be previously defined.") - } + rel_step_type <- "step_epi_lag" + shift_name <- "lag" + } + relevant_shifts <- construct_shift_tibble(terms_used, recipe, rel_step_type, shift_name) + + if (!any(map_lgl( + recipe$steps, + function(recipe_step) inherits(recipe_step, rel_step_type) + ))) { + cli:cli_abort("there is no `{rel_step_type}` defined before this. for the method `extend_{shift_name}` of `step_adjust_latency`, at least one {shift_name} must be previously defined.") } - arg_is_chr_scalar(prefix, id, method) + recipes::add_step( recipe, step_adjust_latency_new( @@ -130,8 +134,7 @@ step_adjust_latency <- trained = trained, as_of = fixed_asof, latency = fixed_latency, - shift_cols = NULL, - prefix = prefix, + shift_cols = relevant_shifts, default = default, keys = epi_keys(recipe), skip = skip, @@ -141,7 +144,7 @@ step_adjust_latency <- } step_adjust_latency_new <- - function(terms, role, trained, as_of, latency, shift_cols, prefix, time_type, default, + function(terms, role, trained, as_of, latency, shift_cols, time_type, default, keys, method, skip, id) { step( subclass = "adjust_latency", @@ -152,7 +155,6 @@ step_adjust_latency_new <- as_of = as_of, latency = latency, shift_cols = shift_cols, - prefix = prefix, default = default, keys = keys, skip = skip, @@ -163,6 +165,7 @@ step_adjust_latency_new <- # lags introduces max(lags) NA's after the max_time_value. # TODO all of the shifting happens before NA removal, which saves all the data I might possibly want; I should probably add a bit that makes sure this operation is happening before NA removal so data doesn't get dropped #' @export +#' @importFrom glue glue prep.step_adjust_latency <- function(x, training, info = NULL, ...) { if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { cli::cli_abort('If `method` is `"extend_ahead"`, then a step ", @@ -172,7 +175,6 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { "must have already added a predictor.') } - sign_shift <- get_sign(x) # get the columns used, even if it's all of them terms_used <- x$columns if (length(terms_used) == 0) { @@ -185,16 +187,18 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { # infer the correct columns to be working with from the previous # transformations - shift_cols <- get_shifted_column_tibble( - x$prefix, training, terms_used, - as_of, x$latency, sign_shift, info + x$prefix <- x$shift_cols$prefix[[1]] + sign_shift <- get_sign(x) + latency_cols <- get_latent_column_tibble( + x$shift_cols, training, as_of, + x$latency, sign_shift, info ) if ((x$method == "extend_ahead") || (x$method == "extend_lags")) { # check that the shift amount isn't too extreme - latency <- max(shift_cols$latency) + latency <- max(latency_cols$latency) time_type <- attributes(training)$metadata$time_type - i_latency <- which.max(shift_cols$latency) + i_latency <- which.max(latency_cols$latency) if ( (grepl("day", time_type) && (latency >= 10)) || (grepl("week", time_type) && (latency >= 4)) || @@ -203,26 +207,25 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ((time_type == "year") && (latency >= 1)) ) { cli::cli_warn(c( - "!" = glue::glue( + "!" = glue( "The shift has been adjusted by {latency}, ", "which is questionable for it's `time_type` of ", "{time_type}" ), - "i" = "input shift: {shift_cols$shifts[[i_latency]]}", - "i" = "latency adjusted shift: {shift_cols$effective_shift[[i_latency]]}", + "i" = "input shift: {latency_cols$shift[[i_latency]]}", + "i" = "latency adjusted shift: {latency_cols$effective_shift[[i_latency]]}", "i" = "max_time = {max_time} -> as_of = {as_of}" )) } } step_adjust_latency_new( - terms = shift_cols$original_name, - role = shift_cols$role[[1]], + terms = latency_cols$original_name, + role = latency_cols$role[[1]], trained = TRUE, - prefix = x$prefix, - shift_cols = shift_cols, + shift_cols = latency_cols, as_of = as_of, - latency = unique(shift_cols$latency), + latency = unique(latency_cols$latency), default = x$default, keys = x$keys, method = x$method, diff --git a/R/utils-latency.R b/R/utils-latency.R index 837397ddb..9a49a1b57 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -10,8 +10,8 @@ extend_either <- function(new_data, shift_cols, keys) { shifted <- shift_cols %>% - select(-any_of(c("shifts", "effective_shift", "type", "role", "source"))) %>% - pmap(\(original_name, latency, new_name) { + select(original_name, latency, new_name) %>% + pmap(function(original_name, latency, new_name) { epi_shift_single( x = new_data, col = original_name, @@ -20,7 +20,7 @@ extend_either <- function(new_data, shift_cols, keys) { key_cols = keys ) }) %>% - map(\(x) zoo::na.trim(x)) %>% + map(function(x) zoo::na.trim(x)) %>% reduce( dplyr::full_join, by = keys @@ -34,6 +34,36 @@ extend_either <- function(new_data, shift_cols, keys) { dplyr::ungroup()) } +#' create a table of the columns to modify, their shifts, and their prefixes +#' @keywords internal +#' @importFrom dplyr tibble +#' @importFrom tidyr unnest +construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name) { + # for the right step types (either "step_epi_lag" or "step_epi_shift"), grab + # the useful parameters, including the evaluated column names + extract_named_rates <- function(recipe_step) { + if (inherits(recipe_step, rel_step_type)) { + recipe_columns <- recipes_eval_select(recipe_step$terms, recipe$template, recipe$term_info) + if (any(recipe_columns %in% terms_used)) { + return(list(term = recipe_columns, shift = recipe_step[shift_name], prefix = recipe_step$prefix)) + } + } + return(NULL) + } + rel_list <- recipe$steps %>% + purrr::map(extract_named_rates) %>% + unlist(recursive = FALSE) %>% + split(c("term", "shift", "prefix")) + relevant_shifts <- tibble( + terms = lapply(rel_list$term, unname), + shift = lapply(rel_list$shift, unname), + prefix = unname(unlist(rel_list$prefix)) + ) %>% + unnest(c(terms, shift)) %>% + unnest(shift) + return(relevant_shifts) +} + #' find the columns added with the lags or aheads, and the amounts they have #' been changed #' @param prefix the prefix indicating if we are adjusting lags or aheads @@ -41,53 +71,29 @@ extend_either <- function(new_data, shift_cols, keys) { #' @return a tibble with columns `column` (relevant shifted names), `shift` (the #' amount that one is shifted), `latency` (original columns difference between #' max_time_value and as_of (on a per-initial column basis)), -#' `effective_shift` (shifts+latency), and `new_name` (adjusted names with the +#' `effective_shift` (shift+latency), and `new_name` (adjusted names with the #' effective_shift) #' @keywords internal -#' @importFrom stringr str_match #' @importFrom dplyr rowwise %>% -#' @importFrom magrittr %<>% -get_shifted_column_tibble <- function( - prefix, new_data, terms_used, as_of, latency, +#' @importFrom purrr map_lgl +#' @importFrom glue glue +get_latent_column_tibble <- function( + shift_cols, new_data, as_of, latency, sign_shift, info, call = caller_env()) { - relevant_columns <- names(new_data)[grepl(prefix, names(new_data))] - to_keep <- rep(FALSE, length(relevant_columns)) - for (col_name in terms_used) { - to_keep <- to_keep | grepl(col_name, relevant_columns) - } - relevant_columns <- relevant_columns[to_keep] - if (length(relevant_columns) == 0) { - cli::cli_abort("There is no column(s) {terms_used}.", - current_column_names = names(new_data), - class = "epipredict_adjust_latency_nonexistent_column_used", - call = call - ) - } - # this pulls text that is any number of digits between two _, e.g. _3557_, and - # converts them to an integer - shift_amounts <- stringr::str_match(relevant_columns, "_(\\d+)_") %>% - `[`(, 2) %>% - as.integer() - - shift_cols <- dplyr::tibble( - original_name = relevant_columns, - shifts = shift_amounts - ) + shift_cols <- shift_cols %>% mutate(original_name = glue("{prefix}{shift}_{terms}")) if (is.null(latency)) { shift_cols <- shift_cols %>% rowwise() %>% # add the latencies to shift_cols mutate(latency = get_latency( - new_data, as_of, original_name, shifts, sign_shift + new_data, as_of, original_name, shift, sign_shift )) %>% ungroup() } else if (length(latency) > 1) { + # if latency has a length, we assign based on comparing the name in the list with the `terms` column shift_cols <- shift_cols %>% rowwise() %>% - mutate(latency = unname(latency[purrr::map_lgl( - names(latency), - \(x) grepl(x, original_name) - )])) %>% + mutate(latency = unname(latency[names(latency) == terms])) %>% ungroup() } else { shift_cols <- shift_cols %>% mutate(latency = latency) @@ -96,10 +102,10 @@ get_shifted_column_tibble <- function( # add the updated names to shift_cols shift_cols <- shift_cols %>% mutate( - effective_shift = shifts + abs(latency) + effective_shift = shift + abs(latency) ) %>% mutate( - new_name = adjust_name(prefix, original_name, effective_shift) + new_name = glue("{prefix}{effective_shift}_{terms}") ) info <- info %>% select(variable, type, role) shift_cols <- left_join(shift_cols, info, by = join_by(original_name == variable)) @@ -166,19 +172,6 @@ set_asof <- function(new_data, info) { return(as_of) } -#' adjust the shifts by latency for the names in column assumes e.g. -#' `"lag_6_case_rate"` and returns something like `"lag_10_case_rate"` -#' @keywords internal -#' @importFrom stringi stri_replace_all_regex -adjust_name <- function(prefix, column, effective_shift) { - pattern <- paste0(prefix, "\\d+", "_") - adjusted_shifts <- paste0(prefix, effective_shift, "_") - stringi::stri_replace_all_regex( - column, - pattern, adjusted_shifts - ) -} - #' the latency is also the amount the shift is off by #' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you #' need to shift the data to bring the 3 day lagged value to today. diff --git a/man/adjust_name.Rd b/man/adjust_name.Rd deleted file mode 100644 index 8cb6e5106..000000000 --- a/man/adjust_name.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-latency.R -\name{adjust_name} -\alias{adjust_name} -\title{adjust the shifts by latency for the names in column assumes e.g. -\code{"lag_6_case_rate"} and returns something like \code{"lag_10_case_rate"}} -\usage{ -adjust_name(prefix, column, effective_shift) -} -\description{ -adjust the shifts by latency for the names in column assumes e.g. -\code{"lag_6_case_rate"} and returns something like \code{"lag_10_case_rate"} -} -\keyword{internal} diff --git a/man/construct_shift_tibble.Rd b/man/construct_shift_tibble.Rd new file mode 100644 index 000000000..619583f1d --- /dev/null +++ b/man/construct_shift_tibble.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{construct_shift_tibble} +\alias{construct_shift_tibble} +\title{create a table of the columns to modify, their shifts, and their prefixes} +\usage{ +construct_shift_tibble(terms_used, recipe, rel_step_type, shift_name) +} +\description{ +create a table of the columns to modify, their shifts, and their prefixes +} +\keyword{internal} diff --git a/man/get_shifted_column_tibble.Rd b/man/get_latent_column_tibble.Rd similarity index 79% rename from man/get_shifted_column_tibble.Rd rename to man/get_latent_column_tibble.Rd index c66941e0e..cbf2c7fe1 100644 --- a/man/get_shifted_column_tibble.Rd +++ b/man/get_latent_column_tibble.Rd @@ -1,14 +1,13 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-latency.R -\name{get_shifted_column_tibble} -\alias{get_shifted_column_tibble} +\name{get_latent_column_tibble} +\alias{get_latent_column_tibble} \title{find the columns added with the lags or aheads, and the amounts they have been changed} \usage{ -get_shifted_column_tibble( - prefix, +get_latent_column_tibble( + shift_cols, new_data, - terms_used, as_of, latency, sign_shift, @@ -17,15 +16,15 @@ get_shifted_column_tibble( ) } \arguments{ -\item{prefix}{the prefix indicating if we are adjusting lags or aheads} - \item{new_data}{the data transformed so far} + +\item{prefix}{the prefix indicating if we are adjusting lags or aheads} } \value{ a tibble with columns \code{column} (relevant shifted names), \code{shift} (the amount that one is shifted), \code{latency} (original columns difference between max_time_value and as_of (on a per-initial column basis)), -\code{effective_shift} (shifts+latency), and \code{new_name} (adjusted names with the +\code{effective_shift} (shift+latency), and \code{new_name} (adjusted names with the effective_shift) } \description{ diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 9e8efc377..4df945204 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -14,7 +14,6 @@ step_adjust_latency( fixed_asof = NULL, default = NA, skip = FALSE, - prefix = NULL, columns = NULL, id = recipes::rand_id("epi_lag") ) @@ -75,14 +74,14 @@ conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} -\item{prefix}{a character. The prefix matching the one used in either -\code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} -if \code{method="extend_lags"} or "locf".} - \item{columns}{A character string of column names to be adjusted; these should be the original columns, and not the derived ones} \item{id}{A unique identifier for the step} + +\item{prefix}{a character. The prefix matching the one used in either +\code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} +if \code{method="extend_lags"} or "locf".} } \value{ An updated version of \code{recipe} with the new step added to the diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 08673a34a..ecd4fa888 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -34,7 +34,7 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1022"), class = "simpleError") + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1024"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = real_x) diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index f364f4c68..b96768b50 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -34,12 +34,24 @@ modified_data <- modified_data %>% tail() as_of - (modified_data %>% filter(!is.na(ahead_4_case_rate)) %>% pull(time_value) %>% max()) all_shift_cols <- tibble::tribble( - ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "lag_3_case_rate", 3, 5, 8, "lag_8_case_rate", "numeric", "predictor", - "lag_7_death_rate", 7, 4, 11, "lag_11_death_rate", "numeric", "predictor", - "ahead_4_case_rate", 4, -5, 9, "ahead_9_case_rate", "numeric", "outcome" + ~terms , ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate" , 3, "lag_", "lag_3_case_rate", 5, 8, "lag_8_case_rate", "numeric", "predictor", + "death_rate" , 7, "lag_", "lag_7_death_rate", 4, 11, "lag_11_death_rate", "numeric", "predictor", + "case_rate" , 4, "ahead_", "ahead_4_case_rate", -5, 9, "ahead_9_case_rate", "numeric", "outcome" ) - +test_recipe <- epi_recipe(modified_data) %>% + step_epi_lag(case_rate, lag = c(3)) %>% + step_epi_lag(death_rate, lag = 7) %>% + step_epi_ahead(case_rate, ahead = 4) +shift_cols <- construct_shift_tibble(c("case_rate", "death_rate"), test_recipe, "step_epi_lag", "lag") +test_that("construct_shift_tibble constructs the right tibble", { + expected_shift_cols <- tibble::tribble( + ~terms, ~shift, ~prefix, + "case_rate", 3, "lag_", + "death_rate", 7, "lag_" + ) + expect_equal(shift_cols, expected_shift_cols) +}) test_that("get_latency works", { expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1), 4) @@ -54,23 +66,6 @@ test_that("get_latency works", { test_that("get_latency infers max_time to be the minimum `max time` across the columns", {}) -test_that("adjust_name works", { - expect_equal( - adjust_name("lag_", "lag_5_case_rate_13", 10), - "lag_10_case_rate_13" - ) - # it won't change a column with the wrong prefix - expect_equal( - adjust_name("lag_", "ahead_5_case_rate", 10), - "ahead_5_case_rate" - ) - # it works on vectors of names - expect_equal( - adjust_name("lag_", c("lag_5_floop_35", "lag_2342352_case"), c(10, 7)), - c("lag_10_floop_35", "lag_7_case") - ) -}) - test_that("get_asof works", { info <- tribble( ~variable, ~type, ~role, ~source, @@ -83,66 +78,63 @@ test_that("get_asof works", { expect_equal(set_asof(modified_data, info), as_of) }) -test_that("get_shifted_column_tibble infers latency and works correctly", { +test_that("get_latent_column_tibble infers latency and works correctly", { info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") - case_lag <- get_shifted_column_tibble( - "lag_", modified_data, "case_rate", - as_of, NULL, 1, info + + case_lag <- get_latent_column_tibble( + shift_cols[1,], modified_data, as_of, NULL, 1, info ) expect_equal(case_lag, all_shift_cols[1, ]) - death_lag <- get_shifted_column_tibble( - "lag_", modified_data, - "death_rate", as_of, NULL, 1, info + death_lag <- get_latent_column_tibble( + shift_cols[2,], modified_data, as_of, NULL, 1, info ) expect_equal(death_lag, all_shift_cols[2, ]) - both_lag <- get_shifted_column_tibble( - "lag_", modified_data, c("case_rate", "death_rate"), - as_of, NULL, 1, info + both_lag <- get_latent_column_tibble( + shift_cols, modified_data, as_of, NULL, 1, info ) expect_equal(both_lag, all_shift_cols[1:2, ]) }) -test_that("get_shifted_column_tibble assigns given latencies", { +test_that("get_latent_column_tibble assigns given latencies", { # non-null latency info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") - both_lag <- get_shifted_column_tibble( - "lag_", modified_data, - c("case_rate", "death_rate"), as_of, 50, 1, info + both_lag <- get_latent_column_tibble( + shift_cols, modified_data, as_of, 50, 1, info ) weird_latencies <- tibble::tribble( - ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "lag_3_case_rate", 3, 50, 53, "lag_53_case_rate", "numeric", "predictor", - "lag_7_death_rate", 7, 50, 57, "lag_57_death_rate", "numeric", "predictor", + ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", 50, 53, "lag_53_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", 50, 57, "lag_57_death_rate", "numeric", "predictor", ) expect_equal(both_lag, weird_latencies) # supposing we add the latencies by hand, and they're different, and in a different order weird_latencies <- tibble::tribble( - ~original_name, ~shifts, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "lag_3_case_rate", 3, 70, 73, "lag_73_case_rate", "numeric", "predictor", - "lag_7_death_rate", 7, 30, 37, "lag_37_death_rate", "numeric", "predictor", + ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", 70, 73, "lag_73_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", 30, 37, "lag_37_death_rate", "numeric", "predictor", ) - both_lag <- get_shifted_column_tibble( - "lag_", modified_data, - c("case_rate", "death_rate"), as_of, c(death_rate = 30, case_rate = 70), 1, info + both_lag <- get_latent_column_tibble( + shift_cols, modified_data, as_of, c(death_rate = 30, case_rate = 70), 1, info ) expect_equal(both_lag, weird_latencies[1:2, ]) - case_ahead <- get_shifted_column_tibble( - "ahead_", modified_data, - "case_rate", as_of, NULL, -1, info + ahead_shift_cols <- construct_shift_tibble(c("case_rate"), test_recipe, "step_epi_ahead", "ahead") + case_ahead <- get_latent_column_tibble( + ahead_shift_cols, modified_data, as_of, NULL, -1, info ) expect_equal(case_ahead, all_shift_cols[3, ]) }) test_that("get_shifted_column_tibble objects to non-columns", { + non_shift_cols <- tibble(terms = "not_present", shift = 99, prefix = "lag_") expect_error( - get_shifted_column_tibble( - "lag_", modified_data, "not_present", as_of, NULL, 1, info + get_latent_column_tibble( + non_shift_cols, modified_data, as_of, NULL, 1, info ), - class = "epipredict_adjust_latency_nonexistent_column_used" + regexp = "Can't subset elements that don't exist" ) }) From 802837447d8a29349d58d734c048f7e2697add55 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 8 May 2024 16:33:08 -0500 Subject: [PATCH 21/92] +purrr, styling --- DESCRIPTION | 2 ++ tests/testthat/test-utils_latency.R | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index f7886f314..4f3b5597a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -40,6 +40,8 @@ Imports: magrittr, recipes (>= 1.0.4), rlang (>= 1.1.0), + purrr, + smoothqr, stats, tibble, tidyr, diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index b96768b50..1b8f93b9a 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -34,10 +34,10 @@ modified_data <- modified_data %>% tail() as_of - (modified_data %>% filter(!is.na(ahead_4_case_rate)) %>% pull(time_value) %>% max()) all_shift_cols <- tibble::tribble( - ~terms , ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate" , 3, "lag_", "lag_3_case_rate", 5, 8, "lag_8_case_rate", "numeric", "predictor", - "death_rate" , 7, "lag_", "lag_7_death_rate", 4, 11, "lag_11_death_rate", "numeric", "predictor", - "case_rate" , 4, "ahead_", "ahead_4_case_rate", -5, 9, "ahead_9_case_rate", "numeric", "outcome" + ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", 5, 8, "lag_8_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", 4, 11, "lag_11_death_rate", "numeric", "predictor", + "case_rate", 4, "ahead_", "ahead_4_case_rate", -5, 9, "ahead_9_case_rate", "numeric", "outcome" ) test_recipe <- epi_recipe(modified_data) %>% step_epi_lag(case_rate, lag = c(3)) %>% @@ -82,12 +82,12 @@ test_that("get_latent_column_tibble infers latency and works correctly", { info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") case_lag <- get_latent_column_tibble( - shift_cols[1,], modified_data, as_of, NULL, 1, info + shift_cols[1, ], modified_data, as_of, NULL, 1, info ) expect_equal(case_lag, all_shift_cols[1, ]) death_lag <- get_latent_column_tibble( - shift_cols[2,], modified_data, as_of, NULL, 1, info + shift_cols[2, ], modified_data, as_of, NULL, 1, info ) expect_equal(death_lag, all_shift_cols[2, ]) @@ -104,17 +104,17 @@ test_that("get_latent_column_tibble assigns given latencies", { shift_cols, modified_data, as_of, 50, 1, info ) weird_latencies <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", 50, 53, "lag_53_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", 50, 57, "lag_57_death_rate", "numeric", "predictor", + ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", 50, 53, "lag_53_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", 50, 57, "lag_57_death_rate", "numeric", "predictor", ) expect_equal(both_lag, weird_latencies) # supposing we add the latencies by hand, and they're different, and in a different order weird_latencies <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", 70, 73, "lag_73_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", 30, 37, "lag_37_death_rate", "numeric", "predictor", + ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", 70, 73, "lag_73_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", 30, 37, "lag_37_death_rate", "numeric", "predictor", ) both_lag <- get_latent_column_tibble( shift_cols, modified_data, as_of, c(death_rate = 30, case_rate = 70), 1, info From 4a0ed4815bd3ef17fcc9bc31249104a940431584 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 8 May 2024 16:41:07 -0500 Subject: [PATCH 22/92] glue -> glue::glue --- R/step_adjust_latency.R | 3 +-- R/utils-latency.R | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 844db4871..6467eb067 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -165,7 +165,6 @@ step_adjust_latency_new <- # lags introduces max(lags) NA's after the max_time_value. # TODO all of the shifting happens before NA removal, which saves all the data I might possibly want; I should probably add a bit that makes sure this operation is happening before NA removal so data doesn't get dropped #' @export -#' @importFrom glue glue prep.step_adjust_latency <- function(x, training, info = NULL, ...) { if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { cli::cli_abort('If `method` is `"extend_ahead"`, then a step ", @@ -207,7 +206,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ((time_type == "year") && (latency >= 1)) ) { cli::cli_warn(c( - "!" = glue( + "!" = glue::glue( "The shift has been adjusted by {latency}, ", "which is questionable for it's `time_type` of ", "{time_type}" diff --git a/R/utils-latency.R b/R/utils-latency.R index 9a49a1b57..5acf4644f 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -76,11 +76,10 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' @keywords internal #' @importFrom dplyr rowwise %>% #' @importFrom purrr map_lgl -#' @importFrom glue glue get_latent_column_tibble <- function( shift_cols, new_data, as_of, latency, sign_shift, info, call = caller_env()) { - shift_cols <- shift_cols %>% mutate(original_name = glue("{prefix}{shift}_{terms}")) + shift_cols <- shift_cols %>% mutate(original_name = glue::glue("{prefix}{shift}_{terms}")) if (is.null(latency)) { shift_cols <- shift_cols %>% rowwise() %>% @@ -105,7 +104,7 @@ get_latent_column_tibble <- function( effective_shift = shift + abs(latency) ) %>% mutate( - new_name = glue("{prefix}{effective_shift}_{terms}") + new_name = glue::glue("{prefix}{effective_shift}_{terms}") ) info <- info %>% select(variable, type, role) shift_cols <- left_join(shift_cols, info, by = join_by(original_name == variable)) From 909e47ce2c25d126a68581936f175390493d386a Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 8 May 2024 16:51:12 -0500 Subject: [PATCH 23/92] fix get_latent_column_tibble docs --- R/utils-latency.R | 5 ++++- man/get_latent_column_tibble.Rd | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/R/utils-latency.R b/R/utils-latency.R index 5acf4644f..21d81a217 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -66,8 +66,11 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' find the columns added with the lags or aheads, and the amounts they have #' been changed -#' @param prefix the prefix indicating if we are adjusting lags or aheads +#' @param shift_cols a list of columns to operate on, as created by `construct_shift_tibble` #' @param new_data the data transformed so far +#' @param as_of +#' @param latency `NULL`, int, or vector, as described in `step_eip_latency` +#' @param sign_shift -1 if ahead, 1 if lag #' @return a tibble with columns `column` (relevant shifted names), `shift` (the #' amount that one is shifted), `latency` (original columns difference between #' max_time_value and as_of (on a per-initial column basis)), diff --git a/man/get_latent_column_tibble.Rd b/man/get_latent_column_tibble.Rd index cbf2c7fe1..9c91182a1 100644 --- a/man/get_latent_column_tibble.Rd +++ b/man/get_latent_column_tibble.Rd @@ -16,9 +16,13 @@ get_latent_column_tibble( ) } \arguments{ +\item{shift_cols}{a list of columns to operate on, as created by \code{construct_shift_tibble}} + \item{new_data}{the data transformed so far} -\item{prefix}{the prefix indicating if we are adjusting lags or aheads} +\item{latency}{\code{NULL}, int, or vector, as described in \code{step_eip_latency}} + +\item{sign_shift}{-1 if ahead, 1 if lag} } \value{ a tibble with columns \code{column} (relevant shifted names), \code{shift} (the From 8639ebd06edee383cf284305ac7ae343cb3295ba Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 8 May 2024 17:13:27 -0500 Subject: [PATCH 24/92] step_adjust_latency arg docs --- R/step_adjust_latency.R | 3 --- man/step_adjust_latency.Rd | 4 ---- 2 files changed, 7 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 6467eb067..c1af1f5cb 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -46,9 +46,6 @@ #' should be the original columns, and not the derived ones #' @param default Determines what fills empty rows #' left by leading/lagging (defaults to NA). -#' @param prefix a character. The prefix matching the one used in either -#' `step_epi_ahead` if `method="extend_ahead"` or `step_epi_lag` -#' if `method="extend_lags"` or "locf". #' @param skip A logical. Should the step be skipped when the #' recipe is baked by [bake()]? While all operations are baked #' when [prep()] is run, some operations may not be able to be diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 4df945204..e123769f0 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -78,10 +78,6 @@ the computations for subsequent operations.} should be the original columns, and not the derived ones} \item{id}{A unique identifier for the step} - -\item{prefix}{a character. The prefix matching the one used in either -\code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} -if \code{method="extend_lags"} or "locf".} } \value{ An updated version of \code{recipe} with the new step added to the From 55314a8cca2fd2276c2f2fac62aeab57a99b4867 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 13 May 2024 12:00:29 -0500 Subject: [PATCH 25/92] rec formatting things, dropping `purrr` --- NAMESPACE | 1 + R/epi_workflow.R | 10 ++++++---- R/step_adjust_latency.R | 20 ++++++++++++++------ R/step_growth_rate.R | 2 +- R/step_lag_difference.R | 2 +- R/utils-latency.R | 3 +-- man/get_latent_column_tibble.Rd | 2 ++ 7 files changed, 26 insertions(+), 14 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 037b64ed3..a93df107a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -267,6 +267,7 @@ importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") importFrom(purrr,map_lgl) +importFrom(quantreg,rq) importFrom(recipes,bake) importFrom(recipes,prep) importFrom(recipes,rand_id) diff --git a/R/epi_workflow.R b/R/epi_workflow.R index af4555303..fe9b656ba 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -187,10 +187,12 @@ augment.epi_workflow <- function(x, new_data, ...) { if (is_epi_df(predictions)) { join_by <- key_colnames(predictions) } else { - cli_abort(c( - "Cannot determine how to join new_data with the predictions.", - "Try converting new_data to an epi_df with `as_epi_df(new_data)`." - )) + cli::cli_abort( + c( + "Cannot determine how to join `new_data` with the `predictions`.", + "Try converting `new_data` to an {.cls epi_df} with `as_epi_df(new_data)`." + ) + ) } complete_overlap <- intersect(names(new_data), join_by) if (length(complete_overlap) < length(join_by)) { diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index c1af1f5cb..0ed7d77bd 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -91,7 +91,7 @@ step_adjust_latency <- id = recipes::rand_id("epi_lag")) { arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { - cli::cli_abort("This recipe step can only operate on an `epi_recipe`.") + cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") } if (!is.null(columns)) { cli::cli_abort(c("The `columns` argument must be `NULL`.", @@ -119,7 +119,11 @@ step_adjust_latency <- recipe$steps, function(recipe_step) inherits(recipe_step, rel_step_type) ))) { - cli:cli_abort("there is no `{rel_step_type}` defined before this. for the method `extend_{shift_name}` of `step_adjust_latency`, at least one {shift_name} must be previously defined.") + cli::cli_abort(glue::glue( + "There is no `{rel_step_type}` defined before this.", + " For the method `extend_{shift_name}` of `step_adjust_latency`,", + " at least one {shift_name} must be previously defined." + )) } recipes::add_step( @@ -164,11 +168,15 @@ step_adjust_latency_new <- #' @export prep.step_adjust_latency <- function(x, training, info = NULL, ...) { if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { - cli::cli_abort('If `method` is `"extend_ahead"`, then a step ", - "must have already added an outcome .') + cli::cli_abort(glue::glue( + "If `method` is `\"extend_ahead\"`, then a step ", + "must have already added an outcome." + )) } else if (!("predictor" %in% info$role)) { - cli::cli_abort('If `method` is `"extend_lags"` or `"locf"`, then a step ", -"must have already added a predictor.') + cli::cli_abort(glue::glue( + "If `method` is `\"extend_lags\"` or `\"locf\"`, then a step ", + "must have already added a predictor." + )) } # get the columns used, even if it's all of them diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index 06f8da4cf..70d0ac2ab 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -58,7 +58,7 @@ step_growth_rate <- arg_is_pos_int(horizon) arg_is_scalar(horizon) if (!is.null(replace_Inf)) { - if (length(replace_Inf) != 1L) cli_abort("replace_Inf must be a scalar.") + if (length(replace_Inf) != 1L) cli::cli_abort("`replace_Inf` must be a scalar.") if (!is.na(replace_Inf)) arg_is_numeric(replace_Inf) } arg_is_chr(role) diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 39ae1ba59..e8c7c36f7 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -32,7 +32,7 @@ step_lag_difference <- skip = FALSE, id = rand_id("lag_diff")) { if (!is_epi_recipe(recipe)) { - cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") + cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") } arg_is_pos_int(horizon) arg_is_chr(role) diff --git a/R/utils-latency.R b/R/utils-latency.R index 21d81a217..03d8c575f 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -68,7 +68,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' been changed #' @param shift_cols a list of columns to operate on, as created by `construct_shift_tibble` #' @param new_data the data transformed so far -#' @param as_of +#' @param as_of the forecast date #' @param latency `NULL`, int, or vector, as described in `step_eip_latency` #' @param sign_shift -1 if ahead, 1 if lag #' @return a tibble with columns `column` (relevant shifted names), `shift` (the @@ -78,7 +78,6 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' effective_shift) #' @keywords internal #' @importFrom dplyr rowwise %>% -#' @importFrom purrr map_lgl get_latent_column_tibble <- function( shift_cols, new_data, as_of, latency, sign_shift, info, call = caller_env()) { diff --git a/man/get_latent_column_tibble.Rd b/man/get_latent_column_tibble.Rd index 9c91182a1..168145ed8 100644 --- a/man/get_latent_column_tibble.Rd +++ b/man/get_latent_column_tibble.Rd @@ -20,6 +20,8 @@ get_latent_column_tibble( \item{new_data}{the data transformed so far} +\item{as_of}{the forecast date} + \item{latency}{\code{NULL}, int, or vector, as described in \code{step_eip_latency}} \item{sign_shift}{-1 if ahead, 1 if lag} From ce230ac021d3589efeffce16d5648846e4c039af Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 14 May 2024 16:59:47 -0500 Subject: [PATCH 26/92] glue->paste, dropping zoo --- DESCRIPTION | 3 +-- R/epi_recipe.R | 4 ++-- R/frosting.R | 2 +- R/layer_cdc_flatline_quantiles.R | 6 +++--- R/layer_residual_quantiles.R | 6 +++--- R/pivot_quantiles.R | 2 +- R/step_adjust_latency.R | 14 +++++++------- R/utils-latency.R | 8 ++++---- R/utils-misc.R | 2 +- R/utils-shift.R | 10 +++++----- man/extend_either.Rd | 4 +++- 11 files changed, 31 insertions(+), 30 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4f3b5597a..552857a36 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -48,8 +48,7 @@ Imports: tidyselect, tsibble, vctrs, - workflows (>= 1.0.0), - zoo + workflows (>= 1.0.0) Suggests: covidcast, data.table, diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 5c4de0c78..684642075 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -96,7 +96,7 @@ epi_recipe.epi_df <- ## Check and add roles when available if (!is.null(roles)) { if (length(roles) != length(vars)) { - cli::cli_abort(c( + cli::cli_abort(paste( "The number of roles should be the same as the number of ", "variables." )) @@ -456,7 +456,7 @@ prep.epi_recipe <- function( } skippers <- map_lgl(x$steps, recipes:::is_skipable) if (any(skippers) & !retain) { - cli::cli_warn(c( + cli::cli_warn(paste( "Since some operations have `skip = TRUE`, using ", "`retain = TRUE` will allow those steps results to ", "be accessible." diff --git a/R/frosting.R b/R/frosting.R index d7ba22902..13d2293a6 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -374,7 +374,7 @@ apply_frosting.epi_workflow <- } if (!has_postprocessor_frosting(workflow)) { - cli_warn(c( + cli::cli_warn(paste( "Only postprocessors of class {.cls frosting} are allowed.", "Returning unpostprocessed predictions." )) diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 8d16ba32f..daeaa1a3e 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -176,7 +176,7 @@ slather.layer_cdc_flatline_quantiles <- if (length(object$by_key) > 0L) { cols_in_preds <- hardhat::check_column_names(p, object$by_key) if (!cols_in_preds$ok) { - cli::cli_warn(c( + cli::cli_warn(paste( "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", "Ignoring these." )) @@ -184,7 +184,7 @@ slather.layer_cdc_flatline_quantiles <- if (inherits(the_fit, "_flatline")) { cols_in_resids <- hardhat::check_column_names(r, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(c( + cli::cli_warn(paste( "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) @@ -201,7 +201,7 @@ slather.layer_cdc_flatline_quantiles <- ) cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(c( + cli::cli_warn(paste( "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index eae151905..257a951a9 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -102,7 +102,7 @@ slather.layer_residual_quantiles <- common <- intersect(object$by_key, names(key_cols)) excess <- setdiff(object$by_key, names(key_cols)) if (length(excess) > 0L) { - cli::cli_warn(c( + cli::cli_warn(paste( "Requested residual grouping key(s) {.val {excess}} are unavailable ", "in the original data. Grouping by the remainder: {.val {common}}." )) @@ -113,7 +113,7 @@ slather.layer_residual_quantiles <- if (length(common_in_r) == length(common)) { r <- left_join(key_cols, r, by = common_in_r) } else { - cli::cli_warn(c( + cli::cli_warn(paste( "Some grouping keys are not in data.frame returned by the", "`residuals()` method. Groupings may not be correct." )) @@ -168,7 +168,7 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - cli::cli_warn(c( + cli::cli_warn(paste( "The `residuals()` method for objects of class {.cls {cl}} results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index f014961e6..4abf9d257 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -71,7 +71,7 @@ pivot_quantiles_longer <- function(.data, ..., .ignore_length_check = FALSE) { .data <- .data %>% tidyr::unnest(all_of(col), names_sep = "_") } } else { - cli::cli_abort(c( + cli::cli_abort(paste( "Some selected columns contain different numbers of quantiles.", "The result would be a {.emph very} long {.cls tibble}.", "To do this anyway, rerun with `.ignore_length_check = TRUE`." diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 0ed7d77bd..14b646431 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -119,7 +119,7 @@ step_adjust_latency <- recipe$steps, function(recipe_step) inherits(recipe_step, rel_step_type) ))) { - cli::cli_abort(glue::glue( + cli::cli_abort(paste( "There is no `{rel_step_type}` defined before this.", " For the method `extend_{shift_name}` of `step_adjust_latency`,", " at least one {shift_name} must be previously defined." @@ -168,13 +168,13 @@ step_adjust_latency_new <- #' @export prep.step_adjust_latency <- function(x, training, info = NULL, ...) { if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { - cli::cli_abort(glue::glue( - "If `method` is `\"extend_ahead\"`, then a step ", + cli::cli_abort(paste( + "If `method` is {.val extend_ahead}, then a step ", "must have already added an outcome." )) } else if (!("predictor" %in% info$role)) { - cli::cli_abort(glue::glue( - "If `method` is `\"extend_lags\"` or `\"locf\"`, then a step ", + cli::cli_abort(paste( + "If `method` is {.val extend_lags} or {.val locf}, then a step ", "must have already added a predictor." )) } @@ -210,8 +210,8 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ((time_type == "yearquarter") && (latency >= 1)) || ((time_type == "year") && (latency >= 1)) ) { - cli::cli_warn(c( - "!" = glue::glue( + cli::cli_warn(paste( + "!" = paste( "The shift has been adjusted by {latency}, ", "which is questionable for it's `time_type` of ", "{time_type}" diff --git a/R/utils-latency.R b/R/utils-latency.R index 03d8c575f..5fa4ce3fc 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -1,5 +1,6 @@ #' offset each relevant column by it's appropriate latency #' works for either adjusting aheads or lags +#' note that this may introduce new NA values when one column is shifted farther than another #' @param shift_cols a tibble which must have the columns `column`, the name of #' the column to adjust, `latency` the latency of the original column relative #' to the `as_of` date, `new_name`, the names in `column` adjusted by the @@ -20,7 +21,6 @@ extend_either <- function(new_data, shift_cols, keys) { key_cols = keys ) }) %>% - map(function(x) zoo::na.trim(x)) %>% reduce( dplyr::full_join, by = keys @@ -146,7 +146,7 @@ set_asof <- function(new_data, info) { max_time <- max(time_values) # make sure the as_of is sane if (!inherits(as_of, class(time_values)) & !inherits(as_of, "POSIXt")) { - cli::cli_abort(glue::glue( + cli::cli_abort(paste( "the data matrix `as_of` value is {as_of}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", @@ -154,13 +154,13 @@ set_asof <- function(new_data, info) { )) } if (is.null(as_of) || is.na(as_of)) { - cli::cli_warn(glue::glue( + cli::cli_warn(paste( "epi_data's `as_of` was {as_of}, setting to ", "the latest time value, {max_time}." )) as_of <- max_time } else if (as_of < max_time) { - cli::cli_abort(glue::glue( + cli::cli_abort(paste( "`as_of` ({(as_of)}) is before the most ", "recent data ({max_time}). Remove before ", "predicting." diff --git a/R/utils-misc.R b/R/utils-misc.R index b4d1c28b7..8fc16a968 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -42,7 +42,7 @@ grab_forged_keys <- function(forged, workflow, new_data) { # 3. these are the keys in the test data as input new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value"))) if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { - cli::cli_warn(c( + cli::cli_warn(paste( "Not all epi keys that were present in the training data are available", "in `new_data`. Predictions will have only the available keys." )) diff --git a/R/utils-shift.R b/R/utils-shift.R index 702a52308..75a9a6bd1 100644 --- a/R/utils-shift.R +++ b/R/utils-shift.R @@ -12,7 +12,7 @@ adjust_latency <- function(object, new_data) { } else if (method == "extend_ahead") { as_of <- attributes(new_data)$metadata$as_of if (FALSE && (typeof(as_of) != typeof(new_data$time_value))) { - cli::cli_abort(glue::glue( + cli::cli_abort(paste( "the data matrix `as_of` value is {as_of}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", @@ -27,13 +27,13 @@ adjust_latency <- function(object, new_data) { max_time <- max(time_values) shift_amount <- as.Date(as_of) - max_time if (is.null(as_of) || is.na(as_of)) { - cli::cli_warn(glue::glue( + cli::cli_warn(paste( "epi_data's `as_of` was {as_of}, setting to ", "the latest time value, {max_time}." )) as_of <- max_time } else if (as_of < max_time) { - cli::cli_abort(glue::glue( + cli::cli_abort(paste( "`as_of` ({(as_of)}) is before the most ", "recent data ({max_time}). Remove before ", "predicting." @@ -47,7 +47,7 @@ adjust_latency <- function(object, new_data) { ((time_type == "yearmonth") && (shift_amount >= 2)) || ((time_type == "yearquarter") && (shift_amount >= 1)) || ((time_type == "year") && (shift_amount >= 1))) { - cli::cli_warn(c( + cli::cli_warn(paste( "!" = glue::glue( "The ahead has been adjusted by {shift_amount}, ", "which is questionable for it's `time_type` of ", @@ -63,7 +63,7 @@ adjust_latency <- function(object, new_data) { cli::cli_abort("the `time_value` column of `new_data` is empty") } } else { - cli::cli_abort(glue::glue( + cli::cli_abort(paste( "Latency adjustment method {method} has not yet ", "been implemented for `step_epi_ahead`." )) diff --git a/man/extend_either.Rd b/man/extend_either.Rd index 8ec5ca38e..b7e306944 100644 --- a/man/extend_either.Rd +++ b/man/extend_either.Rd @@ -3,7 +3,8 @@ \name{extend_either} \alias{extend_either} \title{offset each relevant column by it's appropriate latency -works for either adjusting aheads or lags} +works for either adjusting aheads or lags +note that this may introduce new NA values when one column is shifted farther than another} \usage{ extend_either(new_data, shift_cols, keys) } @@ -20,5 +21,6 @@ latencies \code{latency}} \description{ offset each relevant column by it's appropriate latency works for either adjusting aheads or lags +note that this may introduce new NA values when one column is shifted farther than another } \keyword{internal} From c8f6b85170bdf783e9dc603d0c2c80e9135b61e2 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 16 May 2024 15:43:13 -0500 Subject: [PATCH 27/92] Detecting required/forbidden steps beforehand --- R/step_adjust_latency.R | 28 +++++++++++++---------- tests/testthat/test-step_adjust_latency.R | 23 +++++++++++++++++++ 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 14b646431..4d4c0ad0c 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -73,6 +73,7 @@ #' # step_adjust_latency(method = "extend_ahead") %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) #' r +#' @importFrom recipes detect_step step_adjust_latency <- function(recipe, ..., @@ -98,6 +99,21 @@ step_adjust_latency <- i = "Use `tidyselect` methods to choose columns to lag." )) } + if ((method == "extend_ahead") && (!detect_step(recipe, "epi_ahead"))) { + cli::cli_abort( + "If `method` is {.val extend_ahead}, then a step + must have already added an outcome." + ) + } else if ((method == "extend_lags") && (!detect_step(recipe, "epi_lag"))) { + cli::cli_abort( + "If `method` is {.val extend_lags} or {.val locf}, then a step + must have already added a predictor." + ) + } + if (detect_step(recipe, "naomit")) { + cli::cli_abort("adjust_latency needs to occur before any `NA` removal, + as columns may be moved around") + } method <- rlang::arg_match(method) terms_used <- recipes_eval_select(enquos(...), recipe$template, recipe$term_info) @@ -164,20 +180,8 @@ step_adjust_latency_new <- } # lags introduces max(lags) NA's after the max_time_value. -# TODO all of the shifting happens before NA removal, which saves all the data I might possibly want; I should probably add a bit that makes sure this operation is happening before NA removal so data doesn't get dropped #' @export prep.step_adjust_latency <- function(x, training, info = NULL, ...) { - if ((x$method == "extend_ahead") && (!("outcome" %in% info$role))) { - cli::cli_abort(paste( - "If `method` is {.val extend_ahead}, then a step ", - "must have already added an outcome." - )) - } else if (!("predictor" %in% info$role)) { - cli::cli_abort(paste( - "If `method` is {.val extend_lags} or {.val locf}, then a step ", - "must have already added a predictor." - )) - } # get the columns used, even if it's all of them terms_used <- x$columns diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index ecd4fa888..afb6cedd9 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -132,4 +132,27 @@ test_that("epi_adjust_latency extend_ahead uses the same adjustment when predict test_that("epi_adjust_latency works for other time types", {}) +test_that("epi_adjust_latency insist there's steps before it", { + expect_error( + r5 <- epi_recipe(x) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(method = "extend_lags"), + regexp = "extend_lags" + ) + expect_error( + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(method = "extend_ahead"), + regexp = "extend_ahead" + ) +}) + +test_that("epi_adjust_latency warns against removing NA's beforehand", { + expect_error(r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_naomit() %>% + step_adjust_latency(method = "extend_lags"), + regexp = "adjust_latency needs to occur before any `NA` removal") +}) # todo check that epi_adjust_latency errors for nonsense `as_of`'s From 4927f0e9635f2574322ea73daf5c7594e228843f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 17 May 2024 11:41:45 -0500 Subject: [PATCH 28/92] minor rebase woes --- NAMESPACE | 1 + R/step_adjust_latency.R | 1 - tests/testthat/test-step_adjust_latency.R | 16 +++++++++------- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index a93df107a..0f8da385b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -269,6 +269,7 @@ importFrom(magrittr,"%>%") importFrom(purrr,map_lgl) importFrom(quantreg,rq) importFrom(recipes,bake) +importFrom(recipes,detect_step) importFrom(recipes,prep) importFrom(recipes,rand_id) importFrom(rlang,"!!!") diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 4d4c0ad0c..227b0ee74 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -182,7 +182,6 @@ step_adjust_latency_new <- # lags introduces max(lags) NA's after the max_time_value. #' @export prep.step_adjust_latency <- function(x, training, info = NULL, ...) { - # get the columns used, even if it's all of them terms_used <- x$columns if (length(terms_used) == 0) { diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index afb6cedd9..f52888a68 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -141,18 +141,20 @@ test_that("epi_adjust_latency insist there's steps before it", { ) expect_error( r5 <- epi_recipe(x) %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_adjust_latency(method = "extend_ahead"), regexp = "extend_ahead" ) }) test_that("epi_adjust_latency warns against removing NA's beforehand", { - expect_error(r5 <- epi_recipe(x) %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% - step_epi_lag(case_rate, lag = c(1, 5)) %>% - step_epi_naomit() %>% - step_adjust_latency(method = "extend_lags"), - regexp = "adjust_latency needs to occur before any `NA` removal") + expect_error( + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_naomit() %>% + step_adjust_latency(method = "extend_lags"), + regexp = "adjust_latency needs to occur before any `NA` removal" + ) }) # todo check that epi_adjust_latency errors for nonsense `as_of`'s From 7752b17fd5605aa9421a4082c0bcc2aaba74e375 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 13:03:16 -0700 Subject: [PATCH 29/92] tests for utils-latency and accompanying fixes --- NAMESPACE | 1 + R/utils-latency.R | 1 + tests/testthat/test-utils_latency.R | 42 ++++++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 0f8da385b..1c8585356 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -302,6 +302,7 @@ importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) importFrom(tibble,as_tibble) +importFrom(tibble,is_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) importFrom(tidyr,unnest) diff --git a/R/utils-latency.R b/R/utils-latency.R index 5fa4ce3fc..6ab093fda 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -21,6 +21,7 @@ extend_either <- function(new_data, shift_cols, keys) { key_cols = keys ) }) %>% + map(\(x) na.trim(x)) %>% # TODO need to talk about this reduce( dplyr::full_join, by = keys diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 1b8f93b9a..6fd3672aa 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -75,7 +75,7 @@ test_that("get_asof works", { "death_rate", "numeric", "raw", "original", "not_real", "numeric", "predictor", "derived" ) - expect_equal(set_asof(modified_data, info), as_of) + expect_equal(set_forecast_date(modified_data, info), as_of) }) test_that("get_latent_column_tibble infers latency and works correctly", { @@ -156,6 +156,11 @@ test_that("extend_either works", { epi_shift_single(old_data, "case_rate", -9, "ahead_9_case_rate", keys), by = keys ) %>% + dplyr::add_row(tibble( + geo_value = "place", + time_value = as.Date("2021-08-01"), case_rate = NA, death_rate = NA, + lag_8_case_rate = NA, lag_11_death_rate = NA, ahead_9_case_rate = NA + )) %>% arrange(time_value) expect_equal( extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value), @@ -163,4 +168,39 @@ test_that("extend_either works", { ) }) + + + + +time_range <- as.Date("2021-01-01") + 0:199 +x_adjust_ahead <- tibble( + geo_value = rep("place", 200), + time_value = time_range, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = max(time_range) + 3) +# confirm the delay is right + +test_that("adjust_latency extend_ahead works", { + # testing that POSIXct converts correctly (as well as basic types) + expect_equal( + attributes(x_adjust_ahead)$metadata$as_of - max(x_adjust_ahead$time_value), + as.difftime(3, units = "days") + ) + object <- list(latency_adjustment = "extend_ahead", ahead = 7) + expect_no_error(adjusted_ahead <- adjust_latency(object, x_adjust_ahead)) + expect_type(adjusted_ahead, "integer") + expect_equal(adjusted_ahead, 3 + 7) +}) + +test_that("extend_ahead warns in case of extreme adjustment", { + # warns if the ahead is relatively small + attributes(x_adjust_ahead)$metadata$as_of <- + max(x_adjust_ahead$time_value) + 100 + object <- list(latency_adjustment = "extend_ahead", ahead = 7) + attributes(x_adjust_ahead)$metadata$time_type + testthat::expect_warning(adjust_latency(object, x_adjust_ahead), regexp = "The ahead has been adjusted by 100") +}) + # todo case where somehow columns of different roles are selected From 8f3641b132f6971dabc62f0a529e1efc2d838a59 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 1 Apr 2024 13:59:18 -0700 Subject: [PATCH 30/92] adding stringr --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 552857a36..1c690792a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,6 +43,7 @@ Imports: purrr, smoothqr, stats, + stringr, tibble, tidyr, tidyselect, From ba0c4b8f83f63fb14ce6edd3cda3fd3d67989fed Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 3 May 2024 17:00:00 -0500 Subject: [PATCH 31/92] nothing but `rlang::abort` -> `cli::cli_abort`s --- R/epi_recipe.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 684642075..e4bcaaf91 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -431,7 +431,7 @@ prep.epi_recipe <- function( x, training = NULL, fresh = FALSE, verbose = FALSE, retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) { if (is.null(training)) { - cli::cli_warn(c( + cli::cli_warn(paste( "!" = "No training data was supplied to {.fn prep}.", "!" = "Unlike a {.cls recipe}, an {.cls epi_recipe} does not ", "!" = "store the full template data in the object.", From 27694ef1a5da681734e6df47ea34410efc8e35f4 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 8 May 2024 16:27:48 -0500 Subject: [PATCH 32/92] moving shift detection earlier,dropping string*dep --- DESCRIPTION | 1 - R/step_adjust_latency.R | 7 ++----- R/utils-latency.R | 1 - man/step_adjust_latency.Rd | 4 ++++ 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1c690792a..552857a36 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,6 @@ Imports: purrr, smoothqr, stats, - stringr, tibble, tidyr, tidyselect, diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 227b0ee74..6d8c3d3d3 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -135,11 +135,7 @@ step_adjust_latency <- recipe$steps, function(recipe_step) inherits(recipe_step, rel_step_type) ))) { - cli::cli_abort(paste( - "There is no `{rel_step_type}` defined before this.", - " For the method `extend_{shift_name}` of `step_adjust_latency`,", - " at least one {shift_name} must be previously defined." - )) + cli:cli_abort("there is no `{rel_step_type}` defined before this. for the method `extend_{shift_name}` of `step_adjust_latency`, at least one {shift_name} must be previously defined.") } recipes::add_step( @@ -181,6 +177,7 @@ step_adjust_latency_new <- # lags introduces max(lags) NA's after the max_time_value. #' @export +#' @importFrom glue glue prep.step_adjust_latency <- function(x, training, info = NULL, ...) { # get the columns used, even if it's all of them terms_used <- x$columns diff --git a/R/utils-latency.R b/R/utils-latency.R index 6ab093fda..5fa4ce3fc 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -21,7 +21,6 @@ extend_either <- function(new_data, shift_cols, keys) { key_cols = keys ) }) %>% - map(\(x) na.trim(x)) %>% # TODO need to talk about this reduce( dplyr::full_join, by = keys diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index e123769f0..4df945204 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -78,6 +78,10 @@ the computations for subsequent operations.} should be the original columns, and not the derived ones} \item{id}{A unique identifier for the step} + +\item{prefix}{a character. The prefix matching the one used in either +\code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} +if \code{method="extend_lags"} or "locf".} } \value{ An updated version of \code{recipe} with the new step added to the From 3eab9c21bc44b5c2cd1af5f20b65008e5a915a06 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 13 May 2024 12:00:29 -0500 Subject: [PATCH 33/92] rec formatting things, dropping `purrr` --- NAMESPACE | 1 - R/step_adjust_latency.R | 6 +++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 1c8585356..2914e52a4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -266,7 +266,6 @@ importFrom(glue,glue) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") -importFrom(purrr,map_lgl) importFrom(quantreg,rq) importFrom(recipes,bake) importFrom(recipes,detect_step) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 6d8c3d3d3..7480ac96d 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -135,7 +135,11 @@ step_adjust_latency <- recipe$steps, function(recipe_step) inherits(recipe_step, rel_step_type) ))) { - cli:cli_abort("there is no `{rel_step_type}` defined before this. for the method `extend_{shift_name}` of `step_adjust_latency`, at least one {shift_name} must be previously defined.") + cli::cli_abort(glue::glue( + "There is no `{rel_step_type}` defined before this.", + " For the method `extend_{shift_name}` of `step_adjust_latency`,", + " at least one {shift_name} must be previously defined." + )) } recipes::add_step( From 7aa06e713f10d1f5a6cc12fb337712d74efd5174 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 15 May 2024 18:16:00 -0500 Subject: [PATCH 34/92] initial layer adjustments --- NAMESPACE | 1 + R/layer_add_forecast_date.R | 19 ++--- R/layer_add_target_date.R | 63 +++++++++++------ R/step_adjust_latency.R | 68 +++++++++++------- R/utils-latency.R | 70 +++++++++++++------ man/extend_either.Rd | 2 +- man/get_forecast_date_in_layer.Rd | 11 +++ man/get_latency.Rd | 2 +- man/get_latent_column_tibble.Rd | 8 +-- man/layer_add_forecast_date.Rd | 9 +-- man/layer_add_target_date.Rd | 55 ++++++++++----- man/set_asof.Rd | 12 ---- man/set_forecast_date.Rd | 12 ++++ man/step_adjust_latency.Rd | 48 ++++++++----- tests/testthat/_snaps/utils-shift.md | 13 ---- tests/testthat/test-layer_add_forecast_date.R | 28 ++++++++ tests/testthat/test-layer_add_target_date.R | 22 ++++++ tests/testthat/test-step_adjust_latency.R | 2 +- tests/testthat/test-utils-shift.R | 30 -------- 19 files changed, 299 insertions(+), 176 deletions(-) create mode 100644 man/get_forecast_date_in_layer.Rd delete mode 100644 man/set_asof.Rd create mode 100644 man/set_forecast_date.Rd delete mode 100644 tests/testthat/_snaps/utils-shift.md diff --git a/NAMESPACE b/NAMESPACE index 2914e52a4..6b316bca4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -314,3 +314,4 @@ importFrom(vctrs,vec_data) importFrom(vctrs,vec_ptype_abbr) importFrom(vctrs,vec_ptype_full) importFrom(vctrs,vec_recycle_common) +importFrom(workflows,extract_preprocessor) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 3ebc18cb1..41cda35a9 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -3,10 +3,11 @@ #' #' @param frosting a `frosting` postprocessor #' @param forecast_date The forecast date to add as a column to the `epi_df`. -#' For most cases, this should be specified in the form "yyyy-mm-dd". Note that -#' when the forecast date is left unspecified, it is set to the maximum time -#' value from the data used in pre-processing, fitting the model, and -#' postprocessing. +#' For most cases, this should be specified in the form "yyyy-mm-dd". Note +#' that when the forecast date is left unspecified, it is set to one of two +#' values. If there is a `step_adjust_latency` step present, it uses the +#' `as_of` the maximum time value from the data used in pre-processing, +#' fitting the model, and postprocessing. #' @param id a random id string #' #' @return an updated `frosting` postprocessor @@ -87,16 +88,16 @@ layer_add_forecast_date_new <- function(forecast_date, id) { } #' @export +#' @importFrom workflows extract_preprocessor slather.layer_add_forecast_date <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() if (is.null(object$forecast_date)) { - max_time_value <- as.Date(max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- get_forecast_date_in_layer( + extract_preprocessor(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) - )) - forecast_date <- max_time_value + new_data + ) } else { forecast_date <- object$forecast_date } diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 35bc84339..a1bdc3e4a 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -2,22 +2,26 @@ #' Postprocessing step to add the target date #' #' @param frosting a `frosting` postprocessor -#' @param target_date The target date to add as a column to the -#' `epi_df`. If there's a forecast date specified in a layer, then -#' it is the forecast date plus `ahead` (from `step_epi_ahead` in -#' the `epi_recipe`). Otherwise, it is the maximum `time_value` -#' (from the data used in pre-processing, fitting the model, and -#' postprocessing) plus `ahead`, where `ahead` has been specified in -#' preprocessing. The user may override these by specifying a -#' target date of their own (of the form "yyyy-mm-dd"). +#' @param target_date The target date to add as a column to the `epi_df`. If +#' there's a forecast date specified upstream (either in a +#' `step_adjust_latency` or in a `layer_forecast_date`), then it is the +#' forecast date plus `ahead` (from `step_epi_ahead` in the `epi_recipe`). +#' Otherwise, it is the maximum `time_value` (from the data used in +#' pre-processing, fitting the model, and postprocessing) plus `ahead`, where +#' `ahead` has been specified in preprocessing. The user may override these by +#' specifying a target date of their own (of the form "yyyy-mm-dd"). #' @param id a random id string #' #' @return an updated `frosting` postprocessor #' #' @details By default, this function assumes that a value for `ahead` #' has been specified in a preprocessing step (most likely in -#' `step_epi_ahead`). Then, `ahead` is added to the maximum `time_value` -#' in the test data to get the target date. +#' `step_epi_ahead`). Then, `ahead` is added to the `forecast_date` +#' in the test data to get the target date. `forecast_date` can be set in 3 ways: +#' 1. `step_adjust_latency`, which typically uses the training `epi_df`'s `as_of` +#' 2. `layer_add_forecast_date`, which inherits from 1 if not manually specifed +#' 3. if none of those are the case, it is simply the maximum `time_value` over +#' every dataset used (prep, training, and prediction). #' #' @export #' @examples @@ -42,8 +46,14 @@ #' p <- forecast(wf1) #' p #' -#' # Use ahead + max time value from pre, fit, post -#' # which is the same if include `layer_add_forecast_date()` +#' # Use ahead + forecast_date from adjust_latency +#' # setting the `as_of` to something realistic +#' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +#' r <- epi_recipe(jhu) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_adjust_latency(method = "extend_ahead") %>% +#' step_epi_naomit() #' f2 <- frosting() %>% #' layer_predict() %>% #' layer_add_target_date() %>% @@ -53,15 +63,26 @@ #' p2 <- forecast(wf2) #' p2 #' -#' # Specify own target date +#' # Use ahead + max time value from pre, fit, post +#' # which is the same if include `layer_add_forecast_date()` #' f3 <- frosting() %>% #' layer_predict() %>% -#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_add_target_date() %>% #' layer_naomit(.pred) #' wf3 <- wf %>% add_frosting(f3) #' -#' p3 <- forecast(wf3) -#' p3 +#' p3 <- forecast(wf2) +#' p2 +#' +#' # Specify own target date +#' f4 <- frosting() %>% +#' layer_predict() %>% +#' layer_add_target_date(target_date = "2022-01-08") %>% +#' layer_naomit(.pred) +#' wf4 <- wf %>% add_frosting(f4) +#' +#' p4 <- forecast(wf4) +#' p4 layer_add_target_date <- function(frosting, target_date = NULL, id = rand_id("add_target_date")) { arg_is_chr_scalar(id) @@ -113,13 +134,13 @@ slather.layer_add_target_date <- function(object, components, workflow, ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") target_date <- forecast_date + ahead } else { - max_time_value <- as.Date(max( - workflows::extract_preprocessor(workflow)$max_time_value, + forecast_date <- get_forecast_date_in_layer( + extract_preprocessor(workflow), workflow$fit$meta$max_time_value, - max(new_data$time_value) - )) + new_data + ) ahead <- extract_argument(the_recipe, "step_epi_ahead", "ahead") - target_date <- max_time_value + ahead + target_date <- forecast_date + ahead } object$target_date <- target_date diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 7480ac96d..7de07a995 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -13,12 +13,12 @@ #' to set this manually, as the necessary adjustments will be done for the #' predictors and outcome. #' @param method a character. Determines the method by which the -#' forecast handles latency. All of these assume the forecast date is the -#' `as_of` of the `epi_df`. The options are: +#' forecast handles latency. The options are: #' - `"extend_ahead"`: Lengthen the ahead so that forecasting from the last -#' observation results in a forecast `ahead` after the `as_of` date. E.g. if -#' there are 3 days of latency between the last observation and the `as_of` -#' date for a 4 day ahead forecast, the ahead used in practice is actually 7. +#' observation results in a forecast `ahead` after the `forecast_date` date. +#' E.g. if there are 3 days of latency between the last observation and the +#' `forecast_date` date for a 4 day ahead forecast, the ahead used in practice +#' is actually 7. #' - `"locf"`: carries forward the last observed value(s) up to the forecast #' date. See the Vignette TODO for equivalents using other steps and more #' sophisticated methods of extrapolation. @@ -27,16 +27,18 @@ #' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used #' become `c(3,10,17)`. #' @param fixed_latency either a positive integer, or a labeled positive integer -#' vector. Cannot be set at the same time as `fixed_asof`. If non-`NULL`, -#' the amount to offset the ahead or lag by. If a single integer, this is used -#' for all columns; if a labeled vector, the labels must correspond to the -#' base column names. If `NULL`, the latency is the distance between the -#' `epi_df`'s `max_time_value` and either the `fixed_asof` or the `epi_df`'s -#' `as_of` field. -#' @param fixed_asof either a date of the same kind used in the `epi_df`, or -#' NULL. Cannot be set at the same time as `fixed_latency`. If a date, it -#' gives the date from which the forecast is actually occurring. If `NULL`, -#' the `as_of` is determined either from `fixed_latency` or automatically. +#' vector. Cannot be set at the same time as `fixed_asof`. If non-`NULL`, the +#' amount to offset the ahead or lag by. If a single integer, this is used for +#' all columns; if a labeled vector, the labels must correspond to the base +#' column names (before lags/aheads). If `NULL`, the latency is the distance +#' between the `epi_df`'s `max_time_value` and either the +#' `fixed_forecast_date` or the `epi_df`'s `as_of` field (the default for +#' `forecast_date`). +#' @param fixed_forecast_date either a date of the same kind used in the +#' `epi_df`, or `NULL`. Exclusive with `fixed_latency`. If a date, it gives +#' the date from which the forecast is actually occurring. If `NULL`, the +#' `forecast_date` is determined either via the `fixed_latency`, or is set to +#' the `epi_df`'s `as_of` value if `fixed_latency` is also `NULL`. #' @param role For model terms created by this step, what analysis role should #' they be assigned? `lag` is default a predictor while `ahead` is an outcome. #' It should be correctly inferred and not need setting @@ -68,11 +70,23 @@ #' @rdname step_adjust_latency #' @export #' @examples +#' jhu <- case_death_rate_subset %>% +#' dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +#' # setting the `as_of` to something realistic +#' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +#' #' r <- epi_recipe(case_death_rate_subset) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% -#' # step_adjust_latency(method = "extend_ahead") %>% +#' step_adjust_latency(method = "extend_ahead") %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) #' r +#' +#' jhu_fit <- epi_workflow() %>% +#' add_epi_recipe(r) %>% +#' add_model(linear_reg()) %>% +#' fit(data = jhu) +#' jhu_fit +#' #' @importFrom recipes detect_step step_adjust_latency <- function(recipe, @@ -85,7 +99,7 @@ step_adjust_latency <- "extend_lags" ), fixed_latency = NULL, - fixed_asof = NULL, + fixed_forecast_date = NULL, default = NA, skip = FALSE, columns = NULL, @@ -149,7 +163,7 @@ step_adjust_latency <- role = role, method = method, trained = trained, - as_of = fixed_asof, + forecast_date = fixed_forecast_date, latency = fixed_latency, shift_cols = relevant_shifts, default = default, @@ -161,7 +175,7 @@ step_adjust_latency <- } step_adjust_latency_new <- - function(terms, role, trained, as_of, latency, shift_cols, time_type, default, + function(terms, role, trained, forecast_date, latency, shift_cols, time_type, default, keys, method, skip, id) { step( subclass = "adjust_latency", @@ -169,7 +183,7 @@ step_adjust_latency_new <- role = role, method = method, trained = trained, - as_of = as_of, + forecast_date = forecast_date, latency = latency, shift_cols = shift_cols, default = default, @@ -190,15 +204,15 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { filter(role == "raw") %>% pull(variable) } - # get and check the max_time and as_of are the right kinds of dates - as_of <- x$as_of %||% set_asof(training, info) + # get and check the max_time and forecast_date are the right kinds of dates + forecast_date <- x$forecast_date %||% set_forecast_date(training, info) # infer the correct columns to be working with from the previous # transformations x$prefix <- x$shift_cols$prefix[[1]] sign_shift <- get_sign(x) latency_cols <- get_latent_column_tibble( - x$shift_cols, training, as_of, + x$shift_cols, training, forecast_date, x$latency, sign_shift, info ) @@ -222,7 +236,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ), "i" = "input shift: {latency_cols$shift[[i_latency]]}", "i" = "latency adjusted shift: {latency_cols$effective_shift[[i_latency]]}", - "i" = "max_time = {max_time} -> as_of = {as_of}" + "i" = "`max_time` = {max_time} -> `forecast_date` = {forecast_date}" )) } } @@ -232,7 +246,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { role = latency_cols$role[[1]], trained = TRUE, shift_cols = latency_cols, - as_of = as_of, + forecast_date = forecast_date, latency = unique(latency_cols$latency), default = x$default, keys = x$keys, @@ -268,9 +282,9 @@ print.step_adjust_latency <- } else { terms <- x$terms } - if (!is.null(x$as_of)) { + if (!is.null(x$forecast_date)) { conj <- "with forecast date" - extra_text <- x$as_of + extra_text <- x$forecast_date } else if (!is.null(x$shift_cols)) { conj <- "with latencies" extra_text <- x$shift_cols diff --git a/R/utils-latency.R b/R/utils-latency.R index 5fa4ce3fc..9e0ef8db1 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -3,7 +3,7 @@ #' note that this may introduce new NA values when one column is shifted farther than another #' @param shift_cols a tibble which must have the columns `column`, the name of #' the column to adjust, `latency` the latency of the original column relative -#' to the `as_of` date, `new_name`, the names in `column` adjusted by the +#' to the `forecast_date`, `new_name`, the names in `column` adjusted by the #' latencies `latency` #' @param new_data just what is says #' @param keys the variables which are used as keys @@ -68,18 +68,18 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' been changed #' @param shift_cols a list of columns to operate on, as created by `construct_shift_tibble` #' @param new_data the data transformed so far -#' @param as_of the forecast date -#' @param latency `NULL`, int, or vector, as described in `step_eip_latency` +#' @param forecast_date the forecast date +#' @param latency `NULL`, int, or vector, as described in `step_epi_latency` #' @param sign_shift -1 if ahead, 1 if lag #' @return a tibble with columns `column` (relevant shifted names), `shift` (the #' amount that one is shifted), `latency` (original columns difference between -#' max_time_value and as_of (on a per-initial column basis)), +#' the max `time_value` and `forecast_date` (on a per-initial column basis)), #' `effective_shift` (shift+latency), and `new_name` (adjusted names with the #' effective_shift) #' @keywords internal #' @importFrom dplyr rowwise %>% get_latent_column_tibble <- function( - shift_cols, new_data, as_of, latency, + shift_cols, new_data, forecast_date, latency, sign_shift, info, call = caller_env()) { shift_cols <- shift_cols %>% mutate(original_name = glue::glue("{prefix}{shift}_{terms}")) if (is.null(latency)) { @@ -87,7 +87,7 @@ get_latent_column_tibble <- function( rowwise() %>% # add the latencies to shift_cols mutate(latency = get_latency( - new_data, as_of, original_name, shift, sign_shift + new_data, forecast_date, original_name, shift, sign_shift )) %>% ungroup() } else if (length(latency) > 1) { @@ -119,9 +119,9 @@ get_latent_column_tibble <- function( } -#' extract the as_of, and make sure there's nothing very off about it +#' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal -set_asof <- function(new_data, info) { +set_forecast_date <- function(new_data, info) { original_columns <- info %>% filter(source == "original") %>% pull(variable) @@ -142,45 +142,75 @@ set_asof <- function(new_data, info) { if (length(time_values) <= 0) { cli::cli_abort("the `time_value` column of `new_data` is empty") } - as_of <- attributes(new_data)$metadata$as_of + forecast_date <- attributes(new_data)$metadata$as_of max_time <- max(time_values) # make sure the as_of is sane - if (!inherits(as_of, class(time_values)) & !inherits(as_of, "POSIXt")) { + if (!inherits(forecast_date, class(time_values)) & !inherits(forecast_date, "POSIXt")) { cli::cli_abort(paste( - "the data matrix `as_of` value is {as_of}, ", + "the data matrix `forecast_date` value is {forecast_date}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", "{typeof(new_data$time_value)}." )) } - if (is.null(as_of) || is.na(as_of)) { + if (is.null(forecast_date) || is.na(forecast_date)) { cli::cli_warn(paste( - "epi_data's `as_of` was {as_of}, setting to ", + "epi_data's `forecast_date` was {forecast_date}, setting to ", "the latest time value, {max_time}." )) - as_of <- max_time - } else if (as_of < max_time) { + forecast_date <- max_time + } else if (forecast_date < max_time) { cli::cli_abort(paste( - "`as_of` ({(as_of)}) is before the most ", + "`forecast_date` ({(forecast_date)}) is before the most ", "recent data ({max_time}). Remove before ", "predicting." )) } # TODO cover the rest of the possible types for as_of and max_time... if (class(time_values) == "Date") { - as_of <- as.Date(as_of) + forecast_date <- as.Date(forecast_date) } - return(as_of) + return(forecast_date) } #' the latency is also the amount the shift is off by #' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you #' need to shift the data to bring the 3 day lagged value to today. #' @keywords internal -get_latency <- function(new_data, as_of, column, shift_amount, sign_shift) { +get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shift) { shift_max_date <- new_data %>% drop_na(all_of(column)) %>% pull(time_value) %>% max() - return(as.integer(sign_shift * (as_of - shift_max_date) + shift_amount)) + return(as.integer(sign_shift * (forecast_date - shift_max_date) + shift_amount)) +} + + + +#' get the target date while in a layer +get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new_data) { + max_time_value <- max( + workflow_max_time_value, + this_recipe$max_time_value, + max(new_data$time_value) + ) + if (this_recipe %>% recipes::detect_step("adjust_latency")) { + # get the as_of in an `adjust_latency` step, regardless of where + handpicked_as_of <- map( + this_recipe$steps, + function(x) { + if (inherits(this_recipe$steps[[3]], "step_adjust_latency")) x$as_of + } + ) %>% Filter(Negate(is.null), .) + if (length(handpicked_as_of) > 0) { + max_time_value <- handpicked_as_of[[1]] + } else { + # if we haven't chosen one, use either the max_time_value or the as_of + max_time_value <- max( + max_time_value, + attributes(new_data)$metadata$as_of + ) + } + } + max_time_value } diff --git a/man/extend_either.Rd b/man/extend_either.Rd index b7e306944..ae55fa46a 100644 --- a/man/extend_either.Rd +++ b/man/extend_either.Rd @@ -13,7 +13,7 @@ extend_either(new_data, shift_cols, keys) \item{shift_cols}{a tibble which must have the columns \code{column}, the name of the column to adjust, \code{latency} the latency of the original column relative -to the \code{as_of} date, \code{new_name}, the names in \code{column} adjusted by the +to the \code{forecast_date}, \code{new_name}, the names in \code{column} adjusted by the latencies \code{latency}} \item{keys}{the variables which are used as keys} diff --git a/man/get_forecast_date_in_layer.Rd b/man/get_forecast_date_in_layer.Rd new file mode 100644 index 000000000..2f9e03548 --- /dev/null +++ b/man/get_forecast_date_in_layer.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_forecast_date_in_layer} +\alias{get_forecast_date_in_layer} +\title{get the target date while in a layer} +\usage{ +get_forecast_date_in_layer(this_recipe, workflow_max_time_value, new_data) +} +\description{ +get the target date while in a layer +} diff --git a/man/get_latency.Rd b/man/get_latency.Rd index d9098b456..4cde42ab6 100644 --- a/man/get_latency.Rd +++ b/man/get_latency.Rd @@ -4,7 +4,7 @@ \alias{get_latency} \title{the latency is also the amount the shift is off by} \usage{ -get_latency(new_data, as_of, column, shift_amount, sign_shift) +get_latency(new_data, forecast_date, column, shift_amount, sign_shift) } \arguments{ \item{sign_shift}{integer. 1 if lag and -1 if ahead. These represent how you diff --git a/man/get_latent_column_tibble.Rd b/man/get_latent_column_tibble.Rd index 168145ed8..3134a42ae 100644 --- a/man/get_latent_column_tibble.Rd +++ b/man/get_latent_column_tibble.Rd @@ -8,7 +8,7 @@ been changed} get_latent_column_tibble( shift_cols, new_data, - as_of, + forecast_date, latency, sign_shift, info, @@ -20,16 +20,16 @@ get_latent_column_tibble( \item{new_data}{the data transformed so far} -\item{as_of}{the forecast date} +\item{forecast_date}{the forecast date} -\item{latency}{\code{NULL}, int, or vector, as described in \code{step_eip_latency}} +\item{latency}{\code{NULL}, int, or vector, as described in \code{step_epi_latency}} \item{sign_shift}{-1 if ahead, 1 if lag} } \value{ a tibble with columns \code{column} (relevant shifted names), \code{shift} (the amount that one is shifted), \code{latency} (original columns difference between -max_time_value and as_of (on a per-initial column basis)), +the max \code{time_value} and \code{forecast_date} (on a per-initial column basis)), \code{effective_shift} (shift+latency), and \code{new_name} (adjusted names with the effective_shift) } diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index e27f2bacd..4c3336536 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -14,10 +14,11 @@ layer_add_forecast_date( \item{frosting}{a \code{frosting} postprocessor} \item{forecast_date}{The forecast date to add as a column to the \code{epi_df}. -For most cases, this should be specified in the form "yyyy-mm-dd". Note that -when the forecast date is left unspecified, it is set to the maximum time -value from the data used in pre-processing, fitting the model, and -postprocessing.} +For most cases, this should be specified in the form "yyyy-mm-dd". Note +that when the forecast date is left unspecified, it is set to one of two +values. If there is a \code{step_adjust_latency} step present, it uses the +\code{as_of} the maximum time value from the data used in pre-processing, +fitting the model, and postprocessing.} \item{id}{a random id string} } diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index dc0d2f190..e522cd6da 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -13,14 +13,14 @@ layer_add_target_date( \arguments{ \item{frosting}{a \code{frosting} postprocessor} -\item{target_date}{The target date to add as a column to the -\code{epi_df}. If there's a forecast date specified in a layer, then -it is the forecast date plus \code{ahead} (from \code{step_epi_ahead} in -the \code{epi_recipe}). Otherwise, it is the maximum \code{time_value} -(from the data used in pre-processing, fitting the model, and -postprocessing) plus \code{ahead}, where \code{ahead} has been specified in -preprocessing. The user may override these by specifying a -target date of their own (of the form "yyyy-mm-dd").} +\item{target_date}{The target date to add as a column to the \code{epi_df}. If +there's a forecast date specified upstream (either in a +\code{step_adjust_latency} or in a \code{layer_forecast_date}), then it is the +forecast date plus \code{ahead} (from \code{step_epi_ahead} in the \code{epi_recipe}). +Otherwise, it is the maximum \code{time_value} (from the data used in +pre-processing, fitting the model, and postprocessing) plus \code{ahead}, where +\code{ahead} has been specified in preprocessing. The user may override these by +specifying a target date of their own (of the form "yyyy-mm-dd").} \item{id}{a random id string} } @@ -33,8 +33,14 @@ Postprocessing step to add the target date \details{ By default, this function assumes that a value for \code{ahead} has been specified in a preprocessing step (most likely in -\code{step_epi_ahead}). Then, \code{ahead} is added to the maximum \code{time_value} -in the test data to get the target date. +\code{step_epi_ahead}). Then, \code{ahead} is added to the \code{forecast_date} +in the test data to get the target date. \code{forecast_date} can be set in 3 ways: +\enumerate{ +\item \code{step_adjust_latency}, which typically uses the training \code{epi_df}'s \code{as_of} +\item \code{layer_add_forecast_date}, which inherits from 1 if not manually specifed +\item if none of those are the case, it is simply the maximum \code{time_value} over +every dataset used (prep, training, and prediction). +} } \examples{ library(dplyr) @@ -58,8 +64,14 @@ wf1 <- wf \%>\% add_frosting(f) p <- forecast(wf1) p -# Use ahead + max time value from pre, fit, post -# which is the same if include `layer_add_forecast_date()` +# Use ahead + forecast_date from adjust_latency +# setting the `as_of` to something realistic +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 +r <- epi_recipe(jhu) \%>\% + step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% + step_adjust_latency(method = "extend_ahead") \%>\% + step_epi_naomit() f2 <- frosting() \%>\% layer_predict() \%>\% layer_add_target_date() \%>\% @@ -69,13 +81,24 @@ wf2 <- wf \%>\% add_frosting(f2) p2 <- forecast(wf2) p2 -# Specify own target date +# Use ahead + max time value from pre, fit, post +# which is the same if include `layer_add_forecast_date()` f3 <- frosting() \%>\% layer_predict() \%>\% - layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_add_target_date() \%>\% layer_naomit(.pred) wf3 <- wf \%>\% add_frosting(f3) -p3 <- forecast(wf3) -p3 +p3 <- forecast(wf2) +p2 + +# Specify own target date +f4 <- frosting() \%>\% + layer_predict() \%>\% + layer_add_target_date(target_date = "2022-01-08") \%>\% + layer_naomit(.pred) +wf4 <- wf \%>\% add_frosting(f4) + +p4 <- forecast(wf4) +p4 } diff --git a/man/set_asof.Rd b/man/set_asof.Rd deleted file mode 100644 index fabf97c8a..000000000 --- a/man/set_asof.Rd +++ /dev/null @@ -1,12 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-latency.R -\name{set_asof} -\alias{set_asof} -\title{extract the as_of, and make sure there's nothing very off about it} -\usage{ -set_asof(new_data, info) -} -\description{ -extract the as_of, and make sure there's nothing very off about it -} -\keyword{internal} diff --git a/man/set_forecast_date.Rd b/man/set_forecast_date.Rd new file mode 100644 index 000000000..c238758c9 --- /dev/null +++ b/man/set_forecast_date.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{set_forecast_date} +\alias{set_forecast_date} +\title{Extract the as_of for the forecast date, and make sure there's nothing very off about it.} +\usage{ +set_forecast_date(new_data, info) +} +\description{ +Extract the as_of for the forecast date, and make sure there's nothing very off about it. +} +\keyword{internal} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 4df945204..30832a17e 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -11,7 +11,7 @@ step_adjust_latency( trained = FALSE, method = c("extend_ahead", "locf", "extend_lags"), fixed_latency = NULL, - fixed_asof = NULL, + fixed_forecast_date = NULL, default = NA, skip = FALSE, columns = NULL, @@ -35,13 +35,13 @@ It should be correctly inferred and not need setting} been estimated.} \item{method}{a character. Determines the method by which the -forecast handles latency. All of these assume the forecast date is the -\code{as_of} of the \code{epi_df}. The options are: +forecast handles latency. The options are: \itemize{ \item \code{"extend_ahead"}: Lengthen the ahead so that forecasting from the last -observation results in a forecast \code{ahead} after the \code{as_of} date. E.g. if -there are 3 days of latency between the last observation and the \code{as_of} -date for a 4 day ahead forecast, the ahead used in practice is actually 7. +observation results in a forecast \code{ahead} after the \code{forecast_date} date. +E.g. if there are 3 days of latency between the last observation and the +\code{forecast_date} date for a 4 day ahead forecast, the ahead used in practice +is actually 7. \item \code{"locf"}: carries forward the last observed value(s) up to the forecast date. See the Vignette TODO for equivalents using other steps and more sophisticated methods of extrapolation. @@ -52,17 +52,19 @@ become \code{c(3,10,17)}. }} \item{fixed_latency}{either a positive integer, or a labeled positive integer -vector. Cannot be set at the same time as \code{fixed_asof}. If non-\code{NULL}, -the amount to offset the ahead or lag by. If a single integer, this is used -for all columns; if a labeled vector, the labels must correspond to the -base column names. If \code{NULL}, the latency is the distance between the -\code{epi_df}'s \code{max_time_value} and either the \code{fixed_asof} or the \code{epi_df}'s -\code{as_of} field.} +vector. Cannot be set at the same time as \code{fixed_asof}. If non-\code{NULL}, the +amount to offset the ahead or lag by. If a single integer, this is used for +all columns; if a labeled vector, the labels must correspond to the base +column names (before lags/aheads). If \code{NULL}, the latency is the distance +between the \code{epi_df}'s \code{max_time_value} and either the +\code{fixed_forecast_date} or the \code{epi_df}'s \code{as_of} field (the default for +\code{forecast_date}).} -\item{fixed_asof}{either a date of the same kind used in the \code{epi_df}, or -NULL. Cannot be set at the same time as \code{fixed_latency}. If a date, it -gives the date from which the forecast is actually occurring. If \code{NULL}, -the \code{as_of} is determined either from \code{fixed_latency} or automatically.} +\item{fixed_forecast_date}{either a date of the same kind used in the +\code{epi_df}, or \code{NULL}. Exclusive with \code{fixed_latency}. If a date, it gives +the date from which the forecast is actually occurring. If \code{NULL}, the +\code{forecast_date} is determined either via the \code{fixed_latency}, or is set to +the \code{epi_df}'s \code{as_of} value if \code{fixed_latency} is also \code{NULL}.} \item{default}{Determines what fills empty rows left by leading/lagging (defaults to NA).} @@ -105,11 +107,23 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. } \examples{ +jhu <- case_death_rate_subset \%>\% + dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) +# setting the `as_of` to something realistic +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 + r <- epi_recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% - # step_adjust_latency(method = "extend_ahead") \%>\% + step_adjust_latency(method = "extend_ahead") \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) r + +jhu_fit <- epi_workflow() \%>\% + add_epi_recipe(r) \%>\% + add_model(linear_reg()) \%>\% + fit(data = jhu) +jhu_fit + } \seealso{ Other row operation steps: diff --git a/tests/testthat/_snaps/utils-shift.md b/tests/testthat/_snaps/utils-shift.md deleted file mode 100644 index b7c5f064f..000000000 --- a/tests/testthat/_snaps/utils-shift.md +++ /dev/null @@ -1,13 +0,0 @@ -# extend_ahead warns in case of extreme adjustment - - Code - adjust_latency(object, x_adjust_ahead) - Condition - Warning: - ! The ahead has been adjusted by 100, which is questionable for it's `time_type` of day - i input ahead: 7 - i shifted ahead: 107 - i max_time = 2021-07-19 -> as_of = 2021-10-27 - Output - [1] 107 - diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 428922f46..cad6e79bf 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -1,5 +1,7 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 + r <- epi_recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% @@ -67,6 +69,8 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { # p3 <- predict(wf3, latest), # "forecast_date is less than the most recent update date of the data." # ) + p3 <- predict(wf3, latest) + p3 expect_silent(p3 <- predict(wf3, latest)) expect_equal(ncol(p3), 4L) expect_s3_class(p3, "epi_df") @@ -75,6 +79,30 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { expect_named(p3, c("geo_value", "time_value", ".pred", "forecast_date")) }) +test_that("`layer_add_forecast_date()` infers correct date when using `adjust_latency`", { + r_latent <- epi_recipe(jhu) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + frost_latent <- frosting() %>% + layer_predict() %>% + layer_add_forecast_date() %>% + layer_naomit(.pred) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% + fit(jhu) %>% + add_frosting(frost_latent) + p_latent <- predict(wf_latent, latest) + expect_equal( + p_latent$forecast_date, + rep(as.Date("2022-01-03"), times = 3) + ) + expect_equal( + p_latent$forecast_date - p_latent$time_value, + as.difftime(rep(3, times = 3), units = "days") + ) +}) test_that("forecast date works for daily", { f <- frosting() %>% diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 53506ad07..72f3c1d7f 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -6,6 +6,7 @@ r <- epi_recipe(jhu) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 latest <- jhu %>% dplyr::filter(time_value >= max(time_value) - 14) @@ -39,6 +40,27 @@ test_that("Use ahead + max time value from pre, fit, post", { expect_equal(p2$target_date, rep(as.Date("2022-01-07"), times = 3)) expect_named(p2, c("geo_value", "time_value", ".pred", "forecast_date", "target_date")) }) +test_that("latency adjust doesn't interfere with correct target date", { + r_latent <- epi_recipe(jhu) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_naomit(all_predictors()) %>% + step_naomit(all_outcomes(), skip = TRUE) + wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% fit(jhu) + f_latent <- frosting() %>% + layer_predict() %>% + layer_add_target_date() %>% + layer_naomit(.pred) + wf_latent <- wf_latent %>% add_frosting(f_latent) + + expect_silent(p_latent <- predict(wf_latent, latest)) + expect_equal(ncol(p_latent), 4L) + expect_s3_class(p_latent, "epi_df") + expect_equal(nrow(p_latent), 3L) + expect_equal(p_latent$target_date, rep(as.Date("2022-01-10"), times = 3)) + expect_named(p_latent, c("geo_value", "time_value", ".pred", "target_date")) +}) test_that("Use ahead + specified forecast date", { f <- frosting() %>% diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index f52888a68..61fe249c1 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -34,7 +34,7 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1024"), class = "simpleError") + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1031"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = real_x) diff --git a/tests/testthat/test-utils-shift.R b/tests/testthat/test-utils-shift.R index 53788f8b8..e69de29bb 100644 --- a/tests/testthat/test-utils-shift.R +++ b/tests/testthat/test-utils-shift.R @@ -1,30 +0,0 @@ -time_range <- as.Date("2021-01-01") + 0:199 -x_adjust_ahead <- tibble( - geo_value = rep("place", 200), - time_value = time_range, - case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, - death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 -) %>% - as_epi_df(as_of = max(time_range) + 3) -# confirm the delay is right - -test_that("adjust_latency extend_ahead works", { - # testing that POSIXct converts correctly (as well as basic types) - expect_equal( - attributes(x_adjust_ahead)$metadata$as_of - max(x_adjust_ahead$time_value), - as.difftime(3, units = "days") - ) - object <- list(latency_adjustment = "extend_ahead", ahead = 7) - expect_no_error(adjusted_ahead <- adjust_latency(object, x_adjust_ahead)) - expect_type(adjusted_ahead, "integer") - expect_equal(adjusted_ahead, 3 + 7) -}) - -test_that("extend_ahead warns in case of extreme adjustment", { - # warns if the ahead is relatively small - attributes(x_adjust_ahead)$metadata$as_of <- - max(x_adjust_ahead$time_value) + 100 - object <- list(latency_adjustment = "extend_ahead", ahead = 7) - attributes(x_adjust_ahead)$metadata$time_type - expect_snapshot(adjust_latency(object, x_adjust_ahead)) -}) From be3474c56eefd07007f2328aae10214451048f18 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 17 May 2024 12:32:26 -0500 Subject: [PATCH 35/92] namespace and doc fixes --- NAMESPACE | 2 -- man/step_adjust_latency.Rd | 4 ---- 2 files changed, 6 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6b316bca4..bcf310888 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -266,7 +266,6 @@ importFrom(glue,glue) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") -importFrom(quantreg,rq) importFrom(recipes,bake) importFrom(recipes,detect_step) importFrom(recipes,prep) @@ -301,7 +300,6 @@ importFrom(stats,qnorm) importFrom(stats,quantile) importFrom(stats,residuals) importFrom(tibble,as_tibble) -importFrom(tibble,is_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) importFrom(tidyr,unnest) diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 30832a17e..b503f3eeb 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -80,10 +80,6 @@ the computations for subsequent operations.} should be the original columns, and not the derived ones} \item{id}{A unique identifier for the step} - -\item{prefix}{a character. The prefix matching the one used in either -\code{step_epi_ahead} if \code{method="extend_ahead"} or \code{step_epi_lag} -if \code{method="extend_lags"} or "locf".} } \value{ An updated version of \code{recipe} with the new step added to the From 6c158ce6c60fbe89f3443da62eb9db48e7ebef8c Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 17 May 2024 13:20:22 -0500 Subject: [PATCH 36/92] full rebase fixes --- NAMESPACE | 1 + R/epi_recipe.R | 1 - R/step_adjust_latency.R | 2 +- man/bake.step_adjust_latency.Rd | 1 - tests/testthat/test-step_adjust_latency.R | 4 ++-- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index bcf310888..29b1d9591 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,7 @@ S3method(autoplot,canned_epipred) S3method(autoplot,epi_workflow) S3method(bake,check_enough_train_data) S3method(bake,epi_recipe) +S3method(bake,step_adjust_latency) S3method(bake,step_epi_ahead) S3method(bake,step_epi_lag) S3method(bake,step_epi_slide) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index e4bcaaf91..edb7f352b 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -583,7 +583,6 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { new_data } - kill_levels <- function(x, keys) { for (i in which(names(x) %in% keys)) x[[i]] <- list(values = NA, ordered = NA) x diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 7de07a995..763c893a9 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -264,7 +264,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { #' @param new_data assumes that this already has lag/ahead columns that we need #' to adjust #' @importFrom dplyr %>% pull -#' @keywords internal +#' @export bake.step_adjust_latency <- function(object, new_data, ...) { if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { keys <- object$keys diff --git a/man/bake.step_adjust_latency.Rd b/man/bake.step_adjust_latency.Rd index edb4f1f6e..dac3f5509 100644 --- a/man/bake.step_adjust_latency.Rd +++ b/man/bake.step_adjust_latency.Rd @@ -15,4 +15,3 @@ to adjust} adjust the ahead so that we will be predicting \code{ahead} days after the \code{as_of} date, rather than relative to the last day of data } -\keyword{internal} diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 61fe249c1..f08652ca5 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -6,7 +6,7 @@ x <- tibble( case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 ) %>% - as_epi_df() + as_epi_df(as_of = as.POSIXct("2024-05-17")) max_time <- max(x$time_value) class(attributes(x)$metadata$as_of) as_of <- attributes(x)$metadata$as_of @@ -34,7 +34,7 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1031"), class = "simpleError") + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1033"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = real_x) From 4f717155c69dd0896ef56e9efb0751d1b9c48068 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 17 May 2024 17:26:35 -0500 Subject: [PATCH 37/92] adding latency adjusting to arx_forecaster --- R/arx_forecaster.R | 48 +++++++++++++++++++++++++++------ R/layer_add_forecast_date.R | 7 ++--- R/utils-latency.R | 2 +- tests/testthat/test-snapshots.R | 20 ++++++++++++++ 4 files changed, 63 insertions(+), 14 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 7b0d84f24..3bcf02586 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -54,7 +54,7 @@ arx_forecaster <- function( preds <- forecast( wf, - fill_locf = TRUE, + fill_locf = is.null(args_list$adjust_latency), n_recent = args_list$nafill_buffer, forecast_date = args_list$forecast_date %||% max(epi_data$time_value) ) %>% @@ -120,6 +120,13 @@ arx_fcast_epi_workflow <- function( if (!(is.null(trainer) || is_regression(trainer))) { cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'regression'.") } + # forecast_date is first what they set; + # if they don't and they're not adjusting latency, it defaults to the max time_value + # if they're adjusting as_of, it defaults to the as_of + forecast_date <- args_list$forecast_date %||% + if (is.null(args_list$adjust_latency)) max(epi_data$time_value) else attributes(epi_data)$metadata$as_of + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor @@ -130,6 +137,26 @@ arx_fcast_epi_workflow <- function( } r <- r %>% step_epi_ahead(!!outcome, ahead = args_list$ahead) %>% + { + method <- args_list$adjust_latency + if (!is.null(method)) { + if (method == "extend_ahead") { + step_adjust_latency(., + all_outcomes(), + fixed_forecast_date = forecast_date, + method = method + ) + } else if (method == "extend_lags") { + step_adjust_latency(., + all_predictors(), + fixed_forecast_date = forecast_date, + method = method + ) + } + } else { + . + } + } %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) @@ -145,8 +172,6 @@ arx_fcast_epi_workflow <- function( } - forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) - target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) # --- postprocessor f <- frosting() %>% layer_predict() # %>% layer_naomit() @@ -188,10 +213,15 @@ arx_fcast_epi_workflow <- function( #' @param n_training Integer. An upper limit for the number of rows per #' key that are used for training #' (in the time unit of the `epi_df`). -#' @param forecast_date Date. The date on which the forecast is created. -#' The default `NULL` will attempt to determine this automatically. -#' @param target_date Date. The date for which the forecast is intended. -#' The default `NULL` will attempt to determine this automatically. +#' @param forecast_date Date. The date on which the forecast is created. The +#' default `NULL` will attempt to determine this automatically either as the +#' max time value if there is no latency adjustment, or as the `as_of` of +#' `epi_data` if `adjust_latency` is non-`NULL`. +#' @param target_date Date. The date for which the forecast is intended. The +#' default `NULL` will attempt to determine this automatically as +#' `forecast_date + ahead`. +#' @param adjust_latency Character or `NULL`. one of the `method`s of +#' `step_adjust_latency`, or `NULL` (in which case there is no adjustment). #' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce #' prediction intervals. These are created by computing the quantiles of #' training residuals. A `NULL` value will result in point forecasts only. @@ -237,6 +267,7 @@ arx_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = NULL, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -252,7 +283,7 @@ arx_args_list <- function( arg_is_scalar(ahead, n_training, symmetrize, nonneg) arg_is_chr(quantile_by_key, allow_empty = TRUE) - arg_is_scalar(forecast_date, target_date, allow_null = TRUE) + arg_is_scalar(forecast_date, target_date, adjust_latency, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags) arg_is_lgl(symmetrize, nonneg) @@ -281,6 +312,7 @@ arx_args_list <- function( quantile_levels, forecast_date, target_date, + adjust_latency, symmetrize, nonneg, max_lags, diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 41cda35a9..43b7d7bd4 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -92,15 +92,12 @@ layer_add_forecast_date_new <- function(forecast_date, id) { slather.layer_add_forecast_date <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() - if (is.null(object$forecast_date)) { - forecast_date <- get_forecast_date_in_layer( + forecast_date <- object$forecast_date %||% + get_forecast_date_in_layer( extract_preprocessor(workflow), workflow$fit$meta$max_time_value, new_data ) - } else { - forecast_date <- object$forecast_date - } expected_time_type <- attr( workflows::extract_preprocessor(workflow)$template, "metadata" diff --git a/R/utils-latency.R b/R/utils-latency.R index 9e0ef8db1..7f87bece6 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -182,7 +182,7 @@ get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shif drop_na(all_of(column)) %>% pull(time_value) %>% max() - return(as.integer(sign_shift * (forecast_date - shift_max_date) + shift_amount)) + return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date) + shift_amount)) } diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index d624a4c21..ebfb38ca2 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -68,6 +68,26 @@ test_that("arx_forecaster snapshots", { ) ) expect_snapshot_tibble(arx2$predictions) + attributes(train_data)$metadata$as_of <- max(train_data$time_value) + 5 + arx3 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = arx_args_list( + ahead = 1L, + adjust_latency = "extend_ahead" + ) + ) + # consistency check + expect_snapshot_tibble(arx3$predictions) + expect_equal(arx3$predictions$target_date, + rep(attributes(train_data)$metadata$as_of + 1, times = 6)) + expect_equal(arx3$predictions$target_date, + arx2$predictions$target_date + 5) + expect_equal(arx3$predictions$forecast_date, + arx2$predictions$forecast_date + 5) + # not the same predictions + expect_false(all(arx2$predictions == arx3$predictions)) }) test_that("arx_classifier snapshots", { From e102d413938ab607a633797fabf13f42b43b1920 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 17 May 2024 17:34:47 -0500 Subject: [PATCH 38/92] arx_classifier more or less free --- R/arx_classifier.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index ca6a3537b..97c4deed1 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -57,7 +57,7 @@ arx_classifier <- function( preds <- forecast( wf, - fill_locf = TRUE, + fill_locf = is.null(args_list$adjust_latency), n_recent = args_list$nafill_buffer, forecast_date = args_list$forecast_date %||% max(epi_data$time_value) ) %>% From 65535c53cf182448916e28500123b8bdc5cae3a4 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 17 May 2024 17:43:11 -0500 Subject: [PATCH 39/92] formatting and snapshots --- tests/testthat/_snaps/snapshots.md | 28 ++++++++++++++++++++++++++++ tests/testthat/test-snapshots.R | 18 ++++++++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 84abf57d2..c103ffe0e 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1031,6 +1031,34 @@ 18993, 18993, 18993, 18993, 18993), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" + ), .pred = c(0.303244704017743, 0.531332853311082, 0.588827944685979, + 0.988690249216229, 0.794801997001639, 0.306895457225321), .pred_distn = structure(list( + structure(list(values = c("5%" = 0.136509784083987, "95%" = 0.469979623951498 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.364597933377326, "95%" = 0.698067773244837 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.422093024752224, "95%" = 0.755562864619735 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.821955329282474, "95%" = 1.15542516914998 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.628067077067883, "95%" = 0.961536916935394 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c("5%" = 0.140160537291566, "95%" = 0.473630377159077 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr"))), class = c("distribution", + "vctrs_vctr", "list")), forecast_date = structure(c(18997, 18997, + 18997, 18997, 18997, 18997), class = "Date"), target_date = structure(c(18998, + 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, + -6L), class = c("tbl_df", "tbl", "data.frame")) + # arx_classifier snapshots structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index ebfb38ca2..25bc13bde 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -80,12 +80,18 @@ test_that("arx_forecaster snapshots", { ) # consistency check expect_snapshot_tibble(arx3$predictions) - expect_equal(arx3$predictions$target_date, - rep(attributes(train_data)$metadata$as_of + 1, times = 6)) - expect_equal(arx3$predictions$target_date, - arx2$predictions$target_date + 5) - expect_equal(arx3$predictions$forecast_date, - arx2$predictions$forecast_date + 5) + expect_equal( + arx3$predictions$target_date, + rep(attributes(train_data)$metadata$as_of + 1, times = 6) + ) + expect_equal( + arx3$predictions$target_date, + arx2$predictions$target_date + 5 + ) + expect_equal( + arx3$predictions$forecast_date, + arx2$predictions$forecast_date + 5 + ) # not the same predictions expect_false(all(arx2$predictions == arx3$predictions)) }) From 5d5cfbb1e5963c6d7ed2e5a3710182de4a3fe6aa Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 22 May 2024 18:05:36 -0500 Subject: [PATCH 40/92] updated man pages --- man/arx_args_list.Rd | 15 +++++++++++---- man/arx_class_args_list.Rd | 11 +++++++---- man/cdc_baseline_args_list.Rd | 6 ++++-- man/flatline_args_list.Rd | 11 +++++++---- 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index c9ae4e733..68d445872 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -10,6 +10,7 @@ arx_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = NULL, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -32,11 +33,17 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date for which the forecast is intended. The +default \code{NULL} will attempt to determine this automatically as +\code{forecast_date + ahead}.} + +\item{adjust_latency}{Character or \code{NULL}. one of the \code{method}s of +\code{step_adjust_latency}, or \code{NULL} (in which case there is no adjustment).} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 311950d62..2c780e822 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -34,11 +34,14 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date for which the forecast is intended. The +default \code{NULL} will attempt to determine this automatically as +\code{forecast_date + ahead}.} \item{outcome_transform}{Scalar character. Whether the outcome should be created using growth rates (as the predictors are) or lagged diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 2f9300572..981c3c7e5 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -34,8 +34,10 @@ set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_ key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 059dfa038..633d45020 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -29,11 +29,14 @@ So for example, \code{ahead = 7} will create residuals by comparing values key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. -The default \code{NULL} will attempt to determine this automatically.} +\item{forecast_date}{Date. The date on which the forecast is created. The +default \code{NULL} will attempt to determine this automatically either as the +max time value if there is no latency adjustment, or as the \code{as_of} of +\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} -\item{target_date}{Date. The date for which the forecast is intended. -The default \code{NULL} will attempt to determine this automatically.} +\item{target_date}{Date. The date for which the forecast is intended. The +default \code{NULL} will attempt to determine this automatically as +\code{forecast_date + ahead}.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of From 084acb685c68a492c9e83425a42d2c3bc1c034a5 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 23 May 2024 20:57:35 -0500 Subject: [PATCH 41/92] group_by options to get the max_time_value --- R/step_adjust_latency.R | 20 +++++-- R/utils-latency.R | 47 +++++++++++------ tests/testthat/test-step_adjust_latency.R | 64 +++++++++++++++++++++++ tests/testthat/test-utils_latency.R | 51 +++++++++++------- 4 files changed, 144 insertions(+), 38 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 763c893a9..38af9477b 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -26,6 +26,16 @@ #' the shortest lag at predict time is at the last observation. E.g. if the #' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used #' become `c(3,10,17)`. +#' @param epi_keys_checked a character vector. A list of keys to group by before +#' finding the `max_time_value`; only used if both `fixed_latency` and +#' `fixed_forecast_date` are `NULL`. The default value of this is +#' `c("geo_value")`, but it can be any collection of `epi_keys`. Different +#' locations may have different latencies; to produce a forecast at every +#' location, we need to use the largest latency across every location; this +#' means taking `max_time_value` to be the minimum of the `max_time_value`s +#' for each `geo_value` (or whichever collection of keys are specified). If +#' `NULL` or an empty character vector, it will take the maximum across all +#' values, irrespective of any keys. #' @param fixed_latency either a positive integer, or a labeled positive integer #' vector. Cannot be set at the same time as `fixed_asof`. If non-`NULL`, the #' amount to offset the ahead or lag by. If a single integer, this is used for @@ -98,6 +108,7 @@ step_adjust_latency <- "locf", "extend_lags" ), + epi_keys_checked = c("geo_value"), fixed_latency = NULL, fixed_forecast_date = NULL, default = NA, @@ -162,6 +173,7 @@ step_adjust_latency <- terms = dplyr::enquos(...), role = role, method = method, + epi_keys_checked = epi_keys_checked, trained = trained, forecast_date = fixed_forecast_date, latency = fixed_latency, @@ -176,12 +188,13 @@ step_adjust_latency <- step_adjust_latency_new <- function(terms, role, trained, forecast_date, latency, shift_cols, time_type, default, - keys, method, skip, id) { + keys, method, epi_keys_checked, skip, id) { step( subclass = "adjust_latency", terms = terms, role = role, method = method, + epi_keys_checked = epi_keys_checked, trained = trained, forecast_date = forecast_date, latency = latency, @@ -205,7 +218,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { pull(variable) } # get and check the max_time and forecast_date are the right kinds of dates - forecast_date <- x$forecast_date %||% set_forecast_date(training, info) + forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked) # infer the correct columns to be working with from the previous # transformations @@ -213,7 +226,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { sign_shift <- get_sign(x) latency_cols <- get_latent_column_tibble( x$shift_cols, training, forecast_date, - x$latency, sign_shift, info + x$latency, sign_shift, info, x$epi_keys_checked ) if ((x$method == "extend_ahead") || (x$method == "extend_lags")) { @@ -251,6 +264,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { default = x$default, keys = x$keys, method = x$method, + epi_keys_checked = x$epi_keys_checked, skip = x$skip, id = x$id ) diff --git a/R/utils-latency.R b/R/utils-latency.R index 7f87bece6..00407898c 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -7,8 +7,9 @@ #' latencies `latency` #' @param new_data just what is says #' @param keys the variables which are used as keys +#' @param epi_keys_checked the keys used to group_by to find max_time_values #' @keywords internal -extend_either <- function(new_data, shift_cols, keys) { +extend_either <- function(new_data, shift_cols, keys, epi_keys_checked) { shifted <- shift_cols %>% select(original_name, latency, new_name) %>% @@ -80,14 +81,14 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' @importFrom dplyr rowwise %>% get_latent_column_tibble <- function( shift_cols, new_data, forecast_date, latency, - sign_shift, info, call = caller_env()) { + sign_shift, info, epi_keys_checked, call = caller_env()) { shift_cols <- shift_cols %>% mutate(original_name = glue::glue("{prefix}{shift}_{terms}")) if (is.null(latency)) { shift_cols <- shift_cols %>% rowwise() %>% # add the latencies to shift_cols mutate(latency = get_latency( - new_data, forecast_date, original_name, shift, sign_shift + new_data, forecast_date, original_name, shift, sign_shift, epi_keys_checked )) %>% ungroup() } else if (length(latency) > 1) { @@ -121,7 +122,7 @@ get_latent_column_tibble <- function( #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal -set_forecast_date <- function(new_data, info) { +set_forecast_date <- function(new_data, info, epi_keys_checked) { original_columns <- info %>% filter(source == "original") %>% pull(variable) @@ -135,22 +136,29 @@ set_forecast_date <- function(new_data, info) { } # the source data determines the actual time_values # these are the non-na time_values; - time_values <- new_data %>% + # get the minimum value across the checked epi_keys' maximum time values + max_time <- new_data %>% select(all_of(original_columns)) %>% drop_na() %>% - pull(time_value) - if (length(time_values) <= 0) { - cli::cli_abort("the `time_value` column of `new_data` is empty") - } + { + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && epi_keys_checked != "") { + group_by(., get(epi_keys_checked)) + } else { + . + } + } %>% + summarize(time_value = max(time_value)) %>% + pull(time_value) %>% + min() forecast_date <- attributes(new_data)$metadata$as_of - max_time <- max(time_values) # make sure the as_of is sane - if (!inherits(forecast_date, class(time_values)) & !inherits(forecast_date, "POSIXt")) { + if (!inherits(forecast_date, class(max_time)) & !inherits(forecast_date, "POSIXt")) { cli::cli_abort(paste( "the data matrix `forecast_date` value is {forecast_date}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", - "{typeof(new_data$time_value)}." + "{class(max_time)}." )) } if (is.null(forecast_date) || is.na(forecast_date)) { @@ -167,7 +175,7 @@ set_forecast_date <- function(new_data, info) { )) } # TODO cover the rest of the possible types for as_of and max_time... - if (class(time_values) == "Date") { + if (class(max_time) == "Date") { forecast_date <- as.Date(forecast_date) } return(forecast_date) @@ -177,11 +185,20 @@ set_forecast_date <- function(new_data, info) { #' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you #' need to shift the data to bring the 3 day lagged value to today. #' @keywords internal -get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shift) { +get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shift, epi_keys_checked) { shift_max_date <- new_data %>% drop_na(all_of(column)) %>% + { + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && epi_keys_checked != "") { + group_by(., get(epi_keys_checked)) + } else { + . + } + } %>% + summarize(time_value = max(time_value)) %>% pull(time_value) %>% - max() + min() return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date) + shift_amount)) } diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index f08652ca5..70d127327 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -25,6 +25,15 @@ slm_fit <- function(recipe, data = x) { fit(data = data) } + +# making a toy dataset with lag between geo_values +x_lagged <- x +x_lagged$time_value <- x$time_value - 1 +x_lagged$geo_value <- "other" +x_lagged <- add_row(x, x_lagged) +x_lagged +attributes(x_lagged)$metadata$as_of <- testing_as_of + test_that("epi_adjust_latency correctly extends the lags", { r5 <- epi_recipe(x) %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% @@ -158,3 +167,58 @@ test_that("epi_adjust_latency warns against removing NA's beforehand", { ) }) # todo check that epi_adjust_latency errors for nonsense `as_of`'s + + + +# todo make sure that `epi_keys_checked` works correctly for extra epi_keys +test_that("epi_adjust_latency correctly extends the lags", { + r5 <- epi_recipe(x_lagged) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(method = "extend_lags", epi_keys_checked = NULL) + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1033"), class = "simpleError") + + # now trying with the as_of a reasonable distance in the future + fit5 <- slm_fit(r5, data = x_lagged) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_5_death_rate", "lag_11_death_rate", "lag_16_death_rate", + "lag_6_case_rate", "lag_10_case_rate" + ) + ) + latest <- get_test_data(r5, x_lagged) + latest$time_value %>% unique() + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(testing_as_of)) + + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r5, x) + pred <- predict(fit5, latest) + actual_solutions <- pred %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(5, 11, 16)) %>% + step_epi_lag(case_rate, lag = c(6, 10)) %>% + step_epi_ahead(death_rate, ahead = ahead) + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 6fd3672aa..b8ec678cd 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -1,13 +1,16 @@ -time_values <- as.Date("2021-01-01") + 0:199 +time_values <- as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200] as_of <- max(time_values) + 5 max_time <- max(time_values) old_data <- tibble( - geo_value = rep("place", 200), - time_value = as.Date("2021-01-01") + 0:199, + geo_value = rep(c("place1", "place2"), 100), + time_value = as.Date("2021-01-01") + +floor(seq(0, 100, by = .5))[1:200], case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, tmp_death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 ) %>% + # place2 is slightly more recent than place1 + mutate(time_value = as.Date(ifelse(geo_value == "place2", time_value + 1, time_value))) %>% as_epi_df(as_of = as_of) +old_data keys <- c("time_value", "geo_value") old_data <- old_data %>% full_join(epi_shift_single( @@ -54,17 +57,19 @@ test_that("construct_shift_tibble constructs the right tibble", { }) test_that("get_latency works", { - expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1), 4) - expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 3, 1), 5) + expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1, "geo_value"), 4) + expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 3, 1, "geo_value"), 5) # get_latency does't check the shift_amount - expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 4, 1), 6) + expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 4, 1, "geo_value"), 6) # ahead works correctly - expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, -1), -5) + expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, -1, "geo_value"), -5) # setting the wrong sign doubles the shift and gets the sign wrong - expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, 1), 5 + 4 * 2) + expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, 1, "geo_value"), 5 + 4 * 2) + # minimizing over everything decreases the latency + expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1, NULL), 3) }) -test_that("get_latency infers max_time to be the minimum `max time` across the columns", {}) +test_that("get_latency infers max_time to be the minimum `max time` across the epi_keys", {}) test_that("get_asof works", { info <- tribble( @@ -75,24 +80,29 @@ test_that("get_asof works", { "death_rate", "numeric", "raw", "original", "not_real", "numeric", "predictor", "derived" ) - expect_equal(set_forecast_date(modified_data, info), as_of) + expect_equal(set_forecast_date(modified_data, info, "geo_value"), as_of) + expect_equal(set_forecast_date(modified_data, info, ""), as_of) + expect_equal(set_forecast_date(modified_data, info, NULL), as_of) }) test_that("get_latent_column_tibble infers latency and works correctly", { info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") case_lag <- get_latent_column_tibble( - shift_cols[1, ], modified_data, as_of, NULL, 1, info + shift_cols[1, ], modified_data, as_of, NULL, 1, info, + epi_keys_checked = "geo_value" ) expect_equal(case_lag, all_shift_cols[1, ]) death_lag <- get_latent_column_tibble( - shift_cols[2, ], modified_data, as_of, NULL, 1, info + shift_cols[2, ], modified_data, as_of, NULL, 1, info, + epi_keys_checked = "geo_value" ) expect_equal(death_lag, all_shift_cols[2, ]) both_lag <- get_latent_column_tibble( - shift_cols, modified_data, as_of, NULL, 1, info + shift_cols, modified_data, as_of, NULL, 1, info, + epi_keys_checked = "geo_value" ) expect_equal(both_lag, all_shift_cols[1:2, ]) }) @@ -123,7 +133,7 @@ test_that("get_latent_column_tibble assigns given latencies", { ahead_shift_cols <- construct_shift_tibble(c("case_rate"), test_recipe, "step_epi_ahead", "ahead") case_ahead <- get_latent_column_tibble( - ahead_shift_cols, modified_data, as_of, NULL, -1, info + ahead_shift_cols, modified_data, as_of, NULL, -1, info, "geo_value" ) expect_equal(case_ahead, all_shift_cols[3, ]) }) @@ -156,16 +166,17 @@ test_that("extend_either works", { epi_shift_single(old_data, "case_rate", -9, "ahead_9_case_rate", keys), by = keys ) %>% - dplyr::add_row(tibble( - geo_value = "place", - time_value = as.Date("2021-08-01"), case_rate = NA, death_rate = NA, - lag_8_case_rate = NA, lag_11_death_rate = NA, ahead_9_case_rate = NA + dplyr::bind_rows(tibble( + geo_value = c("place1", "place2"), + time_value = as.Date(c("2021-04-23", "2021-04-24")), case_rate = c(NA, NA), death_rate = c(NA, NA), + lag_8_case_rate = c(NA, NA), lag_11_death_rate = c(NA, NA), ahead_9_case_rate = c(NA, NA) )) %>% - arrange(time_value) + arrange(time_value, geo_value) expect_equal( - extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value), + extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value, geo_value), expected_post_shift ) + extended <- extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value, geo_value) }) From d2e2f95051a848b8d92b88012d5ff6787d4860bd Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 29 May 2024 16:10:10 -0500 Subject: [PATCH 42/92] PR review recs - drop multiline pipes - better docs - check exclusive parameters aren't used simultaneously - inherit typo - additional placeholders for future tests --- R/arx_forecaster.R | 58 ++++++++++------------- R/layer_add_forecast_date.R | 6 +-- R/layer_add_target_date.R | 1 - R/step_adjust_latency.R | 19 ++++---- R/utils-latency.R | 20 ++++---- man/get_latency.Rd | 9 +++- man/get_latent_column_tibble.Rd | 1 + man/layer_add_forecast_date.Rd | 5 +- man/set_forecast_date.Rd | 2 +- man/step_adjust_latency.Rd | 23 ++++++--- tests/testthat/test-step_adjust_latency.R | 4 ++ 11 files changed, 82 insertions(+), 66 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 3bcf02586..3a2df792f 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -1,4 +1,3 @@ -# TODO add latency to default forecaster #' Direct autoregressive forecaster with covariates #' #' This is an autoregressive forecasting model for @@ -136,41 +135,36 @@ arx_fcast_epi_workflow <- function( r <- step_epi_lag(r, !!p, lag = lags[[l]]) } r <- r %>% - step_epi_ahead(!!outcome, ahead = args_list$ahead) %>% + step_epi_ahead(!!outcome, ahead = args_list$ahead) + method <- args_list$adjust_latency + if (!is.null(method)) { + if (method == "extend_ahead") { + r <- r %>% step_adjust_latency(all_outcomes(), + fixed_forecast_date = forecast_date, + method = method + ) + } else if (method == "extend_lags") { + r <- r %>% step_adjust_latency(all_predictors(), + fixed_forecast_date = forecast_date, + method = method + ) + } + r <- r %>% step_epi_naomit() %>% + step_training_window(n_recent = args_list$n_training) %>% { - method <- args_list$adjust_latency - if (!is.null(method)) { - if (method == "extend_ahead") { - step_adjust_latency(., - all_outcomes(), - fixed_forecast_date = forecast_date, - method = method - ) - } else if (method == "extend_lags") { - step_adjust_latency(., - all_predictors(), - fixed_forecast_date = forecast_date, - method = method - ) - } + if (!is.null(args_list$check_enough_data_n)) { + check_enough_train_data( + ., + all_predictors(), + !!outcome, + n = args_list$check_enough_data_n, + epi_keys = args_list$check_enough_data_epi_keys, + drop_na = FALSE + ) } else { . } - } %>% - step_epi_naomit() %>% - step_training_window(n_recent = args_list$n_training) - - if (!is.null(args_list$check_enough_data_n)) { - r <- check_enough_train_data( - r, - all_predictors(), - !!outcome, - n = args_list$check_enough_data_n, - epi_keys = args_list$check_enough_data_epi_keys, - drop_na = FALSE - ) - } - + } # --- postprocessor diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 43b7d7bd4..3c81ded57 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -1,4 +1,3 @@ -# TODO adapt this to latency #' Postprocessing step to add the forecast date #' #' @param frosting a `frosting` postprocessor @@ -6,8 +5,9 @@ #' For most cases, this should be specified in the form "yyyy-mm-dd". Note #' that when the forecast date is left unspecified, it is set to one of two #' values. If there is a `step_adjust_latency` step present, it uses the -#' `as_of` the maximum time value from the data used in pre-processing, -#' fitting the model, and postprocessing. +#' `forecast_date` as set in that function. Otherwise, it uses the maximum +#' `time_value` across the data used for pre-processing, fitting the model, +#' and postprocessing. #' @param id a random id string #' #' @return an updated `frosting` postprocessor diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index a1bdc3e4a..991ec2140 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -1,4 +1,3 @@ -# TODO adapt this to latency #' Postprocessing step to add the target date #' #' @param frosting a `frosting` postprocessor diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 38af9477b..3b8405944 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -27,8 +27,7 @@ #' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used #' become `c(3,10,17)`. #' @param epi_keys_checked a character vector. A list of keys to group by before -#' finding the `max_time_value`; only used if both `fixed_latency` and -#' `fixed_forecast_date` are `NULL`. The default value of this is +#' finding the `max_time_value`. The default value of this is #' `c("geo_value")`, but it can be any collection of `epi_keys`. Different #' locations may have different latencies; to produce a forecast at every #' location, we need to use the largest latency across every location; this @@ -37,12 +36,12 @@ #' `NULL` or an empty character vector, it will take the maximum across all #' values, irrespective of any keys. #' @param fixed_latency either a positive integer, or a labeled positive integer -#' vector. Cannot be set at the same time as `fixed_asof`. If non-`NULL`, the -#' amount to offset the ahead or lag by. If a single integer, this is used for -#' all columns; if a labeled vector, the labels must correspond to the base -#' column names (before lags/aheads). If `NULL`, the latency is the distance -#' between the `epi_df`'s `max_time_value` and either the -#' `fixed_forecast_date` or the `epi_df`'s `as_of` field (the default for +#' vector. Cannot be set at the same time as `fixed_forecast_date`. If +#' non-`NULL`, the amount to offset the ahead or lag by. If a single integer, +#' this is used for all columns; if a labeled vector, the labels must +#' correspond to the base column names (before lags/aheads). If `NULL`, the +#' latency is the distance between the `epi_df`'s `max_time_value` and either +#' the `fixed_forecast_date` or the `epi_df`'s `as_of` field (the default for #' `forecast_date`). #' @param fixed_forecast_date either a date of the same kind used in the #' `epi_df`, or `NULL`. Exclusive with `fixed_latency`. If a date, it gives @@ -139,6 +138,10 @@ step_adjust_latency <- cli::cli_abort("adjust_latency needs to occur before any `NA` removal, as columns may be moved around") } + if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { + cli::cli_abort("Only one of `fixed_latency` and `fixed_forecast_date` + can be non-`NULL` at a time!") + } method <- rlang::arg_match(method) terms_used <- recipes_eval_select(enquos(...), recipe$template, recipe$term_info) diff --git a/R/utils-latency.R b/R/utils-latency.R index 00407898c..4ddd0dc08 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -7,9 +7,8 @@ #' latencies `latency` #' @param new_data just what is says #' @param keys the variables which are used as keys -#' @param epi_keys_checked the keys used to group_by to find max_time_values #' @keywords internal -extend_either <- function(new_data, shift_cols, keys, epi_keys_checked) { +extend_either <- function(new_data, shift_cols, keys) { shifted <- shift_cols %>% select(original_name, latency, new_name) %>% @@ -187,15 +186,12 @@ set_forecast_date <- function(new_data, info, epi_keys_checked) { #' @keywords internal get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shift, epi_keys_checked) { shift_max_date <- new_data %>% - drop_na(all_of(column)) %>% - { - # null and "" don't work in `group_by` - if (!is.null(epi_keys_checked) && epi_keys_checked != "") { - group_by(., get(epi_keys_checked)) - } else { - . - } - } %>% + drop_na(all_of(column)) + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && epi_keys_checked != "") { + shift_max_date <- shift_max_date %>% group_by(get(epi_keys_checked)) + } + shift_max_date <- shift_max_date %>% summarize(time_value = max(time_value)) %>% pull(time_value) %>% min() @@ -216,7 +212,7 @@ get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new handpicked_as_of <- map( this_recipe$steps, function(x) { - if (inherits(this_recipe$steps[[3]], "step_adjust_latency")) x$as_of + if (inherits(x, "step_adjust_latency")) x$as_of } ) %>% Filter(Negate(is.null), .) if (length(handpicked_as_of) > 0) { diff --git a/man/get_latency.Rd b/man/get_latency.Rd index 4cde42ab6..f5bf7f0c2 100644 --- a/man/get_latency.Rd +++ b/man/get_latency.Rd @@ -4,7 +4,14 @@ \alias{get_latency} \title{the latency is also the amount the shift is off by} \usage{ -get_latency(new_data, forecast_date, column, shift_amount, sign_shift) +get_latency( + new_data, + forecast_date, + column, + shift_amount, + sign_shift, + epi_keys_checked +) } \arguments{ \item{sign_shift}{integer. 1 if lag and -1 if ahead. These represent how you diff --git a/man/get_latent_column_tibble.Rd b/man/get_latent_column_tibble.Rd index 3134a42ae..0a0298625 100644 --- a/man/get_latent_column_tibble.Rd +++ b/man/get_latent_column_tibble.Rd @@ -12,6 +12,7 @@ get_latent_column_tibble( latency, sign_shift, info, + epi_keys_checked, call = caller_env() ) } diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index 4c3336536..aa224013f 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -17,8 +17,9 @@ layer_add_forecast_date( For most cases, this should be specified in the form "yyyy-mm-dd". Note that when the forecast date is left unspecified, it is set to one of two values. If there is a \code{step_adjust_latency} step present, it uses the -\code{as_of} the maximum time value from the data used in pre-processing, -fitting the model, and postprocessing.} +\code{forecast_date} as set in that function. Otherwise, it uses the maximum +\code{time_value} across the data used for pre-processing, fitting the model, +and postprocessing.} \item{id}{a random id string} } diff --git a/man/set_forecast_date.Rd b/man/set_forecast_date.Rd index c238758c9..58682bd2b 100644 --- a/man/set_forecast_date.Rd +++ b/man/set_forecast_date.Rd @@ -4,7 +4,7 @@ \alias{set_forecast_date} \title{Extract the as_of for the forecast date, and make sure there's nothing very off about it.} \usage{ -set_forecast_date(new_data, info) +set_forecast_date(new_data, info, epi_keys_checked) } \description{ Extract the as_of for the forecast date, and make sure there's nothing very off about it. diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index b503f3eeb..970b9075c 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -10,6 +10,7 @@ step_adjust_latency( role = NA, trained = FALSE, method = c("extend_ahead", "locf", "extend_lags"), + epi_keys_checked = c("geo_value"), fixed_latency = NULL, fixed_forecast_date = NULL, default = NA, @@ -51,13 +52,23 @@ lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used become \code{c(3,10,17)}. }} +\item{epi_keys_checked}{a character vector. A list of keys to group by before +finding the \code{max_time_value}. The default value of this is +\code{c("geo_value")}, but it can be any collection of \code{epi_keys}. Different +locations may have different latencies; to produce a forecast at every +location, we need to use the largest latency across every location; this +means taking \code{max_time_value} to be the minimum of the \code{max_time_value}s +for each \code{geo_value} (or whichever collection of keys are specified). If +\code{NULL} or an empty character vector, it will take the maximum across all +values, irrespective of any keys.} + \item{fixed_latency}{either a positive integer, or a labeled positive integer -vector. Cannot be set at the same time as \code{fixed_asof}. If non-\code{NULL}, the -amount to offset the ahead or lag by. If a single integer, this is used for -all columns; if a labeled vector, the labels must correspond to the base -column names (before lags/aheads). If \code{NULL}, the latency is the distance -between the \code{epi_df}'s \code{max_time_value} and either the -\code{fixed_forecast_date} or the \code{epi_df}'s \code{as_of} field (the default for +vector. Cannot be set at the same time as \code{fixed_forecast_date}. If +non-\code{NULL}, the amount to offset the ahead or lag by. If a single integer, +this is used for all columns; if a labeled vector, the labels must +correspond to the base column names (before lags/aheads). If \code{NULL}, the +latency is the distance between the \code{epi_df}'s \code{max_time_value} and either +the \code{fixed_forecast_date} or the \code{epi_df}'s \code{as_of} field (the default for \code{forecast_date}).} \item{fixed_forecast_date}{either a date of the same kind used in the diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 70d127327..ab7817799 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -222,3 +222,7 @@ test_that("epi_adjust_latency correctly extends the lags", { fit_hand_adj$fit$fit$fit$coefficients ) }) + +test_that("`step_adjust_latency` only allows one instance of itself", {}) + +test_that("setting fixed_* works for `step_adjust_latency`", {}) From 99d8099117b37dc0bbeb4edf4bd379585b8a0c21 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 29 May 2024 16:56:58 -0500 Subject: [PATCH 43/92] typo in multiline pipe replacement --- R/arx_forecaster.R | 48 +++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 3a2df792f..df59a8177 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -140,31 +140,28 @@ arx_fcast_epi_workflow <- function( if (!is.null(method)) { if (method == "extend_ahead") { r <- r %>% step_adjust_latency(all_outcomes(), - fixed_forecast_date = forecast_date, - method = method - ) + fixed_forecast_date = forecast_date, + method = method + ) } else if (method == "extend_lags") { r <- r %>% step_adjust_latency(all_predictors(), - fixed_forecast_date = forecast_date, - method = method - ) - } - r <- r %>% step_epi_naomit() %>% - step_training_window(n_recent = args_list$n_training) %>% - { - if (!is.null(args_list$check_enough_data_n)) { - check_enough_train_data( - ., - all_predictors(), - !!outcome, - n = args_list$check_enough_data_n, - epi_keys = args_list$check_enough_data_epi_keys, - drop_na = FALSE - ) - } else { - . - } + fixed_forecast_date = forecast_date, + method = method + ) } + } + r <- r %>% + step_epi_naomit() %>% + step_training_window(n_recent = args_list$n_training) + if (!is.null(args_list$check_enough_data_n)) { + r <- r %>% check_enough_train_data( + all_predictors(), + !!outcome, + n = args_list$check_enough_data_n, + epi_keys = args_list$check_enough_data_epi_keys, + drop_na = FALSE + ) + } # --- postprocessor @@ -176,12 +173,11 @@ arx_fcast_epi_workflow <- function( rlang::eval_tidy(trainer$args$quantile_levels) )) args_list$quantile_levels <- quantile_levels - trainer$args$quantile_levels <- enquo(quantile_levels) - f <- layer_quantile_distn(f, quantile_levels = quantile_levels) %>% + trainer$args$quantile_levels <- rlang::enquo(quantile_levels) + f <- f %>% layer_quantile_distn(quantile_levels = quantile_levels) %>% layer_point_from_distn() } else { - f <- layer_residual_quantiles( - f, + f <- f %>% layer_residual_quantiles( quantile_levels = args_list$quantile_levels, symmetrize = args_list$symmetrize, by_key = args_list$quantile_by_key From c4fce2e906814649b3a2827c5416053853d9d95e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 3 Jun 2024 15:46:11 -0500 Subject: [PATCH 44/92] happy styler --- R/arx_forecaster.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index df59a8177..98ff4f037 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -174,7 +174,8 @@ arx_fcast_epi_workflow <- function( )) args_list$quantile_levels <- quantile_levels trainer$args$quantile_levels <- rlang::enquo(quantile_levels) - f <- f %>% layer_quantile_distn(quantile_levels = quantile_levels) %>% + f <- f %>% + layer_quantile_distn(quantile_levels = quantile_levels) %>% layer_point_from_distn() } else { f <- f %>% layer_residual_quantiles( From f5ae9d1182fa773dcd58266650c239043be5091a Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 14 Jun 2024 15:37:13 -0500 Subject: [PATCH 45/92] various requested changes, check passes --- NAMESPACE | 1 + R/arx_classifier.R | 6 +++- R/arx_forecaster.R | 8 ++++-- R/get_test_data.R | 10 +++---- R/layer_residual_quantiles.R | 2 +- R/step_adjust_latency.R | 33 +++++++++++----------- R/utils-latency.R | 39 +++++++++++++++----------- man/bake.step_adjust_latency.Rd | 17 ----------- man/get_forecast_date_in_layer.Rd | 11 ++++++++ man/step_adjust_latency.Rd | 23 +++++++++------ vignettes/articles/symptom-surveys.Rmd | 16 +++++------ 11 files changed, 89 insertions(+), 77 deletions(-) delete mode 100644 man/bake.step_adjust_latency.Rd diff --git a/NAMESPACE b/NAMESPACE index 29b1d9591..9b8035714 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -240,6 +240,7 @@ importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) +importFrom(dplyr,join_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) importFrom(dplyr,pull) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 97c4deed1..e97d6675f 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -55,11 +55,15 @@ arx_classifier <- function( wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) + latency_adjust_fd <- if (is.null(args_list$adjust_latency)) + max(epi_data$time_value) else attributes(epi_data)$metadata$as_of + forecast_date <- args_list$forecast_date %||% latency_adjust_fd + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) preds <- forecast( wf, fill_locf = is.null(args_list$adjust_latency), n_recent = args_list$nafill_buffer, - forecast_date = args_list$forecast_date %||% max(epi_data$time_value) + forecast_date = forecast_date ) %>% as_tibble() %>% select(-time_value) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 98ff4f037..f97d2682d 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -122,8 +122,12 @@ arx_fcast_epi_workflow <- function( # forecast_date is first what they set; # if they don't and they're not adjusting latency, it defaults to the max time_value # if they're adjusting as_of, it defaults to the as_of - forecast_date <- args_list$forecast_date %||% - if (is.null(args_list$adjust_latency)) max(epi_data$time_value) else attributes(epi_data)$metadata$as_of + latency_adjust_fd <- if (is.null(args_list$adjust_latency)) { + max(epi_data$time_value) + } else { + attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% latency_adjust_fd target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) lags <- arx_lags_validator(predictors, args_list$lags) diff --git a/R/get_test_data.R b/R/get_test_data.R index 694e73b06..a491ed40d 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -111,11 +111,11 @@ get_test_data <- function( # Now, fill forward missing data if requested if (fill_locf) { cannot_be_used <- x %>% - filter(forecast_date - time_value <= n_recent) %>% - mutate(fillers = forecast_date - time_value > min_required) %>% - summarize( - across( - -any_of(key_colnames(recipe)), + dplyr::filter(forecast_date - time_value <= n_recent) %>% + dplyr::mutate(fillers = forecast_date - time_value > min_required) %>% + dplyr::summarise( + dplyr::across( + -tidyselect::any_of(epi_keys(recipe)), ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) ), .groups = "drop" diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index 257a951a9..b21bdcfcc 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -124,7 +124,7 @@ slather.layer_residual_quantiles <- } r <- r %>% - summarize( + dplyr::summarise( dstn = list(quantile( c(.resid, s * .resid), probs = object$quantile_levels, na.rm = TRUE diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 3b8405944..28480097d 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -1,10 +1,13 @@ -#' Adapt the pipeline to latency in the data +#' Adapt the model to latent data #' -#' In the standard case, the pipeline assumes that the last observation is also -#' the day from which the forecast is being made. `step_adjust_latency` uses the -#' `as_of` date of the `epi_df` as the `forecast_date`. This is most useful in -#' realtime and pseudo-prospective forecasting for data where there is some -#' delay between the day recorded and when that data is available. +#' In the standard case, the arx models assume that the last observation is also +#' the day from which the forecast is being made. But if the data has latency, +#' then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +#' to compensate. This allows the model to create bleeding-edge forecasts using +#' the lags actually observed rather than anticipated. `step_adjust_latency` +#' uses the `as_of` date of the `epi_df` as the `forecast_date`. This is most +#' useful in realtime and pseudo-prospective forecasting for data where there is +#' some delay between the day recorded and when that data is available. #' #' @param recipe A recipe object. The step will be added to the #' sequence of operations for this recipe. @@ -67,8 +70,10 @@ #' @template step-return #' #' @details The step assumes that the pipeline has already applied either -#' `step_epi_ahead` or `step_epi_lag` depending on the value of -#' `"method"`, and that `step_epi_naomit` has NOT been run. +#' `step_epi_ahead` or `step_epi_lag` depending on the value of `"method"`, +#' and that `step_epi_naomit` has NOT been run. By default, the latency will +#' be determined using the arguments below, but can be set explicitly using +#' either `fixed_latency` or `fixed_forecast_date`. #' #' The `prefix` and `id` arguments are unchangeable to ensure that the code runs #' properly and to avoid inconsistency with naming. For `step_epi_ahead`, they @@ -97,6 +102,7 @@ #' jhu_fit #' #' @importFrom recipes detect_step +#' @importFrom rlang enquos step_adjust_latency <- function(recipe, ..., @@ -113,7 +119,7 @@ step_adjust_latency <- default = NA, skip = FALSE, columns = NULL, - id = recipes::rand_id("epi_lag")) { + id = recipes::rand_id("adjust_latency")) { arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") @@ -173,7 +179,7 @@ step_adjust_latency <- recipes::add_step( recipe, step_adjust_latency_new( - terms = dplyr::enquos(...), + terms = enquos(...), role = role, method = method, epi_keys_checked = epi_keys_checked, @@ -273,13 +279,6 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ) } -#' various ways of handling differences between the `as_of` date and the maximum -#' time value -#' @description -#' adjust the ahead so that we will be predicting `ahead` days after the `as_of` -#' date, rather than relative to the last day of data -#' @param new_data assumes that this already has lag/ahead columns that we need -#' to adjust #' @importFrom dplyr %>% pull #' @export bake.step_adjust_latency <- function(object, new_data, ...) { diff --git a/R/utils-latency.R b/R/utils-latency.R index 4ddd0dc08..9c5f61423 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -77,7 +77,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' `effective_shift` (shift+latency), and `new_name` (adjusted names with the #' effective_shift) #' @keywords internal -#' @importFrom dplyr rowwise %>% +#' @importFrom dplyr rowwise left_join join_by get_latent_column_tibble <- function( shift_cols, new_data, forecast_date, latency, sign_shift, info, epi_keys_checked, call = caller_env()) { @@ -121,6 +121,7 @@ get_latent_column_tibble <- function( #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal +#' @importFrom dplyr select set_forecast_date <- function(new_data, info, epi_keys_checked) { original_columns <- info %>% filter(source == "original") %>% @@ -138,16 +139,13 @@ set_forecast_date <- function(new_data, info, epi_keys_checked) { # get the minimum value across the checked epi_keys' maximum time values max_time <- new_data %>% select(all_of(original_columns)) %>% - drop_na() %>% - { - # null and "" don't work in `group_by` - if (!is.null(epi_keys_checked) && epi_keys_checked != "") { - group_by(., get(epi_keys_checked)) - } else { - . - } - } %>% - summarize(time_value = max(time_value)) %>% + drop_na() + # null and "" don't work in `group_by` + if (!is.null(epi_keys_checked) && (epi_keys_checked != "")) { + max_time <- max_time %>% group_by(get(epi_keys_checked)) + } + max_time <- max_time %>% + summarise(time_value = max(time_value)) %>% pull(time_value) %>% min() forecast_date <- attributes(new_data)$metadata$as_of @@ -174,7 +172,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked) { )) } # TODO cover the rest of the possible types for as_of and max_time... - if (class(max_time) == "Date") { + if (inherits(max_time, "Date")) { forecast_date <- as.Date(forecast_date) } return(forecast_date) @@ -192,7 +190,7 @@ get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shif shift_max_date <- shift_max_date %>% group_by(get(epi_keys_checked)) } shift_max_date <- shift_max_date %>% - summarize(time_value = max(time_value)) %>% + summarise(time_value = max(time_value)) %>% pull(time_value) %>% min() return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date) + shift_amount)) @@ -201,6 +199,13 @@ get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shif #' get the target date while in a layer +#' @param this_recipe the recipe to check for `step_adjust_latency` +#' @param workflow_max_time_value the `max_time` value coming out of the fit +#' workflow (this will be the maximal time value in a potentially different +#' dataset) +#' @param new_data the data we're currently working with, from which we'll take +#' a potentially different max_time_value +#' @keywords internal get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new_data) { max_time_value <- max( workflow_max_time_value, @@ -209,14 +214,14 @@ get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new ) if (this_recipe %>% recipes::detect_step("adjust_latency")) { # get the as_of in an `adjust_latency` step, regardless of where - handpicked_as_of <- map( + handpicked_forecast_date <- map( this_recipe$steps, function(x) { - if (inherits(x, "step_adjust_latency")) x$as_of + if (inherits(x, "step_adjust_latency")) x$forecast_date } ) %>% Filter(Negate(is.null), .) - if (length(handpicked_as_of) > 0) { - max_time_value <- handpicked_as_of[[1]] + if (length(handpicked_forecast_date) > 0) { + max_time_value <- handpicked_forecast_date[[1]] } else { # if we haven't chosen one, use either the max_time_value or the as_of max_time_value <- max( diff --git a/man/bake.step_adjust_latency.Rd b/man/bake.step_adjust_latency.Rd deleted file mode 100644 index dac3f5509..000000000 --- a/man/bake.step_adjust_latency.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/step_adjust_latency.R -\name{bake.step_adjust_latency} -\alias{bake.step_adjust_latency} -\title{various ways of handling differences between the \code{as_of} date and the maximum -time value} -\usage{ -\method{bake}{step_adjust_latency}(object, new_data, ...) -} -\arguments{ -\item{new_data}{assumes that this already has lag/ahead columns that we need -to adjust} -} -\description{ -adjust the ahead so that we will be predicting \code{ahead} days after the \code{as_of} -date, rather than relative to the last day of data -} diff --git a/man/get_forecast_date_in_layer.Rd b/man/get_forecast_date_in_layer.Rd index 2f9e03548..c866a88e7 100644 --- a/man/get_forecast_date_in_layer.Rd +++ b/man/get_forecast_date_in_layer.Rd @@ -6,6 +6,17 @@ \usage{ get_forecast_date_in_layer(this_recipe, workflow_max_time_value, new_data) } +\arguments{ +\item{this_recipe}{the recipe to check for \code{step_adjust_latency}} + +\item{workflow_max_time_value}{the \code{max_time} value coming out of the fit +workflow (this will be the maximal time value in a potentially different +dataset)} + +\item{new_data}{the data we're currently working with, from which we'll take +a potentially different max_time_value} +} \description{ get the target date while in a layer } +\keyword{internal} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 970b9075c..c4f407b3c 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/step_adjust_latency.R \name{step_adjust_latency} \alias{step_adjust_latency} -\title{Adapt the pipeline to latency in the data} +\title{Adapt the model to latent data} \usage{ step_adjust_latency( recipe, @@ -16,7 +16,7 @@ step_adjust_latency( default = NA, skip = FALSE, columns = NULL, - id = recipes::rand_id("epi_lag") + id = recipes::rand_id("adjust_latency") ) } \arguments{ @@ -97,16 +97,21 @@ An updated version of \code{recipe} with the new step added to the sequence of any existing operations. } \description{ -In the standard case, the pipeline assumes that the last observation is also -the day from which the forecast is being made. \code{step_adjust_latency} uses the -\code{as_of} date of the \code{epi_df} as the \code{forecast_date}. This is most useful in -realtime and pseudo-prospective forecasting for data where there is some -delay between the day recorded and when that data is available. +In the standard case, the arx models assume that the last observation is also +the day from which the forecast is being made. But if the data has latency, +then you may wish to adjust the predictors (lags) and/or the outcome (ahead) +to compensate. This allows the model to create bleeding-edge forecasts using +the lags actually observed rather than anticipated. \code{step_adjust_latency} +uses the \code{as_of} date of the \code{epi_df} as the \code{forecast_date}. This is most +useful in realtime and pseudo-prospective forecasting for data where there is +some delay between the day recorded and when that data is available. } \details{ The step assumes that the pipeline has already applied either -\code{step_epi_ahead} or \code{step_epi_lag} depending on the value of -\code{"method"}, and that \code{step_epi_naomit} has NOT been run. +\code{step_epi_ahead} or \code{step_epi_lag} depending on the value of \code{"method"}, +and that \code{step_epi_naomit} has NOT been run. By default, the latency will +be determined using the arguments below, but can be set explicitly using +either \code{fixed_latency} or \code{fixed_forecast_date}. The \code{prefix} and \code{id} arguments are unchangeable to ensure that the code runs properly and to avoid inconsistency with naming. For \code{step_epi_ahead}, they diff --git a/vignettes/articles/symptom-surveys.Rmd b/vignettes/articles/symptom-surveys.Rmd index f480db575..1e51a9963 100644 --- a/vignettes/articles/symptom-surveys.Rmd +++ b/vignettes/articles/symptom-surveys.Rmd @@ -423,7 +423,7 @@ res_err4 <- res_all4 %>% knitr::kable( res_err4 %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -472,7 +472,7 @@ res_dif4 <- res_all4 %>% knitr::kable( res_dif4 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -501,7 +501,7 @@ ggplot_colors <- c("#FC4E07", "#00AFBB", "#E7B800") ggplot(res_dif4 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -649,7 +649,7 @@ knitr::kable( res_err2 %>% select(-ends_with("diff")) %>% group_by(model, lead) %>% - summarize(err = median(err), n = length(unique(forecast_date))) %>% + summarise(err = median(err), n = length(unique(forecast_date))) %>% arrange(lead) %>% ungroup() %>% rename( "Model" = model, "Median scaled error" = err, @@ -684,7 +684,7 @@ to incorporate during periods of time where forecasting is easier. ggplot( res_err2 %>% group_by(model, lead, forecast_date) %>% - summarize(err = median(err)) %>% ungroup(), + summarise(err = median(err)) %>% ungroup(), aes(x = forecast_date, y = err) ) + geom_line(aes(color = model)) + @@ -722,7 +722,7 @@ res_dif2 <- res_all2 %>% knitr::kable( res_dif2 %>% group_by(model, lead) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -739,7 +739,7 @@ quite small. ```{r} ggplot(res_dif2 %>% group_by(model, lead, forecast_date) %>% - summarize(p = binom.test( + summarise(p = binom.test( x = sum(diff > 0, na.rm = TRUE), n = n(), alt = "greater" )$p.val) %>% @@ -802,7 +802,7 @@ err_by_lead <- res %>% ) %>% mutate(model = factor(model, labels = model_names[1:2])) %>% group_by(model, lead) %>% - summarize(err = median(err)) %>% + summarise(err = median(err)) %>% ungroup() ggplot(err_by_lead, aes(x = lead, y = err)) + From be9607bafceb2ccab985467c8d7e3aada7b73314 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 14 Jun 2024 15:52:31 -0500 Subject: [PATCH 46/92] style fix --- R/arx_classifier.R | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index e97d6675f..851da1e2d 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -55,8 +55,11 @@ arx_classifier <- function( wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - latency_adjust_fd <- if (is.null(args_list$adjust_latency)) - max(epi_data$time_value) else attributes(epi_data)$metadata$as_of + latency_adjust_fd <- if (is.null(args_list$adjust_latency)) { + max(epi_data$time_value) + } else { + attributes(epi_data)$metadata$as_of + } forecast_date <- args_list$forecast_date %||% latency_adjust_fd target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) preds <- forecast( From a86b3c7fe5ab9dd577ebad145f003da5ade786f1 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 14 Jun 2024 17:47:23 -0500 Subject: [PATCH 47/92] inheritParams, correct print, test adjust subset --- R/step_adjust_latency.R | 43 +++++++++-------------- man/step_adjust_latency.Rd | 24 ++++++------- tests/testthat/test-step_adjust_latency.R | 20 +++++++++++ 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 28480097d..eaa5d4236 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -9,12 +9,6 @@ #' useful in realtime and pseudo-prospective forecasting for data where there is #' some delay between the day recorded and when that data is available. #' -#' @param recipe A recipe object. The step will be added to the -#' sequence of operations for this recipe. -#' @param ... One or more selector functions to choose variables for this step. -#' See [recipes::selections()] for more details. Typically you will not need -#' to set this manually, as the necessary adjustments will be done for the -#' predictors and outcome. #' @param method a character. Determines the method by which the #' forecast handles latency. The options are: #' - `"extend_ahead"`: Lengthen the ahead so that forecasting from the last @@ -52,22 +46,12 @@ #' `forecast_date` is determined either via the `fixed_latency`, or is set to #' the `epi_df`'s `as_of` value if `fixed_latency` is also `NULL`. #' @param role For model terms created by this step, what analysis role should -#' they be assigned? `lag` is default a predictor while `ahead` is an outcome. -#' It should be correctly inferred and not need setting -#' @param trained A logical to indicate if the quantities for preprocessing have -#' been estimated. -#' @param columns A character string of column names to be adjusted; these -#' should be the original columns, and not the derived ones +#' they be assigned? `lag` is a predictor while `ahead` is an outcome. It +#' should be correctly inferred and not need setting #' @param default Determines what fills empty rows #' left by leading/lagging (defaults to NA). -#' @param skip A logical. Should the step be skipped when the -#' recipe is baked by [bake()]? While all operations are baked -#' when [prep()] is run, some operations may not be able to be -#' conducted on new data (e.g. processing the outcome variable(s)). -#' Care should be taken when using `skip = TRUE` as it may affect -#' the computations for subsequent operations. -#' @param id A unique identifier for the step #' @template step-return +#' @inheritParams recipes::step_lag #' #' @details The step assumes that the pipeline has already applied either #' `step_epi_ahead` or `step_epi_lag` depending on the value of `"method"`, @@ -189,6 +173,7 @@ step_adjust_latency <- shift_cols = relevant_shifts, default = default, keys = epi_keys(recipe), + columns = columns, skip = skip, id = id ) @@ -197,7 +182,7 @@ step_adjust_latency <- step_adjust_latency_new <- function(terms, role, trained, forecast_date, latency, shift_cols, time_type, default, - keys, method, epi_keys_checked, skip, id) { + keys, method, epi_keys_checked, columns, skip, id) { step( subclass = "adjust_latency", terms = terms, @@ -210,6 +195,7 @@ step_adjust_latency_new <- shift_cols = shift_cols, default = default, keys = keys, + columns = columns, skip = skip, id = id ) @@ -220,7 +206,7 @@ step_adjust_latency_new <- #' @importFrom glue glue prep.step_adjust_latency <- function(x, training, info = NULL, ...) { # get the columns used, even if it's all of them - terms_used <- x$columns + terms_used <- x$terms if (length(terms_used) == 0) { terms_used <- info %>% filter(role == "raw") %>% @@ -274,6 +260,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { keys = x$keys, method = x$method, epi_keys_checked = x$epi_keys_checked, + columns = recipes_eval_select(latency_cols$original_name, training, info), skip = x$skip, id = x$id ) @@ -301,14 +288,18 @@ print.step_adjust_latency <- if (!is.null(x$forecast_date)) { conj <- "with forecast date" extra_text <- x$forecast_date - } else if (!is.null(x$shift_cols)) { - conj <- "with latencies" - extra_text <- x$shift_cols + } else if (!is.null(x$latency)) { + conj <- if (length(x$latency == 1)) { + "with latency" + } else { + "with latencies" + } + extra_text <- x$latency } else { - conj <- "" + conj <- "with latency" extra_text <- "set at train time" } - print_epi_step(terms, NULL, x$trained, x$method, + print_epi_step(terms, terms, x$trained, x$method, conjunction = conj, extra_text = extra_text ) diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index c4f407b3c..fc4c55878 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -23,17 +23,15 @@ step_adjust_latency( \item{recipe}{A recipe object. The step will be added to the sequence of operations for this recipe.} -\item{...}{One or more selector functions to choose variables for this step. -See \code{\link[recipes:selections]{recipes::selections()}} for more details. Typically you will not need -to set this manually, as the necessary adjustments will be done for the -predictors and outcome.} +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[recipes:selections]{selections()}} for more details.} \item{role}{For model terms created by this step, what analysis role should -they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome. -It should be correctly inferred and not need setting} +they be assigned? \code{lag} is a predictor while \code{ahead} is an outcome. It +should be correctly inferred and not need setting} -\item{trained}{A logical to indicate if the quantities for preprocessing have -been estimated.} +\item{trained}{A logical to indicate if the quantities for +preprocessing have been estimated.} \item{method}{a character. Determines the method by which the forecast handles latency. The options are: @@ -81,16 +79,16 @@ the \code{epi_df}'s \code{as_of} value if \code{fixed_latency} is also \code{NUL left by leading/lagging (defaults to NA).} \item{skip}{A logical. Should the step be skipped when the -recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked -when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked +when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be conducted on new data (e.g. processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} -\item{columns}{A character string of column names to be adjusted; these -should be the original columns, and not the derived ones} +\item{columns}{A character string of the selected variable names. This field +is a placeholder and will be populated once \code{\link[recipes:prep]{prep()}} is used.} -\item{id}{A unique identifier for the step} +\item{id}{A character string that is unique to this step to identify it.} } \value{ An updated version of \code{recipe} with the new step added to the diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index ab7817799..fb02e740d 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -225,4 +225,24 @@ test_that("epi_adjust_latency correctly extends the lags", { test_that("`step_adjust_latency` only allows one instance of itself", {}) +test_that("`step_adjust_latency` only uses the columns specified in the `...`", { + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(case_rate, method = "extend_lags") + + fit5 <- slm_fit(r5, data = real_x) + expect_equal(names(fit5$fit$fit$fit$coefficients), c("(Intercept)", "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", "lag_6_case_rate", "lag_10_case_rate")) +}) + test_that("setting fixed_* works for `step_adjust_latency`", {}) + +test_that("printing step_adjust_latency results in expected output", { + r5 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% + step_adjust_latency(case_rate, method = "extend_lags") + expect_snapshot(r5) +}) From 5b7eff1f79f44e180148fc9b1b238912abb58355 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 14 Jun 2024 18:03:23 -0500 Subject: [PATCH 48/92] space --- R/step_adjust_latency.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index eaa5d4236..5522a83b6 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -292,7 +292,7 @@ print.step_adjust_latency <- conj <- if (length(x$latency == 1)) { "with latency" } else { - "with latencies" + "with latencies" } extra_text <- x$latency } else { From 63b02c9046b1d4d77bd67b63ddb351060ba416fa Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Sat, 22 Jun 2024 17:41:07 -0500 Subject: [PATCH 49/92] print fix and tests --- R/step_adjust_latency.R | 53 +++++++++++++++++--- man/epi_shift_single.Rd | 24 --------- tests/testthat/_snaps/step_adjust_latency.md | 39 ++++++++++++++ tests/testthat/test-step_adjust_latency.R | 5 ++ 4 files changed, 89 insertions(+), 32 deletions(-) delete mode 100644 man/epi_shift_single.Rd create mode 100644 tests/testthat/_snaps/step_adjust_latency.md diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 5522a83b6..7036b2fe1 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -280,11 +280,6 @@ bake.step_adjust_latency <- function(object, new_data, ...) { #' @export print.step_adjust_latency <- function(x, width = max(20, options$width - 35), ...) { - if (length(x$terms) == 0) { - terms <- "all previous predictors" - } else { - terms <- x$terms - } if (!is.null(x$forecast_date)) { conj <- "with forecast date" extra_text <- x$forecast_date @@ -299,9 +294,51 @@ print.step_adjust_latency <- conj <- "with latency" extra_text <- "set at train time" } - print_epi_step(terms, terms, x$trained, x$method, - conjunction = conj, - extra_text = extra_text + # what follows is a somewhat modified version of print_epi_step, since the case of no arguments for adjust_latency means apply to all relevant columns, and not none of them + theme_div_id <- cli::cli_div( + theme = list(.pkg = list(`vec-trunc` = Inf, `vec-last` = ", ")) + ) + # this is a slightly modified copy of + title <- trimws(x$method) + trained_text <- dplyr::if_else(x$trained, "Trained", "") + vline_seperator <- dplyr::if_else(trained_text == "", "", "|") + comma_seperator <- dplyr::if_else( + trained_text != "", true = ",", false = "" ) + extra_text <- recipes::format_ch_vec(extra_text) + width_title <- nchar(paste0( + "* ", title, ":", " ", conj, " ", extra_text, " ", vline_seperator, + " ", trained_text, " " + )) + width_diff <- cli::console_width() * 1 - width_title + if (x$trained) { + elements <- x$columns + } else { + if (length(x$terms) == 0) { + elements <- "all previous predictors" + } else { + elements <- lapply(x$terms, function(x) { + rlang::expr_deparse(rlang::quo_get_expr(x), width = Inf) + }) + elements <- vctrs::list_unchop(elements, ptype = character()) + } + } + + element_print_lengths <- cumsum(nchar(elements)) + + c(0L, cumsum(rep(2L, length(elements) - 1))) + + c(rep(5L, length(elements) - 1), 0L) + first_line <- which(width_diff >= element_print_lengths) + first_line <- unname(first_line) + first_line <- ifelse( + test = identical(first_line, integer(0)), + yes = length(element_print_lengths), + no = max(first_line) + ) + more_dots <- ifelse(first_line == length(elements), "", ", ...") + cli::cli_bullets( + c("\n {title}: \\\n {.pkg {cli::cli_vec(elements[seq_len(first_line)])}}\\\n {more_dots} \\\n {conj} \\\n {.pkg {extra_text}} \\\n {vline_seperator} \\\n {.emph {trained_text}}") + ) + + cli::cli_end(theme_div_id) invisible(x) } diff --git a/man/epi_shift_single.Rd b/man/epi_shift_single.Rd deleted file mode 100644 index 871879004..000000000 --- a/man/epi_shift_single.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_shift.R -\name{epi_shift_single} -\alias{epi_shift_single} -\title{Shift predictors while maintaining grouping and time_value ordering} -\usage{ -epi_shift_single(x, col, shift_val, newname, key_cols) -} -\arguments{ -\item{x}{Data frame.} - -\item{shift_val}{a single integer. Negative values produce leads.} - -\item{newname}{the name for the newly shifted column} - -\item{key_cols}{vector, or \code{NULL}. Additional grouping vars.} -} -\value{ -a list of tibbles -} -\description{ -This is a lower-level function. As such it performs no error checking. -} -\keyword{internal} diff --git a/tests/testthat/_snaps/step_adjust_latency.md b/tests/testthat/_snaps/step_adjust_latency.md new file mode 100644 index 000000000..354d02d84 --- /dev/null +++ b/tests/testthat/_snaps/step_adjust_latency.md @@ -0,0 +1,39 @@ +# printing step_adjust_latency results in expected output + + Code + r5 + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Operations + 1. Lagging: death_rate by 0, 6, 11 + 2. Lagging: case_rate by 1, 5 + 3. Leading: death_rate by 7 + 4. extend_lags: case_rate with latency set at train time + +--- + + Code + r + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Operations + 1. Leading: death_rate by 7 + 2. extend_ahead: all previous predictors with latency set at train time + 3. Lagging: death_rate by 0, 7, 14 + diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index fb02e740d..af36e185c 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -245,4 +245,9 @@ test_that("printing step_adjust_latency results in expected output", { step_epi_ahead(death_rate, ahead = ahead) %>% step_adjust_latency(case_rate, method = "extend_lags") expect_snapshot(r5) + r <- epi_recipe(case_death_rate_subset) %>% + step_epi_ahead(death_rate, ahead = 7) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) + expect_snapshot(r) }) From fc8b0c0fdd71e247878f2e183bf6d2cd0f86798b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 24 Jun 2024 16:45:31 -0500 Subject: [PATCH 50/92] multi-aheads do work --- tests/testthat/test-step_adjust_latency.R | 105 +++++++++++++++++----- 1 file changed, 82 insertions(+), 23 deletions(-) diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index af36e185c..7a0aac51c 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -35,7 +35,7 @@ x_lagged attributes(x_lagged)$metadata$as_of <- testing_as_of test_that("epi_adjust_latency correctly extends the lags", { - r5 <- epi_recipe(x) %>% + r1 <- epi_recipe(x) %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) %>% @@ -43,35 +43,35 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1033"), class = "simpleError") + expect_error(expect_warning(fit1 <- slm_fit(r1), regexp = "The shift has been adjusted by 1033"), class = "simpleError") # now trying with the as_of a reasonable distance in the future - fit5 <- slm_fit(r5, data = real_x) + fit1 <- slm_fit(r1, data = real_x) expect_equal( - names(fit5$pre$mold$predictors), + names(fit1$pre$mold$predictors), c( "lag_5_death_rate", "lag_11_death_rate", "lag_16_death_rate", "lag_6_case_rate", "lag_10_case_rate" ) ) - latest <- get_test_data(r5, x) - pred <- predict(fit5, latest) + latest <- get_test_data(r1, x) + pred <- predict(fit1, latest) point_pred <- pred %>% filter(!is.na(.pred)) expect_equal(nrow(point_pred), 1) expect_equal(point_pred$time_value, as.Date(testing_as_of)) expect_equal( - names(fit5$pre$mold$outcomes), + names(fit1$pre$mold$outcomes), glue::glue("ahead_{ahead}_death_rate") ) - latest <- get_test_data(r5, x) - pred <- predict(fit5, latest) - actual_solutions <- pred %>% filter(!is.na(.pred)) + latest <- get_test_data(r1, x) + pred1 <- predict(fit1, latest) + actual_solutions <- pred1 %>% filter(!is.na(.pred)) expect_equal(actual_solutions$time_value, testing_as_of) # should have four predictors, including the intercept - expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + expect_equal(length(fit1$fit$fit$fit$coefficients), 6) # result should be equivalent to just immediately doing the adjusted lags by # hand @@ -81,13 +81,13 @@ test_that("epi_adjust_latency correctly extends the lags", { step_epi_ahead(death_rate, ahead = ahead) fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) expect_equal( - fit5$fit$fit$fit$coefficients, + fit1$fit$fit$fit$coefficients, fit_hand_adj$fit$fit$fit$coefficients ) }) test_that("epi_adjust_latency correctly extends the ahead", { - r5 <- epi_recipe(x) %>% + r2 <- epi_recipe(x) %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) %>% @@ -95,24 +95,24 @@ test_that("epi_adjust_latency correctly extends the ahead", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5))) + expect_error(expect_warning(fit5 <- slm_fit(r2))) # real date example - fit5 <- slm_fit(r5, data = real_x) + fit2 <- slm_fit(r2, data = real_x) expect_equal( - names(fit5$pre$mold$predictors), + names(fit2$pre$mold$predictors), c( "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", "lag_1_case_rate", "lag_5_case_rate" ) ) - latest <- get_test_data(r5, x) - pred <- predict(fit5, latest) - point_pred <- pred %>% filter(!is.na(.pred)) + latest <- get_test_data(r2, x) + pred2 <- predict(fit2, latest) + point_pred2 <- pred2 %>% filter(!is.na(.pred)) # max time is still the forecast date - expect_equal(point_pred$time_value, as.Date(max_time)) + expect_equal(point_pred2$time_value, as.Date(max_time)) # target column renamed expect_equal( - names(fit5$pre$mold$outcomes), + names(fit2$pre$mold$outcomes), glue::glue("ahead_{ahead + latency}_death_rate") ) # fit an equivalent forecaster @@ -123,12 +123,71 @@ test_that("epi_adjust_latency correctly extends the ahead", { equiv_fit <- slm_fit(equivalent, data = real_x) # adjusting the ahead should do the same thing as directly adjusting the ahead expect_equal( - fit5$fit$fit$fit$coefficients, + fit2$fit$fit$fit$coefficients, equiv_fit$fit$fit$fit$coefficients ) # should have four predictors, including the intercept - expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + expect_equal(length(fit2$fit$fit$fit$coefficients), 6) +}) + +test_that("epi_adjust_latency extends multiple aheads", { + aheads <- 1:3 + r3 <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = aheads) %>% + step_adjust_latency(method = "extend_ahead") + fitter <- smooth_quantile_reg( + quantile_levels = 0.5, + outcome_locations = aheads, + degree = 1L + ) + epi_wf <- epi_workflow(r3, fitter) + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_error(expect_warning(fit3 <- fit(epi_wf, data = x))) + # real date example + fit3 <- fit(epi_wf, data = real_x) + expect_equal( + names(fit3$pre$mold$outcomes), + c( + "ahead_6_death_rate", "ahead_7_death_rate", "ahead_8_death_rate" + ) + ) + expect_equal( + names(fit3$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r3, real_x) + pred3 <- predict(fit3, latest) + point_pred <- pred3 %>% + unnest(.pred) %>% + filter(!is.na(distn)) + # max time is still the forecast date + expect_equal( + point_pred$time_value, + rep(as.Date(max_time), 3) + ) + # fit an equivalent forecaster + equivalent <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + latency) + equiv_fit <- fit(epi_wf, data = real_x) + # adjusting the ahead should do the same thing as directly adjusting the ahead + equiv_fit + expect_equal( + fit3$fit$fit$fit$rqfit, + equiv_fit$fit$fit$fit$rqfit + ) + + # should have four predictors, including the intercept + expect_equal(length(fit3$fit$fit$fit$rqfit$coefficients), 6) }) test_that("epi_adjust_latency fixed_* work", {}) From a570a0ee9256eb425e1a9bc32fb6e71550d7470b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 28 Jun 2024 12:42:01 -0500 Subject: [PATCH 51/92] arx_fc better fc_date info, docs --- R/arx_forecaster.R | 47 +++++++++++++------ R/canned-epipred.R | 21 ++++++++- R/import-standalone-purrr.R | 2 +- R/utils-latency.R | 2 +- tests/testthat/_snaps/snapshots.md | 71 +++++++++++++++++++++++++++++ tests/testthat/test-snapshots.R | 35 ++++++++++++++ tests/testthat/test-utils_latency.R | 22 +++++---- 7 files changed, 173 insertions(+), 27 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index f97d2682d..78bcd785a 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -119,16 +119,25 @@ arx_fcast_epi_workflow <- function( if (!(is.null(trainer) || is_regression(trainer))) { cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'regression'.") } - # forecast_date is first what they set; + # forecast_date is above all what they set; # if they don't and they're not adjusting latency, it defaults to the max time_value - # if they're adjusting as_of, it defaults to the as_of - latency_adjust_fd <- if (is.null(args_list$adjust_latency)) { - max(epi_data$time_value) + # if they're adjusting, it defaults to the as_of + if (is.null(args_list$adjust_latency)) { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli::cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + } } else { - attributes(epi_data)$metadata$as_of + forecast_date_default <- attributes(epi_data)$metadata$as_of } - forecast_date <- args_list$forecast_date %||% latency_adjust_fd + forecast_date <- args_list$forecast_date %||% forecast_date_default target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + if (forecast_date + args_list$ahead != target_date) { + cli::cli_warn(c( + "`forecast_date` + `ahead` must equal `target_date`.", + i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." + )) + } lags <- arx_lags_validator(predictors, args_list$lags) @@ -140,17 +149,17 @@ arx_fcast_epi_workflow <- function( } r <- r %>% step_epi_ahead(!!outcome, ahead = args_list$ahead) - method <- args_list$adjust_latency - if (!is.null(method)) { - if (method == "extend_ahead") { + method_adjust_latency <- args_list$adjust_latency + if (!is.null(method_adjust_latency)) { + if (method_adjust_latency == "extend_ahead") { r <- r %>% step_adjust_latency(all_outcomes(), fixed_forecast_date = forecast_date, - method = method + method = method_adjust_latency ) - } else if (method == "extend_lags") { + } else if (method_adjust_latency == "extend_lags") { r <- r %>% step_adjust_latency(all_predictors(), fixed_forecast_date = forecast_date, - method = method + method = method_adjust_latency ) } } @@ -215,8 +224,18 @@ arx_fcast_epi_workflow <- function( #' @param target_date Date. The date for which the forecast is intended. The #' default `NULL` will attempt to determine this automatically as #' `forecast_date + ahead`. -#' @param adjust_latency Character or `NULL`. one of the `method`s of -#' `step_adjust_latency`, or `NULL` (in which case there is no adjustment). +#' @param adjust_latency Character or `NULL`. One of the `method`s of +#' [step_adjust_latency()], or `NULL` (in which case there is no adjustment). +#' If there is a difference between the `forecast_date` and the last day of +#' data, this determines how to shift the model to account for this +#' difference. The options are: +#' - `NULL` the default, assumes the `forecast_date` is the last day of data +#' - `"extend_ahead"`: increase the `ahead` by the latency so it's relative to +#' the last day of data. If the last day of data was 3 days ago, the ahead +#' becomes `ahead+3`. +#' - `"extend_lags"`: increase the lags so they're relative to the actual forecast date. If the lags are +#' `c(0,7,14)` and the last day of data was 3 days ago, the lags become +#' `c(3,10,17)`. #' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce #' prediction intervals. These are created by computing the quantiles of #' training residuals. A `NULL` value will result in point forecasts only. diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 0adc0536a..227aaec09 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -81,6 +81,7 @@ print.canned_epipred <- function(x, name, ...) { } cli::cli_li("Time type: {.field {x$metadata$training$time_type}},") cli::cli_li("Using data up-to-date as of: {.field {format(x$metadata$training$as_of)}}.") + cli::cli_li("With the last data available on {.field {format(max(x$epi_workflow$original_data$time_value))}}") cli::cli_end() } fn_meta() @@ -103,10 +104,28 @@ print.canned_epipred <- function(x, name, ...) { "A total of {.val {nrow(x$predictions)}} prediction{?s}", " {?is/are} available for" )) + cli::cli_ul(c( "{.val {n_geos}} unique geographic region{?s},", "At forecast date{?s}: {.val {fds}},", - "For target date{?s}: {.val {tds}}." + "For target date{?s}: {.val {tds}}," )) + if (detect_step(x$epi_workflow$pre$actions$recipe$recipe, "adjust_latency")) { + latency_step <- keep(x$epi_workflow$pre$mold$blueprint$recipe$steps, + \(x) inherits(x, "step_adjust_latency"))[[1]] + latency_per_base_col <- latency_step$shift_cols %>% + group_by(latency) %>% + reframe(variable = parent_name) %>% + distinct() %>% + mutate(latency = abs(latency)) %>% + relocate(variable, latency) + if (nrow(latency_per_base_col)>1) { + intro_text <- "Latency adjusted per column: " + } else { + intro_text <- "Latency adjusted for " + } + latency_info <- paste0(intro_text, paste(apply(latency_per_base_col, 1, paste0, collapse = "="), collapse = ", ")) + cli::cli_ul(latency_info) + } cli::cli_text("") } diff --git a/R/import-standalone-purrr.R b/R/import-standalone-purrr.R index 623142a0e..e4e83f428 100644 --- a/R/import-standalone-purrr.R +++ b/R/import-standalone-purrr.R @@ -123,7 +123,7 @@ map_if <- function(.x, .p, .f, ...) { .x } .rlang_purrr_probe <- function(.x, .p, ...) { - if (is_logical(.p)) { + if (rlang::is_logical(.p)) { stopifnot(length(.p) == length(.x)) .p } else { diff --git a/R/utils-latency.R b/R/utils-latency.R index 9c5f61423..11d89ee92 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -81,7 +81,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name get_latent_column_tibble <- function( shift_cols, new_data, forecast_date, latency, sign_shift, info, epi_keys_checked, call = caller_env()) { - shift_cols <- shift_cols %>% mutate(original_name = glue::glue("{prefix}{shift}_{terms}")) + shift_cols <- shift_cols %>% mutate(original_name = glue::glue("{prefix}{shift}_{terms}"), parent_name = terms) if (is.null(latency)) { shift_cols <- shift_cols %>% rowwise() %>% diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index c103ffe0e..90b647d86 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1059,6 +1059,77 @@ 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) +# arx_forecasrte output format snapshots + + Code + out1 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2022-05-31 14:08:25. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2021-12-31, + * For target date: 2022-01-07, + + +--- + + Code + out2 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2022-05-31 14:08:25. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2022-01-03, + * For target date: 2022-01-10, + * Latency adjusted per column: case_rate=3, death_rate=3 + + +--- + + Code + out3 + Message + == A basic forecaster of type ARX Forecaster =================================== + + This forecaster was fit on 999-01-01. + + Training data was an with: + * Geography: state, + * Time type: day, + * Using data up-to-date as of: 2022-05-31 14:08:25. + * With the last data available on 2021-12-31 + + -- Predictions ----------------------------------------------------------------- + + A total of 56 predictions are available for + * 56 unique geographic regions, + * At forecast date: 2022-01-03, + * For target date: 2022-01-10, + * Latency adjusted for death_rate=3 + + # arx_classifier snapshots structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index 25bc13bde..8627fe5ed 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -96,6 +96,41 @@ test_that("arx_forecaster snapshots", { expect_false(all(arx2$predictions == arx3$predictions)) }) +test_that("arx_forecasrte output format snapshots", { + jhu <- case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-12-01")) + out1 <- arx_forecaster( + jhu, "death_rate", + c("case_rate", "death_rate") + ) + expect_equal(as.Date(out1$metadata$forecast_created), Sys.Date()) + out1$metadata$forecast_created <- as.Date("0999-01-01") + expect_snapshot(out1) + out2 <- arx_forecaster(jhu, "case_rate", + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + quantile_levels = 1:9 / 10, + adjust_latency = "extend_lags", + forecast_date = as.Date("2022-01-03") + ) + ) + expect_equal(as.Date(out2$metadata$forecast_created), Sys.Date()) + out2$metadata$forecast_created <- as.Date("0999-01-01") + expect_snapshot(out2) + out3 <- arx_forecaster(jhu, "death_rate", + c("case_rate", "death_rate"), + trainer = quantile_reg(), + args_list = arx_args_list( + adjust_latency = "extend_ahead", + forecast_date = as.Date("2022-01-03") + ) + ) + expect_equal(as.Date(out3$metadata$forecast_created), Sys.Date()) + out3$metadata$forecast_created <- as.Date("0999-01-01") + expect_snapshot(out3) +}) + test_that("arx_classifier snapshots", { arc1 <- arx_classifier( case_death_rate_subset %>% diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index b8ec678cd..f52b1a950 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -37,16 +37,18 @@ modified_data <- modified_data %>% tail() as_of - (modified_data %>% filter(!is.na(ahead_4_case_rate)) %>% pull(time_value) %>% max()) all_shift_cols <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", 5, 8, "lag_8_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", 4, 11, "lag_11_death_rate", "numeric", "predictor", - "case_rate", 4, "ahead_", "ahead_4_case_rate", -5, 9, "ahead_9_case_rate", "numeric", "outcome" + ~terms, ~shift, ~prefix, ~original_name, ~parent_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", "case_rate", 5, 8, "lag_8_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", "death_rate", 4, 11, "lag_11_death_rate", "numeric", "predictor", + "case_rate", 4, "ahead_", "ahead_4_case_rate", "case_rate", -5, 9, "ahead_9_case_rate", "numeric", "outcome" ) test_recipe <- epi_recipe(modified_data) %>% step_epi_lag(case_rate, lag = c(3)) %>% step_epi_lag(death_rate, lag = 7) %>% step_epi_ahead(case_rate, ahead = 4) shift_cols <- construct_shift_tibble(c("case_rate", "death_rate"), test_recipe, "step_epi_lag", "lag") + + test_that("construct_shift_tibble constructs the right tibble", { expected_shift_cols <- tibble::tribble( ~terms, ~shift, ~prefix, @@ -114,17 +116,17 @@ test_that("get_latent_column_tibble assigns given latencies", { shift_cols, modified_data, as_of, 50, 1, info ) weird_latencies <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", 50, 53, "lag_53_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", 50, 57, "lag_57_death_rate", "numeric", "predictor", + ~terms, ~shift, ~prefix, ~original_name, ~parent_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", "case_rate", 50, 53, "lag_53_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", "death_rate", 50, 57, "lag_57_death_rate", "numeric", "predictor", ) expect_equal(both_lag, weird_latencies) # supposing we add the latencies by hand, and they're different, and in a different order weird_latencies <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", 70, 73, "lag_73_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", 30, 37, "lag_37_death_rate", "numeric", "predictor", + ~terms, ~shift, ~prefix, ~original_name, ~parent_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, + "case_rate", 3, "lag_", "lag_3_case_rate", "case_rate", 70, 73, "lag_73_case_rate", "numeric", "predictor", + "death_rate", 7, "lag_", "lag_7_death_rate", "death_rate", 30, 37, "lag_37_death_rate", "numeric", "predictor", ) both_lag <- get_latent_column_tibble( shift_cols, modified_data, as_of, c(death_rate = 30, case_rate = 70), 1, info From a65cad08261147a407cc3bebcbbbb9ed1d1199e2 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 28 Jun 2024 13:33:19 -0500 Subject: [PATCH 52/92] classifier latency ahead adjustment --- R/arx_classifier.R | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 851da1e2d..ba3fe1271 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -55,12 +55,15 @@ arx_classifier <- function( wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - latency_adjust_fd <- if (is.null(args_list$adjust_latency)) { - max(epi_data$time_value) + if (is.null(args_list$adjust_latency)) { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli::cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + } } else { - attributes(epi_data)$metadata$as_of + forecast_date_default <- attributes(epi_data)$metadata$as_of } - forecast_date <- args_list$forecast_date %||% latency_adjust_fd + forecast_date <- args_list$forecast_date %||% forecast_date_default target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) preds <- forecast( wf, @@ -132,6 +135,18 @@ arx_class_epi_workflow <- function( if (!(is.null(trainer) || is_classification(trainer))) { cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'classification'.") } + + if (is.null(args_list$adjust_latency)) { + forecast_date_default <- max(epi_data$time_value) + if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { + cli::cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + } + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) + lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor @@ -180,7 +195,16 @@ arx_class_epi_workflow <- function( } o2 <- rlang::sym(paste0("ahead_", args_list$ahead, "_", o)) r <- r %>% - step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") %>% + step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") + method_adjust_latency <- args_list$adjust_latency + if (!is.null(method_adjust_latency)) { + # only extend_ahead is supported atm + r <- r %>% step_adjust_latency(all_outcomes(), + fixed_forecast_date = forecast_date, + method = method_adjust_latency + ) + } + r <- r %>% recipes::step_mutate( outcome_class = cut(!!o2, breaks = args_list$breaks), role = "outcome" @@ -267,6 +291,7 @@ arx_class_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = NULL, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, @@ -284,7 +309,10 @@ arx_class_args_list <- function( outcome_transform <- rlang::arg_match(outcome_transform) arg_is_scalar(ahead, n_training, horizon, log_scale) - arg_is_scalar(forecast_date, target_date, allow_null = TRUE) + arg_is_scalar(forecast_date, target_date, adjust_latency, allow_null = TRUE) + if (adjust_latency == "adjust_lags") { + cli::cli_abort("step_adjust_latency is not yet implemented for lagged differences and growth rates") + } arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags, horizon) arg_is_numeric(breaks) @@ -325,6 +353,7 @@ arx_class_args_list <- function( breaks, forecast_date, target_date, + adjust_latency, outcome_transform, max_lags, horizon, From e3a368edba126b38df5353973877f37598a0fda7 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 3 Jul 2024 15:53:53 -0500 Subject: [PATCH 53/92] refactor step_adjust_ahead to be early step --- NAMESPACE | 4 +- R/arx_classifier.R | 13 +- R/arx_forecaster.R | 13 +- R/canned-epipred.R | 39 +++-- R/epi_shift.R | 45 ++++-- R/layer_add_forecast_date.R | 2 +- R/step_adjust_latency.R | 124 +++++++++------- R/utils-latency.R | 94 +----------- man/arx_args_list.Rd | 16 +- man/arx_class_args_list.Rd | 16 ++ man/extend_either.Rd | 26 ---- man/get_latency.Rd | 9 +- man/get_latent_column_tibble.Rd | 41 ----- tests/testthat.R | 4 + tests/testthat/_snaps/snapshots.md | 6 +- tests/testthat/_snaps/step_adjust_latency.md | 14 +- tests/testthat/test-layer_add_forecast_date.R | 8 +- tests/testthat/test-layer_add_target_date.R | 2 +- tests/testthat/test-snapshots.R | 4 +- tests/testthat/test-step_adjust_latency.R | 87 +++++++---- tests/testthat/test-utils_latency.R | 140 ++---------------- 21 files changed, 268 insertions(+), 439 deletions(-) delete mode 100644 man/extend_either.Rd delete mode 100644 man/get_latent_column_tibble.Rd diff --git a/NAMESPACE b/NAMESPACE index 9b8035714..19e42fef5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -246,7 +246,6 @@ importFrom(dplyr,mutate) importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) -importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) @@ -265,6 +264,7 @@ importFrom(ggplot2,geom_linerange) importFrom(ggplot2,geom_point) importFrom(ggplot2,geom_ribbon) importFrom(glue,glue) +importFrom(hardhat,extract_recipe) importFrom(hardhat,refresh_blueprint) importFrom(hardhat,run_mold) importFrom(magrittr,"%>%") @@ -272,6 +272,7 @@ importFrom(recipes,bake) importFrom(recipes,detect_step) importFrom(recipes,prep) importFrom(recipes,rand_id) +importFrom(recipes,recipes_eval_select) importFrom(rlang,"!!!") importFrom(rlang,"!!") importFrom(rlang,"%@%") @@ -304,6 +305,7 @@ importFrom(stats,residuals) importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) +importFrom(tidyr,expand_grid) importFrom(tidyr,unnest) importFrom(vctrs,as_list_of) importFrom(vctrs,field) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index ba3fe1271..f07ae3e71 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -194,8 +194,6 @@ arx_class_epi_workflow <- function( } } o2 <- rlang::sym(paste0("ahead_", args_list$ahead, "_", o)) - r <- r %>% - step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") method_adjust_latency <- args_list$adjust_latency if (!is.null(method_adjust_latency)) { # only extend_ahead is supported atm @@ -205,7 +203,9 @@ arx_class_epi_workflow <- function( ) } r <- r %>% - recipes::step_mutate( + step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") + r <- r %>% + step_mutate( outcome_class = cut(!!o2, breaks = args_list$breaks), role = "outcome" ) %>% @@ -223,10 +223,6 @@ arx_class_epi_workflow <- function( ) } - - forecast_date <- args_list$forecast_date %||% max(epi_data$time_value) - target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - # --- postprocessor f <- frosting() %>% layer_predict() # %>% layer_naomit() f <- layer_add_forecast_date(f, forecast_date = forecast_date) %>% @@ -310,9 +306,6 @@ arx_class_args_list <- function( arg_is_scalar(ahead, n_training, horizon, log_scale) arg_is_scalar(forecast_date, target_date, adjust_latency, allow_null = TRUE) - if (adjust_latency == "adjust_lags") { - cli::cli_abort("step_adjust_latency is not yet implemented for lagged differences and growth rates") - } arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags, horizon) arg_is_numeric(breaks) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 78bcd785a..3abbeb2eb 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -143,12 +143,7 @@ arx_fcast_epi_workflow <- function( # --- preprocessor r <- epi_recipe(epi_data) - for (l in seq_along(lags)) { - p <- predictors[l] - r <- step_epi_lag(r, !!p, lag = lags[[l]]) - } - r <- r %>% - step_epi_ahead(!!outcome, ahead = args_list$ahead) + # adjust latency if the user asks method_adjust_latency <- args_list$adjust_latency if (!is.null(method_adjust_latency)) { if (method_adjust_latency == "extend_ahead") { @@ -163,6 +158,12 @@ arx_fcast_epi_workflow <- function( ) } } + for (l in seq_along(lags)) { + p <- predictors[l] + r <- step_epi_lag(r, !!p, lag = lags[[l]]) + } + r <- r %>% + step_epi_ahead(!!outcome, ahead = args_list$ahead) r <- r %>% step_epi_naomit() %>% step_training_window(n_recent = args_list$n_training) diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 227aaec09..4f95f3d22 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -63,6 +63,7 @@ print.alist <- function(x, ...) { } #' @export +#' @importFrom hardhat extract_recipe print.canned_epipred <- function(x, name, ...) { d <- cli::cli_div(theme = list(rule = list("line-type" = "double"))) cli::cli_rule("A basic forecaster of type {name}") @@ -110,19 +111,33 @@ print.canned_epipred <- function(x, name, ...) { "At forecast date{?s}: {.val {fds}},", "For target date{?s}: {.val {tds}}," )) - if (detect_step(x$epi_workflow$pre$actions$recipe$recipe, "adjust_latency")) { - latency_step <- keep(x$epi_workflow$pre$mold$blueprint$recipe$steps, - \(x) inherits(x, "step_adjust_latency"))[[1]] - latency_per_base_col <- latency_step$shift_cols %>% - group_by(latency) %>% - reframe(variable = parent_name) %>% - distinct() %>% - mutate(latency = abs(latency)) %>% - relocate(variable, latency) - if (nrow(latency_per_base_col)>1) { - intro_text <- "Latency adjusted per column: " + fit_recipe <- extract_recipe(x$epi_workflow) + if (detect_step(fit_recipe, "adjust_latency")) { + is_adj_latency <- map_lgl(fit_recipe$steps, \(x) inherits(x, "step_adjust_latency")) + latency_step <- fit_recipe$steps[is_adj_latency][[1]] + # all steps after adjust_latency + later_steps <- fit_recipe$steps[-(1:which(is_adj_latency))] + if (latency_step$method == "extend_ahead") { + step_names <- "step_epi_ahead" + type_str <- "Aheads" + } else if (latency_step$method == "extend_lags") { + step_names <- "step_epi_lag" + type_str <- "Lags" } else { - intro_text <- "Latency adjusted for " + step_names <- "" + type_str <- "columns locf" + } + later_steps[[1]]$columns + valid_columns <- later_steps %>% + keep(\(x) inherits(x, step_names)) %>% + purrr::map("columns") %>% + reduce(c) + latency_per_base_col <- latency_step$latency_table %>% + filter(col_name %in% valid_columns) %>% mutate(latency = abs(latency)) + if (latency_step$method != "locf" && nrow(latency_per_base_col) > 1) { + intro_text <- glue::glue("{type_str} adjusted per column: ") + } else if (latency_step$method != "locf") { + intro_text <- glue::glue("{type_str} adjusted for ") } latency_info <- paste0(intro_text, paste(apply(latency_per_base_col, 1, paste0, collapse = "="), collapse = ", ")) cli::cli_ul(latency_info) diff --git a/R/epi_shift.R b/R/epi_shift.R index c91856237..43876a53a 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -21,7 +21,13 @@ epi_shift_single <- function(x, col, shift_val, newname, key_cols) { #' the future back to today #' @keywords internal get_sign <- function(object) { - if (object$prefix == "lag_") { + if (!is.null(object$prefix)) { + if (object$prefix == "lag_") { + return(1) + } else { + return(-1) + } + } else if (object$method == "extend_lags") { return(1) } else { return(-1) @@ -32,13 +38,28 @@ get_sign <- function(object) { #' checks missing in `epi_shift_single` #' @keywords internal #' @importFrom cli cli_abort +#' @importFrom tidyr expand_grid +#' @importFrom dplyr mutate left_join join_by add_shifted_columns <- function(new_data, object, amount) { sign_shift <- get_sign(object) - grid <- tidyr::expand_grid(col = object$columns, amount = amount) %>% - dplyr::mutate( - newname = glue::glue("{object$prefix}{amount}_{col}"), - shift_val = sign_shift * amount, - amount = NULL + latency_table <- attributes(new_data)$metadata$latency_table + shift_sign_lat <- attributes(new_data)$metadata$shift_sign + if (!is.null(latency_table) && + shift_sign_lat == sign_shift) { + #TODO this doesn't work on lags of transforms + rel_latency <- latency_table %>% filter(col_name %in% object$columns) + } else { + rel_latency <- tibble(col_name = object$columns, latency = 0L) + } + grid <- expand_grid(col = object$columns, amount = sign_shift *amount) %>% + left_join(rel_latency, by = join_by(col == col_name), ) %>% + tidyr::replace_na(list(latency = 0)) %>% + mutate( + shift_val = amount + latency) %>% + mutate( + newname = glue::glue("{object$prefix}{abs(shift_val)}_{col}"), # name is always positive + amount = NULL, + latency = NULL ) ## ensure no name clashes @@ -56,8 +77,12 @@ add_shifted_columns <- function(new_data, object, amount) { dplyr::full_join, by = ok ) - dplyr::full_join(new_data, shifted, by = ok) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(kill_time_value(ok)))) %>% - dplyr::arrange(time_value) %>% - dplyr::ungroup() + processed <- new_data %>% + full_join(shifted, by = ok) %>% + group_by(dplyr::across(dplyr::all_of(kill_time_value(ok)))) %>% + arrange(time_value) %>% + ungroup() %>% + as_epi_df() + attributes(processed)$metadata <- attributes(new_data)$metadata + return(processed) } diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 3c81ded57..c8f857c89 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -94,7 +94,7 @@ slather.layer_add_forecast_date <- function(object, components, workflow, rlang::check_dots_empty() forecast_date <- object$forecast_date %||% get_forecast_date_in_layer( - extract_preprocessor(workflow), + extract_recipe(workflow), workflow$fit$meta$max_time_value, new_data ) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 7036b2fe1..40fd7c37d 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -74,8 +74,8 @@ #' attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 #' #' r <- epi_recipe(case_death_rate_subset) %>% -#' step_epi_ahead(death_rate, ahead = 7) %>% #' step_adjust_latency(method = "extend_ahead") %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) #' r #' @@ -113,15 +113,14 @@ step_adjust_latency <- i = "Use `tidyselect` methods to choose columns to lag." )) } - if ((method == "extend_ahead") && (!detect_step(recipe, "epi_ahead"))) { - cli::cli_abort( - "If `method` is {.val extend_ahead}, then a step - must have already added an outcome." + if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { + cli::cli_warn( + "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified." ) - } else if ((method == "extend_lags") && (!detect_step(recipe, "epi_lag"))) { - cli::cli_abort( - "If `method` is {.val extend_lags} or {.val locf}, then a step - must have already added a predictor." + } else if ((method == "extend_lags") && detect_step(recipe, "epi_lag")) { + cli::cli_warn( + "If `method` is {.val extend_lags} or {.val locf}, +then the previous `step_epi_lag`s won't work with modified data." ) } if (detect_step(recipe, "naomit")) { @@ -132,10 +131,18 @@ step_adjust_latency <- cli::cli_abort("Only one of `fixed_latency` and `fixed_forecast_date` can be non-`NULL` at a time!") } + if (length(fixed_latency > 1)) { + template <- recipe$template + data_names <- names(template)[!names(template) %in% epi_keys(template)] + wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] + if (length(wrong_names) > 0) { + cli::cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}") + } + } method <- rlang::arg_match(method) terms_used <- recipes_eval_select(enquos(...), recipe$template, recipe$term_info) - if (length(terms_used) == 0) { + if (is_empty(terms_used)) { terms_used <- recipe$term_info %>% filter(role == "raw") %>% pull(variable) @@ -147,18 +154,6 @@ step_adjust_latency <- rel_step_type <- "step_epi_lag" shift_name <- "lag" } - relevant_shifts <- construct_shift_tibble(terms_used, recipe, rel_step_type, shift_name) - - if (!any(map_lgl( - recipe$steps, - function(recipe_step) inherits(recipe_step, rel_step_type) - ))) { - cli::cli_abort(glue::glue( - "There is no `{rel_step_type}` defined before this.", - " For the method `extend_{shift_name}` of `step_adjust_latency`,", - " at least one {shift_name} must be previously defined." - )) - } recipes::add_step( recipe, @@ -170,7 +165,7 @@ step_adjust_latency <- trained = trained, forecast_date = fixed_forecast_date, latency = fixed_latency, - shift_cols = relevant_shifts, + latency_table = NULL, default = default, keys = epi_keys(recipe), columns = columns, @@ -181,7 +176,7 @@ step_adjust_latency <- } step_adjust_latency_new <- - function(terms, role, trained, forecast_date, latency, shift_cols, time_type, default, + function(terms, role, trained, forecast_date, latency, latency_table, time_type, default, keys, method, epi_keys_checked, columns, skip, id) { step( subclass = "adjust_latency", @@ -192,7 +187,7 @@ step_adjust_latency_new <- trained = trained, forecast_date = forecast_date, latency = latency, - shift_cols = shift_cols, + latency_table = latency_table, default = default, keys = keys, columns = columns, @@ -200,14 +195,44 @@ step_adjust_latency_new <- id = id ) } - +#' @importFrom recipes recipes_eval_select +construct_latency_table <- function(x, latency, training, info) { + return(latency_table) +} # lags introduces max(lags) NA's after the max_time_value. #' @export #' @importFrom glue glue prep.step_adjust_latency <- function(x, training, info = NULL, ...) { + sign_shift <- get_sign(x) + latency <- x$latency + forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked) + # construct the latency table + latency_table <- names(training)[!names(training) %in% epi_keys(training)] %>% + tibble(col_name = .) + if (length(recipes_eval_select(x$terms, training, info)) > 0) { + latency_table <- latency_table %>% filter(col_name %in% + recipes_eval_select(x$terms, training, info)) + } + + if (is.null(latency)) { + latency_table <- latency_table %>% + mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, x$epi_keys_checked)) + } else if (length(latency) > 1) { + # if latency has a length, it must also have named elements. We assign based on comparing the name in the list + # with the column names, and drop any which don't have a latency assigned + latency_table <- latency_table %>% + filter(col_name %in% names(latency)) %>% + rowwise() %>% + mutate(latency = unname(latency[names(latency) == col_name])) %>% + ungroup() + } else { + latency_table <- latency_table %>% mutate(latency = latency) + } + attributes(training)$metadata$latency_table <- latency_table # get the columns used, even if it's all of them terms_used <- x$terms - if (length(terms_used) == 0) { + # TODO replace with is_empty as in bake.recipe + if (is_empty(terms_used)) { terms_used <- info %>% filter(role == "raw") %>% pull(variable) @@ -215,20 +240,10 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { # get and check the max_time and forecast_date are the right kinds of dates forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked) - # infer the correct columns to be working with from the previous - # transformations - x$prefix <- x$shift_cols$prefix[[1]] - sign_shift <- get_sign(x) - latency_cols <- get_latent_column_tibble( - x$shift_cols, training, forecast_date, - x$latency, sign_shift, info, x$epi_keys_checked - ) - - if ((x$method == "extend_ahead") || (x$method == "extend_lags")) { # check that the shift amount isn't too extreme - latency <- max(latency_cols$latency) + latency <- max(latency_table$latency) time_type <- attributes(training)$metadata$time_type - i_latency <- which.max(latency_cols$latency) + i_latency <- which.max(latency_table$latency) if ( (grepl("day", time_type) && (latency >= 10)) || (grepl("week", time_type) && (latency >= 4)) || @@ -238,29 +253,27 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ) { cli::cli_warn(paste( "!" = paste( - "The shift has been adjusted by {latency}, ", + "The latency is {latency}, ", "which is questionable for it's `time_type` of ", - "{time_type}" + "{time_type}." ), - "i" = "input shift: {latency_cols$shift[[i_latency]]}", - "i" = "latency adjusted shift: {latency_cols$effective_shift[[i_latency]]}", + "i" = "latency: {latency_table$latency[[i_latency]]}", "i" = "`max_time` = {max_time} -> `forecast_date` = {forecast_date}" )) } - } step_adjust_latency_new( - terms = latency_cols$original_name, - role = latency_cols$role[[1]], + terms = x$terms, + role = x$role, trained = TRUE, - shift_cols = latency_cols, forecast_date = forecast_date, - latency = unique(latency_cols$latency), + latency = unique(latency_table$latency), + latency_table = latency_table, default = x$default, keys = x$keys, method = x$method, epi_keys_checked = x$epi_keys_checked, - columns = recipes_eval_select(latency_cols$original_name, training, info), + columns = recipes_eval_select(latency_table$col_name, training, info), skip = x$skip, id = x$id ) @@ -269,14 +282,19 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { #' @importFrom dplyr %>% pull #' @export bake.step_adjust_latency <- function(object, new_data, ...) { + if (!isa(new_data, "epi_df")) { + new_data <- new_data %>% as_epi_df(as_of = object$forecast_date) + } + attributes(new_data)$metadata$latency_method <- object$method + attributes(new_data)$metadata$shift_sign <- get_sign(object) + attributes(new_data)$metadata$latency_table <- object$latency_table if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { keys <- object$keys return( - extend_either(new_data, object$shift_cols, keys) + new_data ) } } - #' @export print.step_adjust_latency <- function(x, width = max(20, options$width - 35), ...) { @@ -314,8 +332,8 @@ print.step_adjust_latency <- if (x$trained) { elements <- x$columns } else { - if (length(x$terms) == 0) { - elements <- "all previous predictors" + if (is_empty(x$terms)) { + elements <- "all future predictors" } else { elements <- lapply(x$terms, function(x) { rlang::expr_deparse(rlang::quo_get_expr(x), width = Inf) diff --git a/R/utils-latency.R b/R/utils-latency.R index 11d89ee92..a567dae06 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -1,39 +1,3 @@ -#' offset each relevant column by it's appropriate latency -#' works for either adjusting aheads or lags -#' note that this may introduce new NA values when one column is shifted farther than another -#' @param shift_cols a tibble which must have the columns `column`, the name of -#' the column to adjust, `latency` the latency of the original column relative -#' to the `forecast_date`, `new_name`, the names in `column` adjusted by the -#' latencies `latency` -#' @param new_data just what is says -#' @param keys the variables which are used as keys -#' @keywords internal -extend_either <- function(new_data, shift_cols, keys) { - shifted <- - shift_cols %>% - select(original_name, latency, new_name) %>% - pmap(function(original_name, latency, new_name) { - epi_shift_single( - x = new_data, - col = original_name, - shift_val = latency, - newname = new_name, - key_cols = keys - ) - }) %>% - reduce( - dplyr::full_join, - by = keys - ) - - return(new_data %>% - select(-all_of(shift_cols$original_name)) %>% # drop the original versions - dplyr::full_join(shifted, by = keys) %>% - dplyr::group_by(dplyr::across(dplyr::all_of(keys[-1]))) %>% - dplyr::arrange(time_value) %>% - dplyr::ungroup()) -} - #' create a table of the columns to modify, their shifts, and their prefixes #' @keywords internal #' @importFrom dplyr tibble @@ -64,60 +28,6 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name return(relevant_shifts) } -#' find the columns added with the lags or aheads, and the amounts they have -#' been changed -#' @param shift_cols a list of columns to operate on, as created by `construct_shift_tibble` -#' @param new_data the data transformed so far -#' @param forecast_date the forecast date -#' @param latency `NULL`, int, or vector, as described in `step_epi_latency` -#' @param sign_shift -1 if ahead, 1 if lag -#' @return a tibble with columns `column` (relevant shifted names), `shift` (the -#' amount that one is shifted), `latency` (original columns difference between -#' the max `time_value` and `forecast_date` (on a per-initial column basis)), -#' `effective_shift` (shift+latency), and `new_name` (adjusted names with the -#' effective_shift) -#' @keywords internal -#' @importFrom dplyr rowwise left_join join_by -get_latent_column_tibble <- function( - shift_cols, new_data, forecast_date, latency, - sign_shift, info, epi_keys_checked, call = caller_env()) { - shift_cols <- shift_cols %>% mutate(original_name = glue::glue("{prefix}{shift}_{terms}"), parent_name = terms) - if (is.null(latency)) { - shift_cols <- shift_cols %>% - rowwise() %>% - # add the latencies to shift_cols - mutate(latency = get_latency( - new_data, forecast_date, original_name, shift, sign_shift, epi_keys_checked - )) %>% - ungroup() - } else if (length(latency) > 1) { - # if latency has a length, we assign based on comparing the name in the list with the `terms` column - shift_cols <- shift_cols %>% - rowwise() %>% - mutate(latency = unname(latency[names(latency) == terms])) %>% - ungroup() - } else { - shift_cols <- shift_cols %>% mutate(latency = latency) - } - - # add the updated names to shift_cols - shift_cols <- shift_cols %>% - mutate( - effective_shift = shift + abs(latency) - ) %>% - mutate( - new_name = glue::glue("{prefix}{effective_shift}_{terms}") - ) - info <- info %>% select(variable, type, role) - shift_cols <- left_join(shift_cols, info, by = join_by(original_name == variable)) - if (length(unique(shift_cols$role)) != 1) { - cli::cli_abort("not all roles are the same!", - shift_cols = shift_cols - ) - } - return(shift_cols) -} - #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal @@ -182,7 +92,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked) { #' @param sign_shift integer. 1 if lag and -1 if ahead. These represent how you #' need to shift the data to bring the 3 day lagged value to today. #' @keywords internal -get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shift, epi_keys_checked) { +get_latency <- function(new_data, forecast_date, column, sign_shift, epi_keys_checked) { shift_max_date <- new_data %>% drop_na(all_of(column)) # null and "" don't work in `group_by` @@ -193,7 +103,7 @@ get_latency <- function(new_data, forecast_date, column, shift_amount, sign_shif summarise(time_value = max(time_value)) %>% pull(time_value) %>% min() - return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date) + shift_amount)) + return(as.integer(sign_shift * (as.Date(forecast_date) - shift_max_date))) } diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index 68d445872..2612d4a3e 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -42,8 +42,20 @@ max time value if there is no latency adjustment, or as the \code{as_of} of default \code{NULL} will attempt to determine this automatically as \code{forecast_date + ahead}.} -\item{adjust_latency}{Character or \code{NULL}. one of the \code{method}s of -\code{step_adjust_latency}, or \code{NULL} (in which case there is no adjustment).} +\item{adjust_latency}{Character or \code{NULL}. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{NULL} (in which case there is no adjustment). +If there is a difference between the \code{forecast_date} and the last day of +data, this determines how to shift the model to account for this +difference. The options are: +\itemize{ +\item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to +the last day of data. If the last day of data was 3 days ago, the ahead +becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual forecast date. If the lags are +\code{c(0,7,14)} and the last day of data was 3 days ago, the lags become +\code{c(3,10,17)}. +}} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 2c780e822..e801dcc4e 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -10,6 +10,7 @@ arx_class_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, + adjust_latency = NULL, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, @@ -43,6 +44,21 @@ max time value if there is no latency adjustment, or as the \code{as_of} of default \code{NULL} will attempt to determine this automatically as \code{forecast_date + ahead}.} +\item{adjust_latency}{Character or \code{NULL}. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{NULL} (in which case there is no adjustment). +If there is a difference between the \code{forecast_date} and the last day of +data, this determines how to shift the model to account for this +difference. The options are: +\itemize{ +\item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to +the last day of data. If the last day of data was 3 days ago, the ahead +becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual forecast date. If the lags are +\code{c(0,7,14)} and the last day of data was 3 days ago, the lags become +\code{c(3,10,17)}. +}} + \item{outcome_transform}{Scalar character. Whether the outcome should be created using growth rates (as the predictors are) or lagged differences. The second case is closer to the requirements for the diff --git a/man/extend_either.Rd b/man/extend_either.Rd deleted file mode 100644 index ae55fa46a..000000000 --- a/man/extend_either.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-latency.R -\name{extend_either} -\alias{extend_either} -\title{offset each relevant column by it's appropriate latency -works for either adjusting aheads or lags -note that this may introduce new NA values when one column is shifted farther than another} -\usage{ -extend_either(new_data, shift_cols, keys) -} -\arguments{ -\item{new_data}{just what is says} - -\item{shift_cols}{a tibble which must have the columns \code{column}, the name of -the column to adjust, \code{latency} the latency of the original column relative -to the \code{forecast_date}, \code{new_name}, the names in \code{column} adjusted by the -latencies \code{latency}} - -\item{keys}{the variables which are used as keys} -} -\description{ -offset each relevant column by it's appropriate latency -works for either adjusting aheads or lags -note that this may introduce new NA values when one column is shifted farther than another -} -\keyword{internal} diff --git a/man/get_latency.Rd b/man/get_latency.Rd index f5bf7f0c2..5d6d7190f 100644 --- a/man/get_latency.Rd +++ b/man/get_latency.Rd @@ -4,14 +4,7 @@ \alias{get_latency} \title{the latency is also the amount the shift is off by} \usage{ -get_latency( - new_data, - forecast_date, - column, - shift_amount, - sign_shift, - epi_keys_checked -) +get_latency(new_data, forecast_date, column, sign_shift, epi_keys_checked) } \arguments{ \item{sign_shift}{integer. 1 if lag and -1 if ahead. These represent how you diff --git a/man/get_latent_column_tibble.Rd b/man/get_latent_column_tibble.Rd deleted file mode 100644 index 0a0298625..000000000 --- a/man/get_latent_column_tibble.Rd +++ /dev/null @@ -1,41 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-latency.R -\name{get_latent_column_tibble} -\alias{get_latent_column_tibble} -\title{find the columns added with the lags or aheads, and the amounts they have -been changed} -\usage{ -get_latent_column_tibble( - shift_cols, - new_data, - forecast_date, - latency, - sign_shift, - info, - epi_keys_checked, - call = caller_env() -) -} -\arguments{ -\item{shift_cols}{a list of columns to operate on, as created by \code{construct_shift_tibble}} - -\item{new_data}{the data transformed so far} - -\item{forecast_date}{the forecast date} - -\item{latency}{\code{NULL}, int, or vector, as described in \code{step_epi_latency}} - -\item{sign_shift}{-1 if ahead, 1 if lag} -} -\value{ -a tibble with columns \code{column} (relevant shifted names), \code{shift} (the -amount that one is shifted), \code{latency} (original columns difference between -the max \code{time_value} and \code{forecast_date} (on a per-initial column basis)), -\code{effective_shift} (shift+latency), and \code{new_name} (adjusted names with the -effective_shift) -} -\description{ -find the columns added with the lags or aheads, and the amounts they have -been changed -} -\keyword{internal} diff --git a/tests/testthat.R b/tests/testthat.R index 296d916e8..27254bec7 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,4 +1,8 @@ library(testthat) library(epipredict) +library(parsnip) +library(workflows) +library(dplyr) + test_check("epipredict") diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 90b647d86..fb11026dd 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1059,7 +1059,7 @@ 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) -# arx_forecasrte output format snapshots +# arx_forecaster output format snapshots Code out1 @@ -1103,7 +1103,7 @@ * 56 unique geographic regions, * At forecast date: 2022-01-03, * For target date: 2022-01-10, - * Latency adjusted per column: case_rate=3, death_rate=3 + * Lags adjusted per column: case_rate=3, death_rate=3 --- @@ -1127,7 +1127,7 @@ * 56 unique geographic regions, * At forecast date: 2022-01-03, * For target date: 2022-01-10, - * Latency adjusted for death_rate=3 + * Aheads adjusted for death_rate=3 # arx_classifier snapshots diff --git a/tests/testthat/_snaps/step_adjust_latency.md b/tests/testthat/_snaps/step_adjust_latency.md index 354d02d84..fd49b8824 100644 --- a/tests/testthat/_snaps/step_adjust_latency.md +++ b/tests/testthat/_snaps/step_adjust_latency.md @@ -13,10 +13,10 @@ time_value: 1 -- Operations - 1. Lagging: death_rate by 0, 6, 11 - 2. Lagging: case_rate by 1, 5 - 3. Leading: death_rate by 7 - 4. extend_lags: case_rate with latency set at train time + 1. extend_lags: case_rate with latency set at train time + 2. Lagging: death_rate by 0, 6, 11 + 3. Lagging: case_rate by 1, 5 + 4. Leading: death_rate by 7 --- @@ -33,7 +33,7 @@ time_value: 1 -- Operations - 1. Leading: death_rate by 7 - 2. extend_ahead: all previous predictors with latency set at train time - 3. Lagging: death_rate by 0, 7, 14 + 1. Lagging: death_rate by 0, 7, 14 + 2. extend_ahead: all future predictors with latency set at train time + 3. Leading: death_rate by 7 diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index cad6e79bf..3ba41158a 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -80,10 +80,12 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { }) test_that("`layer_add_forecast_date()` infers correct date when using `adjust_latency`", { - r_latent <- epi_recipe(jhu) %>% + jhu_reasonable_date <- jhu + attributes(jhu_reasonable_date)$metadata$as_of <- as.Date("2022-01-03") + r_latent <- epi_recipe(jhu_reasonable_date) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% - step_epi_ahead(death_rate, ahead = 7) %>% step_adjust_latency(method = "extend_ahead") %>% + step_epi_ahead(death_rate, ahead = 7) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) frost_latent <- frosting() %>% @@ -91,7 +93,7 @@ test_that("`layer_add_forecast_date()` infers correct date when using `adjust_la layer_add_forecast_date() %>% layer_naomit(.pred) wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% - fit(jhu) %>% + fit(jhu_reasonable_date) %>% add_frosting(frost_latent) p_latent <- predict(wf_latent, latest) expect_equal( diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index 72f3c1d7f..8bdb3a76b 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -42,9 +42,9 @@ test_that("Use ahead + max time value from pre, fit, post", { }) test_that("latency adjust doesn't interfere with correct target date", { r_latent <- epi_recipe(jhu) %>% + step_adjust_latency(method = "extend_ahead") %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% - step_adjust_latency(method = "extend_ahead") %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% fit(jhu) diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index 8627fe5ed..bdb0457de 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -96,9 +96,9 @@ test_that("arx_forecaster snapshots", { expect_false(all(arx2$predictions == arx3$predictions)) }) -test_that("arx_forecasrte output format snapshots", { +test_that("arx_forecaster output format snapshots", { jhu <- case_death_rate_subset %>% - dplyr::filter(time_value >= as.Date("2021-12-01")) + dplyr::filter(time_value >= as.Date("2021-12-01")) out1 <- arx_forecaster( jhu, "death_rate", c("case_rate", "death_rate") diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 7a0aac51c..fd1bb9675 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -35,19 +35,44 @@ x_lagged attributes(x_lagged)$metadata$as_of <- testing_as_of test_that("epi_adjust_latency correctly extends the lags", { + expect_warning(epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(method = "extend_lags")) + r1 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_lags") %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% - step_epi_ahead(death_rate, ahead = ahead) %>% - step_adjust_latency(method = "extend_lags") + step_epi_ahead(death_rate, ahead = ahead) + + # directly checking the shifts + baked_x <- r1 %>% prep(real_x) %>% bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal(last_dates, + tribble( + ~name, ~last_date, + "lag_16_death_rate", max_time + 16, + "lag_11_death_rate", max_time + 11, + "lag_10_case_rate", max_time + 10, + "lag_6_case_rate", max_time + 6, + "lag_5_death_rate", max_time + 5, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + )) + # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit1 <- slm_fit(r1), regexp = "The shift has been adjusted by 1033"), class = "simpleError") + expect_error(expect_warning(fit1 <- slm_fit(r1, data = x), regexp = "The latency is 1033"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit1 <- slm_fit(r1, data = real_x) - expect_equal( names(fit1$pre$mold$predictors), c( @@ -55,7 +80,7 @@ test_that("epi_adjust_latency correctly extends the lags", { "lag_6_case_rate", "lag_10_case_rate" ) ) - latest <- get_test_data(r1, x) + latest <- get_test_data(r1, real_x) pred <- predict(fit1, latest) point_pred <- pred %>% filter(!is.na(.pred)) expect_equal(nrow(point_pred), 1) @@ -88,10 +113,10 @@ test_that("epi_adjust_latency correctly extends the lags", { test_that("epi_adjust_latency correctly extends the ahead", { r2 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_ahead") %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% - step_epi_ahead(death_rate, ahead = ahead) %>% - step_adjust_latency(method = "extend_ahead") + step_epi_ahead(death_rate, ahead = ahead) # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data @@ -105,7 +130,7 @@ test_that("epi_adjust_latency correctly extends the ahead", { "lag_1_case_rate", "lag_5_case_rate" ) ) - latest <- get_test_data(r2, x) + latest <- get_test_data(r2, real_x) pred2 <- predict(fit2, latest) point_pred2 <- pred2 %>% filter(!is.na(.pred)) # max time is still the forecast date @@ -134,10 +159,10 @@ test_that("epi_adjust_latency correctly extends the ahead", { test_that("epi_adjust_latency extends multiple aheads", { aheads <- 1:3 r3 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_ahead") %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% - step_epi_ahead(death_rate, ahead = aheads) %>% - step_adjust_latency(method = "extend_ahead") + step_epi_ahead(death_rate, ahead = aheads) fitter <- smooth_quantile_reg( quantile_levels = 0.5, outcome_locations = aheads, @@ -147,7 +172,7 @@ test_that("epi_adjust_latency extends multiple aheads", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit3 <- fit(epi_wf, data = x))) + expect_error(fit3 <- fit(epi_wf, data = x)) # real date example fit3 <- fit(epi_wf, data = real_x) expect_equal( @@ -200,16 +225,16 @@ test_that("epi_adjust_latency extend_ahead uses the same adjustment when predict test_that("epi_adjust_latency works for other time types", {}) -test_that("epi_adjust_latency insist there's steps before it", { - expect_error( +test_that("epi_adjust_latency warns there's steps before it", { + expect_warning( r5 <- epi_recipe(x) %>% - step_epi_ahead(death_rate, ahead = ahead) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_adjust_latency(method = "extend_lags"), regexp = "extend_lags" ) - expect_error( + expect_warning( r5 <- epi_recipe(x) %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_ahead(death_rate, ahead = ahead) %>% step_adjust_latency(method = "extend_ahead"), regexp = "extend_ahead" ) @@ -218,28 +243,27 @@ test_that("epi_adjust_latency insist there's steps before it", { test_that("epi_adjust_latency warns against removing NA's beforehand", { expect_error( r5 <- epi_recipe(x) %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% - step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_naomit() %>% - step_adjust_latency(method = "extend_lags"), + step_adjust_latency(method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)), regexp = "adjust_latency needs to occur before any `NA` removal" ) }) # todo check that epi_adjust_latency errors for nonsense `as_of`'s - # todo make sure that `epi_keys_checked` works correctly for extra epi_keys test_that("epi_adjust_latency correctly extends the lags", { r5 <- epi_recipe(x_lagged) %>% + step_adjust_latency(method = "extend_lags", epi_keys_checked = NULL) %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% - step_epi_ahead(death_rate, ahead = ahead) %>% - step_adjust_latency(method = "extend_lags", epi_keys_checked = NULL) + step_epi_ahead(death_rate, ahead = ahead) # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The shift has been adjusted by 1033"), class = "simpleError") + expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The latency is 1033"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = x_lagged) @@ -251,7 +275,6 @@ test_that("epi_adjust_latency correctly extends the lags", { ) ) latest <- get_test_data(r5, x_lagged) - latest$time_value %>% unique() pred <- predict(fit5, latest) point_pred <- pred %>% filter(!is.na(.pred)) expect_equal(nrow(point_pred), 1) @@ -286,10 +309,10 @@ test_that("`step_adjust_latency` only allows one instance of itself", {}) test_that("`step_adjust_latency` only uses the columns specified in the `...`", { r5 <- epi_recipe(x) %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(case_rate, method = "extend_lags") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% - step_epi_ahead(death_rate, ahead = ahead) %>% - step_adjust_latency(case_rate, method = "extend_lags") + step_epi_ahead(death_rate, ahead = ahead) fit5 <- slm_fit(r5, data = real_x) expect_equal(names(fit5$fit$fit$fit$coefficients), c("(Intercept)", "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", "lag_6_case_rate", "lag_10_case_rate")) @@ -299,14 +322,16 @@ test_that("setting fixed_* works for `step_adjust_latency`", {}) test_that("printing step_adjust_latency results in expected output", { r5 <- epi_recipe(x) %>% + step_adjust_latency(case_rate, method = "extend_lags") %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% - step_epi_ahead(death_rate, ahead = ahead) %>% - step_adjust_latency(case_rate, method = "extend_lags") + step_epi_ahead(death_rate, ahead = ahead) expect_snapshot(r5) r <- epi_recipe(case_death_rate_subset) %>% - step_epi_ahead(death_rate, ahead = 7) %>% + step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_adjust_latency(method = "extend_ahead") %>% - step_epi_lag(death_rate, lag = c(0, 7, 14)) + step_epi_ahead(death_rate, ahead = 7) expect_snapshot(r) }) + +test_that("lags of transforms (of transforms etc) work", {}) diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index f52b1a950..5a5979e2a 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -22,58 +22,31 @@ old_data <- old_data %>% modified_data <- old_data %>% dplyr::full_join( - epi_shift_single(old_data, "case_rate", -4, "ahead_4_case_rate", keys), + epi_shift_single(old_data, "case_rate", -4, "case_rate_a", keys), by = keys ) %>% dplyr::full_join( - epi_shift_single(old_data, "case_rate", 3, "lag_3_case_rate", keys), + epi_shift_single(old_data, "case_rate", 3, "case_rate_b", keys), by = keys ) %>% dplyr::full_join( - epi_shift_single(old_data, "death_rate", 7, "lag_7_death_rate", keys), + epi_shift_single(old_data, "death_rate", 7, "death_rate_a", keys), by = keys ) %>% arrange(time_value) -modified_data %>% tail() -as_of - (modified_data %>% filter(!is.na(ahead_4_case_rate)) %>% pull(time_value) %>% max()) -all_shift_cols <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~parent_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", "case_rate", 5, 8, "lag_8_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", "death_rate", 4, 11, "lag_11_death_rate", "numeric", "predictor", - "case_rate", 4, "ahead_", "ahead_4_case_rate", "case_rate", -5, 9, "ahead_9_case_rate", "numeric", "outcome" -) -test_recipe <- epi_recipe(modified_data) %>% - step_epi_lag(case_rate, lag = c(3)) %>% - step_epi_lag(death_rate, lag = 7) %>% - step_epi_ahead(case_rate, ahead = 4) -shift_cols <- construct_shift_tibble(c("case_rate", "death_rate"), test_recipe, "step_epi_lag", "lag") - - -test_that("construct_shift_tibble constructs the right tibble", { - expected_shift_cols <- tibble::tribble( - ~terms, ~shift, ~prefix, - "case_rate", 3, "lag_", - "death_rate", 7, "lag_" - ) - expect_equal(shift_cols, expected_shift_cols) -}) test_that("get_latency works", { - expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1, "geo_value"), 4) - expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 3, 1, "geo_value"), 5) - # get_latency does't check the shift_amount - expect_equal(get_latency(modified_data, as_of, "lag_3_case_rate", 4, 1, "geo_value"), 6) - # ahead works correctly - expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, -1, "geo_value"), -5) - # setting the wrong sign doubles the shift and gets the sign wrong - expect_equal(get_latency(modified_data, as_of, "ahead_4_case_rate", 4, 1, "geo_value"), 5 + 4 * 2) - # minimizing over everything decreases the latency - expect_equal(get_latency(modified_data, as_of, "lag_7_death_rate", 7, 1, NULL), 3) +expect_equal(get_latency(modified_data, as_of, "case_rate", 1, "geo_value"), 5) +expect_equal(get_latency(modified_data, as_of, "case_rate", -1, "geo_value"), -5) +expect_equal(get_latency(modified_data, as_of, "death_rate", 1, "geo_value"), 4) +expect_equal(get_latency(modified_data, as_of, "case_rate_a", 1, "geo_value"), 5 + 4) +expect_equal(get_latency(modified_data, as_of, "case_rate_b", 1, "geo_value"), 5 - 3) +expect_equal(get_latency(modified_data, as_of, "death_rate_a", 1, "geo_value"), 4 - 7) }) test_that("get_latency infers max_time to be the minimum `max time` across the epi_keys", {}) -test_that("get_asof works", { +test_that("set_forecast_date works", { info <- tribble( ~variable, ~type, ~role, ~source, "time_value", "date", "time_value", "original", @@ -87,99 +60,6 @@ test_that("get_asof works", { expect_equal(set_forecast_date(modified_data, info, NULL), as_of) }) -test_that("get_latent_column_tibble infers latency and works correctly", { - info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") - - case_lag <- get_latent_column_tibble( - shift_cols[1, ], modified_data, as_of, NULL, 1, info, - epi_keys_checked = "geo_value" - ) - expect_equal(case_lag, all_shift_cols[1, ]) - - death_lag <- get_latent_column_tibble( - shift_cols[2, ], modified_data, as_of, NULL, 1, info, - epi_keys_checked = "geo_value" - ) - expect_equal(death_lag, all_shift_cols[2, ]) - - both_lag <- get_latent_column_tibble( - shift_cols, modified_data, as_of, NULL, 1, info, - epi_keys_checked = "geo_value" - ) - expect_equal(both_lag, all_shift_cols[1:2, ]) -}) - -test_that("get_latent_column_tibble assigns given latencies", { - # non-null latency - info <- tibble(variable = c("lag_3_case_rate", "lag_7_death_rate", "ahead_4_case_rate"), type = "numeric", role = c(rep("predictor", 2), "outcome"), source = "derived") - both_lag <- get_latent_column_tibble( - shift_cols, modified_data, as_of, 50, 1, info - ) - weird_latencies <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~parent_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", "case_rate", 50, 53, "lag_53_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", "death_rate", 50, 57, "lag_57_death_rate", "numeric", "predictor", - ) - expect_equal(both_lag, weird_latencies) - - # supposing we add the latencies by hand, and they're different, and in a different order - weird_latencies <- tibble::tribble( - ~terms, ~shift, ~prefix, ~original_name, ~parent_name, ~latency, ~effective_shift, ~new_name, ~type, ~role, - "case_rate", 3, "lag_", "lag_3_case_rate", "case_rate", 70, 73, "lag_73_case_rate", "numeric", "predictor", - "death_rate", 7, "lag_", "lag_7_death_rate", "death_rate", 30, 37, "lag_37_death_rate", "numeric", "predictor", - ) - both_lag <- get_latent_column_tibble( - shift_cols, modified_data, as_of, c(death_rate = 30, case_rate = 70), 1, info - ) - expect_equal(both_lag, weird_latencies[1:2, ]) - - ahead_shift_cols <- construct_shift_tibble(c("case_rate"), test_recipe, "step_epi_ahead", "ahead") - case_ahead <- get_latent_column_tibble( - ahead_shift_cols, modified_data, as_of, NULL, -1, info, "geo_value" - ) - expect_equal(case_ahead, all_shift_cols[3, ]) -}) - -test_that("get_shifted_column_tibble objects to non-columns", { - non_shift_cols <- tibble(terms = "not_present", shift = 99, prefix = "lag_") - expect_error( - get_latent_column_tibble( - non_shift_cols, modified_data, as_of, NULL, 1, info - ), - regexp = "Can't subset elements that don't exist" - ) -}) - -test_that("extend_either works", { - keys <- c("geo_value", "time_value") - # extend_either doesn't differentiate between the directions, it just moves - # things - expected_post_shift <- - old_data %>% - dplyr::full_join( - epi_shift_single(old_data, "case_rate", 8, "lag_8_case_rate", keys), - by = keys - ) %>% - dplyr::full_join( - epi_shift_single(old_data, "death_rate", 11, "lag_11_death_rate", keys), - by = keys - ) %>% - dplyr::full_join( - epi_shift_single(old_data, "case_rate", -9, "ahead_9_case_rate", keys), - by = keys - ) %>% - dplyr::bind_rows(tibble( - geo_value = c("place1", "place2"), - time_value = as.Date(c("2021-04-23", "2021-04-24")), case_rate = c(NA, NA), death_rate = c(NA, NA), - lag_8_case_rate = c(NA, NA), lag_11_death_rate = c(NA, NA), ahead_9_case_rate = c(NA, NA) - )) %>% - arrange(time_value, geo_value) - expect_equal( - extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value, geo_value), - expected_post_shift - ) - extended <- extend_either(modified_data, all_shift_cols, keys) %>% arrange(time_value, geo_value) -}) From 09fbfd812c9535b085fade3ad78c622f6be195a0 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 3 Jul 2024 18:50:57 -0500 Subject: [PATCH 54/92] moving locf to step_adjust_ahead instead of get_test_data --- NAMESPACE | 2 + R/arx_classifier.R | 6 -- R/arx_forecaster.R | 23 +---- R/cdc_baseline_forecaster.R | 9 +- R/epi_workflow.R | 6 +- R/flatline_forecaster.R | 9 +- R/get_test_data.R | 105 +++------------------- R/step_adjust_latency.R | 81 ++++++++++------- R/utils-latency.R | 51 ++++++++++- man/arx_args_list.Rd | 11 --- man/arx_class_args_list.Rd | 11 --- man/cdc_baseline_args_list.Rd | 11 --- man/flatline_args_list.Rd | 11 --- man/get_test_data.Rd | 17 +--- man/set_forecast_date.Rd | 2 +- man/step_adjust_latency.Rd | 2 +- tests/testthat/test-epi_workflow.R | 10 +-- tests/testthat/test-get_test_data.R | 2 + tests/testthat/test-step_adjust_latency.R | 105 ++++++++++++++++++++++ 19 files changed, 228 insertions(+), 246 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 19e42fef5..899d22a1e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -240,6 +240,7 @@ importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,full_join) importFrom(dplyr,group_by) +importFrom(dplyr,group_by_at) importFrom(dplyr,join_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) @@ -306,6 +307,7 @@ importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) importFrom(tidyr,expand_grid) +importFrom(tidyr,fill) importFrom(tidyr,unnest) importFrom(vctrs,as_list_of) importFrom(vctrs,field) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index f07ae3e71..5ee6de88a 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -67,9 +67,6 @@ arx_classifier <- function( target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) preds <- forecast( wf, - fill_locf = is.null(args_list$adjust_latency), - n_recent = args_list$nafill_buffer, - forecast_date = forecast_date ) %>% as_tibble() %>% select(-time_value) @@ -294,7 +291,6 @@ arx_class_args_list <- function( method = c("rel_change", "linear_reg"), log_scale = FALSE, additional_gr_args = list(), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ...) { @@ -312,7 +308,6 @@ arx_class_args_list <- function( arg_is_lgl(log_scale) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) if (!is.list(additional_gr_args)) { cli_abort(c( "`additional_gr_args` must be a {.cls list}.", @@ -353,7 +348,6 @@ arx_class_args_list <- function( method, log_scale, additional_gr_args, - nafill_buffer, check_enough_data_n, check_enough_data_epi_keys ), diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 3abbeb2eb..a67d07c11 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -51,14 +51,9 @@ arx_forecaster <- function( wf <- arx_fcast_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - preds <- forecast( - wf, - fill_locf = is.null(args_list$adjust_latency), - n_recent = args_list$nafill_buffer, - forecast_date = args_list$forecast_date %||% max(epi_data$time_value) - ) %>% - as_tibble() %>% - select(-time_value) + preds <- forecast(wf) %>% + tibble::as_tibble() %>% + dplyr::select(-time_value) structure( list( @@ -252,15 +247,6 @@ arx_fcast_epi_workflow <- function( #' `character(0)` performs no grouping. This argument only applies when #' residual quantiles are used. It is not applicable with #' `trainer = quantile_reg()`, for example. -#' @param nafill_buffer At predict time, recent values of the training data -#' are used to create a forecast. However, these can be `NA` due to, e.g., -#' data latency issues. By default, any missing values will get filled with -#' less recent data. Setting this value to `NULL` will result in 1 extra -#' recent row (beyond those required for lag creation) to be used. Note that -#' we require at least `min(lags)` rows of recent data per `geo_value` to -#' create a prediction. For this reason, setting `nafill_buffer < min(lags)` -#' will be treated as _additional_ allowed recent data rather than the -#' total amount of recent data to examine. #' @param check_enough_data_n Integer. A lower limit for the number of rows per #' epi_key that are required for training. If `NULL`, this check is ignored. #' @param check_enough_data_epi_keys Character vector. A character vector of @@ -287,7 +273,6 @@ arx_args_list <- function( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ...) { @@ -305,7 +290,6 @@ arx_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) arg_is_pos(check_enough_data_n, allow_null = TRUE) arg_is_chr(check_enough_data_epi_keys, allow_null = TRUE) @@ -332,7 +316,6 @@ arx_args_list <- function( nonneg, max_lags, quantile_by_key, - nafill_buffer, check_enough_data_n, check_enough_data_epi_keys ), diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index b2e7434e2..f3ba48794 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -79,9 +79,7 @@ cdc_baseline_forecaster <- function( latest <- get_test_data( - epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, - forecast_date - ) + epi_recipe(epi_data), epi_data) f <- frosting() %>% layer_predict() %>% @@ -169,7 +167,6 @@ cdc_baseline_args_list <- function( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value", - nafill_buffer = Inf, ...) { rlang::check_dots_empty() arg_is_scalar(n_training, nsims, data_frequency) @@ -183,7 +180,6 @@ cdc_baseline_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) structure( enlist( @@ -195,8 +191,7 @@ cdc_baseline_args_list <- function( nsims, symmetrize, nonneg, - quantile_by_key, - nafill_buffer + quantile_by_key ), class = c("cdc_baseline_fcast", "alist") ) diff --git a/R/epi_workflow.R b/R/epi_workflow.R index fe9b656ba..34141b732 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -267,13 +267,9 @@ forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NUL )) } } - test_data <- get_test_data( hardhat::extract_preprocessor(object), - object$original_data, - fill_locf = fill_locf, - n_recent = n_recent %||% Inf, - forecast_date = forecast_date %||% frosting_fd %||% max(object$original_data$time_value) + object$original_data ) predict(object, new_data = test_data) diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index 55808b803..f2d6fa998 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -67,9 +67,7 @@ flatline_forecaster <- function( wf <- fit(wf, epi_data) preds <- suppressWarnings(forecast( wf, - fill_locf = TRUE, - n_recent = args_list$nafill_buffer, - forecast_date = forecast_date + fill_locf = TRUE )) %>% as_tibble() %>% select(-time_value) @@ -117,7 +115,6 @@ flatline_args_list <- function( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, ...) { rlang::check_dots_empty() arg_is_scalar(ahead, n_training) @@ -129,7 +126,6 @@ flatline_args_list <- function( arg_is_probabilities(quantile_levels, allow_null = TRUE) arg_is_pos(n_training) if (is.finite(n_training)) arg_is_pos_int(n_training) - if (is.finite(nafill_buffer)) arg_is_pos_int(nafill_buffer, allow_null = TRUE) if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { @@ -149,8 +145,7 @@ flatline_args_list <- function( quantile_levels, symmetrize, nonneg, - quantile_by_key, - nafill_buffer + quantile_by_key ), class = c("flat_fcast", "alist") ) diff --git a/R/get_test_data.R b/R/get_test_data.R index a491ed40d..ebd6e1b7b 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -20,13 +20,6 @@ #' @param recipe A recipe object. #' @param x An epi_df. The typical usage is to #' pass the same data as that used for fitting the recipe. -#' @param fill_locf Logical. Should we use `locf` to fill in missing data? -#' @param n_recent Integer or NULL. If filling missing data with `locf = TRUE`, -#' how far back are we willing to tolerate missing data? Larger values allow -#' more filling. The default `NULL` will determine this from the -#' the `recipe`. For example, suppose `n_recent = 3`, then if the -#' 3 most recent observations in any `geo_value` are all `NA`’s, we won’t be -#' able to fill anything, and an error message will be thrown. (See details.) #' @param forecast_date By default, this is set to the maximum #' `time_value` in `x`. But if there is data latency such that recent `NA`'s #' should be filled, this may be _after_ the last available `time_value`. @@ -44,18 +37,8 @@ #' @export get_test_data <- function( recipe, - x, - fill_locf = FALSE, - n_recent = NULL, - forecast_date = max(x$time_value)) { - if (!is_epi_df(x)) cli_abort("`x` must be an `epi_df`.") - arg_is_lgl(fill_locf) - arg_is_scalar(fill_locf) - arg_is_scalar(n_recent, allow_null = TRUE) - if (!is.null(n_recent) && is.finite(n_recent)) { - arg_is_pos_int(n_recent, allow_null = TRUE) - } - if (!is.null(n_recent)) n_recent <- abs(n_recent) # in case they passed -Inf + x) { + if (!is_epi_df(x)) cli::cli_abort("`x` must be an `epi_df`.") check <- hardhat::check_column_names(x, colnames(recipe$template)) if (!check$ok) { @@ -64,103 +47,35 @@ get_test_data <- function( i = "The following required columns are missing: {check$missing_names}" )) } - if (class(forecast_date) != class(x$time_value)) { - cli_abort("`forecast_date` must be the same class as `x$time_value`.") - } - if (forecast_date < max(x$time_value)) { - cli_abort("`forecast_date` must be no earlier than `max(x$time_value)`") - } min_lags <- min(map_dbl(recipe$steps, ~ min(.x$lag %||% Inf)), Inf) max_lags <- max(map_dbl(recipe$steps, ~ max(.x$lag %||% 0)), 0) max_horizon <- max(map_dbl(recipe$steps, ~ max(.x$horizon %||% 0)), 0) - max_slide <- max(map_dbl(recipe$steps, ~ max(.x$before %||% 0)), 0) - min_required <- max_lags + max_horizon + max_slide - if (is.null(n_recent)) n_recent <- min_required + 1 # one extra for filling - if (n_recent <= min_required) n_recent <- min_required + n_recent + keep <- max_lags + max_horizon # CHECK: Error out if insufficient training data # Probably needs a fix based on the time_type of the epi_df avail_recent <- diff(range(x$time_value)) - if (avail_recent < min_required) { - cli_abort(c( + if (avail_recent < keep) { + cli::cli_abort(c( "You supplied insufficient recent data for this recipe. ", "!" = "You need at least {min_required} days of data,", "!" = "but `x` contains only {avail_recent}." )) } - + max_time_value <- x %>% na.omit %>% pull(time_value) %>% max x <- arrange(x, time_value) groups <- epi_keys_only(recipe) # If we skip NA completion, we remove undesirably early time values # Happens globally, over all groups - keep <- max(n_recent, min_required + 1) - x <- filter(x, forecast_date - time_value <= keep) - - # Pad with explicit missing values up to and including the forecast_date - # x is grouped here - x <- pad_to_end(x, groups, forecast_date) %>% - group_by(across(all_of(groups))) + x <- dplyr::filter(x, max_time_value - time_value <= keep) # If all(lags > 0), then we get rid of recent data if (min_lags > 0 && min_lags < Inf) { - x <- filter(x, forecast_date - time_value >= min_lags) - } - - # Now, fill forward missing data if requested - if (fill_locf) { - cannot_be_used <- x %>% - dplyr::filter(forecast_date - time_value <= n_recent) %>% - dplyr::mutate(fillers = forecast_date - time_value > min_required) %>% - dplyr::summarise( - dplyr::across( - -tidyselect::any_of(epi_keys(recipe)), - ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) - ), - .groups = "drop" - ) %>% - select(-fillers) %>% - summarise(across(-any_of(key_colnames(recipe)), ~ any(.x))) %>% - unlist() - if (any(cannot_be_used)) { - bad_vars <- names(cannot_be_used)[cannot_be_used] - if (recipes::is_trained(recipe)) { - cli_abort(c( - "The variables {.var {bad_vars}} have too many recent missing", - `!` = "values to be filled automatically. ", - i = "You should either choose `n_recent` larger than its current ", - i = "value {n_recent}, or perform NA imputation manually, perhaps with ", - i = "{.code recipes::step_impute_*()} or with {.code tidyr::fill()}." - )) - } - } - x <- tidyr::fill(x, !time_value) + x <- dplyr::filter(x, max_time_value - time_value >= min_lags) } - filter(x, forecast_date - time_value <= min_required) %>% - ungroup() -} - -pad_to_end <- function(x, groups, end_date) { - itval <- guess_period(c(x$time_value, end_date), "time_value") - completed_time_values <- x %>% - group_by(across(all_of(groups))) %>% - summarise( - time_value = rlang::list2( - time_value = Seq(max(time_value) + itval, end_date, itval) - ) - ) %>% - unnest("time_value") %>% - mutate(time_value = vctrs::vec_cast(time_value, x$time_value)) - - bind_rows(x, completed_time_values) %>% - arrange(across(all_of(c("time_value", groups)))) -} - -Seq <- function(from, to, by) { - if (from > to) { - return(NULL) - } - seq(from = from, to = to, by = by) + dplyr::filter(x, max_time_value - time_value <= keep) %>% + epiprocess::ungroup() } diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 40fd7c37d..b963ce257 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -122,6 +122,8 @@ step_adjust_latency <- "If `method` is {.val extend_lags} or {.val locf}, then the previous `step_epi_lag`s won't work with modified data." ) + } else if ((method == "locf") && (length(recipe$steps) > 0)) { + cli::cli_warn("There are steps before `step_adjust_latency`. With the method {.val locf}, it is recommended to include this step before any others") } if (detect_step(recipe, "naomit")) { cli::cli_abort("adjust_latency needs to occur before any `NA` removal, @@ -205,14 +207,14 @@ construct_latency_table <- function(x, latency, training, info) { prep.step_adjust_latency <- function(x, training, info = NULL, ...) { sign_shift <- get_sign(x) latency <- x$latency - forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked) + forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked, latency) # construct the latency table latency_table <- names(training)[!names(training) %in% epi_keys(training)] %>% tibble(col_name = .) - if (length(recipes_eval_select(x$terms, training, info)) > 0) { + if (length(recipes_eval_select(x$terms, training, info)) > 0) { latency_table <- latency_table %>% filter(col_name %in% recipes_eval_select(x$terms, training, info)) - } + } if (is.null(latency)) { latency_table <- latency_table %>% @@ -237,30 +239,28 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { filter(role == "raw") %>% pull(variable) } - # get and check the max_time and forecast_date are the right kinds of dates - forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked) - # check that the shift amount isn't too extreme - latency <- max(latency_table$latency) - time_type <- attributes(training)$metadata$time_type - i_latency <- which.max(latency_table$latency) - if ( - (grepl("day", time_type) && (latency >= 10)) || - (grepl("week", time_type) && (latency >= 4)) || - ((time_type == "yearmonth") && (latency >= 2)) || - ((time_type == "yearquarter") && (latency >= 1)) || - ((time_type == "year") && (latency >= 1)) - ) { - cli::cli_warn(paste( - "!" = paste( - "The latency is {latency}, ", - "which is questionable for it's `time_type` of ", - "{time_type}." - ), - "i" = "latency: {latency_table$latency[[i_latency]]}", - "i" = "`max_time` = {max_time} -> `forecast_date` = {forecast_date}" - )) - } + # check that the shift amount isn't too extreme + latency_max <- max(abs(latency_table$latency)) + time_type <- attributes(training)$metadata$time_type + i_latency <- which.max(latency_table$latency) + if ( + (grepl("day", time_type) && (latency_max >= 10)) || + (grepl("week", time_type) && (latency_max >= 4)) || + ((time_type == "yearmonth") && (latency_max >= 2)) || + ((time_type == "yearquarter") && (latency_max >= 1)) || + ((time_type == "year") && (latency_max >= 1)) + ) { + cli::cli_warn(paste( + "!" = paste( + "The maximum latency is {latency_max}, ", + "which is questionable for it's `time_type` of ", + "{time_type}." + ), + "i" = "latency: {latency_table$latency[[i_latency]]}", + "i" = "`max_time` = {max_time} -> `forecast_date` = {forecast_date}" + )) + } step_adjust_latency_new( terms = x$terms, @@ -279,20 +279,33 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { ) } -#' @importFrom dplyr %>% pull +#' @importFrom dplyr %>% pull group_by_at +#' @importFrom tidyr fill #' @export bake.step_adjust_latency <- function(object, new_data, ...) { if (!isa(new_data, "epi_df")) { + # TODO if new_data actually has keys other than geo_value and time_value, this is going to cause problems new_data <- new_data %>% as_epi_df(as_of = object$forecast_date) } - attributes(new_data)$metadata$latency_method <- object$method - attributes(new_data)$metadata$shift_sign <- get_sign(object) - attributes(new_data)$metadata$latency_table <- object$latency_table - if ((object$method == "extend_ahead") || (object$method == "extend_lags")) { + if (object$method == "extend_ahead" || object$method == "extend_lags") { + attributes(new_data)$metadata$latency_method <- object$method + attributes(new_data)$metadata$shift_sign <- get_sign(object) + attributes(new_data)$metadata$latency_table <- object$latency_table keys <- object$keys - return( - new_data - ) + return(new_data) + } else if (object$method == "locf") { + # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns + rel_keys <- setdiff(epi_keys(new_data), "time_value") + object$forecast_date + unnamed_columns <- object$columns %>% unname() + new_data %>% + pad_to_end(rel_keys, object$forecast_date) %>% + # group_by_at(rel_keys) %>% + arrange(time_value) %>% + as_tibble() %>% + tidyr::fill(.direction = "down", any_of(unnamed_columns)) %>% + ungroup() %>% + return() } } #' @export diff --git a/R/utils-latency.R b/R/utils-latency.R index a567dae06..f681358d4 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -32,7 +32,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal #' @importFrom dplyr select -set_forecast_date <- function(new_data, info, epi_keys_checked) { +set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { original_columns <- info %>% filter(source == "original") %>% pull(variable) @@ -58,7 +58,11 @@ set_forecast_date <- function(new_data, info, epi_keys_checked) { summarise(time_value = max(time_value)) %>% pull(time_value) %>% min() - forecast_date <- attributes(new_data)$metadata$as_of + if (is.null(latency)) { + forecast_date <- attributes(new_data)$metadata$as_of + } else { + forecast_date <- max_time + latency + } # make sure the as_of is sane if (!inherits(forecast_date, class(max_time)) & !inherits(forecast_date, "POSIXt")) { cli::cli_abort(paste( @@ -142,3 +146,46 @@ get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new } max_time_value } + + +fill_locf <- function(x, forecast_date) { + cannot_be_used <- x %>% + dplyr::filter(forecast_date - time_value <= n_recent) %>% + dplyr::mutate(fillers = forecast_date - time_value > keep) %>% + dplyr::summarise( + dplyr::across( + -tidyselect::any_of(epi_keys(recipe)), + ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) + ), + .groups = "drop" + ) %>% + dplyr::select(-fillers) %>% + dplyr::summarise(dplyr::across( + -tidyselect::any_of(epi_keys(recipe)), ~ any(.x) + )) %>% + unlist() + x <- tidyr::fill(x, !time_value) +} + +pad_to_end <- function(x, groups, end_date) { + itval <- epiprocess:::guess_period(c(x$time_value, end_date), "time_value") + completed_time_values <- x %>% + dplyr::group_by(dplyr::across(tidyselect::all_of(groups))) %>% + dplyr::summarise( + time_value = rlang::list2( + time_value = seq_null_swap(max(time_value) + itval, end_date, itval) + ) + ) %>% + unnest("time_value") %>% + mutate(time_value = vctrs::vec_cast(time_value, x$time_value)) + + dplyr::bind_rows(x, completed_time_values) %>% + dplyr::arrange(dplyr::across(tidyselect::all_of(c("time_value", groups)))) +} + +seq_null_swap <- function(from, to, by) { + if (from > to) { + return(NULL) + } + seq(from = from, to = to, by = by) +} diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index 2612d4a3e..99e2f4063 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -15,7 +15,6 @@ arx_args_list( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ... @@ -76,16 +75,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{check_enough_data_n}{Integer. A lower limit for the number of rows per epi_key that are required for training. If \code{NULL}, this check is ignored.} diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index e801dcc4e..9c1832142 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -17,7 +17,6 @@ arx_class_args_list( method = c("rel_change", "linear_reg"), log_scale = FALSE, additional_gr_args = list(), - nafill_buffer = Inf, check_enough_data_n = NULL, check_enough_data_epi_keys = NULL, ... @@ -95,16 +94,6 @@ log scale.} calculation. See \code{\link[epiprocess:growth_rate]{epiprocess::growth_rate()}} and the related Vignette for more details.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{check_enough_data_n}{Integer. A lower limit for the number of rows per epi_key that are required for training. If \code{NULL}, this check is ignored.} diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 981c3c7e5..3870134fb 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -14,7 +14,6 @@ cdc_baseline_args_list( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = "geo_value", - nafill_buffer = Inf, ... ) } @@ -65,16 +64,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{...}{Space to handle future expansions (unused).} } \value{ diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index 633d45020..d056ed825 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -13,7 +13,6 @@ flatline_args_list( symmetrize = TRUE, nonneg = TRUE, quantile_by_key = character(0L), - nafill_buffer = Inf, ... ) } @@ -57,16 +56,6 @@ before calculating residual quantiles. See the \code{by_key} argument to residual quantiles are used. It is not applicable with \code{trainer = quantile_reg()}, for example.} -\item{nafill_buffer}{At predict time, recent values of the training data -are used to create a forecast. However, these can be \code{NA} due to, e.g., -data latency issues. By default, any missing values will get filled with -less recent data. Setting this value to \code{NULL} will result in 1 extra -recent row (beyond those required for lag creation) to be used. Note that -we require at least \code{min(lags)} rows of recent data per \code{geo_value} to -create a prediction. For this reason, setting \code{nafill_buffer < min(lags)} -will be treated as \emph{additional} allowed recent data rather than the -total amount of recent data to examine.} - \item{...}{Space to handle future expansions (unused).} } \value{ diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd index b18685d89..6fef23ba4 100644 --- a/man/get_test_data.Rd +++ b/man/get_test_data.Rd @@ -4,13 +4,7 @@ \alias{get_test_data} \title{Get test data for prediction based on longest lag period} \usage{ -get_test_data( - recipe, - x, - fill_locf = FALSE, - n_recent = NULL, - forecast_date = max(x$time_value) -) +get_test_data(recipe, x) } \arguments{ \item{recipe}{A recipe object.} @@ -18,15 +12,6 @@ get_test_data( \item{x}{An epi_df. The typical usage is to pass the same data as that used for fitting the recipe.} -\item{fill_locf}{Logical. Should we use \code{locf} to fill in missing data?} - -\item{n_recent}{Integer or NULL. If filling missing data with \code{locf = TRUE}, -how far back are we willing to tolerate missing data? Larger values allow -more filling. The default \code{NULL} will determine this from the -the \code{recipe}. For example, suppose \code{n_recent = 3}, then if the -3 most recent observations in any \code{geo_value} are all \code{NA}’s, we won’t be -able to fill anything, and an error message will be thrown. (See details.)} - \item{forecast_date}{By default, this is set to the maximum \code{time_value} in \code{x}. But if there is data latency such that recent \code{NA}'s should be filled, this may be \emph{after} the last available \code{time_value}.} diff --git a/man/set_forecast_date.Rd b/man/set_forecast_date.Rd index 58682bd2b..29dd98d33 100644 --- a/man/set_forecast_date.Rd +++ b/man/set_forecast_date.Rd @@ -4,7 +4,7 @@ \alias{set_forecast_date} \title{Extract the as_of for the forecast date, and make sure there's nothing very off about it.} \usage{ -set_forecast_date(new_data, info, epi_keys_checked) +set_forecast_date(new_data, info, epi_keys_checked, latency) } \description{ Extract the as_of for the forecast date, and make sure there's nothing very off about it. diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index fc4c55878..1006a76dc 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -123,8 +123,8 @@ jhu <- case_death_rate_subset \%>\% attributes(jhu)$metadata$as_of <- max(jhu$time_value) + 3 r <- epi_recipe(case_death_rate_subset) \%>\% - step_epi_ahead(death_rate, ahead = 7) \%>\% step_adjust_latency(method = "extend_ahead") \%>\% + step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) r diff --git a/tests/testthat/test-epi_workflow.R b/tests/testthat/test-epi_workflow.R index 8bb58b0bc..ecd955cc5 100644 --- a/tests/testthat/test-epi_workflow.R +++ b/tests/testthat/test-epi_workflow.R @@ -79,17 +79,11 @@ test_that("forecast method works", { )) ) - args <- list( - fill_locf = TRUE, - n_recent = 360 * 3, - forecast_date = as.Date("2024-01-01") - ) expect_equal( - forecast(wf, !!!args), + forecast(wf), predict(wf, new_data = get_test_data( hardhat::extract_preprocessor(wf), - jhu, - !!!args + jhu )) ) }) diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R index aa799150b..3fabdea2e 100644 --- a/tests/testthat/test-get_test_data.R +++ b/tests/testthat/test-get_test_data.R @@ -44,6 +44,7 @@ test_that("expect error that geo_value or time_value does not exist", { test_that("NA fill behaves as desired", { + testthat::skip() df <- tibble::tibble( geo_value = rep(c("ca", "ny"), each = 10), time_value = rep(1:10, times = 2), @@ -81,6 +82,7 @@ test_that("NA fill behaves as desired", { }) test_that("forecast date behaves", { + testthat::skip() df <- tibble::tibble( geo_value = rep(c("ca", "ny"), each = 10), time_value = rep(1:10, times = 2), diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index fd1bb9675..0113f2399 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -316,6 +316,33 @@ test_that("`step_adjust_latency` only uses the columns specified in the `...`", fit5 <- slm_fit(r5, data = real_x) expect_equal(names(fit5$fit$fit$fit$coefficients), c("(Intercept)", "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", "lag_6_case_rate", "lag_10_case_rate")) + + r51 <- epi_recipe(x) %>% + step_adjust_latency(case_rate, method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + baked_x <- r51 %>% prep(real_x) %>% bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) %>% + mutate(locf_date = last_date - latency) + # iterate over all columns and make sure the latent time period has the exact same values + for (ii in seq(nrow(last_dates))) { + baked_var <- baked_x %>% + filter(last_dates[[ii,"locf_date"]] <= time_value, time_value <=last_dates[[ii,"last_date"]]) %>% + pull(last_dates[[ii,"name"]]) %>% + var + if (grepl("case_rate", last_dates[[ii, "name"]])) { + expect_equal(baked_var, 0) + } else { + expect_true(baked_var > 0) + } + } }) test_that("setting fixed_* works for `step_adjust_latency`", {}) @@ -334,4 +361,82 @@ test_that("printing step_adjust_latency results in expected output", { expect_snapshot(r) }) +test_that("data with epi_df shorn off works", {}) test_that("lags of transforms (of transforms etc) work", {}) +test_that("locf works as intended", { + expect_warning(epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_adjust_latency(method = "locf")) + + r6 <- epi_recipe(x) %>% + step_adjust_latency(method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + # directly checking the shifts + baked_x <- r6 %>% prep(real_x) %>% bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) %>% + mutate(locf_date = last_date - latency) + # iterate over all columns and make sure the latent time period has the exact same values + for (ii in seq(nrow(last_dates))) { + baked_x %>% + filter(last_dates[[ii,"locf_date"]] <= time_value, time_value <=last_dates[[ii,"last_date"]]) %>% + pull(last_dates[[ii,"name"]]) %>% + var %>% + expect_equal(0) + } + + # the as_of on x is today's date, which is >970 days in the future + # also, there's no data >970 days in the past, so it gets an error trying to + # fit on no data + expect_warning(fit6 <- slm_fit(r6, data = x), regexp = "The maximum latency is 1033") + + # now trying with the as_of a reasonable distance in the future + fit6 <- slm_fit(r6, data = real_x) + expect_equal( + names(fit6$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r6, real_x) + pred <- predict(fit6, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(max(point_pred$time_value), as.Date(testing_as_of)) + + expect_equal( + names(fit6$pre$mold$outcomes), + glue::glue("ahead_{ahead}_death_rate") + ) + latest <- get_test_data(r6, x) + pred1 <- predict(fit6, latest) + actual_solutions <- pred1 %>% filter(!is.na(.pred)) + expect_equal(max(actual_solutions$time_value), testing_as_of) + + # should have four predictors, including the intercept + expect_equal(length(fit6$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + # + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + locf_x <- real_x %>% rbind(tibble(geo_value = rep("place", latency), + time_value = max_time + 1:latency, + case_rate = rep(real_x$case_rate[nrow(x)], latency), + death_rate = rep(real_x$death_rate[nrow(x)], latency))) + fit_hand_adj <- slm_fit(hand_adjusted, data = locf_x) + expect_equal( + fit6$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) From 01dc1483669fd7d4db1c12407f2a29b30687f06e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 8 Jul 2024 13:04:09 -0500 Subject: [PATCH 55/92] hotfix from Dan --- R/utils-latency.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/utils-latency.R b/R/utils-latency.R index f681358d4..9bcb7c7a2 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -121,11 +121,11 @@ get_latency <- function(new_data, forecast_date, column, sign_shift, epi_keys_ch #' a potentially different max_time_value #' @keywords internal get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new_data) { - max_time_value <- max( + max_time_value <- as.Date(max( workflow_max_time_value, this_recipe$max_time_value, max(new_data$time_value) - ) + )) if (this_recipe %>% recipes::detect_step("adjust_latency")) { # get the as_of in an `adjust_latency` step, regardless of where handpicked_forecast_date <- map( From a5a84a7e4f9f067e1311ef9cf5c00d5229c4da0f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 30 Jul 2024 17:37:11 -0500 Subject: [PATCH 56/92] rebase fixes, error classes, unskip latency tests --- NAMESPACE | 1 + R/canned-epipred.R | 3 +- R/cdc_baseline_forecaster.R | 3 +- R/epi_shift.R | 4 +- R/get_test_data.R | 19 +-- R/step_adjust_latency.R | 26 ++-- R/utils-latency.R | 52 +++++--- man/get_test_data.Rd | 10 -- man/step_epi_shift.Rd | 7 ++ tests/testthat/test-snapshots.R | 2 +- tests/testthat/test-step_adjust_latency.R | 147 ++++++++++++++++------ tests/testthat/test-utils_latency.R | 34 ++--- 12 files changed, 194 insertions(+), 114 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 899d22a1e..c6ac51064 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -288,6 +288,7 @@ importFrom(rlang,enquos) importFrom(rlang,expr) importFrom(rlang,global_env) importFrom(rlang,inject) +importFrom(rlang,is_empty) importFrom(rlang,is_logical) importFrom(rlang,is_null) importFrom(rlang,is_true) diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 4f95f3d22..66c66a339 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -133,7 +133,8 @@ print.canned_epipred <- function(x, name, ...) { purrr::map("columns") %>% reduce(c) latency_per_base_col <- latency_step$latency_table %>% - filter(col_name %in% valid_columns) %>% mutate(latency = abs(latency)) + filter(col_name %in% valid_columns) %>% + mutate(latency = abs(latency)) if (latency_step$method != "locf" && nrow(latency_per_base_col) > 1) { intro_text <- glue::glue("{type_str} adjusted per column: ") } else if (latency_step$method != "locf") { diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index f3ba48794..85635cfae 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -79,7 +79,8 @@ cdc_baseline_forecaster <- function( latest <- get_test_data( - epi_recipe(epi_data), epi_data) + epi_recipe(epi_data), epi_data + ) f <- frosting() %>% layer_predict() %>% diff --git a/R/epi_shift.R b/R/epi_shift.R index 43876a53a..0bd3da30c 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -46,12 +46,12 @@ add_shifted_columns <- function(new_data, object, amount) { shift_sign_lat <- attributes(new_data)$metadata$shift_sign if (!is.null(latency_table) && shift_sign_lat == sign_shift) { - #TODO this doesn't work on lags of transforms + # TODO this doesn't work on lags of transforms rel_latency <- latency_table %>% filter(col_name %in% object$columns) } else { rel_latency <- tibble(col_name = object$columns, latency = 0L) } - grid <- expand_grid(col = object$columns, amount = sign_shift *amount) %>% + grid <- expand_grid(col = object$columns, amount = sign_shift * amount) %>% left_join(rel_latency, by = join_by(col == col_name), ) %>% tidyr::replace_na(list(latency = 0)) %>% mutate( diff --git a/R/get_test_data.R b/R/get_test_data.R index ebd6e1b7b..f1d83aad0 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -11,18 +11,9 @@ #' used if growth rate calculations are requested by the recipe. This is #' calculated internally. #' -#' It also optionally fills missing values -#' using the last-observation-carried-forward (LOCF) method. If this -#' is not possible (say because there would be only `NA`'s in some location), -#' it will produce an error suggesting alternative options to handle missing -#' values with more advanced techniques. -#' #' @param recipe A recipe object. #' @param x An epi_df. The typical usage is to #' pass the same data as that used for fitting the recipe. -#' @param forecast_date By default, this is set to the maximum -#' `time_value` in `x`. But if there is data latency such that recent `NA`'s -#' should be filled, this may be _after_ the last available `time_value`. #' #' @return An object of the same type as `x` with columns `geo_value`, `time_value`, any additional #' keys, as well other variables in the original dataset. @@ -35,9 +26,8 @@ #' get_test_data(recipe = rec, x = case_death_rate_subset) #' @importFrom rlang %@% #' @export -get_test_data <- function( - recipe, - x) { + +get_test_data <- function(recipe, x) { if (!is_epi_df(x)) cli::cli_abort("`x` must be an `epi_df`.") check <- hardhat::check_column_names(x, colnames(recipe$template)) @@ -63,7 +53,10 @@ get_test_data <- function( "!" = "but `x` contains only {avail_recent}." )) } - max_time_value <- x %>% na.omit %>% pull(time_value) %>% max + max_time_value <- x %>% + na.omit() %>% + pull(time_value) %>% + max() x <- arrange(x, time_value) groups <- epi_keys_only(recipe) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index b963ce257..11e1d84fd 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -86,7 +86,7 @@ #' jhu_fit #' #' @importFrom recipes detect_step -#' @importFrom rlang enquos +#' @importFrom rlang enquos is_empty step_adjust_latency <- function(recipe, ..., @@ -106,39 +106,43 @@ step_adjust_latency <- id = recipes::rand_id("adjust_latency")) { arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { - cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") + cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", class = "epipredict__step_adjust_latency__epi_recipe_only") } if (!is.null(columns)) { cli::cli_abort(c("The `columns` argument must be `NULL`.", i = "Use `tidyselect` methods to choose columns to lag." - )) + ), class = "epipredict__step_adjust_latency__cols_not_null") } if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { cli::cli_warn( - "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified." + "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified.", + class = "epipredict__step_adjust_latency__misordered_step_warning" ) } else if ((method == "extend_lags") && detect_step(recipe, "epi_lag")) { cli::cli_warn( "If `method` is {.val extend_lags} or {.val locf}, -then the previous `step_epi_lag`s won't work with modified data." +then the previous `step_epi_lag`s won't work with modified data.", + class = "epipredict__step_adjust_latency__misordered_step_warning" ) } else if ((method == "locf") && (length(recipe$steps) > 0)) { - cli::cli_warn("There are steps before `step_adjust_latency`. With the method {.val locf}, it is recommended to include this step before any others") + cli::cli_warn("There are steps before `step_adjust_latency`. With the method {.val locf}, it is recommended to include this step before any others", + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) } if (detect_step(recipe, "naomit")) { cli::cli_abort("adjust_latency needs to occur before any `NA` removal, - as columns may be moved around") + as columns may be moved around", class = "epipredict__step_adjust_latency__post_NA_error") } if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { cli::cli_abort("Only one of `fixed_latency` and `fixed_forecast_date` - can be non-`NULL` at a time!") + can be non-`NULL` at a time!", class = "epipredict__step_adjust_latency__too_many_args_error") } if (length(fixed_latency > 1)) { template <- recipe$template data_names <- names(template)[!names(template) %in% epi_keys(template)] wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] if (length(wrong_names) > 0) { - cli::cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}") + cli::cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}", class = "epipredict__step_adjust_latency__undefined_names_error") } } @@ -258,8 +262,8 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { "{time_type}." ), "i" = "latency: {latency_table$latency[[i_latency]]}", - "i" = "`max_time` = {max_time} -> `forecast_date` = {forecast_date}" - )) + "i" = "`max_time` = {max(training$time_value)} -> `forecast_date` = {forecast_date}" + ), class = "epipredict__prep.step_latency__very_large_latency") } step_adjust_latency_new( diff --git a/R/utils-latency.R b/R/utils-latency.R index 9bcb7c7a2..8356ae1e2 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -38,11 +38,14 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { pull(variable) # make sure that there's enough column names if (length(original_columns) < 3) { - cli::cli_abort(glue::glue( - "The original columns of `time_value`, ", - "`geo_value` and at least one signal. The current colums are \n", - paste(capture.output(object$info), collapse = "\n\n") - )) + cli::cli_abort( + glue::glue( + "The original columns of `time_value`, ", + "`geo_value` and at least one signal. The current colums are \n", + paste(capture.output(object$info), collapse = "\n\n") + ), + class = "epipredict__set_forecast_date__too_few_data_columns" + ) } # the source data determines the actual time_values # these are the non-na time_values; @@ -65,25 +68,34 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { } # make sure the as_of is sane if (!inherits(forecast_date, class(max_time)) & !inherits(forecast_date, "POSIXt")) { - cli::cli_abort(paste( - "the data matrix `forecast_date` value is {forecast_date}, ", - "and not a valid `time_type` with type ", - "matching `time_value`'s type of ", - "{class(max_time)}." - )) + cli::cli_abort( + paste( + "the data matrix `forecast_date` value is {forecast_date}, ", + "and not a valid `time_type` with type ", + "matching `time_value`'s type of ", + "{class(max_time)}." + ), + class = "epipredict__set_forecast_date__wrong_time_value_type_error" + ) } if (is.null(forecast_date) || is.na(forecast_date)) { - cli::cli_warn(paste( - "epi_data's `forecast_date` was {forecast_date}, setting to ", - "the latest time value, {max_time}." - )) + cli::cli_warn( + paste( + "epi_data's `forecast_date` was {forecast_date}, setting to ", + "the latest time value, {max_time}." + ), + class = "epipredict__set_forecast_date__max_time_warning" + ) forecast_date <- max_time } else if (forecast_date < max_time) { - cli::cli_abort(paste( - "`forecast_date` ({(forecast_date)}) is before the most ", - "recent data ({max_time}). Remove before ", - "predicting." - )) + cli::cli_abort( + paste( + "`forecast_date` ({(forecast_date)}) is before the most ", + "recent data ({max_time}). Remove before ", + "predicting." + ), + class = "epipredict__set_forecast_date__misordered_forecast_date_error" + ) } # TODO cover the rest of the possible types for as_of and max_time... if (inherits(max_time, "Date")) { diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd index 6fef23ba4..81649452a 100644 --- a/man/get_test_data.Rd +++ b/man/get_test_data.Rd @@ -11,10 +11,6 @@ get_test_data(recipe, x) \item{x}{An epi_df. The typical usage is to pass the same data as that used for fitting the recipe.} - -\item{forecast_date}{By default, this is set to the maximum -\code{time_value} in \code{x}. But if there is data latency such that recent \code{NA}'s -should be filled, this may be \emph{after} the last available \code{time_value}.} } \value{ An object of the same type as \code{x} with columns \code{geo_value}, \code{time_value}, any additional @@ -32,12 +28,6 @@ The minimum required (recent) data to produce a forecast is equal to the maximum lag requested (on any predictor) plus the longest horizon used if growth rate calculations are requested by the recipe. This is calculated internally. - -It also optionally fills missing values -using the last-observation-carried-forward (LOCF) method. If this -is not possible (say because there would be only \code{NA}'s in some location), -it will produce an error suggesting alternative options to handle missing -values with more advanced techniques. } \examples{ # create recipe diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index f0f7f2a2f..ba8262c71 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -55,6 +55,13 @@ Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{id}{A unique identifier for the step} + +\item{latency_adjustment}{a character. Determines the method by which the forecast handles data that doesn't extend to the day the forecast is made. The options are: +\itemize{ +\item \code{"extend_ahead"}: actually forecasts from the last date. E.g. if there are 3 days of latency for a 4 day ahead forecast, the ahead used in practice is actually 7. +\item \code{"locf"}: carries forward the last observed value up to the forecast date. +\item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that the shortest lag at predict time is +}} } \value{ An updated version of \code{recipe} with the new step added to the diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index bdb0457de..7f1d46006 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -98,7 +98,7 @@ test_that("arx_forecaster snapshots", { test_that("arx_forecaster output format snapshots", { jhu <- case_death_rate_subset %>% - dplyr::filter(time_value >= as.Date("2021-12-01")) + dplyr::filter(time_value >= as.Date("2021-12-01")) out1 <- arx_forecaster( jhu, "death_rate", c("case_rate", "death_rate") diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 0113f2399..1f8a2889f 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -1,4 +1,9 @@ library(dplyr) +# Test ideas that were dropped: +# - "epi_adjust_latency works correctly when there's gaps in the timeseries" +# - "epi_adjust_latency extend_ahead uses the same adjustment when predicting on new data after being baked" +# - "`step_adjust_latency` only allows one instance of itself" +# - "data with epi_df shorn off works" x <- tibble( geo_value = rep("place", 200), @@ -46,30 +51,34 @@ test_that("epi_adjust_latency correctly extends the lags", { step_epi_ahead(death_rate, ahead = ahead) # directly checking the shifts - baked_x <- r1 %>% prep(real_x) %>% bake(real_x) + baked_x <- r1 %>% + prep(real_x) %>% + bake(real_x) # map each column to its last non-NA value last_dates <- baked_x %>% tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% group_by(name) %>% summarise(last_date = max(time_value)) %>% arrange(desc(last_date)) - expect_equal(last_dates, - tribble( - ~name, ~last_date, - "lag_16_death_rate", max_time + 16, - "lag_11_death_rate", max_time + 11, - "lag_10_case_rate", max_time + 10, - "lag_6_case_rate", max_time + 6, - "lag_5_death_rate", max_time + 5, - "case_rate", max_time, - "death_rate", max_time, - "ahead_7_death_rate", max_time - 7, - )) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_16_death_rate", max_time + 16, + "lag_11_death_rate", max_time + 11, + "lag_10_case_rate", max_time + 10, + "lag_6_case_rate", max_time + 6, + "lag_5_death_rate", max_time + 5, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit1 <- slm_fit(r1, data = x), regexp = "The latency is 1033"), class = "simpleError") + expect_error(expect_warning(fit1 <- slm_fit(r1, data = x), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit1 <- slm_fit(r1, data = real_x) @@ -120,7 +129,7 @@ test_that("epi_adjust_latency correctly extends the ahead", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r2))) + expect_error(expect_warning(fit5 <- slm_fit(r2), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") # real date example fit2 <- slm_fit(r2, data = real_x) expect_equal( @@ -172,7 +181,7 @@ test_that("epi_adjust_latency extends multiple aheads", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(fit3 <- fit(epi_wf, data = x)) + expect_error(expect_warning(fit3 <- fit(epi_wf, data = x), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") # real date example fit3 <- fit(epi_wf, data = real_x) expect_equal( @@ -205,7 +214,6 @@ test_that("epi_adjust_latency extends multiple aheads", { step_epi_ahead(death_rate, ahead = ahead + latency) equiv_fit <- fit(epi_wf, data = real_x) # adjusting the ahead should do the same thing as directly adjusting the ahead - equiv_fit expect_equal( fit3$fit$fit$fit$rqfit, equiv_fit$fit$fit$fit$rqfit @@ -215,13 +223,71 @@ test_that("epi_adjust_latency extends multiple aheads", { expect_equal(length(fit3$fit$fit$fit$rqfit$coefficients), 6) }) -test_that("epi_adjust_latency fixed_* work", {}) +test_that("epi_adjust_latency fixed_forecast_date works", { + r4 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_lags", fixed_forecast_date = max_time + 14) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + expect_warning(baked_x <- r4 %>% prep(real_x) %>% bake(real_x), class = "epipredict__prep.step_latency__very_large_latency") + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_25_death_rate", max_time + 25, + "lag_20_death_rate", max_time + 20, + "lag_19_case_rate", max_time + 19, + "lag_15_case_rate", max_time + 15, + "lag_14_death_rate", max_time + 14, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) +}) + +test_that("epi_adjust_latency fixed_latency works", { + r4.1 <- epi_recipe(x) %>% + step_adjust_latency(method = "extend_lags", fixed_latency = 2) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + baked_x <- r4.1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_13_death_rate", max_time + 13, + "lag_8_death_rate", max_time + 8, + "lag_7_case_rate", max_time + 7, + "lag_3_case_rate", max_time + 3, + "lag_2_death_rate", max_time + 2, + "case_rate", max_time, + "death_rate", max_time, + "ahead_7_death_rate", max_time - 7, + ) + ) +}) + + # todo test variants on the columns for which this is applied # todo need to have both on columns 1, and 2 -test_that("epi_adjust_latency works correctly when there's gaps in the timeseries", {}) -test_that("epi_adjust_latency extend_ahead uses the same adjustment when predicting on new data after being baked", {}) test_that("epi_adjust_latency works for other time types", {}) @@ -263,7 +329,7 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5), regexp = "The latency is 1033"), class = "simpleError") + expect_error(expect_warning(fit5 <- slm_fit(r5, data = x), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = x_lagged) @@ -305,12 +371,11 @@ test_that("epi_adjust_latency correctly extends the lags", { ) }) -test_that("`step_adjust_latency` only allows one instance of itself", {}) test_that("`step_adjust_latency` only uses the columns specified in the `...`", { r5 <- epi_recipe(x) %>% step_adjust_latency(case_rate, method = "extend_lags") %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) @@ -319,11 +384,13 @@ test_that("`step_adjust_latency` only uses the columns specified in the `...`", r51 <- epi_recipe(x) %>% step_adjust_latency(case_rate, method = "locf") %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) - baked_x <- r51 %>% prep(real_x) %>% bake(real_x) + baked_x <- r51 %>% + prep(real_x) %>% + bake(real_x) # map each column to its last non-NA value last_dates <- baked_x %>% tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% @@ -334,9 +401,9 @@ test_that("`step_adjust_latency` only uses the columns specified in the `...`", # iterate over all columns and make sure the latent time period has the exact same values for (ii in seq(nrow(last_dates))) { baked_var <- baked_x %>% - filter(last_dates[[ii,"locf_date"]] <= time_value, time_value <=last_dates[[ii,"last_date"]]) %>% - pull(last_dates[[ii,"name"]]) %>% - var + filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>% + pull(last_dates[[ii, "name"]]) %>% + var() if (grepl("case_rate", last_dates[[ii, "name"]])) { expect_equal(baked_var, 0) } else { @@ -345,8 +412,6 @@ test_that("`step_adjust_latency` only uses the columns specified in the `...`", } }) -test_that("setting fixed_* works for `step_adjust_latency`", {}) - test_that("printing step_adjust_latency results in expected output", { r5 <- epi_recipe(x) %>% step_adjust_latency(case_rate, method = "extend_lags") %>% @@ -361,8 +426,6 @@ test_that("printing step_adjust_latency results in expected output", { expect_snapshot(r) }) -test_that("data with epi_df shorn off works", {}) -test_that("lags of transforms (of transforms etc) work", {}) test_that("locf works as intended", { expect_warning(epi_recipe(x) %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% @@ -375,7 +438,9 @@ test_that("locf works as intended", { step_epi_ahead(death_rate, ahead = ahead) # directly checking the shifts - baked_x <- r6 %>% prep(real_x) %>% bake(real_x) + baked_x <- r6 %>% + prep(real_x) %>% + bake(real_x) # map each column to its last non-NA value last_dates <- baked_x %>% tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% @@ -386,9 +451,9 @@ test_that("locf works as intended", { # iterate over all columns and make sure the latent time period has the exact same values for (ii in seq(nrow(last_dates))) { baked_x %>% - filter(last_dates[[ii,"locf_date"]] <= time_value, time_value <=last_dates[[ii,"last_date"]]) %>% - pull(last_dates[[ii,"name"]]) %>% - var %>% + filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>% + pull(last_dates[[ii, "name"]]) %>% + var() %>% expect_equal(0) } @@ -430,10 +495,12 @@ test_that("locf works as intended", { step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) - locf_x <- real_x %>% rbind(tibble(geo_value = rep("place", latency), - time_value = max_time + 1:latency, - case_rate = rep(real_x$case_rate[nrow(x)], latency), - death_rate = rep(real_x$death_rate[nrow(x)], latency))) + locf_x <- real_x %>% rbind(tibble( + geo_value = rep("place", latency), + time_value = max_time + 1:latency, + case_rate = rep(real_x$case_rate[nrow(x)], latency), + death_rate = rep(real_x$death_rate[nrow(x)], latency) + )) fit_hand_adj <- slm_fit(hand_adjusted, data = locf_x) expect_equal( fit6$fit$fit$fit$coefficients, diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 5a5979e2a..09f77259f 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -36,15 +36,24 @@ modified_data <- arrange(time_value) test_that("get_latency works", { -expect_equal(get_latency(modified_data, as_of, "case_rate", 1, "geo_value"), 5) -expect_equal(get_latency(modified_data, as_of, "case_rate", -1, "geo_value"), -5) -expect_equal(get_latency(modified_data, as_of, "death_rate", 1, "geo_value"), 4) -expect_equal(get_latency(modified_data, as_of, "case_rate_a", 1, "geo_value"), 5 + 4) -expect_equal(get_latency(modified_data, as_of, "case_rate_b", 1, "geo_value"), 5 - 3) -expect_equal(get_latency(modified_data, as_of, "death_rate_a", 1, "geo_value"), 4 - 7) + expect_equal(get_latency(modified_data, as_of, "case_rate", 1, "geo_value"), 5) + expect_equal(get_latency(modified_data, as_of, "case_rate", -1, "geo_value"), -5) + expect_equal(get_latency(modified_data, as_of, "death_rate", 1, "geo_value"), 4) + expect_equal(get_latency(modified_data, as_of, "case_rate_a", 1, "geo_value"), 5 + 4) + expect_equal(get_latency(modified_data, as_of, "case_rate_b", 1, "geo_value"), 5 - 3) + expect_equal(get_latency(modified_data, as_of, "death_rate_a", 1, "geo_value"), 4 - 7) +}) + +test_that("get_latency infers max_time to be the minimum `max time` across grouping the specified keys", { + # place 2 is already 1 day less latent than place 1, so decreasing it's + # latency it should have no effect + place2_delayed_data <- modified_data %>% mutate(time_value = time_value + 3 * (geo_value == "place2")) + expect_equal(get_latency(place2_delayed_data, as_of, "case_rate", 1, "geo_value"), 5) + # decreaseing the latency of place1 more than 1 pushes it past place2, so at most changes the latency by 1 + place1_delayed_data <- modified_data %>% mutate(time_value = time_value + 5 * (geo_value == "place1")) + expect_equal(get_latency(place1_delayed_data, as_of, "case_rate", 1, "geo_value"), 4) }) -test_that("get_latency infers max_time to be the minimum `max time` across the epi_keys", {}) test_that("set_forecast_date works", { info <- tribble( @@ -55,16 +64,11 @@ test_that("set_forecast_date works", { "death_rate", "numeric", "raw", "original", "not_real", "numeric", "predictor", "derived" ) - expect_equal(set_forecast_date(modified_data, info, "geo_value"), as_of) - expect_equal(set_forecast_date(modified_data, info, ""), as_of) - expect_equal(set_forecast_date(modified_data, info, NULL), as_of) + expect_equal(set_forecast_date(modified_data, info, "geo_value", NULL), as_of) + expect_equal(set_forecast_date(modified_data, info, "", NULL), as_of) + expect_equal(set_forecast_date(modified_data, info, NULL, NULL), as_of) }) - - - - - time_range <- as.Date("2021-01-01") + 0:199 x_adjust_ahead <- tibble( geo_value = rep("place", 200), From bfde279c9c6a8f4c0596cf300ac1a976aadc4228 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 4 Sep 2024 12:29:11 -0500 Subject: [PATCH 57/92] rebase fixes round 2 --- NAMESPACE | 1 + R/step_adjust_latency.R | 8 ++++---- R/step_epi_shift.R | 23 ++++------------------- R/utils-latency.R | 5 +++-- man/step_epi_shift.Rd | 1 - tests/testthat/_snaps/snapshots.md | 30 ++++++++++++++---------------- 6 files changed, 26 insertions(+), 42 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c6ac51064..6ec287dc4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -307,6 +307,7 @@ importFrom(stats,residuals) importFrom(tibble,as_tibble) importFrom(tibble,tibble) importFrom(tidyr,crossing) +importFrom(tidyr,drop_na) importFrom(tidyr,expand_grid) importFrom(tidyr,fill) importFrom(tidyr,unnest) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 11e1d84fd..bb03f0237 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -139,7 +139,7 @@ then the previous `step_epi_lag`s won't work with modified data.", } if (length(fixed_latency > 1)) { template <- recipe$template - data_names <- names(template)[!names(template) %in% epi_keys(template)] + data_names <- names(template)[!names(template) %in% key_colnames(template)] wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] if (length(wrong_names) > 0) { cli::cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}", class = "epipredict__step_adjust_latency__undefined_names_error") @@ -173,7 +173,7 @@ then the previous `step_epi_lag`s won't work with modified data.", latency = fixed_latency, latency_table = NULL, default = default, - keys = epi_keys(recipe), + keys = key_colnames(recipe), columns = columns, skip = skip, id = id @@ -213,7 +213,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { latency <- x$latency forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked, latency) # construct the latency table - latency_table <- names(training)[!names(training) %in% epi_keys(training)] %>% + latency_table <- names(training)[!names(training) %in% key_colnames(training)] %>% tibble(col_name = .) if (length(recipes_eval_select(x$terms, training, info)) > 0) { latency_table <- latency_table %>% filter(col_name %in% @@ -299,7 +299,7 @@ bake.step_adjust_latency <- function(object, new_data, ...) { return(new_data) } else if (object$method == "locf") { # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns - rel_keys <- setdiff(epi_keys(new_data), "time_value") + rel_keys <- setdiff(key_colnames(new_data), "time_value") object$forecast_date unnamed_columns <- object$columns %>% unname() new_data %>% diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index bbb0fc93d..c1b0f51bc 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -19,10 +19,6 @@ #' be the lag or lead for each value in the vector. Lag integers must be #' nonnegative, while ahead integers must be positive. #' @param prefix A character string that will be prefixed to the new column. -#' @param latency_adjustment a character. Determines the method by which the forecast handles data that doesn't extend to the day the forecast is made. The options are: -#' - `"extend_ahead"`: actually forecasts from the last date. E.g. if there are 3 days of latency for a 4 day ahead forecast, the ahead used in practice is actually 7. -#' - `"locf"`: carries forward the last observed value up to the forecast date. -#' - `"extend_lags"`: per `epi_key` and `predictor`, adjusts the lag so that the shortest lag at predict time is #' @param default Determines what fills empty rows #' left by leading/lagging (defaults to NA). #' @param skip A logical. Should the step be skipped when the @@ -71,12 +67,6 @@ step_epi_lag <- } arg_is_nonneg_int(lag) arg_is_chr_scalar(prefix, id) - if (!is.null(columns)) { - cli::cli_abort(c( - "The `columns` argument must be `NULL`.", - i = "Use `tidyselect` methods to choose columns to lag." - )) - } recipes::add_step( recipe, step_epi_lag_new( @@ -87,8 +77,7 @@ step_epi_lag <- prefix = prefix, default = default, keys = key_colnames(recipe), - columns = columns, - latency_adjustment = latency_adjustment, + columns = NULL, skip = skip, id = id ) @@ -107,7 +96,6 @@ step_epi_ahead <- role = "outcome", prefix = "ahead_", default = NA, - columns = NULL, skip = FALSE, id = rand_id("epi_ahead")) { if (!is_epi_recipe(recipe)) { @@ -121,7 +109,7 @@ step_epi_ahead <- )) } arg_is_nonneg_int(ahead) - arg_is_chr_scalar(prefix, id, latency_adjustment) + arg_is_chr_scalar(prefix, id) recipes::add_step( recipe, step_epi_ahead_new( @@ -132,8 +120,7 @@ step_epi_ahead <- prefix = prefix, default = default, keys = key_colnames(recipe), - latency_adjustment = latency_adjustment, - columns = columns, + columns = NULL, skip = skip, id = id ) @@ -143,7 +130,7 @@ step_epi_ahead <- step_epi_lag_new <- function(terms, role, trained, lag, prefix, default, keys, - latency_adjustment, columns, skip, id) { + columns, skip, id) { recipes::step( subclass = "epi_lag", terms = terms, @@ -189,7 +176,6 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - latency_adjustment = x$latency_adjustment, columns = recipes::recipes_eval_select(x$terms, training, info), skip = x$skip, id = x$id @@ -206,7 +192,6 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - latency_adjustment = x$latency_adjustment, columns = recipes::recipes_eval_select(x$terms, training, info), skip = x$skip, id = x$id diff --git a/R/utils-latency.R b/R/utils-latency.R index 8356ae1e2..5498e3ede 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -32,6 +32,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal #' @importFrom dplyr select +#' @importFrom tidyr drop_na set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { original_columns <- info %>% filter(source == "original") %>% @@ -166,14 +167,14 @@ fill_locf <- function(x, forecast_date) { dplyr::mutate(fillers = forecast_date - time_value > keep) %>% dplyr::summarise( dplyr::across( - -tidyselect::any_of(epi_keys(recipe)), + -tidyselect::any_of(key_colnames(recipe)), ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) ), .groups = "drop" ) %>% dplyr::select(-fillers) %>% dplyr::summarise(dplyr::across( - -tidyselect::any_of(epi_keys(recipe)), ~ any(.x) + -tidyselect::any_of(key_colnames(recipe)), ~ any(.x) )) %>% unlist() x <- tidyr::fill(x, !time_value) diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index ba8262c71..e33cf7e0e 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -23,7 +23,6 @@ step_epi_ahead( role = "outcome", prefix = "ahead_", default = NA, - columns = NULL, skip = FALSE, id = rand_id("epi_ahead") ) diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index fb11026dd..6c439dbd8 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1036,26 +1036,24 @@ structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" ), .pred = c(0.303244704017743, 0.531332853311082, 0.588827944685979, 0.988690249216229, 0.794801997001639, 0.306895457225321), .pred_distn = structure(list( - structure(list(values = c("5%" = 0.136509784083987, "95%" = 0.469979623951498 + structure(list(values = c(0.136509784083987, 0.469979623951498 ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c("5%" = 0.364597933377326, "95%" = 0.698067773244837 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c("5%" = 0.422093024752224, "95%" = 0.755562864619735 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c("5%" = 0.821955329282474, "95%" = 1.15542516914998 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + values = c(0.364597933377326, 0.698067773244837), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.422093024752224, + 0.755562864619735), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c("5%" = 0.628067077067883, "95%" = 0.961536916935394 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + values = c(0.821955329282474, 1.15542516914998), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.628067077067883, + 0.961536916935394), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( - values = c("5%" = 0.140160537291566, "95%" = 0.473630377159077 - ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", - "dist_default", "vctrs_rcrd", "vctrs_vctr"))), class = c("distribution", - "vctrs_vctr", "list")), forecast_date = structure(c(18997, 18997, - 18997, 18997, 18997, 18997), class = "Date"), target_date = structure(c(18998, + values = c(0.140160537291566, 0.473630377159077), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", + "list")), forecast_date = structure(c(18997, 18997, 18997, 18997, + 18997, 18997), class = "Date"), target_date = structure(c(18998, 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) From 8119e72172b44d81b688410b3d45bc74a4200790 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 4 Sep 2024 17:48:27 -0500 Subject: [PATCH 58/92] NEWS+Description, partial locf tests, docs --- DEVELOPMENT.md | 1 + NAMESPACE | 3 ++ NEWS.md | 6 +-- R/arx_forecaster.R | 10 ++--- R/epi_shift.R | 3 +- R/step_adjust_latency.R | 21 +++++---- R/utils-latency.R | 55 +++++++++++++++++++---- man/arx_args_list.Rd | 10 ++--- man/arx_class_args_list.Rd | 10 ++--- man/epi_shift_single.Rd | 24 ++++++++++ man/get_grouping_columns.Rd | 11 +++++ man/pad_to_end.Rd | 22 +++++++++ man/seq_null_swap.Rd | 12 +++++ man/step_adjust_latency.Rd | 15 ++++--- tests/testthat/test-utils_latency.R | 70 +++++++++++++++++++++++++---- 15 files changed, 218 insertions(+), 55 deletions(-) create mode 100644 man/epi_shift_single.Rd create mode 100644 man/get_grouping_columns.Rd create mode 100644 man/pad_to_end.Rd create mode 100644 man/seq_null_swap.Rd diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 67f1b3003..0335f124b 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -33,6 +33,7 @@ You can also build the docs manually and launch the site with python. From the t ```bash R -e 'devtools::document()' +R -e 'pkgdown::build_site()' python -m http.server -d docs ``` diff --git a/NAMESPACE b/NAMESPACE index 6ec287dc4..a93bb0c19 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -292,6 +292,8 @@ importFrom(rlang,is_empty) importFrom(rlang,is_logical) importFrom(rlang,is_null) importFrom(rlang,is_true) +importFrom(rlang,is_vector) +importFrom(rlang,list2) importFrom(rlang,set_names) importFrom(rlang,sym) importFrom(stats,as.formula) @@ -311,6 +313,7 @@ importFrom(tidyr,drop_na) importFrom(tidyr,expand_grid) importFrom(tidyr,fill) importFrom(tidyr,unnest) +importFrom(tidyselect,all_of) importFrom(vctrs,as_list_of) importFrom(vctrs,field) importFrom(vctrs,new_rcrd) diff --git a/NEWS.md b/NEWS.md index 3c3a0f942..5b907f237 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,6 @@ # epipredict (development) Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicate PR's. -# epipredict 0.2 - -- add `latency_adjustment` as an option for `add_epi_ahead`, which adjusts the `ahead` so that the prediction is `ahead` relative to the `as_of` date for the `epi_data`, rather than relative to the last day of data. # epipredict 0.1 @@ -64,3 +61,6 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Fix bug where `fit()` drops the `epi_workflow` class (also error if non-`epi_df` data is given to `epi_recipe()`), #363 - Try to retain the `epi_df` class during baking to the extent possible, #376 +- Add `latency_adjustment` as an option for `add_epi_ahead`, which adjusts the + `ahead` so that the prediction is `ahead` relative to the `as_of` date for the + `epi_data`, rather than relative to the last day of data. diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index a67d07c11..702a259eb 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -227,11 +227,11 @@ arx_fcast_epi_workflow <- function( #' difference. The options are: #' - `NULL` the default, assumes the `forecast_date` is the last day of data #' - `"extend_ahead"`: increase the `ahead` by the latency so it's relative to -#' the last day of data. If the last day of data was 3 days ago, the ahead -#' becomes `ahead+3`. -#' - `"extend_lags"`: increase the lags so they're relative to the actual forecast date. If the lags are -#' `c(0,7,14)` and the last day of data was 3 days ago, the lags become -#' `c(3,10,17)`. +#' the last day of data. For example, if the last day of data was 3 days ago, +#' the ahead becomes `ahead+3`. +#' - `"extend_lags"`: increase the lags so they're relative to the actual +#' forecast date. For example, if the lags are `c(0,7,14)` and the last day of +#' data was 3 days ago, the lags become `c(3,10,17)`. #' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce #' prediction intervals. These are created by computing the quantiles of #' training residuals. A `NULL` value will result in point forecasts only. diff --git a/R/epi_shift.R b/R/epi_shift.R index 0bd3da30c..9365331a9 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -55,7 +55,8 @@ add_shifted_columns <- function(new_data, object, amount) { left_join(rel_latency, by = join_by(col == col_name), ) %>% tidyr::replace_na(list(latency = 0)) %>% mutate( - shift_val = amount + latency) %>% + shift_val = amount + latency + ) %>% mutate( newname = glue::glue("{object$prefix}{abs(shift_val)}_{col}"), # name is always positive amount = NULL, diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index bb03f0237..0dd0f3600 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -3,11 +3,13 @@ #' In the standard case, the arx models assume that the last observation is also #' the day from which the forecast is being made. But if the data has latency, #' then you may wish to adjust the predictors (lags) and/or the outcome (ahead) -#' to compensate. This allows the model to create bleeding-edge forecasts using -#' the lags actually observed rather than anticipated. `step_adjust_latency` -#' uses the `as_of` date of the `epi_df` as the `forecast_date`. This is most -#' useful in realtime and pseudo-prospective forecasting for data where there is -#' some delay between the day recorded and when that data is available. +#' to compensate. This allows the user to create models on the most recent data, +#' regardless of latency patterns. Instead of using the last observation date, +#' `step_adjust_latency` uses the `as_of` date of the `epi_df` as the +#' `forecast_date`, potentially using different dates depending on the +#' `epi_keys`, such as geography. This is most useful in realtime and +#' pseudo-prospective forecasting for data where there is some delay between the +#' event occurring and the event being reported. #' #' @param method a character. Determines the method by which the #' forecast handles latency. The options are: @@ -17,8 +19,7 @@ #' `forecast_date` date for a 4 day ahead forecast, the ahead used in practice #' is actually 7. #' - `"locf"`: carries forward the last observed value(s) up to the forecast -#' date. See the Vignette TODO for equivalents using other steps and more -#' sophisticated methods of extrapolation. +#' date. #' - `"extend_lags"`: per `epi_key` and `predictor`, adjusts the lag so that #' the shortest lag at predict time is at the last observation. E.g. if the #' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used @@ -296,11 +297,9 @@ bake.step_adjust_latency <- function(object, new_data, ...) { attributes(new_data)$metadata$shift_sign <- get_sign(object) attributes(new_data)$metadata$latency_table <- object$latency_table keys <- object$keys - return(new_data) } else if (object$method == "locf") { # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns rel_keys <- setdiff(key_colnames(new_data), "time_value") - object$forecast_date unnamed_columns <- object$columns %>% unname() new_data %>% pad_to_end(rel_keys, object$forecast_date) %>% @@ -308,9 +307,9 @@ bake.step_adjust_latency <- function(object, new_data, ...) { arrange(time_value) %>% as_tibble() %>% tidyr::fill(.direction = "down", any_of(unnamed_columns)) %>% - ungroup() %>% - return() + ungroup() } + return(new_data) } #' @export print.step_adjust_latency <- diff --git a/R/utils-latency.R b/R/utils-latency.R index 5498e3ede..b32edf7b5 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -180,22 +180,59 @@ fill_locf <- function(x, forecast_date) { x <- tidyr::fill(x, !time_value) } -pad_to_end <- function(x, groups, end_date) { +#' pad every group at the right interval +#' @description +#' Perform last observation carried forward on a group by group basis. It uses +#' `guess_period` to find the appropriate interval to fill-forward by. It +#' maintains the grouping structure it recieves. It does *not* fill any +#' "interior" `NA` values occurring in the data beforehand. +#' @param x an epi_df to be filled forward. +#' @param columns_to_complete which columns to apply completion to. By default every non-key column of an epi_df +#' @param groups the grouping by which to fill forward +#' @importFrom dplyr across, arrange, bind_rows, group_by, summarise +#' @importFrom tidyselect all_of +#' @importFrom rlang list2 +#' @importFrom vctrs vec_cast +#' @keywords internal +pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { + if (is.null(columns_to_complete)) { + columns_to_complete <- setdiff(names(x), key_colnames(x)) + } itval <- epiprocess:::guess_period(c(x$time_value, end_date), "time_value") - completed_time_values <- x %>% - dplyr::group_by(dplyr::across(tidyselect::all_of(groups))) %>% - dplyr::summarise( - time_value = rlang::list2( - time_value = seq_null_swap(max(time_value) + itval, end_date, itval) + # get the time values we need to fill in + completed_time_values <- + x %>% + group_by(across(all_of(groups))) %>% + summarise( + time_value = list2( + time_value = seq_null_swap(from = max(time_value) + itval, to = end_date, by = itval) ) ) %>% unnest("time_value") %>% - mutate(time_value = vctrs::vec_cast(time_value, x$time_value)) + mutate(time_value = vec_cast(time_value, x$time_value)) + # pull the last value in each group and fill forward + filled_values <- x %>% + group_by(across(all_of(groups))) %>% + slice_tail() %>% + bind_rows(completed_time_values) %>% + arrange(across(all_of(c("time_value", groups)))) %>% + fill(all_of(columns_to_complete), .direction = "down") %>% + slice(-1) - dplyr::bind_rows(x, completed_time_values) %>% - dplyr::arrange(dplyr::across(tidyselect::all_of(c("time_value", groups)))) + bind_rows(x, filled_values) %>% + arrange(across(all_of(key_colnames(x)))) %>% + ungroup() %>% + group_by(across(all_of(get_grouping_columns(x)))) } +#' return the names of the grouped columns, or `NULL` +get_grouping_columns <- function(x) { + group_names <- names(attributes(x)$groups) + head(group_names, -1) +} + +#' seq, but returns null if from is larger +#' @keywords internal seq_null_swap <- function(from, to, by) { if (from > to) { return(NULL) diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index 99e2f4063..84e8c72d9 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -49,11 +49,11 @@ difference. The options are: \itemize{ \item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data \item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to -the last day of data. If the last day of data was 3 days ago, the ahead -becomes \code{ahead+3}. -\item \code{"extend_lags"}: increase the lags so they're relative to the actual forecast date. If the lags are -\code{c(0,7,14)} and the last day of data was 3 days ago, the lags become -\code{c(3,10,17)}. +the last day of data. For example, if the last day of data was 3 days ago, +the ahead becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual +forecast date. For example, if the lags are \code{c(0,7,14)} and the last day of +data was 3 days ago, the lags become \code{c(3,10,17)}. }} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 9c1832142..3502508cb 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -51,11 +51,11 @@ difference. The options are: \itemize{ \item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data \item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to -the last day of data. If the last day of data was 3 days ago, the ahead -becomes \code{ahead+3}. -\item \code{"extend_lags"}: increase the lags so they're relative to the actual forecast date. If the lags are -\code{c(0,7,14)} and the last day of data was 3 days ago, the lags become -\code{c(3,10,17)}. +the last day of data. For example, if the last day of data was 3 days ago, +the ahead becomes \code{ahead+3}. +\item \code{"extend_lags"}: increase the lags so they're relative to the actual +forecast date. For example, if the lags are \code{c(0,7,14)} and the last day of +data was 3 days ago, the lags become \code{c(3,10,17)}. }} \item{outcome_transform}{Scalar character. Whether the outcome should diff --git a/man/epi_shift_single.Rd b/man/epi_shift_single.Rd new file mode 100644 index 000000000..871879004 --- /dev/null +++ b/man/epi_shift_single.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/epi_shift.R +\name{epi_shift_single} +\alias{epi_shift_single} +\title{Shift predictors while maintaining grouping and time_value ordering} +\usage{ +epi_shift_single(x, col, shift_val, newname, key_cols) +} +\arguments{ +\item{x}{Data frame.} + +\item{shift_val}{a single integer. Negative values produce leads.} + +\item{newname}{the name for the newly shifted column} + +\item{key_cols}{vector, or \code{NULL}. Additional grouping vars.} +} +\value{ +a list of tibbles +} +\description{ +This is a lower-level function. As such it performs no error checking. +} +\keyword{internal} diff --git a/man/get_grouping_columns.Rd b/man/get_grouping_columns.Rd new file mode 100644 index 000000000..6b653628d --- /dev/null +++ b/man/get_grouping_columns.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_grouping_columns} +\alias{get_grouping_columns} +\title{return the names of the grouped columns, or \code{NULL}} +\usage{ +get_grouping_columns(x) +} +\description{ +return the names of the grouped columns, or \code{NULL} +} diff --git a/man/pad_to_end.Rd b/man/pad_to_end.Rd new file mode 100644 index 000000000..b9cde372e --- /dev/null +++ b/man/pad_to_end.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{pad_to_end} +\alias{pad_to_end} +\title{pad every group at the right interval} +\usage{ +pad_to_end(x, groups, end_date, columns_to_complete = NULL) +} +\arguments{ +\item{x}{an epi_df to be filled forward.} + +\item{groups}{the grouping by which to fill forward} + +\item{columns_to_complete}{which columns to apply completion to. By default every non-key column of an epi_df} +} +\description{ +Perform last observation carried forward on a group by group basis. It uses +\code{guess_period} to find the appropriate interval to fill-forward by. It +maintains the grouping structure it recieves. It does \emph{not} fill any +"interior" \code{NA} values occurring in the data beforehand. +} +\keyword{internal} diff --git a/man/seq_null_swap.Rd b/man/seq_null_swap.Rd new file mode 100644 index 000000000..7ad5a8954 --- /dev/null +++ b/man/seq_null_swap.Rd @@ -0,0 +1,12 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{seq_null_swap} +\alias{seq_null_swap} +\title{seq, but returns null if from is larger} +\usage{ +seq_null_swap(from, to, by) +} +\description{ +seq, but returns null if from is larger +} +\keyword{internal} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 1006a76dc..3da958045 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -42,8 +42,7 @@ E.g. if there are 3 days of latency between the last observation and the \code{forecast_date} date for a 4 day ahead forecast, the ahead used in practice is actually 7. \item \code{"locf"}: carries forward the last observed value(s) up to the forecast -date. See the Vignette TODO for equivalents using other steps and more -sophisticated methods of extrapolation. +date. \item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that the shortest lag at predict time is at the last observation. E.g. if the lags are \code{c(0,7,14)} for data that is 3 days latent, the actual lags used @@ -98,11 +97,13 @@ sequence of any existing operations. In the standard case, the arx models assume that the last observation is also the day from which the forecast is being made. But if the data has latency, then you may wish to adjust the predictors (lags) and/or the outcome (ahead) -to compensate. This allows the model to create bleeding-edge forecasts using -the lags actually observed rather than anticipated. \code{step_adjust_latency} -uses the \code{as_of} date of the \code{epi_df} as the \code{forecast_date}. This is most -useful in realtime and pseudo-prospective forecasting for data where there is -some delay between the day recorded and when that data is available. +to compensate. This allows the user to create models on the most recent data, +regardless of latency patterns. Instead of using the last observation date, +\code{step_adjust_latency} uses the \code{as_of} date of the \code{epi_df} as the +\code{forecast_date}, potentially using different dates depending on the +\code{epi_keys}, such as geography. This is most useful in realtime and +pseudo-prospective forecasting for data where there is some delay between the +event occurring and the event being reported. } \details{ The step assumes that the pipeline has already applied either diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 09f77259f..a23be628b 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -34,6 +34,14 @@ modified_data <- by = keys ) %>% arrange(time_value) +time_range <- as.Date("2021-01-01") + 0:199 +x_adjust_ahead <- tibble( + geo_value = rep("place", 200), + time_value = time_range, + case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, + death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 +) %>% + as_epi_df(as_of = max(time_range) + 3) test_that("get_latency works", { expect_equal(get_latency(modified_data, as_of, "case_rate", 1, "geo_value"), 5) @@ -69,16 +77,7 @@ test_that("set_forecast_date works", { expect_equal(set_forecast_date(modified_data, info, NULL, NULL), as_of) }) -time_range <- as.Date("2021-01-01") + 0:199 -x_adjust_ahead <- tibble( - geo_value = rep("place", 200), - time_value = time_range, - case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, - death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 -) %>% - as_epi_df(as_of = max(time_range) + 3) # confirm the delay is right - test_that("adjust_latency extend_ahead works", { # testing that POSIXct converts correctly (as well as basic types) expect_equal( @@ -100,4 +99,57 @@ test_that("extend_ahead warns in case of extreme adjustment", { testthat::expect_warning(adjust_latency(object, x_adjust_ahead), regexp = "The ahead has been adjusted by 100") }) +test_that("pad_to_end works correctly", { + single_ex <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "1", as.Date("1066-10-13"), 2, -.6, + "1", as.Date("1066-10-14"), NA, NA, + "1", as.Date("1066-10-15"), 1, -.5, + "2", as.Date("1066-10-13"), 3, .9 + ) %>% + as_epi_df(as_of = "1066-10-16") + expect_equal( + single_ex %>% pad_to_end("geo_value", as.Date("1066-10-16")), + rbind( + single_ex, + tibble(geo_value = "1", time_value = as.Date("1066-10-16"), a = 1, b = -.5), + tibble( + geo_value = "2", + time_value = seq.Date( + from = as.Date("1066-10-14"), + to = as.Date("1066-10-16"), + by = 1 + ), + a = 3, b = .9 + ) + ) %>% arrange(geo_value, time_value) + ) +}) + + +test_that("pad_to_end handles weeks", { + single_ex <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "1", as.Date("1066-10-14"), 2, -.6, + "1", as.Date("1066-10-21"), 1, -.5, + "2", as.Date("1066-10-14"), 3, .9 + ) %>% + as_epi_df(as_of = "1066-10-28") + expect_equal( + single_ex %>% pad_to_end("geo_value", as.Date("1066-10-28")), + rbind( + single_ex, + tibble(geo_value = "1", time_value = as.Date("1066-10-28"), a = 1, b = -.5), + tibble( + geo_value = "2", + time_value = seq.Date( + from = as.Date("1066-10-21"), + to = as.Date("1066-10-28"), + by = 7 + ), + a = 3, b = .9 + ) + ) %>% arrange(time_value, geo_value) + ) +}) # todo case where somehow columns of different roles are selected From 7816d132ee4564fe31a5026e61e719e6d81b135f Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 4 Sep 2024 18:05:58 -0500 Subject: [PATCH 59/92] testing the step --- R/step_adjust_latency.R | 2 +- tests/testthat/test-step_adjust_latency.R | 52 +++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 0dd0f3600..cdef74565 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -301,7 +301,7 @@ bake.step_adjust_latency <- function(object, new_data, ...) { # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns rel_keys <- setdiff(key_colnames(new_data), "time_value") unnamed_columns <- object$columns %>% unname() - new_data %>% + new_data <- new_data %>% pad_to_end(rel_keys, object$forecast_date) %>% # group_by_at(rel_keys) %>% arrange(time_value) %>% diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 1f8a2889f..a557fc146 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -165,6 +165,58 @@ test_that("epi_adjust_latency correctly extends the ahead", { expect_equal(length(fit2$fit$fit$fit$coefficients), 6) }) +test_that("epi_adjust_latency correctly locfs", { + r1 <- epi_recipe(x) %>% + step_adjust_latency(method = "locf") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + # directly checking the shifts + baked_x <- r1 %>% + prep(real_x) %>% + bake(real_x) + # map each column to its last non-NA value + last_dates <- baked_x %>% + tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% + group_by(name) %>% + summarise(last_date = max(time_value)) %>% + arrange(desc(last_date)) + expect_equal( + last_dates, + tribble( + ~name, ~last_date, + "lag_11_death_rate", max_time + 11, + "lag_6_death_rate", max_time + 6, + "lag_5_case_rate", max_time + 5, + "lag_1_case_rate", max_time + 1, + "case_rate", max_time, + "death_rate", max_time, + "lag_0_death_rate", max_time + 0, + "ahead_7_death_rate", max_time - 7, + ) + ) + # we expect a 5-fold repetition of the last values found in the original + # epi_df + last_real <- real_x %>% + group_by(geo_value) %>% + arrange(time_value) %>% + slice_tail() %>% + ungroup() %>% + select(case_rate, death_rate) %>% + uncount(5) + # pulling just the region between the last day and the prediction day + filled_values <- + baked_x %>% + filter( + time_value > max(real_x$time_value), + time_value <= attributes(real_x)$metadata$as_of + ) %>% + ungroup() %>% + select(case_rate, death_rate) + expect_equal(last_real, filled_values) +}) + test_that("epi_adjust_latency extends multiple aheads", { aheads <- 1:3 r3 <- epi_recipe(x) %>% From edc2b3f641c5f2cd66a01d05f861ea3278a47b14 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 5 Sep 2024 11:43:05 -0500 Subject: [PATCH 60/92] step locf tests passing, grf pkgdown --- NAMESPACE | 3 +++ R/get_test_data.R | 2 +- R/step_adjust_latency.R | 2 +- R/utils-latency.R | 23 ++++---------------- man/get_grouping_columns.Rd | 4 ++++ tests/testthat/_snaps/snapshots.md | 26 +++++++++++++++++++++++ tests/testthat/test-snapshots.R | 13 ++++++++++++ tests/testthat/test-step_adjust_latency.R | 20 ++++++++--------- tests/testthat/test-utils_latency.R | 2 +- 9 files changed, 63 insertions(+), 32 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index a93bb0c19..bc83aa2f8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -301,6 +301,7 @@ importFrom(stats,family) importFrom(stats,lm) importFrom(stats,median) importFrom(stats,model.frame) +importFrom(stats,na.omit) importFrom(stats,poly) importFrom(stats,predict) importFrom(stats,qnorm) @@ -314,6 +315,8 @@ importFrom(tidyr,expand_grid) importFrom(tidyr,fill) importFrom(tidyr,unnest) importFrom(tidyselect,all_of) +importFrom(utils,capture.output) +importFrom(utils,head) importFrom(vctrs,as_list_of) importFrom(vctrs,field) importFrom(vctrs,new_rcrd) diff --git a/R/get_test_data.R b/R/get_test_data.R index f1d83aad0..5e74da5a1 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -25,8 +25,8 @@ #' step_epi_lag(case_rate, lag = c(0, 7, 14)) #' get_test_data(recipe = rec, x = case_death_rate_subset) #' @importFrom rlang %@% +#' @importFrom stats na.omit #' @export - get_test_data <- function(recipe, x) { if (!is_epi_df(x)) cli::cli_abort("`x` must be an `epi_df`.") diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index cdef74565..1275c3948 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -302,7 +302,7 @@ bake.step_adjust_latency <- function(object, new_data, ...) { rel_keys <- setdiff(key_colnames(new_data), "time_value") unnamed_columns <- object$columns %>% unname() new_data <- new_data %>% - pad_to_end(rel_keys, object$forecast_date) %>% + pad_to_end(rel_keys, object$forecast_date, unnamed_columns) %>% # group_by_at(rel_keys) %>% arrange(time_value) %>% as_tibble() %>% diff --git a/R/utils-latency.R b/R/utils-latency.R index b32edf7b5..35db0b484 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -33,6 +33,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' @keywords internal #' @importFrom dplyr select #' @importFrom tidyr drop_na +#' @importFrom utils capture.output set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { original_columns <- info %>% filter(source == "original") %>% @@ -161,25 +162,6 @@ get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new } -fill_locf <- function(x, forecast_date) { - cannot_be_used <- x %>% - dplyr::filter(forecast_date - time_value <= n_recent) %>% - dplyr::mutate(fillers = forecast_date - time_value > keep) %>% - dplyr::summarise( - dplyr::across( - -tidyselect::any_of(key_colnames(recipe)), - ~ all(is.na(.x[fillers])) & is.na(head(.x[!fillers], 1)) - ), - .groups = "drop" - ) %>% - dplyr::select(-fillers) %>% - dplyr::summarise(dplyr::across( - -tidyselect::any_of(key_colnames(recipe)), ~ any(.x) - )) %>% - unlist() - x <- tidyr::fill(x, !time_value) -} - #' pad every group at the right interval #' @description #' Perform last observation carried forward on a group by group basis. It uses @@ -226,6 +208,9 @@ pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { } #' return the names of the grouped columns, or `NULL` +#' @param x an epi_df +#' @keywords internal +#' @importFrom utils head get_grouping_columns <- function(x) { group_names <- names(attributes(x)$groups) head(group_names, -1) diff --git a/man/get_grouping_columns.Rd b/man/get_grouping_columns.Rd index 6b653628d..f8b61af42 100644 --- a/man/get_grouping_columns.Rd +++ b/man/get_grouping_columns.Rd @@ -6,6 +6,10 @@ \usage{ get_grouping_columns(x) } +\arguments{ +\item{x}{an epi_df} +} \description{ return the names of the grouped columns, or \code{NULL} } +\keyword{internal} diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 6c439dbd8..6b337c056 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1057,6 +1057,32 @@ 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame")) +--- + + structure(list(geo_value = c("ca", "fl", "ga", "ny", "pa", "tx" + ), .pred = c(0.303244704017743, 0.531332853311082, 0.588827944685979, + 0.988690249216229, 0.794801997001639, 0.306895457225321), .pred_distn = structure(list( + structure(list(values = c(0.136509784083987, 0.469979623951498 + ), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.364597933377326, 0.698067773244837), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.422093024752224, + 0.755562864619735), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.821955329282474, 1.15542516914998), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr")), structure(list(values = c(0.628067077067883, + 0.961536916935394), quantile_levels = c(0.05, 0.95)), class = c("dist_quantiles", + "dist_default", "vctrs_rcrd", "vctrs_vctr")), structure(list( + values = c(0.140160537291566, 0.473630377159077), quantile_levels = c(0.05, + 0.95)), class = c("dist_quantiles", "dist_default", "vctrs_rcrd", + "vctrs_vctr"))), class = c("distribution", "vctrs_vctr", + "list")), forecast_date = structure(c(18997, 18997, 18997, 18997, + 18997, 18997), class = "Date"), target_date = structure(c(18998, + 18998, 18998, 18998, 18998, 18998), class = "Date")), row.names = c(NA, + -6L), class = c("tbl_df", "tbl", "data.frame")) + # arx_forecaster output format snapshots Code diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index 7f1d46006..003fc8319 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -94,6 +94,19 @@ test_that("arx_forecaster snapshots", { ) # not the same predictions expect_false(all(arx2$predictions == arx3$predictions)) + + + arx4 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = arx_args_list( + ahead = 1L, + adjust_latency = "locf" + ) + ) + # consistency check + expect_snapshot_tibble(arx3$predictions) }) test_that("arx_forecaster output format snapshots", { diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index a557fc146..75656989a 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -186,14 +186,14 @@ test_that("epi_adjust_latency correctly locfs", { last_dates, tribble( ~name, ~last_date, - "lag_11_death_rate", max_time + 11, - "lag_6_death_rate", max_time + 6, - "lag_5_case_rate", max_time + 5, - "lag_1_case_rate", max_time + 1, - "case_rate", max_time, - "death_rate", max_time, - "lag_0_death_rate", max_time + 0, - "ahead_7_death_rate", max_time - 7, + "lag_11_death_rate", max_time + 16, + "lag_6_death_rate", max_time + 11, + "lag_5_case_rate", max_time + 10, + "lag_1_case_rate", max_time + 6, + "case_rate", max_time + 5, + "death_rate", max_time + 5, + "lag_0_death_rate", max_time + 5, + "ahead_7_death_rate", max_time - 2, ) ) # we expect a 5-fold repetition of the last values found in the original @@ -204,7 +204,7 @@ test_that("epi_adjust_latency correctly locfs", { slice_tail() %>% ungroup() %>% select(case_rate, death_rate) %>% - uncount(5) + tidyr::uncount(5) # pulling just the region between the last day and the prediction day filled_values <- baked_x %>% @@ -450,7 +450,7 @@ test_that("`step_adjust_latency` only uses the columns specified in the `...`", summarise(last_date = max(time_value)) %>% arrange(desc(last_date)) %>% mutate(locf_date = last_date - latency) - # iterate over all columns and make sure the latent time period has the exact same values + # iterate over all columns and make sure the latent time period has the exact same values (so the variance is zero) for (ii in seq(nrow(last_dates))) { baked_var <- baked_x %>% filter(last_dates[[ii, "locf_date"]] <= time_value, time_value <= last_dates[[ii, "last_date"]]) %>% diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index a23be628b..78f294564 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -149,7 +149,7 @@ test_that("pad_to_end handles weeks", { ), a = 3, b = .9 ) - ) %>% arrange(time_value, geo_value) + ) %>% arrange(geo_value, time_value) ) }) # todo case where somehow columns of different roles are selected From acc5fa0fdddc4ee00be827f5fe199838a7b72896 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 5 Sep 2024 23:01:40 -0500 Subject: [PATCH 61/92] locf correct on NA at end columns --- R/step_adjust_latency.R | 8 ++----- R/utils-latency.R | 35 +++++++++++++++++++++++------ tests/testthat/test-utils_latency.R | 17 ++++++++++++-- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 1275c3948..defbd9262 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -101,7 +101,6 @@ step_adjust_latency <- epi_keys_checked = c("geo_value"), fixed_latency = NULL, fixed_forecast_date = NULL, - default = NA, skip = FALSE, columns = NULL, id = recipes::rand_id("adjust_latency")) { @@ -173,7 +172,6 @@ then the previous `step_epi_lag`s won't work with modified data.", forecast_date = fixed_forecast_date, latency = fixed_latency, latency_table = NULL, - default = default, keys = key_colnames(recipe), columns = columns, skip = skip, @@ -183,8 +181,8 @@ then the previous `step_epi_lag`s won't work with modified data.", } step_adjust_latency_new <- - function(terms, role, trained, forecast_date, latency, latency_table, time_type, default, - keys, method, epi_keys_checked, columns, skip, id) { + function(terms, role, trained, forecast_date, latency, latency_table, + time_type, keys, method, epi_keys_checked, columns, skip, id) { step( subclass = "adjust_latency", terms = terms, @@ -195,7 +193,6 @@ step_adjust_latency_new <- forecast_date = forecast_date, latency = latency, latency_table = latency_table, - default = default, keys = keys, columns = columns, skip = skip, @@ -274,7 +271,6 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { forecast_date = forecast_date, latency = unique(latency_table$latency), latency_table = latency_table, - default = x$default, keys = x$keys, method = x$method, epi_keys_checked = x$epi_keys_checked, diff --git a/R/utils-latency.R b/R/utils-latency.R index 35db0b484..9148c7c2b 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -171,10 +171,10 @@ get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new #' @param x an epi_df to be filled forward. #' @param columns_to_complete which columns to apply completion to. By default every non-key column of an epi_df #' @param groups the grouping by which to fill forward -#' @importFrom dplyr across, arrange, bind_rows, group_by, summarise #' @importFrom tidyselect all_of #' @importFrom rlang list2 #' @importFrom vctrs vec_cast +#' @importFrom dplyr across arrange bind_rows group_by summarise #' @keywords internal pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { if (is.null(columns_to_complete)) { @@ -193,15 +193,20 @@ pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { unnest("time_value") %>% mutate(time_value = vec_cast(time_value, x$time_value)) # pull the last value in each group and fill forward - filled_values <- x %>% - group_by(across(all_of(groups))) %>% - slice_tail() %>% + grouped_and_arranged <- x %>% + arrange(across(all_of(c("time_value", groups)))) %>% + group_by(across(all_of(groups))) + values_to_fill <- grouped_and_arranged %>% + slice(min(across(columns_to_complete, count_single_column)):n()) + + filled_values <- values_to_fill %>% bind_rows(completed_time_values) %>% arrange(across(all_of(c("time_value", groups)))) %>% fill(all_of(columns_to_complete), .direction = "down") %>% - slice(-1) - - bind_rows(x, filled_values) %>% + slice(-1) # remove the oirginal rows + grouped_and_arranged %>% + slice(1:min(across(columns_to_complete, count_single_column))) %>% + bind_rows(filled_values) %>% arrange(across(all_of(key_colnames(x)))) %>% ungroup() %>% group_by(across(all_of(get_grouping_columns(x)))) @@ -216,6 +221,22 @@ get_grouping_columns <- function(x) { head(group_names, -1) } +#' get the location of the last real value +#' @param col the relevant column +#' @param from_last instead of the number of columns +#' @keywords internal +count_single_column <- function(col, from_last) { + run_lengths <- rle(is.na(col)) + n_el <- length(col) + if (tail(run_lengths$values, 1)) { + n_at_end <- tail(run_lengths$lengths, 1) + return(n_el - n_at_end) + } else { + return(n_el) + } +} + + #' seq, but returns null if from is larger #' @keywords internal seq_null_swap <- function(from, to, by) { diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 78f294564..e0f35d7fa 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -103,15 +103,19 @@ test_that("pad_to_end works correctly", { single_ex <- tribble( ~geo_value, ~time_value, ~a, ~b, "1", as.Date("1066-10-13"), 2, -.6, + # internal NA "1", as.Date("1066-10-14"), NA, NA, "1", as.Date("1066-10-15"), 1, -.5, - "2", as.Date("1066-10-13"), 3, .9 + "2", as.Date("1066-10-13"), 3, .9, + # note these are intentionally out of order + "3", as.Date("1066-10-14"), 2.5, NA, + "3", as.Date("1066-10-13"), 2, -.6, ) %>% as_epi_df(as_of = "1066-10-16") expect_equal( single_ex %>% pad_to_end("geo_value", as.Date("1066-10-16")), rbind( - single_ex, + single_ex[-5, ], tibble(geo_value = "1", time_value = as.Date("1066-10-16"), a = 1, b = -.5), tibble( geo_value = "2", @@ -121,6 +125,15 @@ test_that("pad_to_end works correctly", { by = 1 ), a = 3, b = .9 + ), + tibble( + geo_value = "3", + time_value = seq.Date( + from = as.Date("1066-10-14"), + to = as.Date("1066-10-16"), + by = 1 + ), + a = 2.5, b = -0.6 ) ) %>% arrange(geo_value, time_value) ) From a11fa5f2593226e59e033dc0f5b6513ce82a63c4 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 9 Sep 2024 14:47:41 -0500 Subject: [PATCH 62/92] docs along with more extensive tests --- NAMESPACE | 4 +- R/arx_forecaster.R | 20 +- R/epi_shift.R | 5 +- R/epi_workflow.R | 1 - R/step_adjust_latency.R | 209 +++++++++++----- R/utils-latency.R | 53 +++- man/arx_args_list.Rd | 22 +- man/arx_class_args_list.Rd | 22 +- man/cdc_baseline_args_list.Rd | 12 +- man/flatline_args_list.Rd | 17 +- man/step_adjust_latency.Rd | 228 +++++++++++++++--- tests/testthat/test-layer_add_forecast_date.R | 4 +- tests/testthat/test-step_adjust_latency.R | 93 +++++-- tests/testthat/test-utils_latency.R | 21 ++ 14 files changed, 555 insertions(+), 156 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index bc83aa2f8..ba319cecd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -244,13 +244,16 @@ importFrom(dplyr,group_by_at) importFrom(dplyr,join_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) +importFrom(dplyr,n) importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) +importFrom(dplyr,rowwise) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,summarize) importFrom(dplyr,tibble) +importFrom(dplyr,tribble) importFrom(dplyr,ungroup) importFrom(epiprocess,epi_slide) importFrom(epiprocess,growth_rate) @@ -273,7 +276,6 @@ importFrom(recipes,bake) importFrom(recipes,detect_step) importFrom(recipes,prep) importFrom(recipes,rand_id) -importFrom(recipes,recipes_eval_select) importFrom(rlang,"!!!") importFrom(rlang,"!!") importFrom(rlang,"%@%") diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 702a259eb..42729ed85 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -213,18 +213,18 @@ arx_fcast_epi_workflow <- function( #' @param n_training Integer. An upper limit for the number of rows per #' key that are used for training #' (in the time unit of the `epi_df`). -#' @param forecast_date Date. The date on which the forecast is created. The -#' default `NULL` will attempt to determine this automatically either as the -#' max time value if there is no latency adjustment, or as the `as_of` of -#' `epi_data` if `adjust_latency` is non-`NULL`. -#' @param target_date Date. The date for which the forecast is intended. The -#' default `NULL` will attempt to determine this automatically as -#' `forecast_date + ahead`. +#' @param forecast_date Date. The date from which the forecast is occurring. +#' The default `NULL` will determine this automatically from either +#' 1. the maximum time value for which there's data if there is no latency +#' adjustment (the default case), or +#' 2. the `as_of` date of `epi_data` if `adjust_latency` is +#' non-`NULL`. +#' @param target_date Date. The date that is being forecast. The default `NULL` +#' will determine this automatically as `forecast_date + ahead`. #' @param adjust_latency Character or `NULL`. One of the `method`s of #' [step_adjust_latency()], or `NULL` (in which case there is no adjustment). -#' If there is a difference between the `forecast_date` and the last day of -#' data, this determines how to shift the model to account for this -#' difference. The options are: +#' If the `forecast_date` is after the last day of data, this determines how +#' to shift the model to account for this difference. The options are: #' - `NULL` the default, assumes the `forecast_date` is the last day of data #' - `"extend_ahead"`: increase the `ahead` by the latency so it's relative to #' the last day of data. For example, if the last day of data was 3 days ago, diff --git a/R/epi_shift.R b/R/epi_shift.R index 9365331a9..e0b68b579 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -46,9 +46,12 @@ add_shifted_columns <- function(new_data, object, amount) { shift_sign_lat <- attributes(new_data)$metadata$shift_sign if (!is.null(latency_table) && shift_sign_lat == sign_shift) { - # TODO this doesn't work on lags of transforms + # are we adding an unreasonable amount of latency? + check_interminable_latency(new_data, latency_table, object$columns, attributes(new_data)$metadata$forecast_date) + # get the actually used latencies rel_latency <- latency_table %>% filter(col_name %in% object$columns) } else { + # adding zero if there's no latency table rel_latency <- tibble(col_name = object$columns, latency = 0L) } grid <- expand_grid(col = object$columns, amount = sign_shift * amount) %>% diff --git a/R/epi_workflow.R b/R/epi_workflow.R index 34141b732..19a30d73a 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -43,7 +43,6 @@ epi_workflow <- function(preprocessor = NULL, spec = NULL, postprocessor = NULL) if (!is_null(postprocessor)) { out <- add_postprocessor(out, postprocessor) } - out } diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index defbd9262..e4968920e 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -1,16 +1,133 @@ #' Adapt the model to latent data #' +#' @description #' In the standard case, the arx models assume that the last observation is also #' the day from which the forecast is being made. But if the data has latency, #' then you may wish to adjust the predictors (lags) and/or the outcome (ahead) -#' to compensate. This allows the user to create models on the most recent data, -#' regardless of latency patterns. Instead of using the last observation date, -#' `step_adjust_latency` uses the `as_of` date of the `epi_df` as the -#' `forecast_date`, potentially using different dates depending on the -#' `epi_keys`, such as geography. This is most useful in realtime and +#' to compensate. +#' This is most useful in realtime and #' pseudo-prospective forecasting for data where there is some delay between the #' event occurring and the event being reported. #' +#' @details +#' This step allows the user to create models on the most recent +#' data, automatically accounting for latency patterns. Instead of using the last observation +#' date, `step_adjust_latency` uses the `as_of` date of the `epi_df` as the +#' `forecast_date`, and adjusts the model so that there is data available. To +#' demonstrate some of the subtleties, let's consider a toy dataset: +#' ```{r toy_df} +#' toy_df <- tribble( +#' ~geo_value, ~time_value, ~a, ~b, +#' "ma", as.Date("2015-01-11"), 20, 6, +#' "ma", as.Date("2015-01-12"), 23, NA, +#' "ma", as.Date("2015-01-13"), 25, NA, +#' "ca", as.Date("2015-01-11"), 100, 5, +#' "ca", as.Date("2015-01-12"), 103, 10, +#' ) %>% +#' as_epi_df(as_of = as.Date("2015-01-14")) +#' ``` +#' If we're looking to predict the value on the 15th, forecasting from the 14th (the `as_of` date above), +#' there are two issues we will need to address: +#' 1. `"ca"` is latent by 2 days, whereas `"ma"` is latent by 1 +#' 2. if we want to use `b` as an exogenous variable, for `"ma"` it is latent by 3 days instead of just 1. +#' +#' Regardless of `method`, `epi_keys_checked="geo_value"` guarantees tha the +#' difference between `"ma"` and `"ca"` is accounted for by making the +#' latency adjustment at least 2. For some comparison, here's what the various +#' methods will do: +#' +#' ## `locf` +#' Short for "last observation carried forward", `locf` assumes that every day +#' between the last observation and the forecast day is exactly the same. +#' This is a very straightforward assumption, but wrecks any features that +#' depend on changes in value over time, such as the growth rate, or even +#' adjacent lags. A more robust version of this falls under the heading of +#' nowcasting, an eventual aim for this package. On the toy dataset, it +#' doesn't matter which day we're trying to predict, since it just fills +#' forward to the `forecast_date`: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(method="locf") +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' +#' ## `extend_lags` +#' `extend_lags` increases the lags so that they are guaranteed to have +#' data. This has the advantage of being applicable on +#' a per-column basis; if cases and deaths are reported at different +#' latencies, the lags for each are adjusted separately. In the toy example: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(method="extend_lags") %>% +#' step_epi_lag(a,lag=1) %>% +#' step_epi_lag(b,lag=1) %>% +#' step_epi_ahead(a, ahead=1) +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' The maximum latency in column `a` is 2 days, so the lag is increased to 3, +#' while the max latency in column `b` is 3, so the same lag is increased to +#' 4; both of these changes are reflected in the column names. Meanwhile the +#' ahead is uneffected. +#' +#' As a side-note, lag/ahead can be somewhat ambiguous about direction. Here, +#' the values are brought forward in time, so that for a given row, column +#' `lag_3_a` represents the value 3 days before. +#' +#' ## `extend_ahead` +#' `extend_ahead` increases the ahead, turning a 3 day ahead forecast +#' into a 7 day one; this has the advantage of simplicity and is reflective of +#' the actual modelling task, but potentially leaves information unused if +#' different data sources have different latencies; it must use the latency of +#' the most latent data to insure there is data available. In the toy example: +#' ```{r toy_df} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_adjust_latency(method="extend_ahead") %>% +#' step_epi_lag(a,lag=0) %>% +#' step_epi_ahead(a, ahead=1) +#' +#' toy_recipe %>% +#' prep(toy_df) %>% +#' bake(toy_df) %>% +#' arrange(geo_value, time_value) +#' ``` +#' Even though we're doing a 1 day ahead forecast, because our worst latency +#' is 3 days from column `b`'s `"ma"` data, our outcome column is `ahead_4_a` +#' (so 4 days ahead). If we want to ignore any latency in column `b`, we need +#' to explicitly set the columns to consider while adjusting like this: +#' `step_adjust_latency(a, method="extend_ahead")`. +#' +#' # Programmatic details +#' `step_adjust_latency` uses the metadata, such as `time_type` and `as_of`, of +#' the `epi_df` used in the initial prep step, rather than baking or +#' prediction. This means reusing the same forecaster on new data is not +#' advised, though typically it is not advised in general. +#' +#' The latency adjustment only applies to columns created after this step, so +#' this step should go before both `step_epi_ahead` and `step_epi_lag`. This will work: +#' ```{r} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' # non-lag steps +#' step_adjust_latency(a, method = "extend_lags") %>% +#' step_epi_lag(a, lag=0) # other steps +#' ``` +#' while this will not: +#' ```{r} +#' toy_recipe <- epi_recipe(toy_df) %>% +#' step_epi_lag(a, lag=0) %>% +#' step_adjust_latency(a, method = "extend_lags") +#' ``` +#' If you create columns that you then apply lags to (such as +#' `step_growth_rate()`), these should be created before +#' `step_adjust_latency`, so any subseqent latency can be addressed. +#' #' @param method a character. Determines the method by which the #' forecast handles latency. The options are: #' - `"extend_ahead"`: Lengthen the ahead so that forecasting from the last @@ -25,22 +142,22 @@ #' lags are `c(0,7,14)` for data that is 3 days latent, the actual lags used #' become `c(3,10,17)`. #' @param epi_keys_checked a character vector. A list of keys to group by before -#' finding the `max_time_value`. The default value of this is -#' `c("geo_value")`, but it can be any collection of `epi_keys`. Different -#' locations may have different latencies; to produce a forecast at every -#' location, we need to use the largest latency across every location; this -#' means taking `max_time_value` to be the minimum of the `max_time_value`s -#' for each `geo_value` (or whichever collection of keys are specified). If -#' `NULL` or an empty character vector, it will take the maximum across all -#' values, irrespective of any keys. +#' finding the `max_time_value` (the last day of data), defaulting to +#' `geo_value`. Different locations may have different latencies; to produce a +#' forecast at every location, we need to guarantee data at every location by +#' using the largest latency across every location; this means taking +#' `max_time_value` to be the minimum of the `max_time_value`s for each set of +#' key values (so the earliest date). If `NULL` or an empty character vector, +#' it will take the maximum across all values, irrespective of any keys. +#' +#' Note that this is a separate concern from different latencies across +#' different *data columns*, which is only handled by the choice of `method`. #' @param fixed_latency either a positive integer, or a labeled positive integer #' vector. Cannot be set at the same time as `fixed_forecast_date`. If #' non-`NULL`, the amount to offset the ahead or lag by. If a single integer, #' this is used for all columns; if a labeled vector, the labels must #' correspond to the base column names (before lags/aheads). If `NULL`, the -#' latency is the distance between the `epi_df`'s `max_time_value` and either -#' the `fixed_forecast_date` or the `epi_df`'s `as_of` field (the default for -#' `forecast_date`). +#' latency is the distance between the `epi_df`'s `max_time_value` and the `forecast_date`. #' @param fixed_forecast_date either a date of the same kind used in the #' `epi_df`, or `NULL`. Exclusive with `fixed_latency`. If a date, it gives #' the date from which the forecast is actually occurring. If `NULL`, the @@ -49,21 +166,9 @@ #' @param role For model terms created by this step, what analysis role should #' they be assigned? `lag` is a predictor while `ahead` is an outcome. It #' should be correctly inferred and not need setting -#' @param default Determines what fills empty rows -#' left by leading/lagging (defaults to NA). #' @template step-return #' @inheritParams recipes::step_lag #' -#' @details The step assumes that the pipeline has already applied either -#' `step_epi_ahead` or `step_epi_lag` depending on the value of `"method"`, -#' and that `step_epi_naomit` has NOT been run. By default, the latency will -#' be determined using the arguments below, but can be set explicitly using -#' either `fixed_latency` or `fixed_forecast_date`. -#' -#' The `prefix` and `id` arguments are unchangeable to ensure that the code runs -#' properly and to avoid inconsistency with naming. For `step_epi_ahead`, they -#' are always set to `"ahead_"` and `"epi_ahead"` respectively, while for -#' `step_epi_lag`, they are set to `"lag_"` and `"epi_lag`, respectively. #' #' @family row operation steps #' @rdname step_adjust_latency @@ -88,6 +193,7 @@ #' #' @importFrom recipes detect_step #' @importFrom rlang enquos is_empty +#' @importFrom dplyr tribble n step_adjust_latency <- function(recipe, ..., @@ -172,6 +278,7 @@ then the previous `step_epi_lag`s won't work with modified data.", forecast_date = fixed_forecast_date, latency = fixed_latency, latency_table = NULL, + metadata = NULL, keys = key_colnames(recipe), columns = columns, skip = skip, @@ -182,7 +289,8 @@ then the previous `step_epi_lag`s won't work with modified data.", step_adjust_latency_new <- function(terms, role, trained, forecast_date, latency, latency_table, - time_type, keys, method, epi_keys_checked, columns, skip, id) { + metadata, time_type, keys, method, epi_keys_checked, columns, skip, + id) { step( subclass = "adjust_latency", terms = terms, @@ -193,19 +301,17 @@ step_adjust_latency_new <- forecast_date = forecast_date, latency = latency, latency_table = latency_table, + metadata = metadata, keys = keys, columns = columns, skip = skip, id = id ) } -#' @importFrom recipes recipes_eval_select -construct_latency_table <- function(x, latency, training, info) { - return(latency_table) -} # lags introduces max(lags) NA's after the max_time_value. #' @export #' @importFrom glue glue +#' @importFrom dplyr rowwise prep.step_adjust_latency <- function(x, training, info = NULL, ...) { sign_shift <- get_sign(x) latency <- x$latency @@ -220,6 +326,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { if (is.null(latency)) { latency_table <- latency_table %>% + rowwise() %>% mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, x$epi_keys_checked)) } else if (length(latency) > 1) { # if latency has a length, it must also have named elements. We assign based on comparing the name in the list @@ -242,27 +349,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { pull(variable) } - # check that the shift amount isn't too extreme - latency_max <- max(abs(latency_table$latency)) - time_type <- attributes(training)$metadata$time_type - i_latency <- which.max(latency_table$latency) - if ( - (grepl("day", time_type) && (latency_max >= 10)) || - (grepl("week", time_type) && (latency_max >= 4)) || - ((time_type == "yearmonth") && (latency_max >= 2)) || - ((time_type == "yearquarter") && (latency_max >= 1)) || - ((time_type == "year") && (latency_max >= 1)) - ) { - cli::cli_warn(paste( - "!" = paste( - "The maximum latency is {latency_max}, ", - "which is questionable for it's `time_type` of ", - "{time_type}." - ), - "i" = "latency: {latency_table$latency[[i_latency]]}", - "i" = "`max_time` = {max(training$time_value)} -> `forecast_date` = {forecast_date}" - ), class = "epipredict__prep.step_latency__very_large_latency") - } + step_adjust_latency_new( terms = x$terms, @@ -271,6 +358,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { forecast_date = forecast_date, latency = unique(latency_table$latency), latency_table = latency_table, + metadata = attributes(training)$metadata, keys = x$keys, method = x$method, epi_keys_checked = x$epi_keys_checked, @@ -286,23 +374,28 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { bake.step_adjust_latency <- function(object, new_data, ...) { if (!isa(new_data, "epi_df")) { # TODO if new_data actually has keys other than geo_value and time_value, this is going to cause problems - new_data <- new_data %>% as_epi_df(as_of = object$forecast_date) + new_data <- as_epi_df(new_data) + attributes(new_data)$metadata <- object$metadata + attributes(new_data)$metadata$as_of <- object$forecast_date } if (object$method == "extend_ahead" || object$method == "extend_lags") { attributes(new_data)$metadata$latency_method <- object$method attributes(new_data)$metadata$shift_sign <- get_sign(object) attributes(new_data)$metadata$latency_table <- object$latency_table + attributes(new_data)$metadata$forecast_date <- object$forecast_date keys <- object$keys } else if (object$method == "locf") { # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns rel_keys <- setdiff(key_colnames(new_data), "time_value") - unnamed_columns <- object$columns %>% unname() + modified_columns <- object$columns %>% unname() + check_interminable_latency(new_data, object$latency_table, modified_columns, object$forecast_date) + new_data <- new_data %>% - pad_to_end(rel_keys, object$forecast_date, unnamed_columns) %>% + pad_to_end(rel_keys, object$forecast_date, modified_columns) %>% # group_by_at(rel_keys) %>% arrange(time_value) %>% as_tibble() %>% - tidyr::fill(.direction = "down", any_of(unnamed_columns)) %>% + tidyr::fill(.direction = "down", any_of(modified_columns)) %>% ungroup() } return(new_data) diff --git a/R/utils-latency.R b/R/utils-latency.R index 9148c7c2b..bd0e4803e 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -197,15 +197,15 @@ pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { arrange(across(all_of(c("time_value", groups)))) %>% group_by(across(all_of(groups))) values_to_fill <- grouped_and_arranged %>% - slice(min(across(columns_to_complete, count_single_column)):n()) - + slice(min(across(all_of(columns_to_complete), count_single_column)):n()) filled_values <- values_to_fill %>% bind_rows(completed_time_values) %>% arrange(across(all_of(c("time_value", groups)))) %>% fill(all_of(columns_to_complete), .direction = "down") %>% slice(-1) # remove the oirginal rows + grouped_and_arranged %>% - slice(1:min(across(columns_to_complete, count_single_column))) %>% + slice(1:min(across(all_of(columns_to_complete), count_single_column))) %>% bind_rows(filled_values) %>% arrange(across(all_of(key_colnames(x)))) %>% ungroup() %>% @@ -245,3 +245,50 @@ seq_null_swap <- function(from, to, by) { } seq(from = from, to = to, by = by) } + + +#' warn when the latency is larger than would be reasonable +#' @param dataset the epi_df +#' @param latency_table the whole collection of latencies +#' @param target_columns the names of the columns that we're adjusting, and whether its unreasonably latent +#' @keywords internal +check_interminable_latency <- function(dataset, latency_table, target_columns, forecast_date, call = caller_env()) { + # check that the shift amount isn't too extreme + rel_latency_table <- latency_table %>% + filter(col_name %in% target_columns) + # no relevant columns, so this error definitely isn't happening + if (nrow(rel_latency_table) == 0) { + return() + } + latency_max <- rel_latency_table %>% + pull(latency) %>% + abs() %>% + max() + time_type <- attributes(dataset)$metadata$time_type + i_latency <- which.max(latency_table$latency) + if ( + (grepl("day", time_type) && (latency_max >= 28)) || + (grepl("week", time_type) && (latency_max >= 4)) || + ((time_type == "yearmonth") && (latency_max >= 2)) || + ((time_type == "yearquarter") && (latency_max >= 1)) || + ((time_type == "year") && (latency_max >= 1)) + ) { + max_time_value <- dataset %>% + filter(!is.na(!!(latency_table[[i_latency, "col_name"]]))) %>% + pull(time_value) %>% + max() + cli::cli_warn( + message = c( + paste( + "The maximum latency is {latency_max}, ", + "which is questionable for it's `time_type` of ", + "{time_type}." + ), + "i" = "latency: {latency_table$latency[[i_latency]]}", + "i" = "`max_time` = {max_time_value} -> `forecast_date` = {forecast_date}" + ), + class = "epipredict__prep.step_latency__very_large_latency", + call = call + ) + } +} diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index 84e8c72d9..214726fde 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -32,20 +32,22 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. The -default \code{NULL} will attempt to determine this automatically either as the -max time value if there is no latency adjustment, or as the \code{as_of} of -\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. The -default \code{NULL} will attempt to determine this automatically as -\code{forecast_date + ahead}.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} \item{adjust_latency}{Character or \code{NULL}. One of the \code{method}s of \code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{NULL} (in which case there is no adjustment). -If there is a difference between the \code{forecast_date} and the last day of -data, this determines how to shift the model to account for this -difference. The options are: +If the \code{forecast_date} is after the last day of data, this determines how +to shift the model to account for this difference. The options are: \itemize{ \item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data \item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 3502508cb..185b868e5 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -34,20 +34,22 @@ date for which forecasts should be produced.} key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. The -default \code{NULL} will attempt to determine this automatically either as the -max time value if there is no latency adjustment, or as the \code{as_of} of -\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. The -default \code{NULL} will attempt to determine this automatically as -\code{forecast_date + ahead}.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} \item{adjust_latency}{Character or \code{NULL}. One of the \code{method}s of \code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{NULL} (in which case there is no adjustment). -If there is a difference between the \code{forecast_date} and the last day of -data, this determines how to shift the model to account for this -difference. The options are: +If the \code{forecast_date} is after the last day of data, this determines how +to shift the model to account for this difference. The options are: \itemize{ \item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data \item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to diff --git a/man/cdc_baseline_args_list.Rd b/man/cdc_baseline_args_list.Rd index 3870134fb..4a8c13113 100644 --- a/man/cdc_baseline_args_list.Rd +++ b/man/cdc_baseline_args_list.Rd @@ -33,10 +33,14 @@ set of prediction horizons for \code{\link[=layer_cdc_flatline_quantiles]{layer_ key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. The -default \code{NULL} will attempt to determine this automatically either as the -max time value if there is no latency adjustment, or as the \code{as_of} of -\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/flatline_args_list.Rd b/man/flatline_args_list.Rd index d056ed825..401850efe 100644 --- a/man/flatline_args_list.Rd +++ b/man/flatline_args_list.Rd @@ -28,14 +28,17 @@ So for example, \code{ahead = 7} will create residuals by comparing values key that are used for training (in the time unit of the \code{epi_df}).} -\item{forecast_date}{Date. The date on which the forecast is created. The -default \code{NULL} will attempt to determine this automatically either as the -max time value if there is no latency adjustment, or as the \code{as_of} of -\code{epi_data} if \code{adjust_latency} is non-\code{NULL}.} +\item{forecast_date}{Date. The date from which the forecast is occurring. +The default \code{NULL} will determine this automatically from either +\enumerate{ +\item the maximum time value for which there's data if there is no latency +adjustment (the default case), or +\item the \code{as_of} date of \code{epi_data} if \code{adjust_latency} is +non-\code{NULL}. +}} -\item{target_date}{Date. The date for which the forecast is intended. The -default \code{NULL} will attempt to determine this automatically as -\code{forecast_date + ahead}.} +\item{target_date}{Date. The date that is being forecast. The default \code{NULL} +will determine this automatically as \code{forecast_date + ahead}.} \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 3da958045..2b5f282d9 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -13,7 +13,6 @@ step_adjust_latency( epi_keys_checked = c("geo_value"), fixed_latency = NULL, fixed_forecast_date = NULL, - default = NA, skip = FALSE, columns = NULL, id = recipes::rand_id("adjust_latency") @@ -50,23 +49,23 @@ become \code{c(3,10,17)}. }} \item{epi_keys_checked}{a character vector. A list of keys to group by before -finding the \code{max_time_value}. The default value of this is -\code{c("geo_value")}, but it can be any collection of \code{epi_keys}. Different -locations may have different latencies; to produce a forecast at every -location, we need to use the largest latency across every location; this -means taking \code{max_time_value} to be the minimum of the \code{max_time_value}s -for each \code{geo_value} (or whichever collection of keys are specified). If -\code{NULL} or an empty character vector, it will take the maximum across all -values, irrespective of any keys.} +finding the \code{max_time_value} (the last day of data), defaulting to +\code{geo_value}. Different locations may have different latencies; to produce a +forecast at every location, we need to guarantee data at every location by +using the largest latency across every location; this means taking +\code{max_time_value} to be the minimum of the \code{max_time_value}s for each set of +key values (so the earliest date). If \code{NULL} or an empty character vector, +it will take the maximum across all values, irrespective of any keys. + +Note that this is a separate concern from different latencies across +different \emph{data columns}, which is only handled by the choice of \code{method}.} \item{fixed_latency}{either a positive integer, or a labeled positive integer vector. Cannot be set at the same time as \code{fixed_forecast_date}. If non-\code{NULL}, the amount to offset the ahead or lag by. If a single integer, this is used for all columns; if a labeled vector, the labels must correspond to the base column names (before lags/aheads). If \code{NULL}, the -latency is the distance between the \code{epi_df}'s \code{max_time_value} and either -the \code{fixed_forecast_date} or the \code{epi_df}'s \code{as_of} field (the default for -\code{forecast_date}).} +latency is the distance between the \code{epi_df}'s \code{max_time_value} and the \code{forecast_date}.} \item{fixed_forecast_date}{either a date of the same kind used in the \code{epi_df}, or \code{NULL}. Exclusive with \code{fixed_latency}. If a date, it gives @@ -74,9 +73,6 @@ the date from which the forecast is actually occurring. If \code{NULL}, the \code{forecast_date} is determined either via the \code{fixed_latency}, or is set to the \code{epi_df}'s \code{as_of} value if \code{fixed_latency} is also \code{NULL}.} -\item{default}{Determines what fills empty rows -left by leading/lagging (defaults to NA).} - \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be @@ -97,26 +93,198 @@ sequence of any existing operations. In the standard case, the arx models assume that the last observation is also the day from which the forecast is being made. But if the data has latency, then you may wish to adjust the predictors (lags) and/or the outcome (ahead) -to compensate. This allows the user to create models on the most recent data, -regardless of latency patterns. Instead of using the last observation date, -\code{step_adjust_latency} uses the \code{as_of} date of the \code{epi_df} as the -\code{forecast_date}, potentially using different dates depending on the -\code{epi_keys}, such as geography. This is most useful in realtime and +to compensate. +This is most useful in realtime and pseudo-prospective forecasting for data where there is some delay between the event occurring and the event being reported. } \details{ -The step assumes that the pipeline has already applied either -\code{step_epi_ahead} or \code{step_epi_lag} depending on the value of \code{"method"}, -and that \code{step_epi_naomit} has NOT been run. By default, the latency will -be determined using the arguments below, but can be set explicitly using -either \code{fixed_latency} or \code{fixed_forecast_date}. - -The \code{prefix} and \code{id} arguments are unchangeable to ensure that the code runs -properly and to avoid inconsistency with naming. For \code{step_epi_ahead}, they -are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for -\code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. +This step allows the user to create models on the most recent +data, automatically accounting for latency patterns. Instead of using the last observation +date, \code{step_adjust_latency} uses the \code{as_of} date of the \code{epi_df} as the +\code{forecast_date}, and adjusts the model so that there is data available. To +demonstrate some of the subtleties, let's consider a toy dataset: + +\if{html}{\out{
}}\preformatted{toy_df <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "ma", as.Date("2015-01-11"), 20, 6, + "ma", as.Date("2015-01-12"), 23, NA, + "ma", as.Date("2015-01-13"), 25, NA, + "ca", as.Date("2015-01-11"), 100, 5, + "ca", as.Date("2015-01-12"), 103, 10, +) \%>\% + as_epi_df(as_of = as.Date("2015-01-14")) +}\if{html}{\out{
}} + +If we're looking to predict the value on the 15th, forecasting from the 14th (the \code{as_of} date above), +there are two issues we will need to address: +\enumerate{ +\item \code{"ca"} is latent by 2 days, whereas \code{"ma"} is latent by 1 +\item if we want to use \code{b} as an exogenous variable, for \code{"ma"} it is latent by 3 days instead of just 1. +} + +Regardless of \code{method}, \code{epi_keys_checked="geo_value"} guarantees tha the +difference between \code{"ma"} and \code{"ca"} is accounted for by making the +latency adjustment at least 2. For some comparison, here's what the various +methods will do: +\subsection{\code{locf}}{ + +Short for "last observation carried forward", \code{locf} assumes that every day +between the last observation and the forecast day is exactly the same. +This is a very straightforward assumption, but wrecks any features that +depend on changes in value over time, such as the growth rate, or even +adjacent lags. A more robust version of this falls under the heading of +nowcasting, an eventual aim for this package. On the toy dataset, it +doesn't matter which day we're trying to predict, since it just fills +forward to the \code{forecast_date}: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(method="locf") + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 8 x 4 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 8 x 4 +#> geo_value time_value a b +#> * +#> 1 ca 2015-01-11 100 5 +#> 2 ca 2015-01-12 103 10 +#> 3 ca 2015-01-13 103 10 +#> 4 ca 2015-01-14 103 10 +#> 5 ma 2015-01-11 20 6 +#> 6 ma 2015-01-12 23 6 +#> 7 ma 2015-01-13 25 6 +#> 8 ma 2015-01-14 25 6 +}\if{html}{\out{
}} +} + +\subsection{\code{extend_lags}}{ + +\code{extend_lags} increases the lags so that they are guaranteed to have +data. This has the advantage of being applicable on +a per-column basis; if cases and deaths are reported at different +latencies, the lags for each are adjusted separately. In the toy example: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(method="extend_lags") \%>\% + step_epi_lag(a,lag=1) \%>\% + step_epi_lag(b,lag=1) \%>\% + step_epi_ahead(a, ahead=1) + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 21 x 7 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 21 x 7 +#> geo_value time_value a b lag_3_a lag_4_b ahead_1_a +#> * +#> 1 ca 2015-01-10 NA NA NA NA 100 +#> 2 ca 2015-01-11 100 5 NA NA 103 +#> 3 ca 2015-01-12 103 10 NA NA NA +#> 4 ca 2015-01-13 NA NA NA NA NA +#> 5 ca 2015-01-14 NA NA 100 NA NA +#> 6 ca 2015-01-15 NA NA 103 5 NA +#> 7 ca 2015-01-16 NA NA NA 10 NA +#> 8 ca 2015-01-17 NA NA NA NA NA +#> 9 ca 2015-01-18 NA NA NA NA NA +#> 10 ca 2015-01-19 NA NA NA NA NA +#> # i 11 more rows +}\if{html}{\out{
}} + +The maximum latency in column \code{a} is 2 days, so the lag is increased to 3, +while the max latency in column \code{b} is 3, so the same lag is increased to +4; both of these changes are reflected in the column names. Meanwhile the +ahead is uneffected. + +As a side-note, lag/ahead can be somewhat ambiguous about direction. Here, +the values are brought forward in time, so that for a given row, column +\code{lag_3_a} represents the value 3 days before. +} + +\subsection{\code{extend_ahead}}{ + +\code{extend_ahead} increases the ahead, turning a 3 day ahead forecast +into a 7 day one; this has the advantage of simplicity and is reflective of +the actual modelling task, but potentially leaves information unused if +different data sources have different latencies; it must use the latency of +the most latent data to insure there is data available. In the toy example: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_adjust_latency(method="extend_ahead") \%>\% + step_epi_lag(a,lag=0) \%>\% + step_epi_ahead(a, ahead=1) + +toy_recipe \%>\% + prep(toy_df) \%>\% + bake(toy_df) \%>\% + arrange(geo_value, time_value) +#> An `epi_df` object, 10 x 6 with metadata: +#> * geo_type = state +#> * time_type = day +#> * as_of = 2015-01-14 +#> +#> # A tibble: 10 x 6 +#> geo_value time_value a b lag_0_a ahead_3_a +#> * +#> 1 ca 2015-01-08 NA NA NA 100 +#> 2 ca 2015-01-09 NA NA NA 103 +#> 3 ca 2015-01-11 100 5 100 NA +#> 4 ca 2015-01-12 103 10 103 NA +#> 5 ma 2015-01-08 NA NA NA 20 +#> 6 ma 2015-01-09 NA NA NA 23 +#> 7 ma 2015-01-10 NA NA NA 25 +#> 8 ma 2015-01-11 20 6 20 NA +#> 9 ma 2015-01-12 23 NA 23 NA +#> 10 ma 2015-01-13 25 NA 25 NA +}\if{html}{\out{
}} + +Even though we're doing a 1 day ahead forecast, because our worst latency +is 3 days from column \code{b}'s \code{"ma"} data, our outcome column is \code{ahead_4_a} +(so 4 days ahead). If we want to ignore any latency in column \code{b}, we need +to explicitly set the columns to consider while adjusting like this: +\code{step_adjust_latency(a, method="extend_ahead")}. +} } +\section{Programmatic details}{ +\code{step_adjust_latency} uses the metadata, such as \code{time_type} and \code{as_of}, of +the \code{epi_df} used in the initial prep step, rather than baking or +prediction. This means reusing the same forecaster on new data is not +advised, though typically it is not advised in general. + +The latency adjustment only applies to columns created after this step, so +this step should go before both \code{step_epi_ahead} and \code{step_epi_lag}. This will work: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + # non-lag steps + step_adjust_latency(a, method = "extend_lags") \%>\% + step_epi_lag(a, lag=0) # other steps +}\if{html}{\out{
}} + +while this will not: + +\if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% + step_epi_lag(a, lag=0) \%>\% + step_adjust_latency(a, method = "extend_lags") +#> Warning: If `method` is "extend_lags" or "locf", then the previous +#> `step_epi_lag`s won't work with modified data. +}\if{html}{\out{
}} + +If you create columns that you then apply lags to (such as +\code{step_growth_rate()}), these should be created before +\code{step_adjust_latency}, so any subseqent latency can be addressed. +} + \examples{ jhu <- case_death_rate_subset \%>\% dplyr::filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 3ba41158a..49a5e7820 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -95,7 +95,9 @@ test_that("`layer_add_forecast_date()` infers correct date when using `adjust_la wf_latent <- epi_workflow(r_latent, parsnip::linear_reg()) %>% fit(jhu_reasonable_date) %>% add_frosting(frost_latent) - p_latent <- predict(wf_latent, latest) + reasonable_date <- jhu %>% + dplyr::filter(time_value >= max(time_value) - 14) + p_latent <- predict(wf_latent, reasonable_date) expect_equal( p_latent$forecast_date, rep(as.Date("2022-01-03"), times = 3) diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 75656989a..1e9ac4d15 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -11,7 +11,7 @@ x <- tibble( case_rate = sqrt(1:200) + atan(0.1 * 1:200) + sin(5 * 1:200) + 1, death_rate = atan(0.1 * 1:200) + cos(5 * 1:200) + 1 ) %>% - as_epi_df(as_of = as.POSIXct("2024-05-17")) + as_epi_df(as_of = as.POSIXct("2024-09-17")) max_time <- max(x$time_value) class(attributes(x)$metadata$as_of) as_of <- attributes(x)$metadata$as_of @@ -78,7 +78,16 @@ test_that("epi_adjust_latency correctly extends the lags", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_error(expect_warning(fit1 <- slm_fit(r1, data = x), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") + expect_error( + expect_warning( + expect_warning( + fit1 <- slm_fit(r1, data = x), + class = "epipredict__prep.step_latency__very_large_latency" + ), + class = "epipredict__prep.step_latency__very_large_latency" + ), + class = "simpleError" + ) # now trying with the as_of a reasonable distance in the future fit1 <- slm_fit(r1, data = real_x) @@ -281,7 +290,9 @@ test_that("epi_adjust_latency fixed_forecast_date works", { step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) - expect_warning(baked_x <- r4 %>% prep(real_x) %>% bake(real_x), class = "epipredict__prep.step_latency__very_large_latency") + baked_x <- r4 %>% + prep(real_x) %>% + bake(real_x) # map each column to its last non-NA value last_dates <- baked_x %>% tidyr::pivot_longer(cols = contains("rate"), values_drop_na = TRUE) %>% @@ -341,7 +352,7 @@ test_that("epi_adjust_latency fixed_latency works", { -test_that("epi_adjust_latency works for other time types", {}) +# test_that("epi_adjust_latency works for other time types", {}) test_that("epi_adjust_latency warns there's steps before it", { expect_warning( @@ -368,35 +379,31 @@ test_that("epi_adjust_latency warns against removing NA's beforehand", { regexp = "adjust_latency needs to occur before any `NA` removal" ) }) -# todo check that epi_adjust_latency errors for nonsense `as_of`'s +# TODO check that epi_adjust_latency errors for nonsense `as_of`'s -# todo make sure that `epi_keys_checked` works correctly for extra epi_keys -test_that("epi_adjust_latency correctly extends the lags", { +# TODO make sure that `epi_keys_checked` works correctly for extra epi_keys + +test_that("epi_adjust_latency correctly extends the lags when there are different delays per-geo", { r5 <- epi_recipe(x_lagged) %>% - step_adjust_latency(method = "extend_lags", epi_keys_checked = NULL) %>% + step_adjust_latency(method = "extend_lags") %>% step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) - # the as_of on x is today's date, which is >970 days in the future - # also, there's no data >970 days in the past, so it gets an error trying to - # fit on no data - expect_error(expect_warning(fit5 <- slm_fit(r5, data = x), class = "epipredict__prep.step_latency__very_large_latency"), class = "simpleError") - # now trying with the as_of a reasonable distance in the future fit5 <- slm_fit(r5, data = x_lagged) expect_equal( names(fit5$pre$mold$predictors), c( - "lag_5_death_rate", "lag_11_death_rate", "lag_16_death_rate", - "lag_6_case_rate", "lag_10_case_rate" + "lag_6_death_rate", "lag_12_death_rate", "lag_17_death_rate", + "lag_7_case_rate", "lag_11_case_rate" ) ) latest <- get_test_data(r5, x_lagged) pred <- predict(fit5, latest) point_pred <- pred %>% filter(!is.na(.pred)) expect_equal(nrow(point_pred), 1) - expect_equal(point_pred$time_value, as.Date(testing_as_of)) + expect_equal(point_pred$time_value, as.Date(testing_as_of) + 1) expect_equal( names(fit5$pre$mold$outcomes), @@ -405,7 +412,7 @@ test_that("epi_adjust_latency correctly extends the lags", { latest <- get_test_data(r5, x) pred <- predict(fit5, latest) actual_solutions <- pred %>% filter(!is.na(.pred)) - expect_equal(actual_solutions$time_value, testing_as_of) + expect_equal(actual_solutions$time_value, testing_as_of + 1) # should have four predictors, including the intercept expect_equal(length(fit5$fit$fit$fit$coefficients), 6) @@ -413,8 +420,8 @@ test_that("epi_adjust_latency correctly extends the lags", { # result should be equivalent to just immediately doing the adjusted lags by # hand hand_adjusted <- epi_recipe(x) %>% - step_epi_lag(death_rate, lag = c(5, 11, 16)) %>% - step_epi_lag(case_rate, lag = c(6, 10)) %>% + step_epi_lag(death_rate, lag = c(6, 12, 17)) %>% + step_epi_lag(case_rate, lag = c(7, 11)) %>% step_epi_ahead(death_rate, ahead = ahead) fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) expect_equal( @@ -423,6 +430,50 @@ test_that("epi_adjust_latency correctly extends the lags", { ) }) +test_that("epi_adjust_latency correctly extends the ahead when there are different delays per-geo", { + r5 <- epi_recipe(x_lagged) %>% + step_adjust_latency(method = "extend_ahead") %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead) + + fit5 <- slm_fit(r5, data = x_lagged) + expect_equal( + names(fit5$pre$mold$predictors), + c( + "lag_0_death_rate", "lag_6_death_rate", "lag_11_death_rate", + "lag_1_case_rate", "lag_5_case_rate" + ) + ) + latest <- get_test_data(r5, x_lagged) + pred <- predict(fit5, latest) + point_pred <- pred %>% filter(!is.na(.pred)) + expect_equal(nrow(point_pred), 1) + expect_equal(point_pred$time_value, as.Date(max_time)) + joint_latency <- latency + 1 + expect_equal( + names(fit5$pre$mold$outcomes), + glue::glue("ahead_{ahead+joint_latency}_death_rate") + ) + actual_solutions <- pred %>% filter(!is.na(.pred)) + expect_equal(actual_solutions$time_value, as.Date(max_time)) + + # should have four predictors, including the intercept + expect_equal(length(fit5$fit$fit$fit$coefficients), 6) + + # result should be equivalent to just immediately doing the adjusted lags by + # hand + hand_adjusted <- epi_recipe(x) %>% + step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% + step_epi_lag(case_rate, lag = c(1, 5)) %>% + step_epi_ahead(death_rate, ahead = ahead + joint_latency) + + fit_hand_adj <- slm_fit(hand_adjusted, data = real_x) + expect_equal( + fit5$fit$fit$fit$coefficients, + fit_hand_adj$fit$fit$fit$coefficients + ) +}) test_that("`step_adjust_latency` only uses the columns specified in the `...`", { r5 <- epi_recipe(x) %>% @@ -512,7 +563,9 @@ test_that("locf works as intended", { # the as_of on x is today's date, which is >970 days in the future # also, there's no data >970 days in the past, so it gets an error trying to # fit on no data - expect_warning(fit6 <- slm_fit(r6, data = x), regexp = "The maximum latency is 1033") + expect_warning(fit6 <- slm_fit(r6, data = x), + class = "epipredict__prep.step_latency__very_large_latency" + ) # now trying with the as_of a reasonable distance in the future fit6 <- slm_fit(r6, data = real_x) diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index e0f35d7fa..6d71ee6eb 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -43,6 +43,23 @@ x_adjust_ahead <- tibble( ) %>% as_epi_df(as_of = max(time_range) + 3) +modified_data %>% arrange(geo_value, desc(time_value)) +modified_data %>% + group_by(geo_value) %>% + filter(!is.na(case_rate)) %>% + summarise(max(time_value)) +as_of + +toy_df <- tribble( + ~geo_value, ~time_value, ~a, ~b, + "ma", as.Date("2015-01-11"), 20, 6, + "ma", as.Date("2015-01-12"), 23, NA, + "ma", as.Date("2015-01-13"), 25, NA, + "ca", as.Date("2015-01-11"), 100, 5, + "ca", as.Date("2015-01-12"), 103, 10, +) %>% + as_epi_df(as_of = as.Date("2015-01-14")) + test_that("get_latency works", { expect_equal(get_latency(modified_data, as_of, "case_rate", 1, "geo_value"), 5) expect_equal(get_latency(modified_data, as_of, "case_rate", -1, "geo_value"), -5) @@ -50,6 +67,10 @@ test_that("get_latency works", { expect_equal(get_latency(modified_data, as_of, "case_rate_a", 1, "geo_value"), 5 + 4) expect_equal(get_latency(modified_data, as_of, "case_rate_b", 1, "geo_value"), 5 - 3) expect_equal(get_latency(modified_data, as_of, "death_rate_a", 1, "geo_value"), 4 - 7) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "a", 1, "geo_value"), 2) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "a", -1, "geo_value"), -2) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "b", 1, "geo_value"), 3) + expect_equal(get_latency(toy_df, as.Date("2015-01-14"), "b", -1, "geo_value"), -3) }) test_that("get_latency infers max_time to be the minimum `max time` across grouping the specified keys", { From 71694d717b9e8936f81cc2e76662ae85088f2d04 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 9 Sep 2024 17:23:07 -0500 Subject: [PATCH 63/92] non-timezone dependent printing tests --- tests/testthat/_snaps/snapshots.md | 6 +++--- tests/testthat/test-snapshots.R | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 6b337c056..236632c15 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1095,7 +1095,7 @@ Training data was an with: * Geography: state, * Time type: day, - * Using data up-to-date as of: 2022-05-31 14:08:25. + * Using data up-to-date as of: 2022-05-31. * With the last data available on 2021-12-31 -- Predictions ----------------------------------------------------------------- @@ -1118,7 +1118,7 @@ Training data was an with: * Geography: state, * Time type: day, - * Using data up-to-date as of: 2022-05-31 14:08:25. + * Using data up-to-date as of: 2022-05-31. * With the last data available on 2021-12-31 -- Predictions ----------------------------------------------------------------- @@ -1142,7 +1142,7 @@ Training data was an with: * Geography: state, * Time type: day, - * Using data up-to-date as of: 2022-05-31 14:08:25. + * Using data up-to-date as of: 2022-05-31. * With the last data available on 2021-12-31 -- Predictions ----------------------------------------------------------------- diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index 003fc8319..2cfce9521 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -112,6 +112,7 @@ test_that("arx_forecaster snapshots", { test_that("arx_forecaster output format snapshots", { jhu <- case_death_rate_subset %>% dplyr::filter(time_value >= as.Date("2021-12-01")) + attributes(jhu)$metadata$as_of <- as.Date(attributes(jhu)$metadata$as_of) out1 <- arx_forecaster( jhu, "death_rate", c("case_rate", "death_rate") From aa9f1e30e90866b51eb7a770a8222cbc0f5458f5 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 9 Sep 2024 18:26:56 -0500 Subject: [PATCH 64/92] arx_forecaster consistency check and tests --- R/arx_forecaster.R | 19 +++++++------------ tests/testthat/test-arx_forecaster.R | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 12 deletions(-) create mode 100644 tests/testthat/test-arx_forecaster.R diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 42729ed85..89a3f418a 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -120,7 +120,10 @@ arx_fcast_epi_workflow <- function( if (is.null(args_list$adjust_latency)) { forecast_date_default <- max(epi_data$time_value) if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { - cli::cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + cli::cli_warn( + "The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is actually occurring {forecast_date_default}.", + class = "epipredict__arx_forecaster__forecast_date_defaulting" + ) } } else { forecast_date_default <- attributes(epi_data)$metadata$as_of @@ -128,10 +131,11 @@ arx_fcast_epi_workflow <- function( forecast_date <- args_list$forecast_date %||% forecast_date_default target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) if (forecast_date + args_list$ahead != target_date) { - cli::cli_warn(c( + cli::cli_abort(c( "`forecast_date` + `ahead` must equal `target_date`.", i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - )) + ), + class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date") } lags <- arx_lags_validator(predictors, args_list$lags) @@ -293,15 +297,6 @@ arx_args_list <- function( arg_is_pos(check_enough_data_n, allow_null = TRUE) arg_is_chr(check_enough_data_epi_keys, allow_null = TRUE) - if (!is.null(forecast_date) && !is.null(target_date)) { - if (forecast_date + ahead != target_date) { - cli_warn(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - )) - } - } - max_lags <- max(lags) structure( enlist( diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R new file mode 100644 index 000000000..bb4dbd085 --- /dev/null +++ b/tests/testthat/test-arx_forecaster.R @@ -0,0 +1,26 @@ +train_data <- jhu_csse_daily_subset +test_that("arx_forecaster warns if forecast date beyond the implicit one", { + bad_date <- max(train_data$time_value)+300 + expect_warning( + arx1 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(forecast_date = bad_date)) + ), + class = "epipredict__arx_forecaster__forecast_date_defaulting" + ) +}) + +test_that("arx_forecaster errors if forecast date, target date, and ahead are inconsistent", { + max_date <- max(train_data$time_value) + expect_error( + arx1 <- arx_forecaster( + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(ahead = 5, target_date = max_date, forecast_date = max_date)) + ), + class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date" + ) +}) From 899ea5143c99ab1e77274c73495822f08953d4ab Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 10 Sep 2024 16:25:29 -0500 Subject: [PATCH 65/92] arx_forecaster updates --- R/arx_forecaster.R | 9 +++++++++ R/step_adjust_latency.R | 10 ++-------- tests/testthat/test-arx_args_list.R | 7 ++++--- tests/testthat/test-arx_forecaster.R | 2 +- 4 files changed, 16 insertions(+), 12 deletions(-) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 89a3f418a..fed506a7e 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -297,6 +297,15 @@ arx_args_list <- function( arg_is_pos(check_enough_data_n, allow_null = TRUE) arg_is_chr(check_enough_data_epi_keys, allow_null = TRUE) + if (!is.null(forecast_date) && !is.null(target_date)) { + if (forecast_date + ahead != target_date) { + cli_abort(c( + "`forecast_date` + `ahead` must equal `target_date`.", + i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}."), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date") + } + } + max_lags <- max(lags) structure( enlist( diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index e4968920e..9c1d015a2 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -212,7 +212,8 @@ step_adjust_latency <- id = recipes::rand_id("adjust_latency")) { arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { - cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", class = "epipredict__step_adjust_latency__epi_recipe_only") + cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", + class = "epipredict__step_adjust_latency__epi_recipe_only") } if (!is.null(columns)) { cli::cli_abort(c("The `columns` argument must be `NULL`.", @@ -251,14 +252,7 @@ then the previous `step_epi_lag`s won't work with modified data.", cli::cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}", class = "epipredict__step_adjust_latency__undefined_names_error") } } - method <- rlang::arg_match(method) - terms_used <- recipes_eval_select(enquos(...), recipe$template, recipe$term_info) - if (is_empty(terms_used)) { - terms_used <- recipe$term_info %>% - filter(role == "raw") %>% - pull(variable) - } if (method == "extend_ahead") { rel_step_type <- "step_epi_ahead" shift_name <- "ahead" diff --git a/tests/testthat/test-arx_args_list.R b/tests/testthat/test-arx_args_list.R index 03cbc0025..4ae8ec3f5 100644 --- a/tests/testthat/test-arx_args_list.R +++ b/tests/testthat/test-arx_args_list.R @@ -26,11 +26,12 @@ test_that("arx_args checks inputs", { expect_snapshot(error = TRUE, arx_args_list(n_training_min = "de")) expect_snapshot(error = TRUE, arx_args_list(epi_keys = 1)) - expect_warning(arx_args_list( + expect_error(arx_args_list( forecast_date = as.Date("2022-01-01"), - target_date = as.Date("2022-01-03"), + target_date = as.Date("2022-01-04"), ahead = 1L - )) + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date") }) test_that("arx forecaster disambiguates quantiles", { diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R index bb4dbd085..d5603aebe 100644 --- a/tests/testthat/test-arx_forecaster.R +++ b/tests/testthat/test-arx_forecaster.R @@ -21,6 +21,6 @@ test_that("arx_forecaster errors if forecast date, target date, and ahead are in c("death_rate_7d_av", "case_rate_7d_av"), args_list = (arx_args_list(ahead = 5, target_date = max_date, forecast_date = max_date)) ), - class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date" + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" ) }) From af45eaf2c9c8dd5aca4454f8abad5c0f0dd3e06b Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 10 Sep 2024 16:41:07 -0500 Subject: [PATCH 66/92] arx_classifier addition --- R/arx_classifier.R | 26 +++++++++++++++----------- man/check_interminable_latency.Rd | 25 +++++++++++++++++++++++++ man/count_single_column.Rd | 17 +++++++++++++++++ tests/testthat/_snaps/snapshots.md | 27 +++++++++++++++++++++++++++ tests/testthat/test-snapshots.R | 9 +++++++++ 5 files changed, 93 insertions(+), 11 deletions(-) create mode 100644 man/check_interminable_latency.Rd create mode 100644 man/count_single_column.Rd diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 5ee6de88a..45d432bb7 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -158,13 +158,13 @@ arx_class_epi_workflow <- function( additional_gr_args_list = args_list$additional_gr_args ) for (l in seq_along(lags)) { - p <- predictors[l] - p <- as.character(glue::glue_data(args_list, "gr_{horizon}_{method}_{p}")) - r <- step_epi_lag(r, !!p, lag = lags[[l]]) + pred_names <- predictors[l] + pred_names <- as.character(glue::glue_data(args_list, "gr_{horizon}_{method}_{pred_names}")) + r <- step_epi_lag(r, !!pred_names, lag = lags[[l]]) } # ------- outcome if (args_list$outcome_transform == "lag_difference") { - o <- as.character( + pre_out_name <- as.character( glue::glue_data(args_list, "lag_diff_{horizon}_{outcome}") ) r <- r %>% @@ -175,7 +175,7 @@ arx_class_epi_workflow <- function( ) } if (args_list$outcome_transform == "growth_rate") { - o <- as.character( + pre_out_name <- as.character( glue::glue_data(args_list, "gr_{horizon}_{method}_{outcome}") ) if (!(outcome %in% predictors)) { @@ -190,20 +190,24 @@ arx_class_epi_workflow <- function( ) } } - o2 <- rlang::sym(paste0("ahead_", args_list$ahead, "_", o)) + ahead_out_name <- rlang::sym(paste0("ahead_", args_list$ahead, "_", pre_out_name)) method_adjust_latency <- args_list$adjust_latency if (!is.null(method_adjust_latency)) { # only extend_ahead is supported atm - r <- r %>% step_adjust_latency(all_outcomes(), + r <- r %>% step_adjust_latency(!!pre_out_name, fixed_forecast_date = forecast_date, method = method_adjust_latency ) } r <- r %>% - step_epi_ahead(!!o, ahead = args_list$ahead, role = "pre-outcome") + step_epi_ahead(!!pre_out_name, ahead = args_list$ahead, role = "pre-outcome") r <- r %>% step_mutate( - outcome_class = cut(!!o2, breaks = args_list$breaks), + across( + starts_with("ahead"), + ~cut(.x, breaks = args_list$breaks), + .names = "outcome_class", + .unpack = TRUE), role = "outcome" ) %>% step_epi_naomit() %>% @@ -322,8 +326,8 @@ arx_class_args_list <- function( if (forecast_date + ahead != target_date) { cli::cli_warn(c( "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - )) + i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}."), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date") } } diff --git a/man/check_interminable_latency.Rd b/man/check_interminable_latency.Rd new file mode 100644 index 000000000..8ae9ec6df --- /dev/null +++ b/man/check_interminable_latency.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{check_interminable_latency} +\alias{check_interminable_latency} +\title{warn when the latency is larger than would be reasonable} +\usage{ +check_interminable_latency( + dataset, + latency_table, + target_columns, + forecast_date, + call = caller_env() +) +} +\arguments{ +\item{dataset}{the epi_df} + +\item{latency_table}{the whole collection of latencies} + +\item{target_columns}{the names of the columns that we're adjusting, and whether its unreasonably latent} +} +\description{ +warn when the latency is larger than would be reasonable +} +\keyword{internal} diff --git a/man/count_single_column.Rd b/man/count_single_column.Rd new file mode 100644 index 000000000..090f064e4 --- /dev/null +++ b/man/count_single_column.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{count_single_column} +\alias{count_single_column} +\title{get the location of the last real value} +\usage{ +count_single_column(col, from_last) +} +\arguments{ +\item{col}{the relevant column} + +\item{from_last}{instead of the number of columns} +} +\description{ +get the location of the last real value +} +\keyword{internal} diff --git a/tests/testthat/_snaps/snapshots.md b/tests/testthat/_snaps/snapshots.md index 236632c15..9ce6502d1 100644 --- a/tests/testthat/_snaps/snapshots.md +++ b/tests/testthat/_snaps/snapshots.md @@ -1181,3 +1181,30 @@ 18999, 18999, 18999, 18999, 18999, 18999, 18999), class = "Date")), row.names = c(NA, -53L), class = c("tbl_df", "tbl", "data.frame")) +--- + + structure(list(geo_value = c("ak", "al", "ar", "az", "ca", "co", + "ct", "dc", "de", "fl", "ga", "gu", "hi", "ia", "id", "il", "in", + "ks", "ky", "la", "ma", "me", "mi", "mn", "mo", "mp", "ms", "mt", + "nc", "nd", "ne", "nh", "nj", "nm", "nv", "ny", "oh", "ok", "or", + "pa", "pr", "ri", "sc", "sd", "tn", "tx", "ut", "va", "vt", "wa", + "wi", "wv", "wy"), .pred_class = structure(c(1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, + 1L), levels = c("(-Inf,0.25]", "(0.25, Inf]"), class = "factor"), + forecast_date = structure(c(18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, 18994, + 18994, 18994, 18994), class = "Date"), target_date = structure(c(19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, 19001, + 19001, 19001, 19001, 19001, 19001, 19001, 19001), class = "Date")), row.names = c(NA, + -53L), class = c("tbl_df", "tbl", "data.frame")) + diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index 2cfce9521..d8b4a7734 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -153,4 +153,13 @@ test_that("arx_classifier snapshots", { c("case_rate", "death_rate") ) expect_snapshot_tibble(arc1$predictions) + max_date <-case_death_rate_subset$time_value %>% max + arc2 <- arx_classifier( + case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "extend_ahead", forecast_date = max_date+2) + ) + expect_snapshot_tibble(arc2$predictions) }) From 1155d30c4387547aeb5b2385671fb41afc1c5378 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 10 Sep 2024 17:18:27 -0500 Subject: [PATCH 67/92] formatting --- R/arx_classifier.R | 16 ++++++++++------ R/arx_forecaster.R | 23 ++++++++++++++--------- R/step_adjust_latency.R | 3 ++- tests/testthat/test-arx_args_list.R | 14 ++++++++------ tests/testthat/test-arx_forecaster.R | 18 +++++++++--------- tests/testthat/test-snapshots.R | 4 ++-- 6 files changed, 45 insertions(+), 33 deletions(-) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 45d432bb7..32a58cd04 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -205,9 +205,10 @@ arx_class_epi_workflow <- function( step_mutate( across( starts_with("ahead"), - ~cut(.x, breaks = args_list$breaks), + ~ cut(.x, breaks = args_list$breaks), .names = "outcome_class", - .unpack = TRUE), + .unpack = TRUE + ), role = "outcome" ) %>% step_epi_naomit() %>% @@ -324,10 +325,13 @@ arx_class_args_list <- function( if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli::cli_warn(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}."), - class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date") + cli::cli_warn( + c( + "`forecast_date` + `ahead` must equal `target_date`.", + i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) } } diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index fed506a7e..fdc1e88a1 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -131,11 +131,13 @@ arx_fcast_epi_workflow <- function( forecast_date <- args_list$forecast_date %||% forecast_date_default target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) if (forecast_date + args_list$ahead != target_date) { - cli::cli_abort(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - ), - class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date") + cli::cli_abort( + c( + "`forecast_date` + `ahead` must equal `target_date`.", + i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." + ), + class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date" + ) } lags <- arx_lags_validator(predictors, args_list$lags) @@ -299,10 +301,13 @@ arx_args_list <- function( if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli_abort(c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}."), - class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date") + cli_abort( + c( + "`forecast_date` + `ahead` must equal `target_date`.", + i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) } } diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 9c1d015a2..4caf17d8e 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -213,7 +213,8 @@ step_adjust_latency <- arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", - class = "epipredict__step_adjust_latency__epi_recipe_only") + class = "epipredict__step_adjust_latency__epi_recipe_only" + ) } if (!is.null(columns)) { cli::cli_abort(c("The `columns` argument must be `NULL`.", diff --git a/tests/testthat/test-arx_args_list.R b/tests/testthat/test-arx_args_list.R index 4ae8ec3f5..9bbff0135 100644 --- a/tests/testthat/test-arx_args_list.R +++ b/tests/testthat/test-arx_args_list.R @@ -26,12 +26,14 @@ test_that("arx_args checks inputs", { expect_snapshot(error = TRUE, arx_args_list(n_training_min = "de")) expect_snapshot(error = TRUE, arx_args_list(epi_keys = 1)) - expect_error(arx_args_list( - forecast_date = as.Date("2022-01-01"), - target_date = as.Date("2022-01-04"), - ahead = 1L - ), - class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date") + expect_error( + arx_args_list( + forecast_date = as.Date("2022-01-01"), + target_date = as.Date("2022-01-04"), + ahead = 1L + ), + class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" + ) }) test_that("arx forecaster disambiguates quantiles", { diff --git a/tests/testthat/test-arx_forecaster.R b/tests/testthat/test-arx_forecaster.R index d5603aebe..4145a064d 100644 --- a/tests/testthat/test-arx_forecaster.R +++ b/tests/testthat/test-arx_forecaster.R @@ -1,12 +1,12 @@ train_data <- jhu_csse_daily_subset test_that("arx_forecaster warns if forecast date beyond the implicit one", { - bad_date <- max(train_data$time_value)+300 + bad_date <- max(train_data$time_value) + 300 expect_warning( arx1 <- arx_forecaster( - train_data, - "death_rate_7d_av", - c("death_rate_7d_av", "case_rate_7d_av"), - args_list = (arx_args_list(forecast_date = bad_date)) + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(forecast_date = bad_date)) ), class = "epipredict__arx_forecaster__forecast_date_defaulting" ) @@ -16,10 +16,10 @@ test_that("arx_forecaster errors if forecast date, target date, and ahead are in max_date <- max(train_data$time_value) expect_error( arx1 <- arx_forecaster( - train_data, - "death_rate_7d_av", - c("death_rate_7d_av", "case_rate_7d_av"), - args_list = (arx_args_list(ahead = 5, target_date = max_date, forecast_date = max_date)) + train_data, + "death_rate_7d_av", + c("death_rate_7d_av", "case_rate_7d_av"), + args_list = (arx_args_list(ahead = 5, target_date = max_date, forecast_date = max_date)) ), class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" ) diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index d8b4a7734..fc3ee890b 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -153,13 +153,13 @@ test_that("arx_classifier snapshots", { c("case_rate", "death_rate") ) expect_snapshot_tibble(arc1$predictions) - max_date <-case_death_rate_subset$time_value %>% max + max_date <- case_death_rate_subset$time_value %>% max() arc2 <- arx_classifier( case_death_rate_subset %>% dplyr::filter(time_value >= as.Date("2021-11-01")), "death_rate", c("case_rate", "death_rate"), - args_list = arx_class_args_list(adjust_latency = "extend_ahead", forecast_date = max_date+2) + args_list = arx_class_args_list(adjust_latency = "extend_ahead", forecast_date = max_date + 2) ) expect_snapshot_tibble(arc2$predictions) }) From 343551f0e5294b7f0141d7fdd42845def2335937 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 11 Sep 2024 12:32:56 -0500 Subject: [PATCH 68/92] various minor fixes caught in pre-review --- R/arx_forecaster.R | 11 ++++++++++- R/epi_recipe.R | 3 ++- R/epi_workflow.R | 8 +++++--- R/flatline_forecaster.R | 6 +++--- R/frosting.R | 3 ++- R/get_test_data.R | 5 +++-- R/step_epi_shift.R | 4 ++++ R/step_growth_rate.R | 3 ++- R/step_lag_difference.R | 3 ++- man/forecast.epi_workflow.Rd | 4 +--- man/step_adjust_latency.Rd | 4 ++-- tests/testthat/test-layer_add_forecast_date.R | 7 ------- tests/testthat/test-population_scaling.R | 1 - tests/testthat/test-utils-shift.R | 0 14 files changed, 36 insertions(+), 26 deletions(-) delete mode 100644 tests/testthat/test-utils-shift.R diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index fdc1e88a1..291413b5b 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -51,7 +51,16 @@ arx_forecaster <- function( wf <- arx_fcast_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - preds <- forecast(wf) %>% + # get the forecast date for the forecast function + if (is.null(args_list$adjust_latency)) { + forecast_date_default <- max(epi_data$time_value) + } else { + forecast_date_default <- attributes(epi_data)$metadata$as_of + } + forecast_date <- args_list$forecast_date %||% forecast_date_default + + + preds <- forecast(wf, forecast_date = forecast_date) %>% tibble::as_tibble() %>% dplyr::select(-time_value) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index edb7f352b..684642075 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -431,7 +431,7 @@ prep.epi_recipe <- function( x, training = NULL, fresh = FALSE, verbose = FALSE, retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) { if (is.null(training)) { - cli::cli_warn(paste( + cli::cli_warn(c( "!" = "No training data was supplied to {.fn prep}.", "!" = "Unlike a {.cls recipe}, an {.cls epi_recipe} does not ", "!" = "store the full template data in the object.", @@ -583,6 +583,7 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { new_data } + kill_levels <- function(x, keys) { for (i in which(names(x) %in% keys)) x[[i]] <- list(values = NA, ordered = NA) x diff --git a/R/epi_workflow.R b/R/epi_workflow.R index 19a30d73a..619fb6d55 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -43,6 +43,7 @@ epi_workflow <- function(preprocessor = NULL, spec = NULL, postprocessor = NULL) if (!is_null(postprocessor)) { out <- add_postprocessor(out, postprocessor) } + out } @@ -154,9 +155,10 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' #' preds <- predict(wf, latest) #' preds +#' @importFrom cli cli_abort predict.epi_workflow <- function(object, new_data, type = NULL, opts = list(), ...) { if (!workflows::is_trained_workflow(object)) { - cli::cli_abort(c( + cli_abort(c( "Can't predict on an untrained epi_workflow.", i = "Do you need to call `fit()`?" )) @@ -232,7 +234,6 @@ print.epi_workflow <- function(x, ...) { #' #' @param object An epi workflow. #' @param ... Not used. -#' @param fill_locf Logical. Should we use locf to fill in missing data? #' @param n_recent Integer or NULL. If filling missing data with locf = TRUE, #' how far back are we willing to tolerate missing data? Larger values allow #' more filling. The default NULL will determine this from the the recipe. For @@ -246,7 +247,7 @@ print.epi_workflow <- function(x, ...) { #' @return A forecast tibble. #' #' @export -forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) { +forecast.epi_workflow <- function(object, ..., n_recent = NULL, forecast_date = NULL) { rlang::check_dots_empty() if (!object$trained) { @@ -266,6 +267,7 @@ forecast.epi_workflow <- function(object, ..., fill_locf = FALSE, n_recent = NUL )) } } + test_data <- get_test_data( hardhat::extract_preprocessor(object), object$original_data diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index f2d6fa998..fe072a42b 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -66,8 +66,7 @@ flatline_forecaster <- function( wf <- epi_workflow(r, eng, f) wf <- fit(wf, epi_data) preds <- suppressWarnings(forecast( - wf, - fill_locf = TRUE + wf )) %>% as_tibble() %>% select(-time_value) @@ -106,6 +105,7 @@ flatline_forecaster <- function( #' flatline_args_list() #' flatline_args_list(symmetrize = FALSE) #' flatline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) +#' @importFrom cli cli_abort cli_warn flatline_args_list <- function( ahead = 7L, n_training = Inf, @@ -129,7 +129,7 @@ flatline_args_list <- function( if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli::cli_warn(c( + cli_warn(c( "`forecast_date` + `ahead` must equal `target_date`.", i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." )) diff --git a/R/frosting.R b/R/frosting.R index 13d2293a6..6bc103465 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -356,6 +356,7 @@ apply_frosting.default <- function(workflow, components, ...) { #' @rdname apply_frosting #' @importFrom rlang is_null +#' @importFrom cli cli_abort cli_warn #' @param type,opts forwarded (along with `...`) to [`predict.model_fit()`] and #' [`slather()`] for supported layers #' @export @@ -374,7 +375,7 @@ apply_frosting.epi_workflow <- } if (!has_postprocessor_frosting(workflow)) { - cli::cli_warn(paste( + cli_warn(paste( "Only postprocessors of class {.cls frosting} are allowed.", "Returning unpostprocessed predictions." )) diff --git a/R/get_test_data.R b/R/get_test_data.R index 5e74da5a1..f070e97bb 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -26,9 +26,10 @@ #' get_test_data(recipe = rec, x = case_death_rate_subset) #' @importFrom rlang %@% #' @importFrom stats na.omit +#' @importFrom cli cli_abort cli_warn #' @export get_test_data <- function(recipe, x) { - if (!is_epi_df(x)) cli::cli_abort("`x` must be an `epi_df`.") + if (!is_epi_df(x)) cli_abort("`x` must be an `epi_df`.") check <- hardhat::check_column_names(x, colnames(recipe$template)) if (!check$ok) { @@ -47,7 +48,7 @@ get_test_data <- function(recipe, x) { # Probably needs a fix based on the time_type of the epi_df avail_recent <- diff(range(x$time_value)) if (avail_recent < keep) { - cli::cli_abort(c( + cli_abort(c( "You supplied insufficient recent data for this recipe. ", "!" = "You need at least {min_required} days of data,", "!" = "but `x` contains only {avail_recent}." diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index c1b0f51bc..f09e8ae29 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -67,6 +67,7 @@ step_epi_lag <- } arg_is_nonneg_int(lag) arg_is_chr_scalar(prefix, id) + recipes::add_step( recipe, step_epi_lag_new( @@ -110,6 +111,7 @@ step_epi_ahead <- } arg_is_nonneg_int(ahead) arg_is_chr_scalar(prefix, id) + recipes::add_step( recipe, step_epi_ahead_new( @@ -204,6 +206,7 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { bake.step_epi_lag <- function(object, new_data, ...) { add_shifted_columns(new_data, object, object$lag) } + #' @export bake.step_epi_ahead <- function(object, new_data, ...) { add_shifted_columns(new_data, object, object$ahead) @@ -218,6 +221,7 @@ print.step_epi_lag <- function(x, width = max(20, options()$width - 30), ...) { invisible(x) } + #' @export print.step_epi_ahead <- function(x, width = max(20, options()$width - 30), ...) { print_epi_step(x$columns, x$terms, x$trained, "Leading", diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index 70d0ac2ab..d9d52684d 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -30,6 +30,7 @@ #' #' @family row operation steps #' @importFrom epiprocess growth_rate +#' @importFrom cli cli_abort #' @export #' @examples #' r <- epi_recipe(case_death_rate_subset) %>% @@ -58,7 +59,7 @@ step_growth_rate <- arg_is_pos_int(horizon) arg_is_scalar(horizon) if (!is.null(replace_Inf)) { - if (length(replace_Inf) != 1L) cli::cli_abort("`replace_Inf` must be a scalar.") + if (length(replace_Inf) != 1L) cli_abort("`replace_Inf` must be a scalar.") if (!is.na(replace_Inf)) arg_is_numeric(replace_Inf) } arg_is_chr(role) diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index e8c7c36f7..bee5f5c40 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -13,6 +13,7 @@ #' #' #' @family row operation steps +#' @importFrom cli cli_abort cli_warn #' @export #' @examples #' r <- epi_recipe(case_death_rate_subset) %>% @@ -32,7 +33,7 @@ step_lag_difference <- skip = FALSE, id = rand_id("lag_diff")) { if (!is_epi_recipe(recipe)) { - cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") + cli_abort("This recipe step can only operate on an {.cls epi_recipe}.") } arg_is_pos_int(horizon) arg_is_chr(role) diff --git a/man/forecast.epi_workflow.Rd b/man/forecast.epi_workflow.Rd index b9f6870b8..22f8cf4bb 100644 --- a/man/forecast.epi_workflow.Rd +++ b/man/forecast.epi_workflow.Rd @@ -4,15 +4,13 @@ \alias{forecast.epi_workflow} \title{Produce a forecast from an epi workflow} \usage{ -\method{forecast}{epi_workflow}(object, ..., fill_locf = FALSE, n_recent = NULL, forecast_date = NULL) +\method{forecast}{epi_workflow}(object, ..., n_recent = NULL, forecast_date = NULL) } \arguments{ \item{object}{An epi workflow.} \item{...}{Not used.} -\item{fill_locf}{Logical. Should we use locf to fill in missing data?} - \item{n_recent}{Integer or NULL. If filling missing data with locf = TRUE, how far back are we willing to tolerate missing data? Larger values allow more filling. The default NULL will determine this from the the recipe. For diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 2b5f282d9..9b10d1822 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -276,8 +276,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous -#> `step_epi_lag`s won't work with modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with modified +#> data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 49a5e7820..491bf5e20 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -64,13 +64,6 @@ test_that("Do not specify a forecast_date in `layer_add_forecast_date()`", { layer_naomit(.pred) wf3 <- wf %>% add_frosting(f3) - # this warning has been removed - # expect_warning( - # p3 <- predict(wf3, latest), - # "forecast_date is less than the most recent update date of the data." - # ) - p3 <- predict(wf3, latest) - p3 expect_silent(p3 <- predict(wf3, latest)) expect_equal(ncol(p3), 4L) expect_s3_class(p3, "epi_df") diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index fbae404ad..d18be65f5 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -193,7 +193,6 @@ test_that("Postprocessing to get cases from case rate", { test_that("test joining by default columns", { - skip() jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% dplyr::select(geo_value, time_value, case_rate) diff --git a/tests/testthat/test-utils-shift.R b/tests/testthat/test-utils-shift.R deleted file mode 100644 index e69de29bb..000000000 From 60827a36d638df3ddf994fa0bd74db8524ca16ab Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 11 Sep 2024 13:46:26 -0500 Subject: [PATCH 69/92] spurious join_by tests removed --- tests/testthat/_snaps/population_scaling.md | 32 --------------------- tests/testthat/test-population_scaling.R | 10 +++---- 2 files changed, 5 insertions(+), 37 deletions(-) delete mode 100644 tests/testthat/_snaps/population_scaling.md diff --git a/tests/testthat/_snaps/population_scaling.md b/tests/testthat/_snaps/population_scaling.md deleted file mode 100644 index 6d75fcc22..000000000 --- a/tests/testthat/_snaps/population_scaling.md +++ /dev/null @@ -1,32 +0,0 @@ -# test joining by default columns - - Code - prep <- prep(r, jhu) - Message - Joining with `by = join_by(geo_value)` - Joining with `by = join_by(geo_value)` - ---- - - Code - b <- bake(prep, jhu) - Message - Joining with `by = join_by(geo_value)` - Joining with `by = join_by(geo_value)` - ---- - - Code - wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) %>% add_frosting(f) - Message - Joining with `by = join_by(geo_value)` - Joining with `by = join_by(geo_value)` - ---- - - Code - p <- predict(wf, latest) - Message - Joining with `by = join_by(geo_value)` - Joining with `by = join_by(geo_value)` - diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index d18be65f5..646afdf76 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -214,9 +214,9 @@ test_that("test joining by default columns", { recipes::step_naomit(recipes::all_predictors()) %>% recipes::step_naomit(recipes::all_outcomes(), skip = TRUE) - expect_snapshot(prep <- prep(r, jhu)) + prep <- prep(r, jhu) - expect_snapshot(b <- bake(prep, jhu)) + b <- bake(prep, jhu) f <- frosting() %>% layer_predict() %>% @@ -228,12 +228,12 @@ test_that("test joining by default columns", { df_pop_col = "values" ) - expect_snapshot(wf <- epi_workflow( + wf <- epi_workflow( r, parsnip::linear_reg() ) %>% fit(jhu) %>% - add_frosting(f)) + add_frosting(f) latest <- get_test_data( recipe = r, @@ -246,7 +246,7 @@ test_that("test joining by default columns", { ) - expect_snapshot(p <- predict(wf, latest)) + p <- predict(wf, latest) From 60b3fad8ff06714121299fff379751cbfdb36bfd Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Wed, 11 Sep 2024 16:07:47 -0500 Subject: [PATCH 70/92] vignettes: no more fill_locf, some missing data --- vignettes/articles/sliding.Rmd | 4 ++-- vignettes/articles/smooth-qr.Rmd | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vignettes/articles/sliding.Rmd b/vignettes/articles/sliding.Rmd index 1556c4a72..908caf307 100644 --- a/vignettes/articles/sliding.Rmd +++ b/vignettes/articles/sliding.Rmd @@ -328,7 +328,7 @@ confirmed_incidence_prop <- pub_covidcast( geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = states, - issues = epirange(20000101, 20211231) + issues = epirange(20000101, 20221231) ) %>% select(geo_value, time_value, version = issue, case_rate = value) %>% arrange(geo_value, time_value) %>% @@ -341,7 +341,7 @@ deaths_incidence_prop <- pub_covidcast( geo_type = "state", time_values = epirange(20200301, 20211231), geo_values = states, - issues = epirange(20000101, 20211231) + issues = epirange(20000101, 20221231) ) %>% select(geo_value, time_value, version = issue, death_rate = value) %>% arrange(geo_value, time_value) %>% diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd index 3d626b2e1..b93c726f6 100644 --- a/vignettes/articles/smooth-qr.Rmd +++ b/vignettes/articles/smooth-qr.Rmd @@ -195,7 +195,7 @@ smooth_fc <- function(x, aheads = 1:28, degree = 3L, quantiles = 0.5, fd) { the_fit <- ewf %>% fit(x) - latest <- get_test_data(rec, x, fill_locf = TRUE) + latest <- get_test_data(rec, x) preds <- predict(the_fit, new_data = latest) %>% mutate(forecast_date = fd, target_date = fd + ahead) %>% From 28c5863fc88e6c3c6344160db0e874ec44a53072 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 12 Sep 2024 15:45:06 -0500 Subject: [PATCH 71/92] drop :: for cli, many dplyr commands --- R/arx_classifier.R | 8 ++--- R/arx_forecaster.R | 6 ++-- R/autoplot.R | 2 +- R/compat-purrr.R | 4 +-- R/compat-recipes.R | 2 +- R/create-layer.R | 2 +- R/dist_quantiles.R | 12 ++++---- R/epi_recipe.R | 52 +++++++++++++++----------------- R/epi_shift.R | 7 ++--- R/epi_workflow.R | 7 ++--- R/flatline_forecaster.R | 1 - R/flusight_hub_formatter.R | 4 +-- R/frosting.R | 9 +++--- R/get_test_data.R | 7 ++--- R/layer_cdc_flatline_quantiles.R | 8 ++--- R/layer_residual_quantiles.R | 14 ++++----- R/layers.R | 4 +-- R/make_quantile_reg.R | 2 +- R/pivot_quantiles.R | 6 ++-- R/step_adjust_latency.R | 16 +++++----- R/step_growth_rate.R | 1 - R/step_lag_difference.R | 1 - R/step_population_scaling.R | 2 +- R/tidy.R | 2 +- R/utils-enframer.R | 2 +- R/utils-latency.R | 10 +++--- R/utils-misc.R | 4 +-- R/utils-shift.R | 12 ++++---- 28 files changed, 98 insertions(+), 109 deletions(-) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 32a58cd04..bbb3433c1 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -58,7 +58,7 @@ arx_classifier <- function( if (is.null(args_list$adjust_latency)) { forecast_date_default <- max(epi_data$time_value) if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { - cli::cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") } } else { forecast_date_default <- attributes(epi_data)$metadata$as_of @@ -136,7 +136,7 @@ arx_class_epi_workflow <- function( if (is.null(args_list$adjust_latency)) { forecast_date_default <- max(epi_data$time_value) if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { - cli::cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") + cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") } } else { forecast_date_default <- attributes(epi_data)$metadata$as_of @@ -150,7 +150,7 @@ arx_class_epi_workflow <- function( # ------- predictors r <- epi_recipe(epi_data) %>% step_growth_rate( - dplyr::all_of(predictors), + all_of(predictors), role = "grp", horizon = args_list$horizon, method = args_list$method, @@ -325,7 +325,7 @@ arx_class_args_list <- function( if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli::cli_warn( + cli_warn( c( "`forecast_date` + `ahead` must equal `target_date`.", i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 291413b5b..a3363e689 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -62,7 +62,7 @@ arx_forecaster <- function( preds <- forecast(wf, forecast_date = forecast_date) %>% tibble::as_tibble() %>% - dplyr::select(-time_value) + select(-time_value) structure( list( @@ -129,7 +129,7 @@ arx_fcast_epi_workflow <- function( if (is.null(args_list$adjust_latency)) { forecast_date_default <- max(epi_data$time_value) if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { - cli::cli_warn( + cli_warn( "The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is actually occurring {forecast_date_default}.", class = "epipredict__arx_forecaster__forecast_date_defaulting" ) @@ -140,7 +140,7 @@ arx_fcast_epi_workflow <- function( forecast_date <- args_list$forecast_date %||% forecast_date_default target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) if (forecast_date + args_list$ahead != target_date) { - cli::cli_abort( + cli_abort( c( "`forecast_date` + `ahead` must equal `target_date`.", i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." diff --git a/R/autoplot.R b/R/autoplot.R index 648c74e33..8bded03a3 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -217,7 +217,7 @@ autoplot.canned_epipred <- function( ewf <- object$epi_workflow predictions <- object$predictions %>% - dplyr::rename(time_value = target_date) + rename(time_value = target_date) autoplot(ewf, predictions, .color_by = .color_by, .facet_by = .facet_by, diff --git a/R/compat-purrr.R b/R/compat-purrr.R index e06038e44..6ec7df02f 100644 --- a/R/compat-purrr.R +++ b/R/compat-purrr.R @@ -11,11 +11,11 @@ map_vec <- function(.x, .f, ...) { map_dfr <- function(.x, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map(.x, .f, ...) - dplyr::bind_rows(res, .id = .id) + bind_rows(res, .id = .id) } map2_dfr <- function(.x, .y, .f, ..., .id = NULL) { .f <- rlang::as_function(.f, env = rlang::global_env()) res <- map2(.x, .y, .f, ...) - dplyr::bind_rows(res, .id = .id) + bind_rows(res, .id = .id) } diff --git a/R/compat-recipes.R b/R/compat-recipes.R index 0e4a557cb..f90367497 100644 --- a/R/compat-recipes.R +++ b/R/compat-recipes.R @@ -18,7 +18,7 @@ inline_check <- function(x) { funs <- fun_calls(x) funs <- funs[!(funs %in% c("~", "+", "-"))] if (length(funs) > 0) { - cli::cli_abort(paste0( + cli_abort(paste0( "No in-line functions should be used here; ", "use steps to define baking actions." )) diff --git a/R/create-layer.R b/R/create-layer.R index 0268a906f..7179cb7e3 100644 --- a/R/create-layer.R +++ b/R/create-layer.R @@ -22,7 +22,7 @@ create_layer <- function(name = NULL, open = rlang::is_interactive()) { if (substr(name, 1, 5) == "layer") { nn <- substring(name, 6) if (substr(nn, 1, 1) == "_") nn <- substring(nn, 2) - cli::cli_abort( + cli_abort( c('`name` should not begin with "layer" or "layer_".', i = 'Did you mean to use `create_layer("{ nn }")`?' ) diff --git a/R/dist_quantiles.R b/R/dist_quantiles.R index dd97ec809..8930bdeaa 100644 --- a/R/dist_quantiles.R +++ b/R/dist_quantiles.R @@ -23,7 +23,7 @@ new_quantiles <- function(values = double(1), quantile_levels = double(1)) { quantile_levels <- quantile_levels[o] } if (is.unsorted(values, na.rm = TRUE)) { - cli::cli_abort("`values[order(quantile_levels)]` produces unsorted quantiles.") + cli_abort("`values[order(quantile_levels)]` produces unsorted quantiles.") } new_rcrd(list(values = values, quantile_levels = quantile_levels), @@ -102,14 +102,14 @@ validate_dist_quantiles <- function(values, quantile_levels) { ) length_diff <- vctrs::list_sizes(values) != vctrs::list_sizes(quantile_levels) if (any(length_diff)) { - cli::cli_abort(c( + cli_abort(c( "`values` and `quantile_levels` must have common length.", i = "Mismatches found at position(s): {.val {which(length_diff)}}." )) } level_duplication <- map_lgl(quantile_levels, vctrs::vec_duplicate_any) if (any(level_duplication)) { - cli::cli_abort(c( + cli_abort(c( "`quantile_levels` must not be duplicated.", i = "Duplicates found at position(s): {.val {which(level_duplication)}}." )) @@ -171,7 +171,7 @@ quantile_extrapolate <- function(x, tau_out, middle) { return(qvals[match(tau_out, tau)]) } if (length(tau) < 2) { - cli::cli_abort( + cli_abort( "Quantile extrapolation is not possible with fewer than 2 quantiles." ) return(qvals_out) @@ -209,7 +209,7 @@ quantile_extrapolate <- function(x, tau_out, middle) { v = c(qvals, qvals_out[indm]) ) %>% dplyr::distinct(q, .keep_all = TRUE) %>% - dplyr::arrange(q) + arrange(q) } if (any(indl)) { qvals_out[indl] <- tail_extrapolate(tau_out[indl], utils::head(qv, 2)) @@ -267,7 +267,7 @@ Ops.dist_quantiles <- function(e1, e2) { } tau <- union(tau1, tau2) if (all(is_dist)) { - cli::cli_abort( + cli_abort( "You can't perform arithmetic between two distributions like this." ) } else { diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 684642075..7db9b9179 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -61,15 +61,13 @@ epi_recipe.epi_df <- attr(x, "decay_to_tibble") <- FALSE if (!is.null(formula)) { if (!is.null(vars)) { - cli::cli_abort( - paste0( - "This `vars` specification will be ignored ", - "when a formula is used" - ) - ) + cli_abort(paste0( + "This `vars` specification will be ignored ", + "when a formula is used" + )) } if (!is.null(roles)) { - cli::cli_abort( + cli_abort( paste0( "This `roles` specification will be ignored ", "when a formula is used" @@ -82,10 +80,10 @@ epi_recipe.epi_df <- } if (is.null(vars)) vars <- colnames(x) if (any(table(vars) > 1)) { - cli::cli_abort("`vars` should have unique members") + cli_abort("`vars` should have unique members") } if (any(!(vars %in% colnames(x)))) { - cli::cli_abort("1 or more elements of `vars` are not in the data") + cli_abort("1 or more elements of `vars` are not in the data") } keys <- key_colnames(x) # we know x is an epi_df @@ -96,14 +94,14 @@ epi_recipe.epi_df <- ## Check and add roles when available if (!is.null(roles)) { if (length(roles) != length(vars)) { - cli::cli_abort(paste( + cli_abort(paste( "The number of roles should be the same as the number of ", "variables." )) } var_info$role <- roles } else { - var_info <- var_info %>% dplyr::filter(!(variable %in% keys)) + var_info <- var_info %>% filter(!(variable %in% keys)) var_info$role <- "raw" } ## Now we add the keys when necessary @@ -113,12 +111,12 @@ epi_recipe.epi_df <- ) ## Add types - var_info <- dplyr::full_join(recipes:::get_types(x), var_info, by = "variable") + var_info <- full_join(recipes:::get_types(x), var_info, by = "variable") var_info$source <- "original" ## arrange to easy order var_info <- var_info %>% - dplyr::arrange(factor( + arrange(factor( role, levels = union( c("predictor", "outcome", "time_value", "geo_value", "key"), @@ -394,7 +392,7 @@ adjust_epi_recipe.epi_workflow <- function(x, which_step, ..., blueprint = defau #' @export adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default_epi_recipe_blueprint()) { if (!(is.numeric(which_step) || is.character(which_step))) { - cli::cli_abort( + cli_abort( c("`which_step` must be a number or a character.", i = "`which_step` has class {.cls {class(which_step)[1]}}." ) @@ -407,7 +405,7 @@ adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default if (!starts_with_step) which_step <- paste0("step_", which_step) if (!(which_step %in% step_names)) { - cli::cli_abort(c( + cli_abort(c( "`which_step` does not appear in the available `epi_recipe` step names. ", i = "The step names are {.val {step_names}}." )) @@ -416,7 +414,7 @@ adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default if (length(which_step_idx) == 1) { x$steps[[which_step_idx]] <- update(x$steps[[which_step_idx]], ...) } else { - cli::cli_abort(c( + cli_abort(c( "`which_step` is not unique. Matches steps: {.val {which_step_idx}}.", i = "Please use the step number instead for precise alterations." )) @@ -431,7 +429,7 @@ prep.epi_recipe <- function( x, training = NULL, fresh = FALSE, verbose = FALSE, retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) { if (is.null(training)) { - cli::cli_warn(c( + cli_warn(c( "!" = "No training data was supplied to {.fn prep}.", "!" = "Unlike a {.cls recipe}, an {.cls epi_recipe} does not ", "!" = "store the full template data in the object.", @@ -441,7 +439,7 @@ prep.epi_recipe <- function( } training <- recipes:::check_training_set(training, x, fresh) training <- epi_check_training_set(training, x) - training <- dplyr::relocate(training, dplyr::all_of(key_colnames(training))) + training <- relocate(training, all_of(key_colnames(training))) tr_data <- recipes:::train_info(training) keys <- key_colnames(x) @@ -456,7 +454,7 @@ prep.epi_recipe <- function( } skippers <- map_lgl(x$steps, recipes:::is_skipable) if (any(skippers) & !retain) { - cli::cli_warn(paste( + cli_warn(paste( "Since some operations have `skip = TRUE`, using ", "`retain = TRUE` will allow those steps results to ", "be accessible." @@ -464,7 +462,7 @@ prep.epi_recipe <- function( } if (fresh) x$term_info <- x$var_info - running_info <- x$term_info %>% dplyr::mutate(number = 0, skip = FALSE) + running_info <- x$term_info %>% mutate(number = 0, skip = FALSE) for (i in seq(along.with = x$steps)) { needs_tuning <- map_lgl(x$steps[[i]], recipes:::is_tune) if (any(needs_tuning)) { @@ -474,7 +472,7 @@ prep.epi_recipe <- function( "You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: ", arg, ". Do you want to use a tuning function such as `tune_grid()`?" ) - cli::cli_abort(msg) + cli_abort(msg) } note <- paste("oper", i, gsub("_", " ", class(x$steps[[i]])[1])) if (!x$steps[[i]]$trained | fresh) { @@ -489,7 +487,7 @@ prep.epi_recipe <- function( ) training <- bake(x$steps[[i]], new_data = training) if (!tibble::is_tibble(training)) { - cli::cli_abort("`bake()` methods should always return {.cls tibble}.") + cli_abort("`bake()` methods should always return {.cls tibble}.") } if (!is_epi_df(training)) { # tidymodels killed our class @@ -501,7 +499,7 @@ prep.epi_recipe <- function( other_keys = metadata$other_keys %||% character() ) } - training <- dplyr::relocate(training, all_of(key_colnames(training))) + training <- relocate(training, all_of(key_colnames(training))) x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info) if (!is.na(x$steps[[i]]$role)) { new_vars <- setdiff(x$term_info$variable, running_info$variable) @@ -514,7 +512,7 @@ prep.epi_recipe <- function( recipes:::changelog(log_changes, before_nms, names(training), x$steps[[i]]) running_info <- rbind( running_info, - dplyr::mutate(x$term_info, number = i, skip = x$steps[[i]]$skip) + mutate(x$term_info, number = i, skip = x$steps[[i]]$skip) ) } else { if (verbose) cat(note, "[pre-trained]\n") @@ -546,9 +544,9 @@ prep.epi_recipe <- function( x$orig_lvls <- orig_lvls x$retained <- retain x$last_term_info <- running_info %>% - dplyr::group_by(variable) %>% - dplyr::arrange(dplyr::desc(number)) %>% - dplyr::summarise( + group_by(variable) %>% + arrange(dplyr::desc(number)) %>% + summarise( type = list(dplyr::first(type)), role = list(unique(unlist(role))), source = dplyr::first(source), diff --git a/R/epi_shift.R b/R/epi_shift.R index e0b68b579..815e1e48f 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -37,9 +37,8 @@ get_sign <- function(object) { #' backend for both `bake.step_epi_ahead` and `bake.step_epi_lag`, performs the #' checks missing in `epi_shift_single` #' @keywords internal -#' @importFrom cli cli_abort #' @importFrom tidyr expand_grid -#' @importFrom dplyr mutate left_join join_by +#' @importFrom dplyr join_by add_shifted_columns <- function(new_data, object, amount) { sign_shift <- get_sign(object) latency_table <- attributes(new_data)$metadata$latency_table @@ -78,12 +77,12 @@ add_shifted_columns <- function(new_data, object, amount) { ok <- object$keys shifted <- reduce( pmap(grid, epi_shift_single, x = new_data, key_cols = ok), - dplyr::full_join, + full_join, by = ok ) processed <- new_data %>% full_join(shifted, by = ok) %>% - group_by(dplyr::across(dplyr::all_of(kill_time_value(ok)))) %>% + group_by(across(all_of(kill_time_value(ok)))) %>% arrange(time_value) %>% ungroup() %>% as_epi_df() diff --git a/R/epi_workflow.R b/R/epi_workflow.R index 619fb6d55..7819a6d73 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -155,7 +155,6 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' #' preds <- predict(wf, latest) #' preds -#' @importFrom cli cli_abort predict.epi_workflow <- function(object, new_data, type = NULL, opts = list(), ...) { if (!workflows::is_trained_workflow(object)) { cli_abort(c( @@ -188,12 +187,10 @@ augment.epi_workflow <- function(x, new_data, ...) { if (is_epi_df(predictions)) { join_by <- key_colnames(predictions) } else { - cli::cli_abort( - c( + cli_abort(c( "Cannot determine how to join `new_data` with the `predictions`.", "Try converting `new_data` to an {.cls epi_df} with `as_epi_df(new_data)`." - ) - ) + )) } complete_overlap <- intersect(names(new_data), join_by) if (length(complete_overlap) < length(join_by)) { diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index fe072a42b..b26e3c2c1 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -105,7 +105,6 @@ flatline_forecaster <- function( #' flatline_args_list() #' flatline_args_list(symmetrize = FALSE) #' flatline_args_list(quantile_levels = c(.1, .3, .7, .9), n_training = 120) -#' @importFrom cli cli_abort cli_warn flatline_args_list <- function( ahead = 7L, n_training = Inf, diff --git a/R/flusight_hub_formatter.R b/R/flusight_hub_formatter.R index 3e0eb1aaa..c1aa00b82 100644 --- a/R/flusight_hub_formatter.R +++ b/R/flusight_hub_formatter.R @@ -1,7 +1,7 @@ location_to_abbr <- function(location) { dictionary <- state_census %>% - dplyr::mutate(fips = sprintf("%02d", fips)) %>% + mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr @@ -12,7 +12,7 @@ location_to_abbr <- function(location) { abbr_to_location <- function(abbr) { dictionary <- state_census %>% - dplyr::mutate(fips = sprintf("%02d", fips)) %>% + mutate(fips = sprintf("%02d", fips)) %>% dplyr::transmute( location = dplyr::case_match(fips, "00" ~ "US", .default = fips), abbr diff --git a/R/frosting.R b/R/frosting.R index 6bc103465..2672bcdd1 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -89,7 +89,7 @@ validate_has_postprocessor <- function(x, ..., call = caller_env()) { "The workflow must have a frosting postprocessor.", i = "Provide one with `add_frosting()`." ) - cli::cli_abort(message, call = call) + cli_abort(message, call = call) } invisible(x) } @@ -288,7 +288,7 @@ new_frosting <- function() { #' p frosting <- function(layers = NULL, requirements = NULL) { if (!is_null(layers) || !is_null(requirements)) { - cli::cli_abort( + cli_abort( "Currently, no arguments to `frosting()` are allowed to be non-null." ) } @@ -356,7 +356,6 @@ apply_frosting.default <- function(workflow, components, ...) { #' @rdname apply_frosting #' @importFrom rlang is_null -#' @importFrom cli cli_abort cli_warn #' @param type,opts forwarded (along with `...`) to [`predict.model_fit()`] and #' [`slather()`] for supported layers #' @export @@ -368,7 +367,7 @@ apply_frosting.epi_workflow <- components$predictions <- predict( the_fit, components$forged$predictors, ... ) - components$predictions <- dplyr::bind_cols( + components$predictions <- bind_cols( components$keys, components$predictions ) return(components) @@ -382,7 +381,7 @@ apply_frosting.epi_workflow <- components$predictions <- predict( the_fit, components$forged$predictors, type, opts, ... ) - components$predictions <- dplyr::bind_cols( + components$predictions <- bind_cols( components$keys, components$predictions ) return(components) diff --git a/R/get_test_data.R b/R/get_test_data.R index f070e97bb..af43fc012 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -26,7 +26,6 @@ #' get_test_data(recipe = rec, x = case_death_rate_subset) #' @importFrom rlang %@% #' @importFrom stats na.omit -#' @importFrom cli cli_abort cli_warn #' @export get_test_data <- function(recipe, x) { if (!is_epi_df(x)) cli_abort("`x` must be an `epi_df`.") @@ -63,13 +62,13 @@ get_test_data <- function(recipe, x) { # If we skip NA completion, we remove undesirably early time values # Happens globally, over all groups - x <- dplyr::filter(x, max_time_value - time_value <= keep) + x <- filter(x, max_time_value - time_value <= keep) # If all(lags > 0), then we get rid of recent data if (min_lags > 0 && min_lags < Inf) { - x <- dplyr::filter(x, max_time_value - time_value >= min_lags) + x <- filter(x, max_time_value - time_value >= min_lags) } - dplyr::filter(x, max_time_value - time_value <= keep) %>% + filter(x, max_time_value - time_value <= keep) %>% epiprocess::ungroup() } diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index daeaa1a3e..fd61c4045 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -163,7 +163,7 @@ slather.layer_cdc_flatline_quantiles <- } the_fit <- workflows::extract_fit_parsnip(workflow) if (!inherits(the_fit, "_flatline")) { - cli::cli_warn(c( + cli_warn(c( "Predictions for this workflow were not produced by the {.cls flatline}", "{.pkg parsnip} engine. Results may be unexpected. See {.fn epipredict::flatline}." )) @@ -176,7 +176,7 @@ slather.layer_cdc_flatline_quantiles <- if (length(object$by_key) > 0L) { cols_in_preds <- hardhat::check_column_names(p, object$by_key) if (!cols_in_preds$ok) { - cli::cli_warn(paste( + cli_warn(paste( "Predicted values are missing key columns: {.val {cols_in_preds$missing_names}}.", "Ignoring these." )) @@ -184,7 +184,7 @@ slather.layer_cdc_flatline_quantiles <- if (inherits(the_fit, "_flatline")) { cols_in_resids <- hardhat::check_column_names(r, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(paste( + cli_warn(paste( "Existing residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) @@ -201,7 +201,7 @@ slather.layer_cdc_flatline_quantiles <- ) cols_in_resids <- hardhat::check_column_names(key_cols, object$by_key) if (!cols_in_resids$ok) { - cli::cli_warn(paste( + cli_warn(paste( "Requested residuals are missing key columns: {.val {cols_in_resids$missing_names}}.", "Ignoring these." )) diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index b21bdcfcc..1b623adfa 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -102,7 +102,7 @@ slather.layer_residual_quantiles <- common <- intersect(object$by_key, names(key_cols)) excess <- setdiff(object$by_key, names(key_cols)) if (length(excess) > 0L) { - cli::cli_warn(paste( + cli_warn(paste( "Requested residual grouping key(s) {.val {excess}} are unavailable ", "in the original data. Grouping by the remainder: {.val {common}}." )) @@ -113,7 +113,7 @@ slather.layer_residual_quantiles <- if (length(common_in_r) == length(common)) { r <- left_join(key_cols, r, by = common_in_r) } else { - cli::cli_warn(paste( + cli_warn(paste( "Some grouping keys are not in data.frame returned by the", "`residuals()` method. Groupings may not be correct." )) @@ -124,7 +124,7 @@ slather.layer_residual_quantiles <- } r <- r %>% - dplyr::summarise( + summarise( dstn = list(quantile( c(.resid, s * .resid), probs = object$quantile_levels, na.rm = TRUE @@ -132,7 +132,7 @@ slather.layer_residual_quantiles <- ) # Check for NA if (any(sapply(r$dstn, is.na))) { - cli::cli_abort(c( + cli_abort(c( "Residual quantiles could not be calculated due to missing residuals.", i = "This may be due to `n_train` < `ahead` in your {.cls epi_recipe}." )) @@ -149,7 +149,7 @@ slather.layer_residual_quantiles <- grab_residuals <- function(the_fit, components) { if (the_fit$spec$mode != "regression") { - cli::cli_abort("For meaningful residuals, the predictor should be a regression model.") + cli_abort("For meaningful residuals, the predictor should be a regression model.") } r <- stats::residuals(the_fit$fit) if (!is.null(r)) { # Got something from the method @@ -157,7 +157,7 @@ grab_residuals <- function(the_fit, components) { if (".resid" %in% names(r)) { # success return(r) } else { # failure - cli::cli_warn(c( + cli_warn(c( "The `residuals()` method for objects of class {.cls {cl}} results in", "a data frame without a column named `.resid`.", i = "Residual quantiles will be calculated directly from the", @@ -168,7 +168,7 @@ grab_residuals <- function(the_fit, components) { } else if (is.vector(drop(r))) { # also success return(tibble(.resid = drop(r))) } else { # failure - cli::cli_warn(paste( + cli_warn(paste( "The `residuals()` method for objects of class {.cls {cl}} results in an", "object that is neither a data frame with a column named `.resid`,", "nor something coercible to a vector.", diff --git a/R/layers.R b/R/layers.R index aa515a917..2a7568d51 100644 --- a/R/layers.R +++ b/R/layers.R @@ -148,8 +148,8 @@ validate_layer <- function(x, ..., arg = rlang::caller_arg(x), call = caller_env()) { rlang::check_dots_empty() if (!is_layer(x)) { - cli::cli_abort( - "{arg} must be a frosting layer, not a {.cls {class(x)[[1]]}}.", +- cli_abort( +- "{arg} must be a frosting layer, not a {.cls {class(x)[[1]]}}.", .call = call ) } diff --git a/R/make_quantile_reg.R b/R/make_quantile_reg.R index 2157aa470..9e653184c 100644 --- a/R/make_quantile_reg.R +++ b/R/make_quantile_reg.R @@ -37,7 +37,7 @@ quantile_reg <- function(mode = "regression", engine = "rq", quantile_levels = 0 if (any(quantile_levels > 1)) cli_abort("All `quantile_levels` must be less than 1.") if (any(quantile_levels < 0)) cli_abort("All `quantile_levels` must be greater than 0.") if (is.unsorted(quantile_levels)) { - cli::cli_warn("Sorting `quantile_levels` to increasing order.") + cli_warn("Sorting `quantile_levels` to increasing order.") quantile_levels <- sort(quantile_levels) } args <- list(quantile_levels = rlang::enquo(quantile_levels), method = rlang::enquo(method)) diff --git a/R/pivot_quantiles.R b/R/pivot_quantiles.R index 4abf9d257..b01dc392c 100644 --- a/R/pivot_quantiles.R +++ b/R/pivot_quantiles.R @@ -71,7 +71,7 @@ pivot_quantiles_longer <- function(.data, ..., .ignore_length_check = FALSE) { .data <- .data %>% tidyr::unnest(all_of(col), names_sep = "_") } } else { - cli::cli_abort(paste( + cli_abort(paste( "Some selected columns contain different numbers of quantiles.", "The result would be a {.emph very} long {.cls tibble}.", "To do this anyway, rerun with `.ignore_length_check = TRUE`." @@ -115,7 +115,7 @@ pivot_quantiles_wider <- function(.data, ...) { checks <- map_lgl(cols, ~ diff(range(vctrs::list_sizes(.data[[.x]]))) == 0L) if (!all(checks)) { nms <- cols[!checks] - cli::cli_abort(c( + cli_abort(c( "Quantiles must be the same length and have the same set of taus.", i = "Check failed for variables(s) {.var {nms}}." )) @@ -157,7 +157,7 @@ validate_pivot_quantiles <- function(.data, ...) { dqs <- map_lgl(cols, ~ is_dist_quantiles(.data[[.x]])) if (!all(dqs)) { nms <- cols[!dqs] - cli::cli_abort( + cli_abort( "Variables(s) {.var {nms}} are not `dist_quantiles`. Cannot pivot them." ) } diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 4caf17d8e..be359fc9c 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -212,37 +212,37 @@ step_adjust_latency <- id = recipes::rand_id("adjust_latency")) { arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { - cli::cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", + cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", class = "epipredict__step_adjust_latency__epi_recipe_only" ) } if (!is.null(columns)) { - cli::cli_abort(c("The `columns` argument must be `NULL`.", + cli_abort(c("The `columns` argument must be `NULL`.", i = "Use `tidyselect` methods to choose columns to lag." ), class = "epipredict__step_adjust_latency__cols_not_null") } if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { - cli::cli_warn( + cli_warn( "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified.", class = "epipredict__step_adjust_latency__misordered_step_warning" ) } else if ((method == "extend_lags") && detect_step(recipe, "epi_lag")) { - cli::cli_warn( + cli_warn( "If `method` is {.val extend_lags} or {.val locf}, then the previous `step_epi_lag`s won't work with modified data.", class = "epipredict__step_adjust_latency__misordered_step_warning" ) } else if ((method == "locf") && (length(recipe$steps) > 0)) { - cli::cli_warn("There are steps before `step_adjust_latency`. With the method {.val locf}, it is recommended to include this step before any others", + cli_warn("There are steps before `step_adjust_latency`. With the method {.val locf}, it is recommended to include this step before any others", class = "epipredict__step_adjust_latency__misordered_step_warning" ) } if (detect_step(recipe, "naomit")) { - cli::cli_abort("adjust_latency needs to occur before any `NA` removal, + cli_abort("adjust_latency needs to occur before any `NA` removal, as columns may be moved around", class = "epipredict__step_adjust_latency__post_NA_error") } if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { - cli::cli_abort("Only one of `fixed_latency` and `fixed_forecast_date` + cli_abort("Only one of `fixed_latency` and `fixed_forecast_date` can be non-`NULL` at a time!", class = "epipredict__step_adjust_latency__too_many_args_error") } if (length(fixed_latency > 1)) { @@ -250,7 +250,7 @@ then the previous `step_epi_lag`s won't work with modified data.", data_names <- names(template)[!names(template) %in% key_colnames(template)] wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] if (length(wrong_names) > 0) { - cli::cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}", class = "epipredict__step_adjust_latency__undefined_names_error") + cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}", class = "epipredict__step_adjust_latency__undefined_names_error") } } method <- rlang::arg_match(method) diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index d9d52684d..00bf9bd87 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -30,7 +30,6 @@ #' #' @family row operation steps #' @importFrom epiprocess growth_rate -#' @importFrom cli cli_abort #' @export #' @examples #' r <- epi_recipe(case_death_rate_subset) %>% diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index bee5f5c40..39ae1ba59 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -13,7 +13,6 @@ #' #' #' @family row operation steps -#' @importFrom cli cli_abort cli_warn #' @export #' @examples #' r <- epi_recipe(case_death_rate_subset) %>% diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 4e4d3aa26..76068109c 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -165,7 +165,7 @@ bake.step_population_scaling <- function(object, new_data, ...) { hardhat::validate_column_names(object$df, joinby$y) if (object$suffix != "_scaled" && object$create_new == FALSE) { - cli::cli_warn(c( + cli_warn(c( "Custom `suffix` {.val {object$suffix}} was ignored in `step_population_scaling`.", i = "Perhaps `create_new` should be {.val {TRUE}}?" )) diff --git a/R/tidy.R b/R/tidy.R index 61b298411..8fc06398a 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -96,7 +96,7 @@ tidy.frosting <- function(x, number = NA, id = NA, ...) { #' @export tidy.layer <- function(x, ...) { - cli::cli_abort( + cli_abort( "No `tidy()` method exists for a layer with class: {.cls {class(x)}}." ) } diff --git a/R/utils-enframer.R b/R/utils-enframer.R index 0a8152906..723bd6a9c 100644 --- a/R/utils-enframer.R +++ b/R/utils-enframer.R @@ -8,7 +8,7 @@ enframer <- function(df, x, fill = NA) { if (any(names(df) %in% x)) { stop("In enframer: some new cols match existing column names") } - for (v in x) df <- dplyr::mutate(df, !!v := fill) + for (v in x) df <- mutate(df, !!v := fill) df } diff --git a/R/utils-latency.R b/R/utils-latency.R index bd0e4803e..9398f8188 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -40,7 +40,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { pull(variable) # make sure that there's enough column names if (length(original_columns) < 3) { - cli::cli_abort( + cli_abort( glue::glue( "The original columns of `time_value`, ", "`geo_value` and at least one signal. The current colums are \n", @@ -70,7 +70,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { } # make sure the as_of is sane if (!inherits(forecast_date, class(max_time)) & !inherits(forecast_date, "POSIXt")) { - cli::cli_abort( + cli_abort( paste( "the data matrix `forecast_date` value is {forecast_date}, ", "and not a valid `time_type` with type ", @@ -81,7 +81,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { ) } if (is.null(forecast_date) || is.na(forecast_date)) { - cli::cli_warn( + cli_warn( paste( "epi_data's `forecast_date` was {forecast_date}, setting to ", "the latest time value, {max_time}." @@ -90,7 +90,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { ) forecast_date <- max_time } else if (forecast_date < max_time) { - cli::cli_abort( + cli_abort( paste( "`forecast_date` ({(forecast_date)}) is before the most ", "recent data ({max_time}). Remove before ", @@ -277,7 +277,7 @@ check_interminable_latency <- function(dataset, latency_table, target_columns, f filter(!is.na(!!(latency_table[[i_latency, "col_name"]]))) %>% pull(time_value) %>% max() - cli::cli_warn( + cli_warn( message = c( paste( "The maximum latency is {latency_max}, ", diff --git a/R/utils-misc.R b/R/utils-misc.R index 8fc16a968..5cc8b364d 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -34,7 +34,7 @@ check_pname <- function(res, preds, object, newname = NULL) { grab_forged_keys <- function(forged, workflow, new_data) { forged_roles <- names(forged$extras$roles) - extras <- dplyr::bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) + extras <- bind_cols(forged$extras$roles[forged_roles %in% c("geo_value", "time_value", "key")]) # 1. these are the keys in the test data after prep/bake new_keys <- names(extras) # 2. these are the keys in the training data @@ -42,7 +42,7 @@ grab_forged_keys <- function(forged, workflow, new_data) { # 3. these are the keys in the test data as input new_df_keys <- key_colnames(new_data, extra_keys = setdiff(new_keys, c("geo_value", "time_value"))) if (!(setequal(old_keys, new_df_keys) && setequal(new_keys, new_df_keys))) { - cli::cli_warn(paste( + cli_warn(paste( "Not all epi keys that were present in the training data are available", "in `new_data`. Predictions will have only the available keys." )) diff --git a/R/utils-shift.R b/R/utils-shift.R index 75a9a6bd1..7b9de6a08 100644 --- a/R/utils-shift.R +++ b/R/utils-shift.R @@ -12,7 +12,7 @@ adjust_latency <- function(object, new_data) { } else if (method == "extend_ahead") { as_of <- attributes(new_data)$metadata$as_of if (FALSE && (typeof(as_of) != typeof(new_data$time_value))) { - cli::cli_abort(paste( + cli_abort(paste( "the data matrix `as_of` value is {as_of}, ", "and not a valid `time_type` with type ", "matching `time_value`'s type of ", @@ -27,13 +27,13 @@ adjust_latency <- function(object, new_data) { max_time <- max(time_values) shift_amount <- as.Date(as_of) - max_time if (is.null(as_of) || is.na(as_of)) { - cli::cli_warn(paste( + cli_warn(paste( "epi_data's `as_of` was {as_of}, setting to ", "the latest time value, {max_time}." )) as_of <- max_time } else if (as_of < max_time) { - cli::cli_abort(paste( + cli_abort(paste( "`as_of` ({(as_of)}) is before the most ", "recent data ({max_time}). Remove before ", "predicting." @@ -47,7 +47,7 @@ adjust_latency <- function(object, new_data) { ((time_type == "yearmonth") && (shift_amount >= 2)) || ((time_type == "yearquarter") && (shift_amount >= 1)) || ((time_type == "year") && (shift_amount >= 1))) { - cli::cli_warn(paste( + cli_warn(paste( "!" = glue::glue( "The ahead has been adjusted by {shift_amount}, ", "which is questionable for it's `time_type` of ", @@ -60,10 +60,10 @@ adjust_latency <- function(object, new_data) { } return(effective_ahead) } else { - cli::cli_abort("the `time_value` column of `new_data` is empty") + cli_abort("the `time_value` column of `new_data` is empty") } } else { - cli::cli_abort(paste( + cli_abort(paste( "Latency adjustment method {method} has not yet ", "been implemented for `step_epi_ahead`." )) From 63db4d5802f1af54d43b2931a36d5e140d5d1aa6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 13 Sep 2024 10:55:58 -0500 Subject: [PATCH 72/92] various recommendations --- DEVELOPMENT.md | 1 + R/arx_classifier.R | 2 +- R/arx_forecaster.R | 15 +++--- R/canned-epipred.R | 4 +- R/cdc_baseline_forecaster.R | 4 +- R/epi_shift.R | 6 +-- R/epi_workflow.R | 6 +-- R/flatline_forecaster.R | 8 ++-- R/import-standalone-purrr.R | 2 +- R/layers.R | 6 +-- R/step_adjust_latency.R | 42 ++++++++--------- R/utils-latency.R | 12 +---- R/utils-shift.R | 71 ----------------------------- man/adjust_latency.Rd | 14 ------ man/arx_args_list.Rd | 8 ++-- man/arx_class_args_list.Rd | 6 +-- man/count_single_column.Rd | 4 +- man/step_adjust_latency.Rd | 30 +++--------- tests/testthat/test-utils_latency.R | 22 --------- 19 files changed, 60 insertions(+), 203 deletions(-) delete mode 100644 R/utils-shift.R delete mode 100644 man/adjust_latency.Rd diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 0335f124b..8bc251e77 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -32,6 +32,7 @@ The `main` version is available at `file:////epidatr/epipredict/inde You can also build the docs manually and launch the site with python. From the terminal, this looks like ```bash +R -e 'pkgdown::clean_site()' R -e 'devtools::document()' R -e 'pkgdown::build_site()' python -m http.server -d docs diff --git a/R/arx_classifier.R b/R/arx_classifier.R index bbb3433c1..2bd2c30a1 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -204,7 +204,7 @@ arx_class_epi_workflow <- function( r <- r %>% step_mutate( across( - starts_with("ahead"), + starts_with("ahead_"), ~ cut(.x, breaks = args_list$breaks), .names = "outcome_class", .unpack = TRUE diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index a3363e689..14d8b112b 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -126,7 +126,7 @@ arx_fcast_epi_workflow <- function( # forecast_date is above all what they set; # if they don't and they're not adjusting latency, it defaults to the max time_value # if they're adjusting, it defaults to the as_of - if (is.null(args_list$adjust_latency)) { + if (args_list$adjust_latency == "none") { forecast_date_default <- max(epi_data$time_value) if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { cli_warn( @@ -236,11 +236,11 @@ arx_fcast_epi_workflow <- function( #' non-`NULL`. #' @param target_date Date. The date that is being forecast. The default `NULL` #' will determine this automatically as `forecast_date + ahead`. -#' @param adjust_latency Character or `NULL`. One of the `method`s of -#' [step_adjust_latency()], or `NULL` (in which case there is no adjustment). +#' @param adjust_latency Character. One of the `method`s of +#' [step_adjust_latency()], or `"none"` (in which case there is no adjustment). #' If the `forecast_date` is after the last day of data, this determines how #' to shift the model to account for this difference. The options are: -#' - `NULL` the default, assumes the `forecast_date` is the last day of data +#' - `"none"` the default, assumes the `forecast_date` is the last day of data #' - `"extend_ahead"`: increase the `ahead` by the latency so it's relative to #' the last day of data. For example, if the last day of data was 3 days ago, #' the ahead becomes `ahead+3`. @@ -283,7 +283,7 @@ arx_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, - adjust_latency = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -296,9 +296,10 @@ arx_args_list <- function( .lags <- lags if (is.list(lags)) lags <- unlist(lags) - arg_is_scalar(ahead, n_training, symmetrize, nonneg) + adjust_latency <- rlang::arg_match(adjust_latency) + arg_is_scalar(ahead, n_training, symmetrize, nonneg, adjust_latency) arg_is_chr(quantile_by_key, allow_empty = TRUE) - arg_is_scalar(forecast_date, target_date, adjust_latency, allow_null = TRUE) + arg_is_scalar(forecast_date, target_date, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags) arg_is_lgl(symmetrize, nonneg) diff --git a/R/canned-epipred.R b/R/canned-epipred.R index 66c66a339..1e088426e 100644 --- a/R/canned-epipred.R +++ b/R/canned-epipred.R @@ -113,7 +113,7 @@ print.canned_epipred <- function(x, name, ...) { )) fit_recipe <- extract_recipe(x$epi_workflow) if (detect_step(fit_recipe, "adjust_latency")) { - is_adj_latency <- map_lgl(fit_recipe$steps, \(x) inherits(x, "step_adjust_latency")) + is_adj_latency <- map_lgl(fit_recipe$steps, function(x) inherits(x, "step_adjust_latency")) latency_step <- fit_recipe$steps[is_adj_latency][[1]] # all steps after adjust_latency later_steps <- fit_recipe$steps[-(1:which(is_adj_latency))] @@ -129,7 +129,7 @@ print.canned_epipred <- function(x, name, ...) { } later_steps[[1]]$columns valid_columns <- later_steps %>% - keep(\(x) inherits(x, step_names)) %>% + keep(function(x) inherits(x, step_names)) %>% purrr::map("columns") %>% reduce(c) latency_per_base_col <- latency_step$latency_table %>% diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index 85635cfae..3352c5159 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -78,9 +78,7 @@ cdc_baseline_forecaster <- function( # target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) - latest <- get_test_data( - epi_recipe(epi_data), epi_data - ) + latest <- get_test_data(epi_recipe(epi_data), epi_data) f <- frosting() %>% layer_predict() %>% diff --git a/R/epi_shift.R b/R/epi_shift.R index 815e1e48f..7088fb341 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -45,8 +45,6 @@ add_shifted_columns <- function(new_data, object, amount) { shift_sign_lat <- attributes(new_data)$metadata$shift_sign if (!is.null(latency_table) && shift_sign_lat == sign_shift) { - # are we adding an unreasonable amount of latency? - check_interminable_latency(new_data, latency_table, object$columns, attributes(new_data)$metadata$forecast_date) # get the actually used latencies rel_latency <- latency_table %>% filter(col_name %in% object$columns) } else { @@ -56,9 +54,7 @@ add_shifted_columns <- function(new_data, object, amount) { grid <- expand_grid(col = object$columns, amount = sign_shift * amount) %>% left_join(rel_latency, by = join_by(col == col_name), ) %>% tidyr::replace_na(list(latency = 0)) %>% - mutate( - shift_val = amount + latency - ) %>% + mutate(shift_val = amount + latency) %>% mutate( newname = glue::glue("{object$prefix}{abs(shift_val)}_{col}"), # name is always positive amount = NULL, diff --git a/R/epi_workflow.R b/R/epi_workflow.R index 7819a6d73..ff2393ecc 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -188,9 +188,9 @@ augment.epi_workflow <- function(x, new_data, ...) { join_by <- key_colnames(predictions) } else { cli_abort(c( - "Cannot determine how to join `new_data` with the `predictions`.", - "Try converting `new_data` to an {.cls epi_df} with `as_epi_df(new_data)`." - )) + "Cannot determine how to join `new_data` with the `predictions`.", + "Try converting `new_data` to an {.cls epi_df} with `as_epi_df(new_data)`." + )) } complete_overlap <- intersect(names(new_data), join_by) if (length(complete_overlap) < length(join_by)) { diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index b26e3c2c1..59f54bd86 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -63,11 +63,9 @@ flatline_forecaster <- function( eng <- linear_reg(engine = "flatline") - wf <- epi_workflow(r, eng, f) - wf <- fit(wf, epi_data) - preds <- suppressWarnings(forecast( - wf - )) %>% + wf <- epi_workflow(r, eng, f) %>% + fit(epi_data) + preds <- suppressWarnings(forecast(wf)) %>% as_tibble() %>% select(-time_value) diff --git a/R/import-standalone-purrr.R b/R/import-standalone-purrr.R index e4e83f428..623142a0e 100644 --- a/R/import-standalone-purrr.R +++ b/R/import-standalone-purrr.R @@ -123,7 +123,7 @@ map_if <- function(.x, .p, .f, ...) { .x } .rlang_purrr_probe <- function(.x, .p, ...) { - if (rlang::is_logical(.p)) { + if (is_logical(.p)) { stopifnot(length(.p) == length(.x)) .p } else { diff --git a/R/layers.R b/R/layers.R index 2a7568d51..538fcad1b 100644 --- a/R/layers.R +++ b/R/layers.R @@ -148,9 +148,9 @@ validate_layer <- function(x, ..., arg = rlang::caller_arg(x), call = caller_env()) { rlang::check_dots_empty() if (!is_layer(x)) { -- cli_abort( -- "{arg} must be a frosting layer, not a {.cls {class(x)[[1]]}}.", - .call = call + cli_abort( + "{arg} must be a frosting layer, not a {.cls {class(x)[[1]]}}.", + call = call ) } invisible(x) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index be359fc9c..d5776940e 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -163,9 +163,9 @@ #' the date from which the forecast is actually occurring. If `NULL`, the #' `forecast_date` is determined either via the `fixed_latency`, or is set to #' the `epi_df`'s `as_of` value if `fixed_latency` is also `NULL`. -#' @param role For model terms created by this step, what analysis role should -#' they be assigned? `lag` is a predictor while `ahead` is an outcome. It -#' should be correctly inferred and not need setting +#' @param check_latency_length bool, determines whether to warn if the latency +#' is unusually high. Turn off if you know your forecast is going to be far +#' into the future. #' @template step-return #' @inheritParams recipes::step_lag #' @@ -197,30 +197,22 @@ step_adjust_latency <- function(recipe, ..., - role = NA, - trained = FALSE, method = c( "extend_ahead", "locf", "extend_lags" ), - epi_keys_checked = c("geo_value"), + epi_keys_checked = NULL, fixed_latency = NULL, fixed_forecast_date = NULL, - skip = FALSE, - columns = NULL, - id = recipes::rand_id("adjust_latency")) { + check_latency_length = TRUE, + id = rand_id("adjust_latency")) { arg_is_chr_scalar(id, method) if (!is_epi_recipe(recipe)) { cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", class = "epipredict__step_adjust_latency__epi_recipe_only" ) } - if (!is.null(columns)) { - cli_abort(c("The `columns` argument must be `NULL`.", - i = "Use `tidyselect` methods to choose columns to lag." - ), class = "epipredict__step_adjust_latency__cols_not_null") - } if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { cli_warn( "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified.", @@ -261,22 +253,25 @@ then the previous `step_epi_lag`s won't work with modified data.", rel_step_type <- "step_epi_lag" shift_name <- "lag" } - + if (is.null(epi_keys_checked)) { + epi_keys_checked <- kill_time_value(key_colnames(recipe$template)) + } recipes::add_step( recipe, step_adjust_latency_new( terms = enquos(...), - role = role, + role = NA, method = method, epi_keys_checked = epi_keys_checked, - trained = trained, + check_latency_length = check_latency_length, + trained = FALSE, forecast_date = fixed_forecast_date, latency = fixed_latency, latency_table = NULL, metadata = NULL, keys = key_colnames(recipe), - columns = columns, - skip = skip, + columns = NULL, + skip = FALSE, id = id ) ) @@ -284,7 +279,7 @@ then the previous `step_epi_lag`s won't work with modified data.", step_adjust_latency_new <- function(terms, role, trained, forecast_date, latency, latency_table, - metadata, time_type, keys, method, epi_keys_checked, columns, skip, + metadata, time_type, keys, method, epi_keys_checked, check_latency_length, columns, skip, id) { step( subclass = "adjust_latency", @@ -292,6 +287,7 @@ step_adjust_latency_new <- role = role, method = method, epi_keys_checked = epi_keys_checked, + check_latency_length = check_latency_length, trained = trained, forecast_date = forecast_date, latency = latency, @@ -337,7 +333,6 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { attributes(training)$metadata$latency_table <- latency_table # get the columns used, even if it's all of them terms_used <- x$terms - # TODO replace with is_empty as in bake.recipe if (is_empty(terms_used)) { terms_used <- info %>% filter(role == "raw") %>% @@ -357,6 +352,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { keys = x$keys, method = x$method, epi_keys_checked = x$epi_keys_checked, + check_latency_length = x$check_latency_length, columns = recipes_eval_select(latency_table$col_name, training, info), skip = x$skip, id = x$id @@ -383,7 +379,9 @@ bake.step_adjust_latency <- function(object, new_data, ...) { # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns rel_keys <- setdiff(key_colnames(new_data), "time_value") modified_columns <- object$columns %>% unname() - check_interminable_latency(new_data, object$latency_table, modified_columns, object$forecast_date) + if (object$check_latency_length) { + check_interminable_latency(new_data, object$latency_table, modified_columns, object$forecast_date) + } new_data <- new_data %>% pad_to_end(rel_keys, object$forecast_date, modified_columns) %>% diff --git a/R/utils-latency.R b/R/utils-latency.R index 9398f8188..f53f01ca3 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -223,17 +223,9 @@ get_grouping_columns <- function(x) { #' get the location of the last real value #' @param col the relevant column -#' @param from_last instead of the number of columns #' @keywords internal -count_single_column <- function(col, from_last) { - run_lengths <- rle(is.na(col)) - n_el <- length(col) - if (tail(run_lengths$values, 1)) { - n_at_end <- tail(run_lengths$lengths, 1) - return(n_el - n_at_end) - } else { - return(n_el) - } +count_single_column <- function(col) { + max(which(!is.na(col))) } diff --git a/R/utils-shift.R b/R/utils-shift.R deleted file mode 100644 index 7b9de6a08..000000000 --- a/R/utils-shift.R +++ /dev/null @@ -1,71 +0,0 @@ -#' various ways of handling differences between the `as_of` date and the maximum -#' time value -#' @description -#' adjust the ahead so that we will be predicting `ahead` days after the `as_of` -#' date, rather than relative to the last day of data -#' @keywords internal -adjust_latency <- function(object, new_data) { - method <- object$latency_adjustment - ahead <- object$ahead - if (is.na(method) || is.null(method) || method == "None") { - return(object$ahead) - } else if (method == "extend_ahead") { - as_of <- attributes(new_data)$metadata$as_of - if (FALSE && (typeof(as_of) != typeof(new_data$time_value))) { - cli_abort(paste( - "the data matrix `as_of` value is {as_of}, ", - "and not a valid `time_type` with type ", - "matching `time_value`'s type of ", - "{typeof(new_data$time_value)}." - )) - } - # adjust the ahead so that we're predicting relative to the as_of date, - # rather - # than the last day of data - time_values <- new_data$time_value - if (length(time_values) > 0) { - max_time <- max(time_values) - shift_amount <- as.Date(as_of) - max_time - if (is.null(as_of) || is.na(as_of)) { - cli_warn(paste( - "epi_data's `as_of` was {as_of}, setting to ", - "the latest time value, {max_time}." - )) - as_of <- max_time - } else if (as_of < max_time) { - cli_abort(paste( - "`as_of` ({(as_of)}) is before the most ", - "recent data ({max_time}). Remove before ", - "predicting." - )) - } - effective_ahead <- as.integer(shift_amount + ahead) - time_type <- attributes(new_data)$metadata$time_type - - if ((grepl("day", time_type) && (shift_amount >= 10)) || - (grepl("week", time_type) && (shift_amount >= 4)) || - ((time_type == "yearmonth") && (shift_amount >= 2)) || - ((time_type == "yearquarter") && (shift_amount >= 1)) || - ((time_type == "year") && (shift_amount >= 1))) { - cli_warn(paste( - "!" = glue::glue( - "The ahead has been adjusted by {shift_amount}, ", - "which is questionable for it's `time_type` of ", - "{time_type}" - ), - "i" = "input ahead: {ahead}", - "i" = "shifted ahead: {effective_ahead}", - "i" = "max_time = {max_time} -> as_of = {as_of}" - )) - } - return(effective_ahead) - } else { - cli_abort("the `time_value` column of `new_data` is empty") - } - } else { - cli_abort(paste( - "Latency adjustment method {method} has not yet ", - "been implemented for `step_epi_ahead`." - )) - } -} diff --git a/man/adjust_latency.Rd b/man/adjust_latency.Rd deleted file mode 100644 index eaebd5c29..000000000 --- a/man/adjust_latency.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-shift.R -\name{adjust_latency} -\alias{adjust_latency} -\title{various ways of handling differences between the \code{as_of} date and the maximum -time value} -\usage{ -adjust_latency(object, new_data) -} -\description{ -adjust the ahead so that we will be predicting \code{ahead} days after the \code{as_of} -date, rather than relative to the last day of data -} -\keyword{internal} diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index 214726fde..39ef23e09 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -10,7 +10,7 @@ arx_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, - adjust_latency = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -44,12 +44,12 @@ non-\code{NULL}. \item{target_date}{Date. The date that is being forecast. The default \code{NULL} will determine this automatically as \code{forecast_date + ahead}.} -\item{adjust_latency}{Character or \code{NULL}. One of the \code{method}s of -\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{NULL} (in which case there is no adjustment). +\item{adjust_latency}{Character. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{"none"} (in which case there is no adjustment). If the \code{forecast_date} is after the last day of data, this determines how to shift the model to account for this difference. The options are: \itemize{ -\item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"none"} the default, assumes the \code{forecast_date} is the last day of data \item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to the last day of data. For example, if the last day of data was 3 days ago, the ahead becomes \code{ahead+3}. diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 185b868e5..4873d59ae 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -46,12 +46,12 @@ non-\code{NULL}. \item{target_date}{Date. The date that is being forecast. The default \code{NULL} will determine this automatically as \code{forecast_date + ahead}.} -\item{adjust_latency}{Character or \code{NULL}. One of the \code{method}s of -\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{NULL} (in which case there is no adjustment). +\item{adjust_latency}{Character. One of the \code{method}s of +\code{\link[=step_adjust_latency]{step_adjust_latency()}}, or \code{"none"} (in which case there is no adjustment). If the \code{forecast_date} is after the last day of data, this determines how to shift the model to account for this difference. The options are: \itemize{ -\item \code{NULL} the default, assumes the \code{forecast_date} is the last day of data +\item \code{"none"} the default, assumes the \code{forecast_date} is the last day of data \item \code{"extend_ahead"}: increase the \code{ahead} by the latency so it's relative to the last day of data. For example, if the last day of data was 3 days ago, the ahead becomes \code{ahead+3}. diff --git a/man/count_single_column.Rd b/man/count_single_column.Rd index 090f064e4..7922511e7 100644 --- a/man/count_single_column.Rd +++ b/man/count_single_column.Rd @@ -4,12 +4,10 @@ \alias{count_single_column} \title{get the location of the last real value} \usage{ -count_single_column(col, from_last) +count_single_column(col) } \arguments{ \item{col}{the relevant column} - -\item{from_last}{instead of the number of columns} } \description{ get the location of the last real value diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 9b10d1822..80d6aa3ef 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -7,15 +7,12 @@ step_adjust_latency( recipe, ..., - role = NA, - trained = FALSE, method = c("extend_ahead", "locf", "extend_lags"), - epi_keys_checked = c("geo_value"), + epi_keys_checked = NULL, fixed_latency = NULL, fixed_forecast_date = NULL, - skip = FALSE, - columns = NULL, - id = recipes::rand_id("adjust_latency") + check_latency_length = TRUE, + id = rand_id("adjust_latency") ) } \arguments{ @@ -25,13 +22,6 @@ sequence of operations for this recipe.} \item{...}{One or more selector functions to choose variables for this step. See \code{\link[recipes:selections]{selections()}} for more details.} -\item{role}{For model terms created by this step, what analysis role should -they be assigned? \code{lag} is a predictor while \code{ahead} is an outcome. It -should be correctly inferred and not need setting} - -\item{trained}{A logical to indicate if the quantities for -preprocessing have been estimated.} - \item{method}{a character. Determines the method by which the forecast handles latency. The options are: \itemize{ @@ -73,15 +63,9 @@ the date from which the forecast is actually occurring. If \code{NULL}, the \code{forecast_date} is determined either via the \code{fixed_latency}, or is set to the \code{epi_df}'s \code{as_of} value if \code{fixed_latency} is also \code{NULL}.} -\item{skip}{A logical. Should the step be skipped when the -recipe is baked by \code{\link[recipes:bake]{bake()}}? While all operations are baked -when \code{\link[recipes:prep]{prep()}} is run, some operations may not be able to be -conducted on new data (e.g. processing the outcome variable(s)). -Care should be taken when using \code{skip = TRUE} as it may affect -the computations for subsequent operations.} - -\item{columns}{A character string of the selected variable names. This field -is a placeholder and will be populated once \code{\link[recipes:prep]{prep()}} is used.} +\item{check_latency_length}{bool, determines whether to warn if the latency +is unusually high. Turn off if you know your forecast is going to be far +into the future.} \item{id}{A character string that is unique to this step to identify it.} } @@ -276,8 +260,6 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with modified -#> data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index 6d71ee6eb..d408ba7d8 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -98,28 +98,6 @@ test_that("set_forecast_date works", { expect_equal(set_forecast_date(modified_data, info, NULL, NULL), as_of) }) -# confirm the delay is right -test_that("adjust_latency extend_ahead works", { - # testing that POSIXct converts correctly (as well as basic types) - expect_equal( - attributes(x_adjust_ahead)$metadata$as_of - max(x_adjust_ahead$time_value), - as.difftime(3, units = "days") - ) - object <- list(latency_adjustment = "extend_ahead", ahead = 7) - expect_no_error(adjusted_ahead <- adjust_latency(object, x_adjust_ahead)) - expect_type(adjusted_ahead, "integer") - expect_equal(adjusted_ahead, 3 + 7) -}) - -test_that("extend_ahead warns in case of extreme adjustment", { - # warns if the ahead is relatively small - attributes(x_adjust_ahead)$metadata$as_of <- - max(x_adjust_ahead$time_value) + 100 - object <- list(latency_adjustment = "extend_ahead", ahead = 7) - attributes(x_adjust_ahead)$metadata$time_type - testthat::expect_warning(adjust_latency(object, x_adjust_ahead), regexp = "The ahead has been adjusted by 100") -}) - test_that("pad_to_end works correctly", { single_ex <- tribble( ~geo_value, ~time_value, ~a, ~b, From b5ed1b3a16a52dbf11fb887eb5b5788cf66b40a6 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 13 Sep 2024 17:54:09 -0500 Subject: [PATCH 73/92] remaining recs besides metadata to term_info --- DESCRIPTION | 1 - NAMESPACE | 1 - R/arx_classifier.R | 18 +-- R/arx_forecaster.R | 8 +- R/step_adjust_latency.R | 91 ++++++++------- R/utils-latency.R | 104 ++++++++++++------ man/arx_args_list.Rd | 4 + man/arx_class_args_list.Rd | 6 +- ..._forecast_date.Rd => get_forecast_date.Rd} | 6 +- man/get_grouping_columns.Rd | 15 --- man/get_latency_table.Rd | 25 +++++ man/step_adjust_latency.Rd | 2 + tests/testthat/test-utils_latency.R | 8 +- 13 files changed, 179 insertions(+), 110 deletions(-) rename man/{set_forecast_date.Rd => get_forecast_date.Rd} (71%) delete mode 100644 man/get_grouping_columns.Rd create mode 100644 man/get_latency_table.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 552857a36..d89f9b276 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -41,7 +41,6 @@ Imports: recipes (>= 1.0.4), rlang (>= 1.1.0), purrr, - smoothqr, stats, tibble, tidyr, diff --git a/NAMESPACE b/NAMESPACE index ba319cecd..4c5ec75ac 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -318,7 +318,6 @@ importFrom(tidyr,fill) importFrom(tidyr,unnest) importFrom(tidyselect,all_of) importFrom(utils,capture.output) -importFrom(utils,head) importFrom(vctrs,as_list_of) importFrom(vctrs,field) importFrom(vctrs,new_rcrd) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 2bd2c30a1..9cc76052f 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -55,7 +55,7 @@ arx_classifier <- function( wf <- arx_class_epi_workflow(epi_data, outcome, predictors, trainer, args_list) wf <- fit(wf, epi_data) - if (is.null(args_list$adjust_latency)) { + if (args_list$adjust_latency == "none") { forecast_date_default <- max(epi_data$time_value) if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") @@ -133,7 +133,7 @@ arx_class_epi_workflow <- function( cli_abort("`trainer` must be a {.pkg parsnip} model of mode 'classification'.") } - if (is.null(args_list$adjust_latency)) { + if (args_list$adjust_latency == "none") { forecast_date_default <- max(epi_data$time_value) if (!is.null(args_list$forecast_date) && args_list$forecast_date != forecast_date_default) { cli_warn("The specified forecast date {args_list$forecast_date} doesn't match the date from which the forecast is occurring {forecast_date}.") @@ -190,9 +190,9 @@ arx_class_epi_workflow <- function( ) } } - ahead_out_name <- rlang::sym(paste0("ahead_", args_list$ahead, "_", pre_out_name)) + ahead_out_name <- glue::glue("ahead_[0-9]*_{pre_out_name}") method_adjust_latency <- args_list$adjust_latency - if (!is.null(method_adjust_latency)) { + if (method_adjust_latency != "none") { # only extend_ahead is supported atm r <- r %>% step_adjust_latency(!!pre_out_name, fixed_forecast_date = forecast_date, @@ -204,7 +204,7 @@ arx_class_epi_workflow <- function( r <- r %>% step_mutate( across( - starts_with("ahead_"), + matches(ahead_out_name), ~ cut(.x, breaks = args_list$breaks), .names = "outcome_class", .unpack = TRUE @@ -289,7 +289,8 @@ arx_class_args_list <- function( n_training = Inf, forecast_date = NULL, target_date = NULL, - adjust_latency = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, @@ -305,8 +306,9 @@ arx_class_args_list <- function( method <- rlang::arg_match(method) outcome_transform <- rlang::arg_match(outcome_transform) - arg_is_scalar(ahead, n_training, horizon, log_scale) - arg_is_scalar(forecast_date, target_date, adjust_latency, allow_null = TRUE) + adjust_latency <- rlang::arg_match(adjust_latency) + arg_is_scalar(ahead, n_training, horizon, log_scale, adjust_latency, warn_latency) + arg_is_scalar(forecast_date, target_date, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) arg_is_nonneg_int(ahead, lags, horizon) arg_is_numeric(breaks) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 14d8b112b..4d7dab60e 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -52,7 +52,7 @@ arx_forecaster <- function( wf <- fit(wf, epi_data) # get the forecast date for the forecast function - if (is.null(args_list$adjust_latency)) { + if (args_list$adjust_latency == "none") { forecast_date_default <- max(epi_data$time_value) } else { forecast_date_default <- attributes(epi_data)$metadata$as_of @@ -247,6 +247,8 @@ arx_fcast_epi_workflow <- function( #' - `"extend_lags"`: increase the lags so they're relative to the actual #' forecast date. For example, if the lags are `c(0,7,14)` and the last day of #' data was 3 days ago, the lags become `c(3,10,17)`. +#' @param warn_latency by default, `step_adjust_latency` warns the user if the +#' latency is large. If this is `FALSE`, that warning is turned off. #' @param quantile_levels Vector or `NULL`. A vector of probabilities to produce #' prediction intervals. These are created by computing the quantiles of #' training residuals. A `NULL` value will result in point forecasts only. @@ -284,6 +286,7 @@ arx_args_list <- function( forecast_date = NULL, target_date = NULL, adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -297,7 +300,7 @@ arx_args_list <- function( if (is.list(lags)) lags <- unlist(lags) adjust_latency <- rlang::arg_match(adjust_latency) - arg_is_scalar(ahead, n_training, symmetrize, nonneg, adjust_latency) + arg_is_scalar(ahead, n_training, symmetrize, nonneg, adjust_latency, warn_latency) arg_is_chr(quantile_by_key, allow_empty = TRUE) arg_is_scalar(forecast_date, target_date, allow_null = TRUE) arg_is_date(forecast_date, target_date, allow_null = TRUE) @@ -331,6 +334,7 @@ arx_args_list <- function( forecast_date, target_date, adjust_latency, + warn_latency, symmetrize, nonneg, max_lags, diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index d5776940e..78aaafe7e 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -265,11 +265,11 @@ then the previous `step_epi_lag`s won't work with modified data.", epi_keys_checked = epi_keys_checked, check_latency_length = check_latency_length, trained = FALSE, - forecast_date = fixed_forecast_date, + fixed_forecast_date = fixed_forecast_date, + forecast_date = NULL, latency = fixed_latency, latency_table = NULL, metadata = NULL, - keys = key_colnames(recipe), columns = NULL, skip = FALSE, id = id @@ -278,8 +278,8 @@ then the previous `step_epi_lag`s won't work with modified data.", } step_adjust_latency_new <- - function(terms, role, trained, forecast_date, latency, latency_table, - metadata, time_type, keys, method, epi_keys_checked, check_latency_length, columns, skip, + function(terms, role, trained, fixed_forecast_date, forecast_date, latency, latency_table, + metadata, time_type, method, epi_keys_checked, check_latency_length, columns, skip, id) { step( subclass = "adjust_latency", @@ -289,47 +289,28 @@ step_adjust_latency_new <- epi_keys_checked = epi_keys_checked, check_latency_length = check_latency_length, trained = trained, + fixed_forecast_date = fixed_forecast_date, forecast_date = forecast_date, latency = latency, latency_table = latency_table, metadata = metadata, - keys = keys, columns = columns, skip = skip, id = id ) } + # lags introduces max(lags) NA's after the max_time_value. #' @export #' @importFrom glue glue #' @importFrom dplyr rowwise prep.step_adjust_latency <- function(x, training, info = NULL, ...) { - sign_shift <- get_sign(x) latency <- x$latency - forecast_date <- x$forecast_date %||% set_forecast_date(training, info, x$epi_keys_checked, latency) - # construct the latency table - latency_table <- names(training)[!names(training) %in% key_colnames(training)] %>% - tibble(col_name = .) - if (length(recipes_eval_select(x$terms, training, info)) > 0) { - latency_table <- latency_table %>% filter(col_name %in% - recipes_eval_select(x$terms, training, info)) - } + forecast_date <- x$fixed_forecast_date %||% get_forecast_date(training, info, x$epi_keys_checked, latency) - if (is.null(latency)) { - latency_table <- latency_table %>% - rowwise() %>% - mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, x$epi_keys_checked)) - } else if (length(latency) > 1) { - # if latency has a length, it must also have named elements. We assign based on comparing the name in the list - # with the column names, and drop any which don't have a latency assigned - latency_table <- latency_table %>% - filter(col_name %in% names(latency)) %>% - rowwise() %>% - mutate(latency = unname(latency[names(latency) == col_name])) %>% - ungroup() - } else { - latency_table <- latency_table %>% mutate(latency = latency) - } + latency_table <- get_latency_table( + training, NULL, forecast_date, latency, + get_sign(x), x$epi_keys_checked, info, x$terms) attributes(training)$metadata$latency_table <- latency_table # get the columns used, even if it's all of them terms_used <- x$terms @@ -338,18 +319,16 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { filter(role == "raw") %>% pull(variable) } - - - + step_adjust_latency_new( terms = x$terms, role = x$role, trained = TRUE, + fixed_forecast_date = x$fixed_forecast_date, forecast_date = forecast_date, - latency = unique(latency_table$latency), + latency = x$latency, latency_table = latency_table, metadata = attributes(training)$metadata, - keys = x$keys, method = x$method, epi_keys_checked = x$epi_keys_checked, check_latency_length = x$check_latency_length, @@ -363,18 +342,50 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { #' @importFrom tidyr fill #' @export bake.step_adjust_latency <- function(object, new_data, ...) { - if (!isa(new_data, "epi_df")) { - # TODO if new_data actually has keys other than geo_value and time_value, this is going to cause problems + if (!inherits(new_data, "epi_df") || is.null(attributes(new_data)$metadata$as_of)) { new_data <- as_epi_df(new_data) attributes(new_data)$metadata <- object$metadata attributes(new_data)$metadata$as_of <- object$forecast_date + } else { + latency <- object$latency + current_forecast_date <- object$fixed_forecast_date %||% + get_forecast_date( + new_data, NULL, object$epi_keys_checked, latency, c(key_colnames(new_data), object$columns) + ) + local_latency_table <- get_latency_table( + new_data, object$columns, current_forecast_date, latency, + get_sign(object), object$epi_keys_checked, NULL, NULL) + comparison_table <- local_latency_table %>% + ungroup() %>% + dplyr::full_join(object$latency_table %>% ungroup(), by = join_by(col_name), suffix = c(".bake",".prep")) %>% + mutate(bakeMprep = latency.bake - latency.prep) + if (any(comparison_table$bakeMprep > 0)) { + cli_abort(paste0( + "There is more latency at bake time than there was at prep time.", + " You will need to fit a model with more latency to predict on this dataset."), + class = "epipredict__latency__bake_prep_difference_error", + latency_table = comparison_table) + } + if (any(comparison_table$bakeMprep < 0)) { + cli_warn(paste0( + "There is less latency at bake time than there was at prep time.", + " This will still fit, but will discard the most recent data."), + class = "epipredict__latency__bake_prep_difference_warn", + latency_table = comparison_table) + } + if (current_forecast_date != object$forecast_date){ + cli_warn(paste0( + "The forecast date differs from the one set at train time; ", + " this means any dates added by `layer_forecast_date` will be inaccurate."), + class = "epipredict__latency__bake_prep_forecast_date_warn" + ) + } } if (object$method == "extend_ahead" || object$method == "extend_lags") { attributes(new_data)$metadata$latency_method <- object$method attributes(new_data)$metadata$shift_sign <- get_sign(object) attributes(new_data)$metadata$latency_table <- object$latency_table attributes(new_data)$metadata$forecast_date <- object$forecast_date - keys <- object$keys } else if (object$method == "locf") { # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns rel_keys <- setdiff(key_colnames(new_data), "time_value") @@ -399,13 +410,13 @@ print.step_adjust_latency <- if (!is.null(x$forecast_date)) { conj <- "with forecast date" extra_text <- x$forecast_date - } else if (!is.null(x$latency)) { - conj <- if (length(x$latency == 1)) { + } else if (!is.null(x$latency_table)) { + conj <- if (nrow(x$latency) == 1) { "with latency" } else { "with latencies" } - extra_text <- x$latency + extra_text <- unique(x$latency_table$latency) } else { conj <- "with latency" extra_text <- "set at train time" diff --git a/R/utils-latency.R b/R/utils-latency.R index f53f01ca3..8e8655e2a 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -15,7 +15,7 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name return(NULL) } rel_list <- recipe$steps %>% - purrr::map(extract_named_rates) %>% + map(extract_named_rates) %>% unlist(recursive = FALSE) %>% split(c("term", "shift", "prefix")) relevant_shifts <- tibble( @@ -34,26 +34,28 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name #' @importFrom dplyr select #' @importFrom tidyr drop_na #' @importFrom utils capture.output -set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { - original_columns <- info %>% - filter(source == "original") %>% - pull(variable) - # make sure that there's enough column names - if (length(original_columns) < 3) { - cli_abort( - glue::glue( - "The original columns of `time_value`, ", - "`geo_value` and at least one signal. The current colums are \n", - paste(capture.output(object$info), collapse = "\n\n") - ), - class = "epipredict__set_forecast_date__too_few_data_columns" - ) +get_forecast_date <- function(new_data, info, epi_keys_checked, latency, columns = NULL) { + if (is.null(columns)) { + columns <- info %>% + filter(source == "original") %>% + pull(variable) + # make sure that there's enough column names + if (length(columns) < 3) { + cli_abort( + glue::glue( + "The original columns of `time_value`, ", + "`geo_value` and at least one signal. The current colums are \n", + paste(capture.output(object$info), collapse = "\n\n") + ), + class = "epipredict__get_forecast_date__too_few_data_columns" + ) + } } # the source data determines the actual time_values # these are the non-na time_values; # get the minimum value across the checked epi_keys' maximum time values max_time <- new_data %>% - select(all_of(original_columns)) %>% + select(all_of(columns)) %>% drop_na() # null and "" don't work in `group_by` if (!is.null(epi_keys_checked) && (epi_keys_checked != "")) { @@ -77,7 +79,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { "matching `time_value`'s type of ", "{class(max_time)}." ), - class = "epipredict__set_forecast_date__wrong_time_value_type_error" + class = "epipredict__get_forecast_date__wrong_time_value_type_error" ) } if (is.null(forecast_date) || is.na(forecast_date)) { @@ -86,7 +88,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { "epi_data's `forecast_date` was {forecast_date}, setting to ", "the latest time value, {max_time}." ), - class = "epipredict__set_forecast_date__max_time_warning" + class = "epipredict__get_forecast_date__max_time_warning" ) forecast_date <- max_time } else if (forecast_date < max_time) { @@ -96,7 +98,7 @@ set_forecast_date <- function(new_data, info, epi_keys_checked, latency) { "recent data ({max_time}). Remove before ", "predicting." ), - class = "epipredict__set_forecast_date__misordered_forecast_date_error" + class = "epipredict__get_forecast_date__misordered_forecast_date_error" ) } # TODO cover the rest of the possible types for as_of and max_time... @@ -135,7 +137,7 @@ get_latency <- function(new_data, forecast_date, column, sign_shift, epi_keys_ch #' a potentially different max_time_value #' @keywords internal get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new_data) { - max_time_value <- as.Date(max( + forecast_date <- as.Date(max( workflow_max_time_value, this_recipe$max_time_value, max(new_data$time_value) @@ -149,16 +151,16 @@ get_forecast_date_in_layer <- function(this_recipe, workflow_max_time_value, new } ) %>% Filter(Negate(is.null), .) if (length(handpicked_forecast_date) > 0) { - max_time_value <- handpicked_forecast_date[[1]] + forecast_date <- handpicked_forecast_date[[1]] } else { # if we haven't chosen one, use either the max_time_value or the as_of - max_time_value <- max( - max_time_value, + forecast_date <- max( + forecast_date, attributes(new_data)$metadata$as_of ) } } - max_time_value + forecast_date } @@ -208,17 +210,7 @@ pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { slice(1:min(across(all_of(columns_to_complete), count_single_column))) %>% bind_rows(filled_values) %>% arrange(across(all_of(key_colnames(x)))) %>% - ungroup() %>% - group_by(across(all_of(get_grouping_columns(x)))) -} - -#' return the names of the grouped columns, or `NULL` -#' @param x an epi_df -#' @keywords internal -#' @importFrom utils head -get_grouping_columns <- function(x) { - group_names <- names(attributes(x)$groups) - head(group_names, -1) + ungroup() } #' get the location of the last real value @@ -284,3 +276,45 @@ check_interminable_latency <- function(dataset, latency_table, target_columns, f ) } } + +#' create the latency table +#' This is a table of column names and the latency adjustment necessary for that column. An example: +#' +#' col_name latency +#' +#' 1 case_rate 5 +#' 2 death_rate 5 +#' @keywords internal +#' @importFrom dplyr rowwise +get_latency_table <- function(training, columns, forecast_date, latency, sign_shift, epi_keys_checked, info, terms) { + if (is.null(columns)) { + columns <- recipes_eval_select(terms, training, info) + } + # construct the latency table + latency_table <- names(training)[!names(training) %in% key_colnames(training)] %>% + tibble(col_name = .) + if (length(columns) > 0) { + latency_table <- latency_table %>% filter(col_name %in% + columns) + } + + if (is.null(latency)) { + latency_table <- latency_table %>% + rowwise() %>% + mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, epi_keys_checked)) + } else if (length(latency) > 1) { + # if latency has a length, it must also have named elements. We assign based on comparing the name in the list + # with the column names, and drop any which don't have a latency assigned + latency_table <- latency_table %>% + filter(col_name %in% names(latency)) %>% + rowwise() %>% + mutate(latency = unname(latency[names(latency) == col_name])) + } else { + tmp_latency_table <- latency_table %>% + rowwise() %>% + mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, epi_keys_checked)) + if (latency ) + latency_table <- latency_table %>% mutate(latency = latency) + } + return(latency_table %>% ungroup()) +} diff --git a/man/arx_args_list.Rd b/man/arx_args_list.Rd index 39ef23e09..f28cdefab 100644 --- a/man/arx_args_list.Rd +++ b/man/arx_args_list.Rd @@ -11,6 +11,7 @@ arx_args_list( forecast_date = NULL, target_date = NULL, adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, quantile_levels = c(0.05, 0.95), symmetrize = TRUE, nonneg = TRUE, @@ -58,6 +59,9 @@ forecast date. For example, if the lags are \code{c(0,7,14)} and the last day of data was 3 days ago, the lags become \code{c(3,10,17)}. }} +\item{warn_latency}{by default, \code{step_adjust_latency} warns the user if the +latency is large. If this is \code{FALSE}, that warning is turned off.} + \item{quantile_levels}{Vector or \code{NULL}. A vector of probabilities to produce prediction intervals. These are created by computing the quantiles of training residuals. A \code{NULL} value will result in point forecasts only.} diff --git a/man/arx_class_args_list.Rd b/man/arx_class_args_list.Rd index 4873d59ae..a229b67c0 100644 --- a/man/arx_class_args_list.Rd +++ b/man/arx_class_args_list.Rd @@ -10,7 +10,8 @@ arx_class_args_list( n_training = Inf, forecast_date = NULL, target_date = NULL, - adjust_latency = NULL, + adjust_latency = c("none", "extend_ahead", "extend_lags", "locf"), + warn_latency = TRUE, outcome_transform = c("growth_rate", "lag_difference"), breaks = 0.25, horizon = 7L, @@ -60,6 +61,9 @@ forecast date. For example, if the lags are \code{c(0,7,14)} and the last day of data was 3 days ago, the lags become \code{c(3,10,17)}. }} +\item{warn_latency}{by default, \code{step_adjust_latency} warns the user if the +latency is large. If this is \code{FALSE}, that warning is turned off.} + \item{outcome_transform}{Scalar character. Whether the outcome should be created using growth rates (as the predictors are) or lagged differences. The second case is closer to the requirements for the diff --git a/man/set_forecast_date.Rd b/man/get_forecast_date.Rd similarity index 71% rename from man/set_forecast_date.Rd rename to man/get_forecast_date.Rd index 29dd98d33..6a35fff04 100644 --- a/man/set_forecast_date.Rd +++ b/man/get_forecast_date.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-latency.R -\name{set_forecast_date} -\alias{set_forecast_date} +\name{get_forecast_date} +\alias{get_forecast_date} \title{Extract the as_of for the forecast date, and make sure there's nothing very off about it.} \usage{ -set_forecast_date(new_data, info, epi_keys_checked, latency) +get_forecast_date(new_data, info, epi_keys_checked, latency, columns = NULL) } \description{ Extract the as_of for the forecast date, and make sure there's nothing very off about it. diff --git a/man/get_grouping_columns.Rd b/man/get_grouping_columns.Rd deleted file mode 100644 index f8b61af42..000000000 --- a/man/get_grouping_columns.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils-latency.R -\name{get_grouping_columns} -\alias{get_grouping_columns} -\title{return the names of the grouped columns, or \code{NULL}} -\usage{ -get_grouping_columns(x) -} -\arguments{ -\item{x}{an epi_df} -} -\description{ -return the names of the grouped columns, or \code{NULL} -} -\keyword{internal} diff --git a/man/get_latency_table.Rd b/man/get_latency_table.Rd new file mode 100644 index 000000000..ae309c944 --- /dev/null +++ b/man/get_latency_table.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{get_latency_table} +\alias{get_latency_table} +\title{create the latency table +This is a table of column names and the latency adjustment necessary for that column. An example:} +\usage{ +get_latency_table( + training, + columns, + forecast_date, + latency, + sign_shift, + epi_keys_checked, + info, + terms +) +} +\description{ +col_name latency +\if{html}{\out{}} \if{html}{\out{}} +1 case_rate 5 +2 death_rate 5 +} +\keyword{internal} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 80d6aa3ef..6ea9581c0 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -260,6 +260,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with modified +#> data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/test-utils_latency.R b/tests/testthat/test-utils_latency.R index d408ba7d8..3ba6453a0 100644 --- a/tests/testthat/test-utils_latency.R +++ b/tests/testthat/test-utils_latency.R @@ -84,7 +84,7 @@ test_that("get_latency infers max_time to be the minimum `max time` across group }) -test_that("set_forecast_date works", { +test_that("get_forecast_date works", { info <- tribble( ~variable, ~type, ~role, ~source, "time_value", "date", "time_value", "original", @@ -93,9 +93,9 @@ test_that("set_forecast_date works", { "death_rate", "numeric", "raw", "original", "not_real", "numeric", "predictor", "derived" ) - expect_equal(set_forecast_date(modified_data, info, "geo_value", NULL), as_of) - expect_equal(set_forecast_date(modified_data, info, "", NULL), as_of) - expect_equal(set_forecast_date(modified_data, info, NULL, NULL), as_of) + expect_equal(get_forecast_date(modified_data, info, "geo_value", NULL), as_of) + expect_equal(get_forecast_date(modified_data, info, "", NULL), as_of) + expect_equal(get_forecast_date(modified_data, info, NULL, NULL), as_of) }) test_that("pad_to_end works correctly", { From 50cecaefd7227819ba16320fe3b8996b02c675ee Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 16 Sep 2024 14:49:38 -0500 Subject: [PATCH 74/92] moving step checks to a separate function, styler --- R/arx_classifier.R | 2 +- R/step_adjust_latency.R | 83 +++++++++++++------------------------- R/utils-latency.R | 57 +++++++++++++++++++++++++- man/step_adjust_latency.Rd | 4 +- 4 files changed, 87 insertions(+), 59 deletions(-) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 9cc76052f..c5166a176 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -192,7 +192,7 @@ arx_class_epi_workflow <- function( } ahead_out_name <- glue::glue("ahead_[0-9]*_{pre_out_name}") method_adjust_latency <- args_list$adjust_latency - if (method_adjust_latency != "none") { + if (method_adjust_latency != "none") { # only extend_ahead is supported atm r <- r %>% step_adjust_latency(!!pre_out_name, fixed_forecast_date = forecast_date, diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 78aaafe7e..1ac3a4fc8 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -207,44 +207,9 @@ step_adjust_latency <- fixed_forecast_date = NULL, check_latency_length = TRUE, id = rand_id("adjust_latency")) { - arg_is_chr_scalar(id, method) - if (!is_epi_recipe(recipe)) { - cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", - class = "epipredict__step_adjust_latency__epi_recipe_only" - ) - } - if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { - cli_warn( - "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified.", - class = "epipredict__step_adjust_latency__misordered_step_warning" - ) - } else if ((method == "extend_lags") && detect_step(recipe, "epi_lag")) { - cli_warn( - "If `method` is {.val extend_lags} or {.val locf}, -then the previous `step_epi_lag`s won't work with modified data.", - class = "epipredict__step_adjust_latency__misordered_step_warning" - ) - } else if ((method == "locf") && (length(recipe$steps) > 0)) { - cli_warn("There are steps before `step_adjust_latency`. With the method {.val locf}, it is recommended to include this step before any others", - class = "epipredict__step_adjust_latency__misordered_step_warning" - ) - } - if (detect_step(recipe, "naomit")) { - cli_abort("adjust_latency needs to occur before any `NA` removal, - as columns may be moved around", class = "epipredict__step_adjust_latency__post_NA_error") - } - if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { - cli_abort("Only one of `fixed_latency` and `fixed_forecast_date` - can be non-`NULL` at a time!", class = "epipredict__step_adjust_latency__too_many_args_error") - } - if (length(fixed_latency > 1)) { - template <- recipe$template - data_names <- names(template)[!names(template) %in% key_colnames(template)] - wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] - if (length(wrong_names) > 0) { - cli_abort("{.val fixed_latency} contains names not in the template dataset: {wrong_names}", class = "epipredict__step_adjust_latency__undefined_names_error") - } - } + step_adjust_latency_checks( + id, method, recipe, fixed_latency, fixed_forecast_date + ) method <- rlang::arg_match(method) if (method == "extend_ahead") { rel_step_type <- "step_epi_ahead" @@ -310,7 +275,8 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { latency_table <- get_latency_table( training, NULL, forecast_date, latency, - get_sign(x), x$epi_keys_checked, info, x$terms) + get_sign(x), x$epi_keys_checked, info, x$terms + ) attributes(training)$metadata$latency_table <- latency_table # get the columns used, even if it's all of them terms_used <- x$terms @@ -319,7 +285,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { filter(role == "raw") %>% pull(variable) } - + step_adjust_latency_new( terms = x$terms, role = x$role, @@ -354,29 +320,38 @@ bake.step_adjust_latency <- function(object, new_data, ...) { ) local_latency_table <- get_latency_table( new_data, object$columns, current_forecast_date, latency, - get_sign(object), object$epi_keys_checked, NULL, NULL) + get_sign(object), object$epi_keys_checked, NULL, NULL + ) comparison_table <- local_latency_table %>% ungroup() %>% - dplyr::full_join(object$latency_table %>% ungroup(), by = join_by(col_name), suffix = c(".bake",".prep")) %>% + dplyr::full_join(object$latency_table %>% ungroup(), by = join_by(col_name), suffix = c(".bake", ".prep")) %>% mutate(bakeMprep = latency.bake - latency.prep) if (any(comparison_table$bakeMprep > 0)) { - cli_abort(paste0( - "There is more latency at bake time than there was at prep time.", - " You will need to fit a model with more latency to predict on this dataset."), + cli_abort( + paste0( + "There is more latency at bake time than there was at prep time.", + " You will need to fit a model with more latency to predict on this dataset." + ), class = "epipredict__latency__bake_prep_difference_error", - latency_table = comparison_table) + latency_table = comparison_table + ) } if (any(comparison_table$bakeMprep < 0)) { - cli_warn(paste0( - "There is less latency at bake time than there was at prep time.", - " This will still fit, but will discard the most recent data."), + cli_warn( + paste0( + "There is less latency at bake time than there was at prep time.", + " This will still fit, but will discard the most recent data." + ), class = "epipredict__latency__bake_prep_difference_warn", - latency_table = comparison_table) + latency_table = comparison_table + ) } - if (current_forecast_date != object$forecast_date){ - cli_warn(paste0( - "The forecast date differs from the one set at train time; ", - " this means any dates added by `layer_forecast_date` will be inaccurate."), + if (current_forecast_date != object$forecast_date) { + cli_warn( + paste0( + "The forecast date differs from the one set at train time; ", + " this means any dates added by `layer_forecast_date` will be inaccurate." + ), class = "epipredict__latency__bake_prep_forecast_date_warn" ) } diff --git a/R/utils-latency.R b/R/utils-latency.R index 8e8655e2a..e3df8bb61 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -313,8 +313,61 @@ get_latency_table <- function(training, columns, forecast_date, latency, sign_sh tmp_latency_table <- latency_table %>% rowwise() %>% mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, epi_keys_checked)) - if (latency ) - latency_table <- latency_table %>% mutate(latency = latency) + if (latency) { + latency_table <- latency_table %>% mutate(latency = latency) + } } return(latency_table %>% ungroup()) } + + +#' checks: the recipe type, whether a previous step is the relevant epi_shift, +#' that either `fixed_latency` or `fixed_forecast_date` is non-null, and that +#' `fixed_latency` only references columns that exist at the time of the step +#' inclusion +#' @keywords internal +step_adjust_latency_checks <- function(id, method, recipe, fixed_latency, fixed_forecast_date, call = caller_env()) { + arg_is_chr_scalar(id, method) + if (!is_epi_recipe(recipe)) { + cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", + class = "epipredict__step_adjust_latency__epi_recipe_only" + ) + } + if ((method == "extend_ahead") && (detect_step(recipe, "epi_ahead"))) { + cli_warn( + "If `method` is {.val extend_ahead}, then the previous `step_epi_ahead` won't be modified.", + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } else if ((method == "extend_lags") && detect_step(recipe, "epi_lag")) { + cli_warn( + "If `method` is {.val extend_lags} or {.val locf}, +then the previous `step_epi_lag`s won't work with modified data.", + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } else if ((method == "locf") && (length(recipe$steps) > 0)) { + cli_warn( + paste0( + "There are steps before `step_adjust_latency`.", + " With the method {.val locf}, it is recommended to include this step before any others" + ), + class = "epipredict__step_adjust_latency__misordered_step_warning" + ) + } + if (!is.null(fixed_latency) && !is.null(fixed_forecast_date)) { + cli_abort( + "Only one of `fixed_latency` and `fixed_forecast_date` can be non-`NULL` at a time!", + class = "epipredict__step_adjust_latency__too_many_args_error" + ) + } + if (length(fixed_latency > 1)) { + template <- recipe$template + data_names <- names(template)[!names(template) %in% key_colnames(template)] + wrong_names <- names(fixed_latency)[!names(fixed_latency) %in% data_names] + if (length(wrong_names) > 0) { + cli_abort( + "{.val fixed_latency} contains names not in the template dataset: {wrong_names}", + class = "epipredict__step_adjust_latency__undefined_names_error" + ) + } + } +} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 6ea9581c0..e1e516a3c 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -260,8 +260,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with modified -#> data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous +#> `step_epi_lag`s won't work with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as From 7f18662a5622d5052f23e54b1f7f848e2f34afd8 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 16 Sep 2024 15:08:16 -0500 Subject: [PATCH 75/92] NAomit can happen before (but probably shouldn't) --- tests/testthat/test-step_adjust_latency.R | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 1e9ac4d15..93d4b0660 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -369,16 +369,6 @@ test_that("epi_adjust_latency warns there's steps before it", { ) }) -test_that("epi_adjust_latency warns against removing NA's beforehand", { - expect_error( - r5 <- epi_recipe(x) %>% - step_epi_naomit() %>% - step_adjust_latency(method = "extend_lags") %>% - step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% - step_epi_lag(case_rate, lag = c(1, 5)), - regexp = "adjust_latency needs to occur before any `NA` removal" - ) -}) # TODO check that epi_adjust_latency errors for nonsense `as_of`'s From 4b37f7ef9fed757c503f43eae9d0bbaf6d2535e6 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 16 Sep 2024 15:33:17 -0700 Subject: [PATCH 76/92] draft implementation Strategy: 1. Add the grid created in ahead/lag steps to the object 2. At bake time, directly modify these if needed. --- R/epi_recipe.R | 4 +++ R/epi_shift.R | 11 ++------ R/step_adjust_latency.R | 50 ++++++++++++++++++---------------- R/step_epi_shift.R | 22 ++++++++++++--- R/utils-latency.R | 60 ++++++++++++++++++++++++++++++++++------- 5 files changed, 102 insertions(+), 45 deletions(-) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 7db9b9179..44b7b5a5b 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -568,6 +568,10 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { } composition <- "tibble" } + + ## Latency checks/adjustments + object <- adjust_recipe_latency_before_bake(object) + new_data <- NextMethod("bake") if (!is.null(meta)) { # Baking should have dropped epi_df-ness and metadata. Re-infer some diff --git a/R/epi_shift.R b/R/epi_shift.R index 7088fb341..2787d33fd 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -51,15 +51,8 @@ add_shifted_columns <- function(new_data, object, amount) { # adding zero if there's no latency table rel_latency <- tibble(col_name = object$columns, latency = 0L) } - grid <- expand_grid(col = object$columns, amount = sign_shift * amount) %>% - left_join(rel_latency, by = join_by(col == col_name), ) %>% - tidyr::replace_na(list(latency = 0)) %>% - mutate(shift_val = amount + latency) %>% - mutate( - newname = glue::glue("{object$prefix}{abs(shift_val)}_{col}"), # name is always positive - amount = NULL, - latency = NULL - ) + grid <- object$shift_grid %>% + mutate(newname = glue::glue("{object$prefix}{abs(shift_val)}_{col}")) ## ensure no name clashes new_data_names <- colnames(new_data) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 1ac3a4fc8..bbeb07932 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -211,13 +211,6 @@ step_adjust_latency <- id, method, recipe, fixed_latency, fixed_forecast_date ) method <- rlang::arg_match(method) - if (method == "extend_ahead") { - rel_step_type <- "step_epi_ahead" - shift_name <- "ahead" - } else if (method == "extend_lags") { - rel_step_type <- "step_epi_lag" - shift_name <- "lag" - } if (is.null(epi_keys_checked)) { epi_keys_checked <- kill_time_value(key_colnames(recipe$template)) } @@ -226,15 +219,15 @@ step_adjust_latency <- step_adjust_latency_new( terms = enquos(...), role = NA, - method = method, - epi_keys_checked = epi_keys_checked, - check_latency_length = check_latency_length, trained = FALSE, fixed_forecast_date = fixed_forecast_date, forecast_date = NULL, latency = fixed_latency, latency_table = NULL, metadata = NULL, + method = method, + epi_keys_checked = epi_keys_checked, + check_latency_length = check_latency_length, columns = NULL, skip = FALSE, id = id @@ -243,22 +236,22 @@ step_adjust_latency <- } step_adjust_latency_new <- - function(terms, role, trained, fixed_forecast_date, forecast_date, latency, latency_table, - metadata, time_type, method, epi_keys_checked, check_latency_length, columns, skip, - id) { + function(terms, role, trained, fixed_forecast_date, forecast_date, latency, + latency_table, metadata, method, epi_keys_checked, + check_latency_length, columns, skip, id) { step( subclass = "adjust_latency", terms = terms, role = role, - method = method, - epi_keys_checked = epi_keys_checked, - check_latency_length = check_latency_length, trained = trained, fixed_forecast_date = fixed_forecast_date, forecast_date = forecast_date, latency = latency, latency_table = latency_table, metadata = metadata, + method = method, + epi_keys_checked = epi_keys_checked, + check_latency_length = check_latency_length, columns = columns, skip = skip, id = id @@ -271,13 +264,13 @@ step_adjust_latency_new <- #' @importFrom dplyr rowwise prep.step_adjust_latency <- function(x, training, info = NULL, ...) { latency <- x$latency - forecast_date <- x$fixed_forecast_date %||% get_forecast_date(training, info, x$epi_keys_checked, latency) + forecast_date <- x$fixed_forecast_date %||% + get_forecast_date(training, info, x$epi_keys_checked, latency) latency_table <- get_latency_table( training, NULL, forecast_date, latency, get_sign(x), x$epi_keys_checked, info, x$terms ) - attributes(training)$metadata$latency_table <- latency_table # get the columns used, even if it's all of them terms_used <- x$terms if (is_empty(terms_used)) { @@ -310,13 +303,14 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { bake.step_adjust_latency <- function(object, new_data, ...) { if (!inherits(new_data, "epi_df") || is.null(attributes(new_data)$metadata$as_of)) { new_data <- as_epi_df(new_data) - attributes(new_data)$metadata <- object$metadata + attributes(new_data)$metadata <- object$metadata # DJM: why? detected above? attributes(new_data)$metadata$as_of <- object$forecast_date } else { latency <- object$latency current_forecast_date <- object$fixed_forecast_date %||% get_forecast_date( - new_data, NULL, object$epi_keys_checked, latency, c(key_colnames(new_data), object$columns) + new_data, NULL, object$epi_keys_checked, latency, + c(key_colnames(new_data), object$columns) ) local_latency_table <- get_latency_table( new_data, object$columns, current_forecast_date, latency, @@ -324,7 +318,11 @@ bake.step_adjust_latency <- function(object, new_data, ...) { ) comparison_table <- local_latency_table %>% ungroup() %>% - dplyr::full_join(object$latency_table %>% ungroup(), by = join_by(col_name), suffix = c(".bake", ".prep")) %>% + dplyr::full_join( + object$latency_table %>% ungroup(), + by = join_by(col_name), + suffix = c(".bake", ".prep") + ) %>% mutate(bakeMprep = latency.bake - latency.prep) if (any(comparison_table$bakeMprep > 0)) { cli_abort( @@ -362,11 +360,14 @@ bake.step_adjust_latency <- function(object, new_data, ...) { attributes(new_data)$metadata$latency_table <- object$latency_table attributes(new_data)$metadata$forecast_date <- object$forecast_date } else if (object$method == "locf") { - # locf doesn't need to mess with the metadata at all, it just forward-fills the requested columns + # locf doesn't need to mess with the metadata at all, it just forward-fills + # the requested columns rel_keys <- setdiff(key_colnames(new_data), "time_value") modified_columns <- object$columns %>% unname() if (object$check_latency_length) { - check_interminable_latency(new_data, object$latency_table, modified_columns, object$forecast_date) + check_interminable_latency( + new_data, object$latency_table, modified_columns, object$forecast_date + ) } new_data <- new_data %>% @@ -379,6 +380,9 @@ bake.step_adjust_latency <- function(object, new_data, ...) { } return(new_data) } + + + #' @export print.step_adjust_latency <- function(x, width = max(20, options$width - 35), ...) { diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index f09e8ae29..6006fac19 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -79,6 +79,7 @@ step_epi_lag <- default = default, keys = key_colnames(recipe), columns = NULL, + shift_grid = NULL, skip = skip, id = id ) @@ -123,6 +124,7 @@ step_epi_ahead <- default = default, keys = key_colnames(recipe), columns = NULL, + shift_grid = NULL, skip = skip, id = id ) @@ -132,7 +134,7 @@ step_epi_ahead <- step_epi_lag_new <- function(terms, role, trained, lag, prefix, default, keys, - columns, skip, id) { + columns, shift_grid, skip, id) { recipes::step( subclass = "epi_lag", terms = terms, @@ -143,6 +145,7 @@ step_epi_lag_new <- default = default, keys = keys, columns = columns, + shift_grid = shift_grid, skip = skip, id = id ) @@ -150,7 +153,7 @@ step_epi_lag_new <- step_epi_ahead_new <- function(terms, role, trained, ahead, prefix, default, keys, - columns, skip, id) { + columns, shift_grid, skip, id) { recipes::step( subclass = "epi_ahead", terms = terms, @@ -161,6 +164,7 @@ step_epi_ahead_new <- default = default, keys = keys, columns = columns, + shift_grid = shift_grid, skip = skip, id = id ) @@ -170,6 +174,10 @@ step_epi_ahead_new <- #' @export prep.step_epi_lag <- function(x, training, info = NULL, ...) { + columns <- recipes::recipes_eval_select(x$terms, training, info) + sgn <- get_sign(x) + shift_grid <- expand_grid(col = columns, shift_val = sgn * x$lag) + step_epi_lag_new( terms = x$terms, role = x$role, @@ -178,7 +186,8 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - columns = recipes::recipes_eval_select(x$terms, training, info), + columns = columns, + shift_grid = shift_grid, skip = x$skip, id = x$id ) @@ -186,6 +195,10 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { #' @export prep.step_epi_ahead <- function(x, training, info = NULL, ...) { + columns <- recipes::recipes_eval_select(x$terms, training, info) + sgn <- get_sign(x) + shift_grid <- expand_grid(col = columns, shift_val = sgn * x$ahead) + step_epi_ahead_new( terms = x$terms, role = x$role, @@ -194,7 +207,8 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { prefix = x$prefix, default = x$default, keys = x$keys, - columns = recipes::recipes_eval_select(x$terms, training, info), + columns = columns, + shift_grid = shift_grid, skip = x$skip, id = x$id ) diff --git a/R/utils-latency.R b/R/utils-latency.R index e3df8bb61..d56a163a3 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -29,6 +29,35 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name } +adjust_recipe_latency_before_bake <- function(object) { + if (detect_step(object, "adjust_latency")) { + latency_step <- object$steps[[which(tidy(object)$type == "adjust_latency")]] + if (length(latency_step) > 1) { + cli_abort("Only one `step_adjust_latency()` is allowed.") + } + latency_table <- latency_step$latency_table + method <- latency_step$method + if (method == "extend_ahead") { + loc <- which(tidy(object)$type == "epi_ahead") + sign_shift <- -1 + } else if (method == "extend_lags") { + loc <- which(tidy(object))$type == "epi_lag" + sign_shift <- 1 + } + if (method != "locf") { + for (s in seq_along(loc)) { + sgrid <- object$steps[[s]]$shift_grid + sgrid <- sgrid %>% + left_join(latency_table, by = join_by(col == col_name)) %>% + tidyr::replace_na(list(latency = 0)) %>% + mutate(shift_val = shift_val + latency, amount = NULL) + object$steps[[s]]$shift_grid <- sgrid + } + } + } + object +} + #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal #' @importFrom dplyr select @@ -277,6 +306,10 @@ check_interminable_latency <- function(dataset, latency_table, target_columns, f } } +`%nin%` <- function(x, table) { + !(x %in% table) +} + #' create the latency table #' This is a table of column names and the latency adjustment necessary for that column. An example: #' @@ -286,33 +319,38 @@ check_interminable_latency <- function(dataset, latency_table, target_columns, f #' 2 death_rate 5 #' @keywords internal #' @importFrom dplyr rowwise -get_latency_table <- function(training, columns, forecast_date, latency, sign_shift, epi_keys_checked, info, terms) { +get_latency_table <- function(training, columns, forecast_date, latency, + sign_shift, epi_keys_checked, info, terms) { if (is.null(columns)) { columns <- recipes_eval_select(terms, training, info) } # construct the latency table - latency_table <- names(training)[!names(training) %in% key_colnames(training)] %>% - tibble(col_name = .) + latency_table <- tibble(col_name = names(training)) %>% + filter(col_name %nin% key_colnames(training)) if (length(columns) > 0) { - latency_table <- latency_table %>% filter(col_name %in% - columns) + latency_table <- latency_table %>% filter(col_name %in% columns) } if (is.null(latency)) { latency_table <- latency_table %>% rowwise() %>% - mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, epi_keys_checked)) + mutate(latency = get_latency( + training, forecast_date, col_name, sign_shift, epi_keys_checked + )) } else if (length(latency) > 1) { - # if latency has a length, it must also have named elements. We assign based on comparing the name in the list + # if latency has a length, it must also have named elements. + # We assign based on comparing the name in the list # with the column names, and drop any which don't have a latency assigned latency_table <- latency_table %>% filter(col_name %in% names(latency)) %>% rowwise() %>% mutate(latency = unname(latency[names(latency) == col_name])) } else { - tmp_latency_table <- latency_table %>% + latency_table <- latency_table %>% rowwise() %>% - mutate(latency = get_latency(training, forecast_date, col_name, sign_shift, epi_keys_checked)) + mutate(latency = get_latency( + training, forecast_date, col_name, sign_shift, epi_keys_checked + )) if (latency) { latency_table <- latency_table %>% mutate(latency = latency) } @@ -328,6 +366,10 @@ get_latency_table <- function(training, columns, forecast_date, latency, sign_sh #' @keywords internal step_adjust_latency_checks <- function(id, method, recipe, fixed_latency, fixed_forecast_date, call = caller_env()) { arg_is_chr_scalar(id, method) + if (detect_step(recipe, "adjust_latency")) { + cli_abort("Only one `step_adjust_latency()` can be included in a recipe.", + class = "epipredict__step_adjust_latency__multiple_steps") + } if (!is_epi_recipe(recipe)) { cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", class = "epipredict__step_adjust_latency__epi_recipe_only" From 9a7abc1c02b02e62bcdc36e3005be321d148912b Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 16 Sep 2024 15:34:10 -0700 Subject: [PATCH 77/92] refactor the utility fun --- R/utils-latency.R | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/R/utils-latency.R b/R/utils-latency.R index d56a163a3..a3c0c679e 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -28,34 +28,33 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name return(relevant_shifts) } - -adjust_recipe_latency_before_bake <- function(object) { - if (detect_step(object, "adjust_latency")) { - latency_step <- object$steps[[which(tidy(object)$type == "adjust_latency")]] - if (length(latency_step) > 1) { - cli_abort("Only one `step_adjust_latency()` is allowed.") - } +# +adjust_recipe_latency_before_bake <- function(rec) { + if (detect_step(rec, "adjust_latency")) { + step_types <- tidy(rec)$type + # can only be 1, or we would have aborted on recipe creation + latency_step <- rec$steps[[which(step_types == "adjust_latency")]] latency_table <- latency_step$latency_table method <- latency_step$method if (method == "extend_ahead") { - loc <- which(tidy(object)$type == "epi_ahead") + loc <- which(step_types == "epi_ahead") sign_shift <- -1 } else if (method == "extend_lags") { - loc <- which(tidy(object))$type == "epi_lag" + loc <- which(step_types == "epi_lag") sign_shift <- 1 } if (method != "locf") { for (s in seq_along(loc)) { - sgrid <- object$steps[[s]]$shift_grid + sgrid <- rec$steps[[s]]$shift_grid sgrid <- sgrid %>% left_join(latency_table, by = join_by(col == col_name)) %>% tidyr::replace_na(list(latency = 0)) %>% mutate(shift_val = shift_val + latency, amount = NULL) - object$steps[[s]]$shift_grid <- sgrid + rec$steps[[s]]$shift_grid <- sgrid } } } - object + rec } #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. From 82f1165a50af1a4127b8145520ebbe6a7363cf51 Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 16 Sep 2024 15:41:50 -0700 Subject: [PATCH 78/92] only do processing if locf --- R/step_adjust_latency.R | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index bbeb07932..afd991a07 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -354,12 +354,7 @@ bake.step_adjust_latency <- function(object, new_data, ...) { ) } } - if (object$method == "extend_ahead" || object$method == "extend_lags") { - attributes(new_data)$metadata$latency_method <- object$method - attributes(new_data)$metadata$shift_sign <- get_sign(object) - attributes(new_data)$metadata$latency_table <- object$latency_table - attributes(new_data)$metadata$forecast_date <- object$forecast_date - } else if (object$method == "locf") { + if (object$method == "locf") { # locf doesn't need to mess with the metadata at all, it just forward-fills # the requested columns rel_keys <- setdiff(key_colnames(new_data), "time_value") From 9044d5a5564139e34176a84dc9967539c2d10262 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 16 Sep 2024 17:09:27 -0700 Subject: [PATCH 79/92] Update R/utils-latency.R Co-authored-by: David Weber --- R/utils-latency.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils-latency.R b/R/utils-latency.R index a3c0c679e..6ebcd04e2 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -44,7 +44,7 @@ adjust_recipe_latency_before_bake <- function(rec) { sign_shift <- 1 } if (method != "locf") { - for (s in seq_along(loc)) { + for (s in loc) { sgrid <- rec$steps[[s]]$shift_grid sgrid <- sgrid %>% left_join(latency_table, by = join_by(col == col_name)) %>% From 4a4a90cd7247105ee930c95e77235ba6255d79fb Mon Sep 17 00:00:00 2001 From: "Daniel J. McDonald" Date: Mon, 16 Sep 2024 17:43:47 -0700 Subject: [PATCH 80/92] fix needles/haystack bug --- R/epi_shift.R | 15 ++------------- R/step_epi_shift.R | 13 +++++++++++-- R/utils-latency.R | 3 ++- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/R/epi_shift.R b/R/epi_shift.R index 2787d33fd..f9d5501cc 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -40,19 +40,8 @@ get_sign <- function(object) { #' @importFrom tidyr expand_grid #' @importFrom dplyr join_by add_shifted_columns <- function(new_data, object, amount) { - sign_shift <- get_sign(object) - latency_table <- attributes(new_data)$metadata$latency_table - shift_sign_lat <- attributes(new_data)$metadata$shift_sign - if (!is.null(latency_table) && - shift_sign_lat == sign_shift) { - # get the actually used latencies - rel_latency <- latency_table %>% filter(col_name %in% object$columns) - } else { - # adding zero if there's no latency table - rel_latency <- tibble(col_name = object$columns, latency = 0L) - } - grid <- object$shift_grid %>% - mutate(newname = glue::glue("{object$prefix}{abs(shift_val)}_{col}")) + + grid <- object$shift_grid ## ensure no name clashes new_data_names <- colnames(new_data) diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 6006fac19..098dd7eff 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -176,7 +176,12 @@ step_epi_ahead_new <- prep.step_epi_lag <- function(x, training, info = NULL, ...) { columns <- recipes::recipes_eval_select(x$terms, training, info) sgn <- get_sign(x) - shift_grid <- expand_grid(col = columns, shift_val = sgn * x$lag) + shift_grid <- expand_grid( + col = columns, + shift_val = sgn * x$lag, + newname = glue::glue("{x$prefix}{abs(shift_val)}_{col}") + ) + step_epi_lag_new( terms = x$terms, @@ -197,7 +202,11 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { prep.step_epi_ahead <- function(x, training, info = NULL, ...) { columns <- recipes::recipes_eval_select(x$terms, training, info) sgn <- get_sign(x) - shift_grid <- expand_grid(col = columns, shift_val = sgn * x$ahead) + shift_grid <- expand_grid( + col = columns, + shift_val = sgn * x$ahead, + newname = glue::glue("{x$prefix}{abs(shift_val)}_{col}") + ) step_epi_ahead_new( terms = x$terms, diff --git a/R/utils-latency.R b/R/utils-latency.R index 6ebcd04e2..1c3b5ea2b 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -45,11 +45,12 @@ adjust_recipe_latency_before_bake <- function(rec) { } if (method != "locf") { for (s in loc) { + pfx <- rec$steps[[s]]$prefix sgrid <- rec$steps[[s]]$shift_grid sgrid <- sgrid %>% left_join(latency_table, by = join_by(col == col_name)) %>% tidyr::replace_na(list(latency = 0)) %>% - mutate(shift_val = shift_val + latency, amount = NULL) + mutate(shift_val = shift_val + latency, amount = NULL, latency = NULL) rec$steps[[s]]$shift_grid <- sgrid } } From b1298299244dfbecfca0a3e7c00ad1bb44277343 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 17 Sep 2024 15:51:12 -0500 Subject: [PATCH 81/92] some formatting --- R/epi_shift.R | 1 - R/utils-latency.R | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/epi_shift.R b/R/epi_shift.R index f9d5501cc..3d82a321f 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -40,7 +40,6 @@ get_sign <- function(object) { #' @importFrom tidyr expand_grid #' @importFrom dplyr join_by add_shifted_columns <- function(new_data, object, amount) { - grid <- object$shift_grid ## ensure no name clashes diff --git a/R/utils-latency.R b/R/utils-latency.R index 1c3b5ea2b..fd01280bc 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -368,7 +368,8 @@ step_adjust_latency_checks <- function(id, method, recipe, fixed_latency, fixed_ arg_is_chr_scalar(id, method) if (detect_step(recipe, "adjust_latency")) { cli_abort("Only one `step_adjust_latency()` can be included in a recipe.", - class = "epipredict__step_adjust_latency__multiple_steps") + class = "epipredict__step_adjust_latency__multiple_steps" + ) } if (!is_epi_recipe(recipe)) { cli_abort("This recipe step can only operate on an {.cls epi_recipe}.", From 6e5d6fc5bc67ced3764b16549671078cb6ed12b5 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 17 Sep 2024 18:07:28 -0500 Subject: [PATCH 82/92] fixed --- R/epi_recipe.R | 10 ++- R/epi_shift.R | 2 +- R/step_adjust_latency.R | 55 ++------------- R/step_epi_shift.R | 57 +++++++++++----- R/utils-latency.R | 108 +++++++++++++++++++++--------- man/add_shifted_columns.Rd | 2 +- man/step_adjust_latency_checks.Rd | 25 +++++++ 7 files changed, 158 insertions(+), 101 deletions(-) create mode 100644 man/step_adjust_latency_checks.Rd diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 44b7b5a5b..3d72b7d19 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -498,6 +498,13 @@ prep.epi_recipe <- function( as_of = metadata$as_of, other_keys = metadata$other_keys %||% character() ) + } else if ( + inherits(x$steps[[i]], "step_adjust_latency") && + (x$steps[[i]]$method == "extend_ahead" || x$steps[[i]]$method == "extend_lags") + ) { + # pass along the latency table and shift sign from the latency adjustment step + attributes(training)$metadata$latency_table <- x$steps[[i]]$latency_table + attributes(training)$metadata$latency_sign <- x$steps[[i]]$latency_sign } training <- relocate(training, all_of(key_colnames(training))) x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info) @@ -569,9 +576,6 @@ bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { composition <- "tibble" } - ## Latency checks/adjustments - object <- adjust_recipe_latency_before_bake(object) - new_data <- NextMethod("bake") if (!is.null(meta)) { # Baking should have dropped epi_df-ness and metadata. Re-infer some diff --git a/R/epi_shift.R b/R/epi_shift.R index 3d82a321f..3113d0bf7 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -39,7 +39,7 @@ get_sign <- function(object) { #' @keywords internal #' @importFrom tidyr expand_grid #' @importFrom dplyr join_by -add_shifted_columns <- function(new_data, object, amount) { +add_shifted_columns <- function(new_data, object) { grid <- object$shift_grid ## ensure no name clashes diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index afd991a07..3b6d21467 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -224,6 +224,7 @@ step_adjust_latency <- forecast_date = NULL, latency = fixed_latency, latency_table = NULL, + latency_sign = NULL, metadata = NULL, method = method, epi_keys_checked = epi_keys_checked, @@ -237,7 +238,7 @@ step_adjust_latency <- step_adjust_latency_new <- function(terms, role, trained, fixed_forecast_date, forecast_date, latency, - latency_table, metadata, method, epi_keys_checked, + latency_table, latency_sign, metadata, method, epi_keys_checked, check_latency_length, columns, skip, id) { step( subclass = "adjust_latency", @@ -248,6 +249,7 @@ step_adjust_latency_new <- forecast_date = forecast_date, latency = latency, latency_table = latency_table, + latency_sign = latency_sign, metadata = metadata, method = method, epi_keys_checked = epi_keys_checked, @@ -287,6 +289,7 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { forecast_date = forecast_date, latency = x$latency, latency_table = latency_table, + latency_sign = get_sign(x), metadata = attributes(training)$metadata, method = x$method, epi_keys_checked = x$epi_keys_checked, @@ -303,56 +306,10 @@ prep.step_adjust_latency <- function(x, training, info = NULL, ...) { bake.step_adjust_latency <- function(object, new_data, ...) { if (!inherits(new_data, "epi_df") || is.null(attributes(new_data)$metadata$as_of)) { new_data <- as_epi_df(new_data) - attributes(new_data)$metadata <- object$metadata # DJM: why? detected above? + attributes(new_data)$metadata <- object$metadata attributes(new_data)$metadata$as_of <- object$forecast_date } else { - latency <- object$latency - current_forecast_date <- object$fixed_forecast_date %||% - get_forecast_date( - new_data, NULL, object$epi_keys_checked, latency, - c(key_colnames(new_data), object$columns) - ) - local_latency_table <- get_latency_table( - new_data, object$columns, current_forecast_date, latency, - get_sign(object), object$epi_keys_checked, NULL, NULL - ) - comparison_table <- local_latency_table %>% - ungroup() %>% - dplyr::full_join( - object$latency_table %>% ungroup(), - by = join_by(col_name), - suffix = c(".bake", ".prep") - ) %>% - mutate(bakeMprep = latency.bake - latency.prep) - if (any(comparison_table$bakeMprep > 0)) { - cli_abort( - paste0( - "There is more latency at bake time than there was at prep time.", - " You will need to fit a model with more latency to predict on this dataset." - ), - class = "epipredict__latency__bake_prep_difference_error", - latency_table = comparison_table - ) - } - if (any(comparison_table$bakeMprep < 0)) { - cli_warn( - paste0( - "There is less latency at bake time than there was at prep time.", - " This will still fit, but will discard the most recent data." - ), - class = "epipredict__latency__bake_prep_difference_warn", - latency_table = comparison_table - ) - } - if (current_forecast_date != object$forecast_date) { - cli_warn( - paste0( - "The forecast date differs from the one set at train time; ", - " this means any dates added by `layer_forecast_date` will be inaccurate." - ), - class = "epipredict__latency__bake_prep_forecast_date_warn" - ) - } + compare_bake_prep_latencies(object, new_data) } if (object$method == "locf") { # locf doesn't need to mess with the metadata at all, it just forward-fills diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 098dd7eff..8c6b8803a 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -80,6 +80,7 @@ step_epi_lag <- keys = key_colnames(recipe), columns = NULL, shift_grid = NULL, + latency_adjusted = FALSE, skip = skip, id = id ) @@ -125,6 +126,7 @@ step_epi_ahead <- keys = key_colnames(recipe), columns = NULL, shift_grid = NULL, + latency_adjusted = FALSE, skip = skip, id = id ) @@ -134,7 +136,7 @@ step_epi_ahead <- step_epi_lag_new <- function(terms, role, trained, lag, prefix, default, keys, - columns, shift_grid, skip, id) { + columns, shift_grid, latency_adjusted, skip, id) { recipes::step( subclass = "epi_lag", terms = terms, @@ -146,6 +148,7 @@ step_epi_lag_new <- keys = keys, columns = columns, shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = skip, id = id ) @@ -153,7 +156,7 @@ step_epi_lag_new <- step_epi_ahead_new <- function(terms, role, trained, ahead, prefix, default, keys, - columns, shift_grid, skip, id) { + columns, shift_grid, latency_adjusted, skip, id) { recipes::step( subclass = "epi_ahead", terms = terms, @@ -165,6 +168,7 @@ step_epi_ahead_new <- keys = keys, columns = columns, shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = skip, id = id ) @@ -175,13 +179,20 @@ step_epi_ahead_new <- #' @export prep.step_epi_lag <- function(x, training, info = NULL, ...) { columns <- recipes::recipes_eval_select(x$terms, training, info) - sgn <- get_sign(x) - shift_grid <- expand_grid( - col = columns, - shift_val = sgn * x$lag, - newname = glue::glue("{x$prefix}{abs(shift_val)}_{col}") - ) - + if (!x$latency_adjusted) { + tmp <- create_shift_grid( + x$prefix, + x$lag, + get_sign(x), + columns, + attributes(training)$metadata$latency_table, + attributes(training)$metadata$latency_sign + ) + shift_grid <- tmp[[1]] + latency_adjusted <- tmp[[2]] + } else { + shift_grid <- x$shift_grid + } step_epi_lag_new( terms = x$terms, @@ -193,6 +204,7 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { keys = x$keys, columns = columns, shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = x$skip, id = x$id ) @@ -201,12 +213,20 @@ prep.step_epi_lag <- function(x, training, info = NULL, ...) { #' @export prep.step_epi_ahead <- function(x, training, info = NULL, ...) { columns <- recipes::recipes_eval_select(x$terms, training, info) - sgn <- get_sign(x) - shift_grid <- expand_grid( - col = columns, - shift_val = sgn * x$ahead, - newname = glue::glue("{x$prefix}{abs(shift_val)}_{col}") - ) + if (!x$latency_adjusted) { + tmp <- create_shift_grid( + x$prefix, + x$ahead, + get_sign(x), + columns, + attributes(training)$metadata$latency_table, + attributes(training)$metadata$latency_sign + ) + shift_grid <- tmp[[1]] + latency_adjusted <- tmp[[2]] + } else { + shift_grid <- x$shift_grid + } step_epi_ahead_new( terms = x$terms, @@ -218,6 +238,7 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { keys = x$keys, columns = columns, shift_grid = shift_grid, + latency_adjusted = latency_adjusted, skip = x$skip, id = x$id ) @@ -227,12 +248,14 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { #' @export bake.step_epi_lag <- function(object, new_data, ...) { - add_shifted_columns(new_data, object, object$lag) + names(object) + object$shift_grid + add_shifted_columns(new_data, object) } #' @export bake.step_epi_ahead <- function(object, new_data, ...) { - add_shifted_columns(new_data, object, object$ahead) + add_shifted_columns(new_data, object) } #' @export diff --git a/R/utils-latency.R b/R/utils-latency.R index fd01280bc..dc5fa9af8 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -28,36 +28,6 @@ construct_shift_tibble <- function(terms_used, recipe, rel_step_type, shift_name return(relevant_shifts) } -# -adjust_recipe_latency_before_bake <- function(rec) { - if (detect_step(rec, "adjust_latency")) { - step_types <- tidy(rec)$type - # can only be 1, or we would have aborted on recipe creation - latency_step <- rec$steps[[which(step_types == "adjust_latency")]] - latency_table <- latency_step$latency_table - method <- latency_step$method - if (method == "extend_ahead") { - loc <- which(step_types == "epi_ahead") - sign_shift <- -1 - } else if (method == "extend_lags") { - loc <- which(step_types == "epi_lag") - sign_shift <- 1 - } - if (method != "locf") { - for (s in loc) { - pfx <- rec$steps[[s]]$prefix - sgrid <- rec$steps[[s]]$shift_grid - sgrid <- sgrid %>% - left_join(latency_table, by = join_by(col == col_name)) %>% - tidyr::replace_na(list(latency = 0)) %>% - mutate(shift_val = shift_val + latency, amount = NULL, latency = NULL) - rec$steps[[s]]$shift_grid <- sgrid - } - } - } - rec -} - #' Extract the as_of for the forecast date, and make sure there's nothing very off about it. #' @keywords internal #' @importFrom dplyr select @@ -414,3 +384,81 @@ then the previous `step_epi_lag`s won't work with modified data.", } } } + +compare_bake_prep_latencies <- function(object, new_data, call = caller_env()) { + latency <- object$latency + current_forecast_date <- object$fixed_forecast_date %||% + get_forecast_date( + new_data, NULL, object$epi_keys_checked, latency, + c(key_colnames(new_data), object$columns) + ) + local_latency_table <- get_latency_table( + new_data, object$columns, current_forecast_date, latency, + get_sign(object), object$epi_keys_checked, NULL, NULL + ) + comparison_table <- local_latency_table %>% + ungroup() %>% + dplyr::full_join( + object$latency_table %>% ungroup(), + by = join_by(col_name), + suffix = c(".bake", ".prep") + ) %>% + mutate(bakeMprep = latency.bake - latency.prep) + if (any(comparison_table$bakeMprep > 0)) { + cli_abort( + paste0( + "There is more latency at bake time than there was at prep time.", + " You will need to fit a model with more latency to predict on this dataset." + ), + class = "epipredict__latency__bake_prep_difference_error", + latency_table = comparison_table, + call = call + ) + } + if (any(comparison_table$bakeMprep < 0)) { + cli_warn( + paste0( + "There is less latency at bake time than there was at prep time.", + " This will still fit, but will discard the most recent data." + ), + class = "epipredict__latency__bake_prep_difference_warn", + latency_table = comparison_table, + call = call + ) + } + if (current_forecast_date != object$forecast_date) { + cli_warn( + paste0( + "The forecast date differs from the one set at train time; ", + " this means any dates added by `layer_forecast_date` will be inaccurate." + ), + class = "epipredict__latency__bake_prep_forecast_date_warn", + call = call + ) + } +} + + +#' @keywords internal +create_shift_grid <- function(prefix, amount, target_sign, columns, latency_table, latency_sign) { + if (!is.null(latency_table) && + latency_sign == target_sign) { + # get the actually used latencies + rel_latency <- latency_table %>% filter(col_name %in% columns) + latency_adjusted <- TRUE + } else { + # adding zero if there's no latency table + rel_latency <- tibble(col_name = columns, latency = 0L) + latency_adjusted <- FALSE + } + shift_grid <- expand_grid(col = columns, amount = target_sign * amount) %>% + left_join(rel_latency, by = join_by(col == col_name), ) %>% + tidyr::replace_na(list(latency = 0)) %>% + mutate(shift_val = amount + latency) %>% + mutate( + newname = glue::glue("{prefix}{abs(shift_val)}_{col}"), # name is always positive + amount = NULL, + latency = NULL + ) + return(list(shift_grid, latency_adjusted)) +} diff --git a/man/add_shifted_columns.Rd b/man/add_shifted_columns.Rd index d7aba745b..aad22e805 100644 --- a/man/add_shifted_columns.Rd +++ b/man/add_shifted_columns.Rd @@ -5,7 +5,7 @@ \title{backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the checks missing in \code{epi_shift_single}} \usage{ -add_shifted_columns(new_data, object, amount) +add_shifted_columns(new_data, object) } \description{ backend for both \code{bake.step_epi_ahead} and \code{bake.step_epi_lag}, performs the diff --git a/man/step_adjust_latency_checks.Rd b/man/step_adjust_latency_checks.Rd new file mode 100644 index 000000000..baed1fb9b --- /dev/null +++ b/man/step_adjust_latency_checks.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils-latency.R +\name{step_adjust_latency_checks} +\alias{step_adjust_latency_checks} +\title{checks: the recipe type, whether a previous step is the relevant epi_shift, +that either \code{fixed_latency} or \code{fixed_forecast_date} is non-null, and that +\code{fixed_latency} only references columns that exist at the time of the step +inclusion} +\usage{ +step_adjust_latency_checks( + id, + method, + recipe, + fixed_latency, + fixed_forecast_date, + call = caller_env() +) +} +\description{ +checks: the recipe type, whether a previous step is the relevant epi_shift, +that either \code{fixed_latency} or \code{fixed_forecast_date} is non-null, and that +\code{fixed_latency} only references columns that exist at the time of the step +inclusion +} +\keyword{internal} From 388ccc1992e5943ed9866d198224489d62bf96ee Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 17 Sep 2024 18:07:35 -0500 Subject: [PATCH 83/92] single letter variables are impossible to search --- R/epi_recipe.R | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 3d72b7d19..d287937ee 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -463,8 +463,8 @@ prep.epi_recipe <- function( if (fresh) x$term_info <- x$var_info running_info <- x$term_info %>% mutate(number = 0, skip = FALSE) - for (i in seq(along.with = x$steps)) { - needs_tuning <- map_lgl(x$steps[[i]], recipes:::is_tune) + for (ii in seq(along.with = x$steps)) { + needs_tuning <- map_lgl(x$steps[[ii]], recipes:::is_tune) if (any(needs_tuning)) { arg <- names(needs_tuning)[needs_tuning] arg <- paste0("'", arg, "'", collapse = ", ") @@ -474,18 +474,18 @@ prep.epi_recipe <- function( ) cli_abort(msg) } - note <- paste("oper", i, gsub("_", " ", class(x$steps[[i]])[1])) - if (!x$steps[[i]]$trained | fresh) { + note <- paste("oper", ii, gsub("_", " ", class(x$steps[[ii]])[1])) + if (!x$steps[[ii]]$trained || fresh) { if (verbose) { cat(note, "[training]", "\n") } before_nms <- names(training) before_template <- training[1, ] - x$steps[[i]] <- prep(x$steps[[i]], + x$steps[[ii]] <- prep(x$steps[[ii]], training = training, info = x$term_info ) - training <- bake(x$steps[[i]], new_data = training) + training <- bake(x$steps[[ii]], new_data = training) if (!tibble::is_tibble(training)) { cli_abort("`bake()` methods should always return {.cls tibble}.") } @@ -499,27 +499,27 @@ prep.epi_recipe <- function( other_keys = metadata$other_keys %||% character() ) } else if ( - inherits(x$steps[[i]], "step_adjust_latency") && - (x$steps[[i]]$method == "extend_ahead" || x$steps[[i]]$method == "extend_lags") + inherits(x$steps[[ii]], "step_adjust_latency") && + (x$steps[[ii]]$method == "extend_ahead" || x$steps[[ii]]$method == "extend_lags") ) { # pass along the latency table and shift sign from the latency adjustment step - attributes(training)$metadata$latency_table <- x$steps[[i]]$latency_table - attributes(training)$metadata$latency_sign <- x$steps[[i]]$latency_sign + attributes(training)$metadata$latency_table <- x$steps[[ii]]$latency_table + attributes(training)$metadata$latency_sign <- x$steps[[ii]]$latency_sign } training <- relocate(training, all_of(key_colnames(training))) x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info) - if (!is.na(x$steps[[i]]$role)) { + if (!is.na(x$steps[[ii]]$role)) { new_vars <- setdiff(x$term_info$variable, running_info$variable) pos_new_var <- x$term_info$variable %in% new_vars pos_new_and_na_role <- pos_new_var & is.na(x$term_info$role) pos_new_and_na_source <- pos_new_var & is.na(x$term_info$source) - x$term_info$role[pos_new_and_na_role] <- x$steps[[i]]$role + x$term_info$role[pos_new_and_na_role] <- x$steps[[ii]]$role x$term_info$source[pos_new_and_na_source] <- "derived" } recipes:::changelog(log_changes, before_nms, names(training), x$steps[[i]]) running_info <- rbind( running_info, - mutate(x$term_info, number = i, skip = x$steps[[i]]$skip) + mutate(x$term_info, number = ii, skip = x$steps[[ii]]$skip) ) } else { if (verbose) cat(note, "[pre-trained]\n") From 54993d25fb62d413facb40921789d6aad001e301 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 17 Sep 2024 18:28:44 -0500 Subject: [PATCH 84/92] lat adj tag for steps which have been modified --- R/step_epi_shift.R | 16 ++++++- tests/testthat/_snaps/step_adjust_latency.md | 47 +++++++++++++++++++- tests/testthat/test-step_adjust_latency.R | 7 +-- 3 files changed, 64 insertions(+), 6 deletions(-) diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 8c6b8803a..b04efd06f 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -260,9 +260,15 @@ bake.step_epi_ahead <- function(object, new_data, ...) { #' @export print.step_epi_lag <- function(x, width = max(20, options()$width - 30), ...) { + if (x$latency_adjusted && x$trained) { + lag <- x$shift_grid$shift_val + lag <- c(lag, "(lat adj)") + } else { + lag <- x$lag + } print_epi_step(x$columns, x$terms, x$trained, "Lagging", conjunction = "by", - extra_text = x$lag + extra_text = lag ) invisible(x) } @@ -270,9 +276,15 @@ print.step_epi_lag <- function(x, width = max(20, options()$width - 30), ...) { #' @export print.step_epi_ahead <- function(x, width = max(20, options()$width - 30), ...) { + if (x$latency_adjusted && x$trained) { + ahead <- x$shift_grid$shift_val + ahead <- c(ahead, "(lat adj)") + } else { + ahead <- x$ahead + } print_epi_step(x$columns, x$terms, x$trained, "Leading", conjunction = "by", - extra_text = x$ahead + extra_text = ahead ) invisible(x) } diff --git a/tests/testthat/_snaps/step_adjust_latency.md b/tests/testthat/_snaps/step_adjust_latency.md index fd49b8824..e37ae07ea 100644 --- a/tests/testthat/_snaps/step_adjust_latency.md +++ b/tests/testthat/_snaps/step_adjust_latency.md @@ -21,7 +21,30 @@ --- Code - r + prep(r5, real_x) + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Training information + Training data contained 200 data points and no incomplete rows. + + -- Operations + 1. extend_lags: case_rate with forecast date 2021-07-24 | Trained + 2. Lagging: death_rate by 0, 6, 11, (lat adj) | Trained + 3. Lagging: case_rate by 6, 10, (lat adj) | Trained + 4. Leading: death_rate by 7 | Trained + +--- + + Code + r6 Message -- Epi Recipe ------------------------------------------------------------------ @@ -37,3 +60,25 @@ 2. extend_ahead: all future predictors with latency set at train time 3. Leading: death_rate by 7 +--- + + Code + prep(r6, case_death_rate_subset) + Message + + -- Epi Recipe ------------------------------------------------------------------ + + -- Inputs + Number of variables by role + raw: 2 + geo_value: 1 + time_value: 1 + + -- Training information + Training data contained 20496 data points and no incomplete rows. + + -- Operations + 1. Lagging: death_rate by 0, 7, 14 | Trained + 2. extend_ahead: case_rate, ... with forecast date 2022-05-31 | Trained + 3. Leading: death_rate by -158, (lat adj) | Trained + diff --git a/tests/testthat/test-step_adjust_latency.R b/tests/testthat/test-step_adjust_latency.R index 93d4b0660..2a9ea4419 100644 --- a/tests/testthat/test-step_adjust_latency.R +++ b/tests/testthat/test-step_adjust_latency.R @@ -49,7 +49,6 @@ test_that("epi_adjust_latency correctly extends the lags", { step_epi_lag(death_rate, lag = c(0, 6, 11)) %>% step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) - # directly checking the shifts baked_x <- r1 %>% prep(real_x) %>% @@ -512,11 +511,13 @@ test_that("printing step_adjust_latency results in expected output", { step_epi_lag(case_rate, lag = c(1, 5)) %>% step_epi_ahead(death_rate, ahead = ahead) expect_snapshot(r5) - r <- epi_recipe(case_death_rate_subset) %>% + expect_snapshot(prep(r5, real_x)) + r6 <- epi_recipe(case_death_rate_subset) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_adjust_latency(method = "extend_ahead") %>% step_epi_ahead(death_rate, ahead = 7) - expect_snapshot(r) + expect_snapshot(r6) + expect_snapshot(prep(r6, case_death_rate_subset)) }) test_that("locf works as intended", { From e70d5530bb1b6dfe11838b55c6bebe067d5b831e Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 20 Sep 2024 12:02:58 -0500 Subject: [PATCH 85/92] remove actual changes to prep.epi_recipe --- R/epi_recipe.R | 7 ------- R/step_adjust_latency.R | 3 +++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index d287937ee..769229b39 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -498,13 +498,6 @@ prep.epi_recipe <- function( as_of = metadata$as_of, other_keys = metadata$other_keys %||% character() ) - } else if ( - inherits(x$steps[[ii]], "step_adjust_latency") && - (x$steps[[ii]]$method == "extend_ahead" || x$steps[[ii]]$method == "extend_lags") - ) { - # pass along the latency table and shift sign from the latency adjustment step - attributes(training)$metadata$latency_table <- x$steps[[ii]]$latency_table - attributes(training)$metadata$latency_sign <- x$steps[[ii]]$latency_sign } training <- relocate(training, all_of(key_colnames(training))) x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info) diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index 3b6d21467..433fbb328 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -329,6 +329,9 @@ bake.step_adjust_latency <- function(object, new_data, ...) { as_tibble() %>% tidyr::fill(.direction = "down", any_of(modified_columns)) %>% ungroup() + } else if (object$method == "extend_lags" || object$method == "extend_ahead") { + attributes(new_data)$metadata$latency_table <- object$latency_table + attributes(new_data)$metadata$latency_sign <- object$latency_sign } return(new_data) } From b3c96b346a0b7a6d4e0ad9ed52dc88aa07bdc031 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Fri, 27 Sep 2024 17:47:14 -0500 Subject: [PATCH 86/92] various requests and rebasing on dev --- R/arx_classifier.R | 8 ++++---- R/arx_forecaster.R | 14 +++----------- R/epi_recipe.R | 2 +- R/utils-enframer.R | 23 ----------------------- 4 files changed, 8 insertions(+), 39 deletions(-) delete mode 100644 R/utils-enframer.R diff --git a/R/arx_classifier.R b/R/arx_classifier.R index c5166a176..6dcffde88 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -190,10 +190,10 @@ arx_class_epi_workflow <- function( ) } } + # regex that will match any amount of adjustment for the ahead ahead_out_name <- glue::glue("ahead_[0-9]*_{pre_out_name}") method_adjust_latency <- args_list$adjust_latency if (method_adjust_latency != "none") { - # only extend_ahead is supported atm r <- r %>% step_adjust_latency(!!pre_out_name, fixed_forecast_date = forecast_date, method = method_adjust_latency @@ -328,9 +328,9 @@ arx_class_args_list <- function( if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { cli_warn( - c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." + paste0( + "`forecast_date` {.val {forecast_date}} +", + " `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}." ), class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" ) diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 4d7dab60e..bfd5eaec1 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -61,7 +61,7 @@ arx_forecaster <- function( preds <- forecast(wf, forecast_date = forecast_date) %>% - tibble::as_tibble() %>% + as_tibble() %>% select(-time_value) structure( @@ -140,11 +140,7 @@ arx_fcast_epi_workflow <- function( forecast_date <- args_list$forecast_date %||% forecast_date_default target_date <- args_list$target_date %||% (forecast_date + args_list$ahead) if (forecast_date + args_list$ahead != target_date) { - cli_abort( - c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - ), + cli_abort("`forecast_date` {.val {forecast_date}} + `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}.", class = "epipredict__arx_forecaster__inconsistent_target_ahead_forecaste_date" ) } @@ -314,11 +310,7 @@ arx_args_list <- function( if (!is.null(forecast_date) && !is.null(target_date)) { if (forecast_date + ahead != target_date) { - cli_abort( - c( - "`forecast_date` + `ahead` must equal `target_date`.", - i = "{.val {forecast_date}} + {.val {ahead}} != {.val {target_date}}." - ), + cli_abort("`forecast_date` {.val {forecast_date}} + `ahead` {.val {ahead}} must equal `target_date` {.val {target_date}}.", class = "epipredict__arx_args__inconsistent_target_ahead_forecaste_date" ) } diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 769229b39..311b9d073 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -94,7 +94,7 @@ epi_recipe.epi_df <- ## Check and add roles when available if (!is.null(roles)) { if (length(roles) != length(vars)) { - cli_abort(paste( + cli_abort(paste0( "The number of roles should be the same as the number of ", "variables." )) diff --git a/R/utils-enframer.R b/R/utils-enframer.R deleted file mode 100644 index 723bd6a9c..000000000 --- a/R/utils-enframer.R +++ /dev/null @@ -1,23 +0,0 @@ -enframer <- function(df, x, fill = NA) { - stopifnot(is.data.frame(df)) - stopifnot(length(fill) == 1 || length(fill) == nrow(df)) - arg_is_chr(x, allow_null = TRUE) - if (is.null(x)) { - return(df) - } - if (any(names(df) %in% x)) { - stop("In enframer: some new cols match existing column names") - } - for (v in x) df <- mutate(df, !!v := fill) - df -} - -enlist <- function(...) { - # converted to thin wrapper around - rlang::dots_list( - ..., - .homonyms = "error", - .named = TRUE, - .check_assign = TRUE - ) -} From 86c46a457f72ccbd6829f2b305368ab685b26761 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 30 Sep 2024 11:23:09 -0500 Subject: [PATCH 87/92] updating after rebase --- R/arx_classifier.R | 2 -- R/utils-latency.R | 8 ++--- R/utils-misc.R | 11 +++++++ man/{seq_null_swap.Rd => seq_forward.Rd} | 6 ++-- man/step_adjust_latency.Rd | 4 +-- tests/testthat/test-get_test_data.R | 2 +- tests/testthat/test-pad_to_end.R | 37 ------------------------ tests/testthat/test-step_epi_naomit.R | 6 ++-- 8 files changed, 24 insertions(+), 52 deletions(-) rename man/{seq_null_swap.Rd => seq_forward.Rd} (76%) delete mode 100644 tests/testthat/test-pad_to_end.R diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 6dcffde88..3d2daa978 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -370,5 +370,3 @@ print.arx_class <- function(x, ...) { name <- "ARX Classifier" NextMethod(name = name, ...) } - -# this is a trivial change to induce a check diff --git a/R/utils-latency.R b/R/utils-latency.R index dc5fa9af8..311656ac1 100644 --- a/R/utils-latency.R +++ b/R/utils-latency.R @@ -183,12 +183,11 @@ pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { } itval <- epiprocess:::guess_period(c(x$time_value, end_date), "time_value") # get the time values we need to fill in - completed_time_values <- - x %>% + completed_time_values <- x %>% group_by(across(all_of(groups))) %>% summarise( time_value = list2( - time_value = seq_null_swap(from = max(time_value) + itval, to = end_date, by = itval) + time_value = seq_forward(from = max(time_value) + itval, to = end_date, by = itval) ) ) %>% unnest("time_value") %>% @@ -197,6 +196,7 @@ pad_to_end <- function(x, groups, end_date, columns_to_complete = NULL) { grouped_and_arranged <- x %>% arrange(across(all_of(c("time_value", groups)))) %>% group_by(across(all_of(groups))) + values_to_fill <- grouped_and_arranged %>% slice(min(across(all_of(columns_to_complete), count_single_column)):n()) filled_values <- values_to_fill %>% @@ -222,7 +222,7 @@ count_single_column <- function(col) { #' seq, but returns null if from is larger #' @keywords internal -seq_null_swap <- function(from, to, by) { +seq_forward <- function(from, to, by) { if (from > to) { return(NULL) } diff --git a/R/utils-misc.R b/R/utils-misc.R index 5cc8b364d..a1e0f025f 100644 --- a/R/utils-misc.R +++ b/R/utils-misc.R @@ -75,3 +75,14 @@ is_classification <- function(trainer) { is_regression <- function(trainer) { get_parsnip_mode(trainer) %in% c("regression", "unknown") } + + +enlist <- function(...) { + # converted to thin wrapper around + rlang::dots_list( + ..., + .homonyms = "error", + .named = TRUE, + .check_assign = TRUE + ) +} diff --git a/man/seq_null_swap.Rd b/man/seq_forward.Rd similarity index 76% rename from man/seq_null_swap.Rd rename to man/seq_forward.Rd index 7ad5a8954..9b3da6e55 100644 --- a/man/seq_null_swap.Rd +++ b/man/seq_forward.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils-latency.R -\name{seq_null_swap} -\alias{seq_null_swap} +\name{seq_forward} +\alias{seq_forward} \title{seq, but returns null if from is larger} \usage{ -seq_null_swap(from, to, by) +seq_forward(from, to, by) } \description{ seq, but returns null if from is larger diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index e1e516a3c..f0ee41390 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -260,8 +260,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous -#> `step_epi_lag`s won't work with modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with +#> modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R index 3fabdea2e..5f315c499 100644 --- a/tests/testthat/test-get_test_data.R +++ b/tests/testthat/test-get_test_data.R @@ -1,4 +1,4 @@ -library(dplyr) +suppressPackageStartupMessages(library(dplyr)) test_that("return expected number of rows and returned dataset is ungrouped", { r <- epi_recipe(case_death_rate_subset) %>% step_epi_ahead(death_rate, ahead = 7) %>% diff --git a/tests/testthat/test-pad_to_end.R b/tests/testthat/test-pad_to_end.R deleted file mode 100644 index 6949f06ac..000000000 --- a/tests/testthat/test-pad_to_end.R +++ /dev/null @@ -1,37 +0,0 @@ -test_that("test set padding works", { - dat <- tibble::tibble( - gr1 = rep(c("a", "b"), times = c(3, 4)), - time_value = c(1:3, 1:4), - value = 1:7 - ) %>% arrange(time_value, gr1) - expect_identical(pad_to_end(dat, "gr1", 3), dat) - expect_equal(nrow(pad_to_end(dat, "gr1", 4)), 8L) - p <- pad_to_end(dat, "gr1", 5) - expect_equal(nrow(p), 10L) - expect_identical(p$gr1, rep(c("a", "b"), times = 5)) - expect_identical(p$time_value, rep(1:5, each = 2)) - expect_identical(p$value, as.integer(c(1, 4, 2, 5, 3, 6, NA, 7, NA, NA))) - - dat <- dat %>% arrange(gr1) - dat$gr2 <- c("c", "c", "d", "c", "c", "d", "d") - dat <- dat %>% arrange(time_value) - # don't treat it as a group - p <- pad_to_end(dat, "gr1", 4) - expect_identical(nrow(p), 8L) - expect_identical(p$gr2, c(rep("c", 4), "d", "d", NA, "d")) - - # treat it as a group (needs different time_value) - dat$time_value <- c(1, 1, 2, 2, 1, 1, 2) # double - p <- pad_to_end(dat, c("gr1", "gr2"), 2) - expect_equal(nrow(p), 8L) - expect_identical(p$gr1, rep(c("a", "a", "b", "b"), times = 2)) - expect_identical(p$gr2, rep(c("c", "d"), times = 4)) - expect_identical(p$time_value, rep(c(1, 2), each = 4)) - expect_identical(p$value, as.integer(c(1, 3, 4, 6, 2, NA, 5, 7))) - - # make sure it maintains the epi_df - dat <- dat %>% - dplyr::rename(geo_value = gr1) %>% - as_epi_df(other_keys = "gr2") - expect_s3_class(pad_to_end(dat, "geo_value", 2), "epi_df") -}) diff --git a/tests/testthat/test-step_epi_naomit.R b/tests/testthat/test-step_epi_naomit.R index 0e5e1750f..2f361ec98 100644 --- a/tests/testthat/test-step_epi_naomit.R +++ b/tests/testthat/test-step_epi_naomit.R @@ -1,6 +1,6 @@ -library(dplyr) -library(parsnip) -library(workflows) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(parsnip)) +suppressPackageStartupMessages(library(workflows)) # Random generated dataset x <- tibble( From 90edb465ef109b1790543a2aea214d02995710f4 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 30 Sep 2024 11:28:11 -0500 Subject: [PATCH 88/92] final requests --- R/arx_classifier.R | 9 +++++++-- tests/testthat/test-snapshots.R | 20 ++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/R/arx_classifier.R b/R/arx_classifier.R index 3d2daa978..0aec0e362 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -191,9 +191,14 @@ arx_class_epi_workflow <- function( } } # regex that will match any amount of adjustment for the ahead - ahead_out_name <- glue::glue("ahead_[0-9]*_{pre_out_name}") + ahead_out_name_regex <- glue::glue("ahead_[0-9]*_{pre_out_name}") method_adjust_latency <- args_list$adjust_latency if (method_adjust_latency != "none") { + if (method_adjust_latency != "extend_ahead") { + cli_abort("only extend_ahead is currently supported", + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) + } r <- r %>% step_adjust_latency(!!pre_out_name, fixed_forecast_date = forecast_date, method = method_adjust_latency @@ -204,7 +209,7 @@ arx_class_epi_workflow <- function( r <- r %>% step_mutate( across( - matches(ahead_out_name), + matches(ahead_out_name_regex), ~ cut(.x, breaks = args_list$breaks), .names = "outcome_class", .unpack = TRUE diff --git a/tests/testthat/test-snapshots.R b/tests/testthat/test-snapshots.R index fc3ee890b..da8635ae0 100644 --- a/tests/testthat/test-snapshots.R +++ b/tests/testthat/test-snapshots.R @@ -162,4 +162,24 @@ test_that("arx_classifier snapshots", { args_list = arx_class_args_list(adjust_latency = "extend_ahead", forecast_date = max_date + 2) ) expect_snapshot_tibble(arc2$predictions) + expect_error( + arc3 <- arx_classifier( + case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "extend_lags", forecast_date = max_date + 2) + ), + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) + expect_error( + arc4 <- arx_classifier( + case_death_rate_subset %>% + dplyr::filter(time_value >= as.Date("2021-11-01")), + "death_rate", + c("case_rate", "death_rate"), + args_list = arx_class_args_list(adjust_latency = "locf", forecast_date = max_date + 2) + ), + class = "epipredict__arx_classifier__adjust_latency_unsupported_method" + ) }) From ce99138f08802dd6369837fa0704addae108ba6c Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 30 Sep 2024 12:57:29 -0500 Subject: [PATCH 89/92] only adding metadata if given an epi_df originally --- .Rbuildignore | 3 ++- R/epi_shift.R | 13 +++++++++---- R/step_epi_shift.R | 2 -- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 510725267..dc41e6223 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -20,4 +20,5 @@ ^doc$ ^Meta$ ^.lintr$ -^.venv$ \ No newline at end of file +^.venv$ +^inst/templates$ diff --git a/R/epi_shift.R b/R/epi_shift.R index 3113d0bf7..367e26285 100644 --- a/R/epi_shift.R +++ b/R/epi_shift.R @@ -60,9 +60,14 @@ add_shifted_columns <- function(new_data, object) { processed <- new_data %>% full_join(shifted, by = ok) %>% group_by(across(all_of(kill_time_value(ok)))) %>% - arrange(time_value) %>% - ungroup() %>% - as_epi_df() - attributes(processed)$metadata <- attributes(new_data)$metadata + arrange(time_value) + if (inherits(new_data, "epi_df")) { + processed <- processed %>% + ungroup() %>% + as_epi_df( + as_of = attributes(new_data)$metadata$as_of, + other_keys = attributes(new_data)$metadata$other_keys + ) + } return(processed) } diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index b04efd06f..a4bcee52e 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -248,8 +248,6 @@ prep.step_epi_ahead <- function(x, training, info = NULL, ...) { #' @export bake.step_epi_lag <- function(object, new_data, ...) { - names(object) - object$shift_grid add_shifted_columns(new_data, object) } From c6800bbcf4cda861b65d8eb820da2ee21fd65f9d Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Mon, 30 Sep 2024 17:19:24 -0500 Subject: [PATCH 90/92] snapshot updates --- tests/testthat/_snaps/step_epi_shift.md | 2 +- tests/testthat/_snaps/step_growth_rate.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/_snaps/step_epi_shift.md b/tests/testthat/_snaps/step_epi_shift.md index 44c828118..eaf495995 100644 --- a/tests/testthat/_snaps/step_epi_shift.md +++ b/tests/testthat/_snaps/step_epi_shift.md @@ -30,7 +30,7 @@ Code slm_fit(r4) Condition - Error in `bake()`: + Error in `add_shifted_columns()`: ! Name collision occured in The following variable name already exists: "lag_7_death_rate". diff --git a/tests/testthat/_snaps/step_growth_rate.md b/tests/testthat/_snaps/step_growth_rate.md index 5a3ac6f44..0912977e3 100644 --- a/tests/testthat/_snaps/step_growth_rate.md +++ b/tests/testthat/_snaps/step_growth_rate.md @@ -117,5 +117,5 @@ step_growth_rate(r, value, replace_Inf = c(1, 2)) Condition Error in `step_growth_rate()`: - ! replace_Inf must be a scalar. + ! `replace_Inf` must be a scalar. From 561570e35aec7155d8422523764730a0473dfeca Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 1 Oct 2024 16:05:19 -0500 Subject: [PATCH 91/92] description and News --- DESCRIPTION | 2 +- NEWS.md | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d89f9b276..d3369cf23 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.1.0 +Version: 0.1.1 Authors@R: c( person("Daniel J.", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), diff --git a/NEWS.md b/NEWS.md index 5b907f237..ef71394a2 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,13 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicate PR's. +# epipredict 0.2 + +## features +- Add `step_adjust_latency`, which give several methods to adjust the forecast if the `forecast_date` is after the last day of data. + +## bugfixes + # epipredict 0.1 - simplify `layer_residual_quantiles()` to avoid timesuck in `utils::methods()` @@ -61,6 +68,3 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Fix bug where `fit()` drops the `epi_workflow` class (also error if non-`epi_df` data is given to `epi_recipe()`), #363 - Try to retain the `epi_df` class during baking to the extent possible, #376 -- Add `latency_adjustment` as an option for `add_epi_ahead`, which adjusts the - `ahead` so that the prediction is `ahead` relative to the `as_of` date for the - `epi_data`, rather than relative to the last day of data. From 053b501ba216c861f2e9d0e5c073afbb2aaae038 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Tue, 1 Oct 2024 16:21:10 -0500 Subject: [PATCH 92/92] rerererebase --- NAMESPACE | 1 - R/get_test_data.R | 2 ++ man/step_epi_shift.Rd | 7 ------ tests/testthat/_snaps/enframer.md | 32 ------------------------ tests/testthat/_snaps/pivot_quantiles.md | 4 +-- tests/testthat/test-enframer.R | 13 ---------- 6 files changed, 3 insertions(+), 56 deletions(-) delete mode 100644 tests/testthat/_snaps/enframer.md delete mode 100644 tests/testthat/test-enframer.R diff --git a/NAMESPACE b/NAMESPACE index 4c5ec75ac..86b77716b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -294,7 +294,6 @@ importFrom(rlang,is_empty) importFrom(rlang,is_logical) importFrom(rlang,is_null) importFrom(rlang,is_true) -importFrom(rlang,is_vector) importFrom(rlang,list2) importFrom(rlang,set_names) importFrom(rlang,sym) diff --git a/R/get_test_data.R b/R/get_test_data.R index af43fc012..8de698301 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -41,6 +41,8 @@ get_test_data <- function(recipe, x) { min_lags <- min(map_dbl(recipe$steps, ~ min(.x$lag %||% Inf)), Inf) max_lags <- max(map_dbl(recipe$steps, ~ max(.x$lag %||% 0)), 0) max_horizon <- max(map_dbl(recipe$steps, ~ max(.x$horizon %||% 0)), 0) + max_slide <- max(map_dbl(recipe$steps, ~ max(.x$before %||% 0)), 0) + min_required <- max_lags + max_horizon + max_slide keep <- max_lags + max_horizon # CHECK: Error out if insufficient training data diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index e33cf7e0e..30ac05d16 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -54,13 +54,6 @@ Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations.} \item{id}{A unique identifier for the step} - -\item{latency_adjustment}{a character. Determines the method by which the forecast handles data that doesn't extend to the day the forecast is made. The options are: -\itemize{ -\item \code{"extend_ahead"}: actually forecasts from the last date. E.g. if there are 3 days of latency for a 4 day ahead forecast, the ahead used in practice is actually 7. -\item \code{"locf"}: carries forward the last observed value up to the forecast date. -\item \code{"extend_lags"}: per \code{epi_key} and \code{predictor}, adjusts the lag so that the shortest lag at predict time is -}} } \value{ An updated version of \code{recipe} with the new step added to the diff --git a/tests/testthat/_snaps/enframer.md b/tests/testthat/_snaps/enframer.md deleted file mode 100644 index 4b05dbff3..000000000 --- a/tests/testthat/_snaps/enframer.md +++ /dev/null @@ -1,32 +0,0 @@ -# enframer errors/works as needed - - Code - enframer(1:5, letters[1]) - Condition - Error in `enframer()`: - ! is.data.frame(df) is not TRUE - ---- - - Code - enframer(data.frame(a = 1:5), 1:3) - Condition - Error in `enframer()`: - ! `x` must be of type . - ---- - - Code - enframer(data.frame(a = 1:5), letters[1:3]) - Condition - Error in `enframer()`: - ! In enframer: some new cols match existing column names - ---- - - Code - enframer(data.frame(aa = 1:5), letters[1:2], fill = 1:4) - Condition - Error in `enframer()`: - ! length(fill) == 1 || length(fill) == nrow(df) is not TRUE - diff --git a/tests/testthat/_snaps/pivot_quantiles.md b/tests/testthat/_snaps/pivot_quantiles.md index 184eb62a6..ca775a18f 100644 --- a/tests/testthat/_snaps/pivot_quantiles.md +++ b/tests/testthat/_snaps/pivot_quantiles.md @@ -45,7 +45,5 @@ pivot_quantiles_longer(tib, d1, d3) Condition Error in `pivot_quantiles_longer()`: - ! Some selected columns contain different numbers of quantiles. - The result would be a very long . - To do this anyway, rerun with `.ignore_length_check = TRUE`. + ! Some selected columns contain different numbers of quantiles. The result would be a very long . To do this anyway, rerun with `.ignore_length_check = TRUE`. diff --git a/tests/testthat/test-enframer.R b/tests/testthat/test-enframer.R deleted file mode 100644 index 0926c587b..000000000 --- a/tests/testthat/test-enframer.R +++ /dev/null @@ -1,13 +0,0 @@ -test_that("enframer errors/works as needed", { - template1 <- data.frame(aa = 1:5, a = NA, b = NA, c = NA) - template2 <- data.frame(aa = 1:5, a = 2:6, b = 2:6, c = 2:6) - expect_snapshot(error = TRUE, enframer(1:5, letters[1])) - expect_snapshot(error = TRUE, enframer(data.frame(a = 1:5), 1:3)) - expect_snapshot(error = TRUE, enframer(data.frame(a = 1:5), letters[1:3])) - expect_identical(enframer(data.frame(aa = 1:5), letters[1:3]), template1) - expect_snapshot(error = TRUE, enframer(data.frame(aa = 1:5), letters[1:2], fill = 1:4)) - expect_identical( - enframer(data.frame(aa = 1:5), letters[1:3], fill = 2:6), - template2 - ) -})