diff --git a/DESCRIPTION b/DESCRIPTION index 6d1217587..d0366f22b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: epipredict Title: Basic epidemiology forecasting methods -Version: 0.0.20 +Version: 0.1.0 Authors@R: c( person("Daniel", "McDonald", , "daniel@stat.ubc.ca", role = c("aut", "cre")), person("Ryan", "Tibshirani", , "ryantibs@cmu.edu", role = "aut"), @@ -23,7 +23,7 @@ URL: https://github.com/cmu-delphi/epipredict/, https://cmu-delphi.github.io/epipredict BugReports: https://github.com/cmu-delphi/epipredict/issues/ Depends: - epiprocess (>= 0.7.5), + epiprocess (>= 0.7.12), parsnip (>= 1.0.0), R (>= 3.5.0) Imports: diff --git a/NAMESPACE b/NAMESPACE index 23c5adeaf..c20b8c801 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,8 +29,6 @@ S3method(bake,step_training_window) S3method(detect_layer,frosting) S3method(detect_layer,workflow) S3method(epi_recipe,default) -S3method(epi_recipe,epi_df) -S3method(epi_recipe,formula) S3method(extract_argument,epi_workflow) S3method(extract_argument,frosting) S3method(extract_argument,layer) @@ -96,6 +94,8 @@ S3method(print,step_naomit) S3method(print,step_population_scaling) S3method(print,step_training_window) S3method(quantile,dist_quantiles) +S3method(recipe,epi_df) +S3method(recipes::recipe,formula) S3method(refresh_blueprint,default_epi_recipe_blueprint) S3method(residuals,flatline) S3method(run_mold,default_epi_recipe_blueprint) @@ -152,7 +152,6 @@ export(default_epi_recipe_blueprint) export(detect_layer) export(dist_quantiles) export(epi_recipe) -export(epi_recipe_blueprint) export(epi_workflow) export(extract_argument) export(extract_frosting) @@ -183,13 +182,12 @@ export(layer_residual_quantiles) export(layer_threshold) export(layer_unnest) export(nested_quantiles) -export(new_default_epi_recipe_blueprint) -export(new_epi_recipe_blueprint) export(pivot_quantiles_longer) export(pivot_quantiles_wider) export(prep) export(quantile_reg) export(rand_id) +export(recipe) export(remove_epi_recipe) export(remove_frosting) export(remove_model) @@ -264,6 +262,7 @@ importFrom(magrittr,"%>%") importFrom(recipes,bake) importFrom(recipes,prep) importFrom(recipes,rand_id) +importFrom(recipes,recipe) importFrom(rlang,"!!!") importFrom(rlang,"!!") importFrom(rlang,"%@%") diff --git a/R/arx_classifier.R b/R/arx_classifier.R index ca6a3537b..9c1972e88 100644 --- a/R/arx_classifier.R +++ b/R/arx_classifier.R @@ -10,7 +10,7 @@ #' be real-valued. Conversion of this data to unordered classes is handled #' internally based on the `breaks` argument to [arx_class_args_list()]. #' If discrete classes are already in the `epi_df`, it is recommended to -#' code up a classifier from scratch using [epi_recipe()]. +#' code up a classifier from scratch using [recipe()]. #' @param trainer A `{parsnip}` model describing the type of estimation. #' For now, we enforce `mode = "classification"`. Typical values are #' [parsnip::logistic_reg()] or [parsnip::multinom_reg()]. More complicated @@ -129,7 +129,7 @@ arx_class_epi_workflow <- function( # --- preprocessor # ------- predictors - r <- epi_recipe(epi_data) %>% + r <- recipe(epi_data) %>% step_growth_rate( dplyr::all_of(predictors), role = "grp", diff --git a/R/arx_forecaster.R b/R/arx_forecaster.R index 37c9aae86..b7d06f322 100644 --- a/R/arx_forecaster.R +++ b/R/arx_forecaster.R @@ -122,7 +122,7 @@ arx_fcast_epi_workflow <- function( lags <- arx_lags_validator(predictors, args_list$lags) # --- preprocessor - r <- epi_recipe(epi_data) + r <- recipe(epi_data) for (l in seq_along(lags)) { p <- predictors[l] r <- step_epi_lag(r, !!p, lag = lags[[l]]) diff --git a/R/autoplot.R b/R/autoplot.R index d35850fd6..dab763fe0 100644 --- a/R/autoplot.R +++ b/R/autoplot.R @@ -32,7 +32,7 @@ ggplot2::autoplot #' jhu <- case_death_rate_subset %>% #' filter(time_value >= as.Date("2021-11-01")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -56,7 +56,7 @@ ggplot2::autoplot #' # ------- Show multiple horizons #' #' p <- lapply(c(7, 14, 21, 28), function(h) { -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = h) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -184,7 +184,10 @@ autoplot.epi_workflow <- function( } if (".pred" %in% names(predictions)) { - ntarget_dates <- n_distinct(predictions$time_value) + ntarget_dates <- dplyr::n_distinct(predictions$time_value) + if (distributional::is_distribution(predictions$.pred)) { + predictions <- dplyr::mutate(predictions, .pred = median(.pred)) + } if (ntarget_dates > 1L) { bp <- bp + geom_line( diff --git a/R/blueprint-epi_recipe-default.R b/R/blueprint-epi_recipe-default.R index 886cd5512..4e72ae297 100644 --- a/R/blueprint-epi_recipe-default.R +++ b/R/blueprint-epi_recipe-default.R @@ -1,111 +1,69 @@ -#' Recipe blueprint that accounts for `epi_df` panel data -#' -#' Used for simplicity. See [hardhat::new_recipe_blueprint()] or -#' [hardhat::default_recipe_blueprint()] for more details. -#' -#' @inheritParams hardhat::new_recipe_blueprint +#' Default epi_recipe blueprint #' -#' @details The `bake_dependent_roles` are automatically set to `epi_df` defaults. -#' @return A recipe blueprint. +#' Recipe blueprint that accounts for `epi_df` panel data +#' Used for simplicity. See [hardhat::default_recipe_blueprint()] for more +#' details. This subclass is nearly the same, except it ensures that +#' downstream processing doesn't drop the epi_df class from the data. #' -#' @keywords internal +#' @inheritParams hardhat::default_recipe_blueprint +#' @return A `epi_recipe` blueprint. #' @export -new_epi_recipe_blueprint <- - function(intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - composition = "tibble", - ptypes = NULL, recipe = NULL, ..., subclass = character()) { - hardhat::new_recipe_blueprint( - intercept = intercept, - allow_novel_levels = allow_novel_levels, - fresh = fresh, - composition = composition, - ptypes = ptypes, - recipe = recipe, - ..., - subclass = c(subclass, "epi_recipe_blueprint") - ) - } - - -#' @rdname new_epi_recipe_blueprint -#' @export -epi_recipe_blueprint <- - function(intercept = FALSE, allow_novel_levels = FALSE, - fresh = TRUE, - composition = "tibble") { - new_epi_recipe_blueprint( - intercept = intercept, - allow_novel_levels = allow_novel_levels, - fresh = fresh, - composition = composition - ) - } +#' @keywords internal +default_epi_recipe_blueprint <- function(intercept = FALSE, + allow_novel_levels = FALSE, + fresh = TRUE, + strings_as_factors = FALSE, + composition = "tibble") { + new_default_epi_recipe_blueprint( + intercept = intercept, + allow_novel_levels = allow_novel_levels, + fresh = fresh, + strings_as_factors = strings_as_factors, + composition = composition + ) +} -#' @rdname new_epi_recipe_blueprint -#' @export -default_epi_recipe_blueprint <- - function(intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, - composition = "tibble") { - new_default_epi_recipe_blueprint( - intercept = intercept, - allow_novel_levels = allow_novel_levels, - fresh = fresh, - composition = composition - ) - } +new_default_epi_recipe_blueprint <- function(intercept = FALSE, + allow_novel_levels = TRUE, + fresh = TRUE, + strings_as_factors = FALSE, + composition = "tibble", + ptypes = NULL, + recipe = NULL, + extra_role_ptypes = NULL, + ..., + subclass = character()) { + hardhat::new_recipe_blueprint( + intercept = intercept, + allow_novel_levels = allow_novel_levels, + fresh = fresh, + strings_as_factors = strings_as_factors, + composition = composition, + ptypes = ptypes, + recipe = recipe, + extra_role_ptypes = extra_role_ptypes, + ..., + subclass = c(subclass, "default_epi_recipe_blueprint", "default_recipe_blueprint") + ) +} -#' @rdname new_epi_recipe_blueprint -#' @inheritParams hardhat::new_default_recipe_blueprint -#' @export -new_default_epi_recipe_blueprint <- - function(intercept = FALSE, allow_novel_levels = FALSE, - fresh = TRUE, - composition = "tibble", ptypes = NULL, recipe = NULL, - extra_role_ptypes = NULL, ..., subclass = character()) { - new_epi_recipe_blueprint( - intercept = intercept, - allow_novel_levels = allow_novel_levels, - fresh = fresh, - composition = composition, - ptypes = ptypes, - recipe = recipe, - extra_role_ptypes = extra_role_ptypes, - ..., - subclass = c(subclass, "default_epi_recipe_blueprint", "default_recipe_blueprint") - ) - } #' @importFrom hardhat run_mold #' @export run_mold.default_epi_recipe_blueprint <- function(blueprint, ..., data) { rlang::check_dots_empty0(...) - # blueprint <- hardhat:::patch_recipe_default_blueprint(blueprint) - cleaned <- mold_epi_recipe_default_clean(blueprint = blueprint, data = data) - blueprint <- cleaned$blueprint - data <- cleaned$data + # we don't do the "cleaning" in `hardhat:::run_mold.default_recipe_blueprint` + # That function drops the epi_df class without any recourse. + # The only way we should be here at all is if `data` is an epi_df, but just + # in case... + if (!is_epi_df(data)) { + cli_warn("`data` is not an {.cls epi_df}. It has class {.cls {class(data)}}.") + } hardhat:::mold_recipe_default_process(blueprint = blueprint, data = data) } -mold_epi_recipe_default_clean <- function(blueprint, data) { - hardhat:::check_data_frame_or_matrix(data) - if (!is_epi_df(data)) data <- hardhat:::coerce_to_tibble(data) - hardhat:::new_mold_clean(blueprint, data) -} - #' @importFrom hardhat refresh_blueprint #' @export refresh_blueprint.default_epi_recipe_blueprint <- function(blueprint) { do.call(new_default_epi_recipe_blueprint, as.list(blueprint)) } - - -## removing this function? -# er_check_is_data_like <- function(.x, .x_nm) { -# if (rlang::is_missing(.x_nm)) { -# .x_nm <- rlang::as_label(rlang::enexpr(.x)) -# } -# if (!hardhat:::is_new_data_like(.x)) { -# hardhat:::glubort("`{.x_nm}` must be a data.frame or a matrix, not a {class1(.x)}.") -# } -# .x -# } diff --git a/R/cdc_baseline_forecaster.R b/R/cdc_baseline_forecaster.R index 74af5e443..976255fb8 100644 --- a/R/cdc_baseline_forecaster.R +++ b/R/cdc_baseline_forecaster.R @@ -36,25 +36,25 @@ #' cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") #' preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) #' -#' if (require(ggplot2)) { -#' forecast_date <- unique(preds$forecast_date) -#' four_states <- c("ca", "pa", "wa", "ny") -#' preds %>% -#' filter(geo_value %in% four_states) %>% -#' ggplot(aes(target_date)) + -#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + -#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + -#' geom_line(aes(y = .pred), color = "orange") + -#' geom_line( -#' data = weekly_deaths %>% filter(geo_value %in% four_states), -#' aes(x = time_value, y = deaths) -#' ) + -#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + -#' labs(x = "Date", y = "Weekly deaths") + -#' facet_wrap(~geo_value, scales = "free_y") + -#' theme_bw() + -#' geom_vline(xintercept = forecast_date) -#' } +#' library(ggplot2) +#' forecast_date <- unique(preds$forecast_date) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = weekly_deaths %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = deaths) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Weekly deaths") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) +#' cdc_baseline_forecaster <- function( epi_data, outcome, @@ -68,7 +68,7 @@ cdc_baseline_forecaster <- function( outcome <- rlang::sym(outcome) - r <- epi_recipe(epi_data) %>% + r <- recipe(epi_data) %>% step_epi_ahead(!!outcome, ahead = args_list$data_frequency, skip = TRUE) %>% recipes::update_role(!!outcome, new_role = "predictor") %>% recipes::add_role(tidyselect::all_of(keys), new_role = "predictor") %>% @@ -79,7 +79,7 @@ cdc_baseline_forecaster <- function( latest <- get_test_data( - epi_recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, + recipe(epi_data), epi_data, TRUE, args_list$nafill_buffer, forecast_date ) diff --git a/R/epi_recipe.R b/R/epi_recipe.R index 88ba605cd..c3a18d3cb 100644 --- a/R/epi_recipe.R +++ b/R/epi_recipe.R @@ -1,224 +1,15 @@ -#' Create a epi_recipe for preprocessing data -#' -#' A recipe is a description of the steps to be applied to a data set in -#' order to prepare it for data analysis. This is a loose wrapper -#' around [recipes::recipe()] to properly handle the additional -#' columns present in an `epi_df` -#' -#' @aliases epi_recipe epi_recipe.default epi_recipe.formula #' @import recipes #' @export epi_recipe <- function(x, ...) { + deprecate_soft("This function is being deprecated. Use `recipe()` instead.") UseMethod("epi_recipe") } - -#' @rdname epi_recipe #' @export epi_recipe.default <- function(x, ...) { - ## if not a formula or an epi_df, we just pass to recipes::recipe - if (is.matrix(x) || is.data.frame(x) || tibble::is_tibble(x)) { - x <- x[1, , drop = FALSE] - } - cli_warn( - "epi_recipe has been called with a non-epi_df object, returning a regular recipe. Various - step_epi_* functions will not work." - ) - recipes::recipe(x, ...) + recipe(x, ...) } -#' @rdname epi_recipe -#' @inheritParams recipes::recipe -#' @param roles A character string (the same length of `vars`) that -#' describes a single role that the variable will take. This value could be -#' anything but common roles are `"outcome"`, `"predictor"`, -#' `"time_value"`, and `"geo_value"` -#' @param ... Further arguments passed to or from other methods (not currently -#' used). -#' @param formula A model formula. No in-line functions should be used here -#' (e.g. `log(x)`, `x:y`, etc.) and minus signs are not allowed. These types of -#' transformations should be enacted using `step` functions in this package. -#' Dots are allowed as are simple multivariate outcome terms (i.e. no need for -#' `cbind`; see Examples). -#' @param x,data A data frame, tibble, or epi_df of the *template* data set -#' (see below). This is always coerced to the first row to avoid memory issues -#' @inherit recipes::recipe return -#' -#' @export -#' @examples -#' library(dplyr) -#' library(recipes) -#' jhu <- case_death_rate_subset %>% -#' filter(time_value > "2021-08-01") %>% -#' arrange(geo_value, time_value) -#' -#' r <- epi_recipe(jhu) %>% -#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% -#' step_epi_ahead(death_rate, ahead = 7) %>% -#' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% -#' step_naomit(all_predictors()) %>% -#' # below, `skip` means we don't do this at predict time -#' step_naomit(all_outcomes(), skip = TRUE) -#' -#' r -epi_recipe.epi_df <- - function(x, formula = NULL, ..., vars = NULL, roles = NULL) { - if (!is.null(formula)) { - if (!is.null(vars)) { - rlang::abort( - paste0( - "This `vars` specification will be ignored ", - "when a formula is used" - ) - ) - } - if (!is.null(roles)) { - rlang::abort( - paste0( - "This `roles` specification will be ignored ", - "when a formula is used" - ) - ) - } - - obj <- epi_recipe.formula(formula, x, ...) - return(obj) - } - if (is.null(vars)) vars <- colnames(x) - if (any(table(vars) > 1)) { - rlang::abort("`vars` should have unique members") - } - if (any(!(vars %in% colnames(x)))) { - rlang::abort("1 or more elements of `vars` are not in the data") - } - - keys <- key_colnames(x) # we know x is an epi_df - - var_info <- tibble(variable = vars) - key_roles <- c("geo_value", "time_value", rep("key", length(keys) - 2)) - - ## Check and add roles when available - if (!is.null(roles)) { - if (length(roles) != length(vars)) { - rlang::abort(c( - "The number of roles should be the same as the number of ", - "variables." - )) - } - var_info$role <- roles - } else { - var_info <- var_info %>% dplyr::filter(!(variable %in% keys)) - var_info$role <- "raw" - } - ## Now we add the keys when necessary - var_info <- dplyr::union( - var_info, - tibble::tibble(variable = keys, role = key_roles) - ) - - ## Add types - var_info <- dplyr::full_join(recipes:::get_types(x), var_info, by = "variable") - var_info$source <- "original" - - ## arrange to easy order - var_info <- var_info %>% - dplyr::arrange(factor( - role, - levels = union( - c("predictor", "outcome", "time_value", "geo_value", "key"), - unique(role) - ) # anything else - )) - - ## Return final object of class `recipe` - out <- list( - var_info = var_info, - term_info = var_info, - steps = NULL, - template = x[1, ], - max_time_value = max(x$time_value), - levels = NULL, - retained = NA - ) - class(out) <- c("epi_recipe", "recipe") - out - } - - -#' @rdname epi_recipe -#' @importFrom rlang abort -#' @export -epi_recipe.formula <- function(formula, data, ...) { - # we ensure that there's only 1 row in the template - data <- data[1, ] - # check for minus: - if (!epiprocess::is_epi_df(data)) { - cli_warn( - "epi_recipe has been called with a non-epi_df object, returning a regular recipe. Various - step_epi_* functions will not work." - ) - return(recipes::recipe(formula, data, ...)) - } - - f_funcs <- recipes:::fun_calls(formula, data) - if (any(f_funcs == "-")) { - abort("`-` is not allowed in a recipe formula. Use `step_rm()` instead.") - } - - # Check for other in-line functions - args <- epi_form2args(formula, data, ...) - obj <- epi_recipe.epi_df( - x = args$x, - formula = NULL, - ..., - vars = args$vars, - roles = args$roles - ) - obj -} - - -# slightly modified version of `form2args()` in {recipes} -epi_form2args <- function(formula, data, ...) { - if (!rlang::is_formula(formula)) formula <- as.formula(formula) - - ## check for in-line formulas - recipes:::inline_check(formula, data) - - ## use rlang to get both sides of the formula - outcomes <- recipes:::get_lhs_vars(formula, data) - predictors <- recipes:::get_rhs_vars(formula, data, no_lhs = TRUE) - keys <- key_colnames(data) - - ## if . was used on the rhs, subtract out the outcomes - predictors <- predictors[!(predictors %in% outcomes)] - ## if . was used anywhere, remove epi_keys - if (rlang::f_lhs(formula) == ".") { - outcomes <- outcomes[!(outcomes %in% keys)] - } - if (rlang::f_rhs(formula) == ".") { - predictors <- predictors[!(predictors %in% keys)] - } - - ## get `vars` from rhs, lhs. keys get added downstream - vars <- c(predictors, outcomes) - ## subset data columns - data <- data[, union(vars, keys)] - - ## derive roles - roles <- rep("predictor", length(predictors)) - if (length(outcomes) > 0) { - roles <- c(roles, rep("outcome", length(outcomes))) - } - # if (length(keys) > 0) { - # roles <- c(roles, c("time_value", rep("key", length(keys) - 1))) - # } - - ## pass to recipe.default with vars and roles - list(x = data, vars = vars, roles = roles) -} - - #' Test for `epi_recipe` #' @@ -274,7 +65,7 @@ is_epi_recipe <- function(x) { #' filter(time_value > "2021-08-01") %>% #' arrange(geo_value, time_value) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -286,7 +77,7 @@ is_epi_recipe <- function(x) { #' #' workflow #' -#' r2 <- epi_recipe(jhu) %>% +#' r2 <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) #' @@ -356,12 +147,12 @@ update_epi_recipe <- function(x, recipe, ..., blueprint = default_epi_recipe_blu #' #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() #' -#' wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +#' wf <- epi_workflow(r, linear_reg()) %>% fit(jhu) #' latest <- jhu %>% #' filter(time_value >= max(time_value) - 14) #' @@ -430,20 +221,16 @@ adjust_epi_recipe.epi_recipe <- function(x, which_step, ..., blueprint = default x } -# unfortunately, almost everything the same as in prep.recipe except string/fctr handling + #' @export prep.epi_recipe <- function( x, training = NULL, fresh = FALSE, verbose = FALSE, retain = TRUE, log_changes = FALSE, strings_as_factors = TRUE, ...) { - if (is.null(training)) { - cli::cli_warn(c( - "!" = "No training data was supplied to {.fn prep}.", - "!" = "Unlike a {.cls recipe}, an {.cls epi_recipe} does not ", - "!" = "store the full template data in the object.", - "!" = "Please supply the training data to the {.fn prep} function,", - "!" = "to avoid addtional warning messages." - )) + if (!strings_as_factors) { + return(NextMethod("prep")) } + # workaround to avoid converting strings2factors with recipes::prep.recipe() + # We do the conversion here, then set it to FALSE training <- recipes:::check_training_set(training, x, fresh) training <- epi_check_training_set(training, x) training <- dplyr::relocate(training, dplyr::all_of(key_colnames(training))) @@ -452,115 +239,28 @@ prep.epi_recipe <- function( orig_lvls <- lapply(training, recipes:::get_levels) orig_lvls <- kill_levels(orig_lvls, keys) - if (strings_as_factors) { - lvls <- lapply(training, recipes:::get_levels) - lvls <- kill_levels(lvls, keys) - training <- recipes:::strings2factors(training, lvls) - } else { - lvls <- NULL - } - skippers <- map_lgl(x$steps, recipes:::is_skipable) - if (any(skippers) & !retain) { - cli::cli_warn(c( - "Since some operations have `skip = TRUE`, using ", - "`retain = TRUE` will allow those steps results to ", - "be accessible." - )) - } - if (fresh) x$term_info <- x$var_info - - running_info <- x$term_info %>% dplyr::mutate(number = 0, skip = FALSE) - for (i in seq(along.with = x$steps)) { - needs_tuning <- map_lgl(x$steps[[i]], recipes:::is_tune) - if (any(needs_tuning)) { - arg <- names(needs_tuning)[needs_tuning] - arg <- paste0("'", arg, "'", collapse = ", ") - msg <- paste0( - "You cannot `prep()` a tuneable recipe. Argument(s) with `tune()`: ", - arg, ". Do you want to use a tuning function such as `tune_grid()`?" - ) - rlang::abort(msg) - } - note <- paste("oper", i, gsub("_", " ", class(x$steps[[i]])[1])) - if (!x$steps[[i]]$trained | fresh) { - if (verbose) { - cat(note, "[training]", "\n") - } - before_nms <- names(training) - before_template <- training[1, ] - x$steps[[i]] <- prep(x$steps[[i]], - training = training, - info = x$term_info - ) - training <- bake(x$steps[[i]], new_data = training) - if (!tibble::is_tibble(training)) { - cli::cli_abort("`bake()` methods should always return {.cls tibble}.") - } - if (!is_epi_df(training)) { - # tidymodels killed our class - # for now, we only allow step_epi_* to alter the metadata - training <- dplyr::dplyr_reconstruct( - as_epi_df(training), before_template - ) - } - training <- dplyr::relocate(training, all_of(key_colnames(training))) - x$term_info <- recipes:::merge_term_info(get_types(training), x$term_info) - if (!is.na(x$steps[[i]]$role)) { - new_vars <- setdiff(x$term_info$variable, running_info$variable) - pos_new_var <- x$term_info$variable %in% new_vars - pos_new_and_na_role <- pos_new_var & is.na(x$term_info$role) - pos_new_and_na_source <- pos_new_var & is.na(x$term_info$source) - x$term_info$role[pos_new_and_na_role] <- x$steps[[i]]$role - x$term_info$source[pos_new_and_na_source] <- "derived" - } - recipes:::changelog(log_changes, before_nms, names(training), x$steps[[i]]) - running_info <- rbind( - running_info, - dplyr::mutate(x$term_info, number = i, skip = x$steps[[i]]$skip) - ) - } else { - if (verbose) cat(note, "[pre-trained]\n") - } - } - if (strings_as_factors) { - lvls <- lapply(training, recipes:::get_levels) - lvls <- kill_levels(lvls, keys) - check_lvls <- recipes:::has_lvls(lvls) - if (!any(check_lvls)) lvls <- NULL - } else { - lvls <- NULL - } - if (retain) { - if (verbose) { - cat( - "The retained training set is ~", - format(utils::object.size(training), units = "Mb", digits = 2), - " in memory.\n\n" - ) - } - x$template <- training - } else { - x$template <- training[0, ] - } - x$max_time_value <- max(training$time_value) - x$tr_info <- tr_data + lvls <- lapply(training, recipes:::get_levels) + lvls <- kill_levels(lvls, keys) # don't do anything to the epi_keys + training <- recipes:::strings2factors(training, lvls) + + x <- NextMethod("prep", + training = training, fresh = fresh, + verbose = verbose, + retain = retain, log_changes = log_changes, + strings_as_factors = FALSE, ... + ) + # Now, we undo the conversion. + + lvls <- lapply(x$template, recipes:::get_levels) + lvls <- kill_levels(lvls, keys) + check_lvls <- recipes:::has_lvls(lvls) + if (!any(check_lvls)) lvls <- NULL x$levels <- lvls x$orig_lvls <- orig_lvls - x$retained <- retain - x$last_term_info <- running_info %>% - dplyr::group_by(variable) %>% - dplyr::arrange(dplyr::desc(number)) %>% - dplyr::summarise( - type = list(dplyr::first(type)), - role = list(unique(unlist(role))), - source = dplyr::first(source), - number = dplyr::first(number), - skip = dplyr::first(skip), - .groups = "keep" - ) x } + #' @export bake.epi_recipe <- function(object, new_data, ..., composition = "epi_df") { meta <- NULL diff --git a/R/epi_workflow.R b/R/epi_workflow.R index b059a81d0..369b96eb1 100644 --- a/R/epi_workflow.R +++ b/R/epi_workflow.R @@ -22,7 +22,7 @@ #' @examples #' jhu <- case_death_rate_subset #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -87,7 +87,7 @@ is_epi_workflow <- function(x) { #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) #' @@ -139,16 +139,17 @@ fit.epi_workflow <- function(object, data, ..., control = workflows::control_wor #' @name predict-epi_workflow #' @export #' @examples +#' library(dplyr) #' jhu <- case_death_rate_subset #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% #' step_epi_naomit() #' -#' wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) -#' latest <- jhu %>% dplyr::filter(time_value >= max(time_value) - 14) +#' wf <- epi_workflow(r, linear_reg()) %>% fit(jhu) +#' latest <- jhu %>% filter(time_value >= max(time_value) - 14) #' #' preds <- predict(wf, latest) #' preds diff --git a/R/flatline_forecaster.R b/R/flatline_forecaster.R index 55808b803..75241f712 100644 --- a/R/flatline_forecaster.R +++ b/R/flatline_forecaster.R @@ -41,7 +41,7 @@ flatline_forecaster <- function( outcome <- rlang::sym(outcome) - r <- epi_recipe(epi_data) %>% + r <- recipe(epi_data) %>% step_epi_ahead(!!outcome, ahead = args_list$ahead, skip = TRUE) %>% recipes::update_role(!!outcome, new_role = "predictor") %>% recipes::add_role(tidyselect::all_of(keys), new_role = "predictor") %>% diff --git a/R/frosting.R b/R/frosting.R index 8474edbdf..6d1e9196c 100644 --- a/R/frosting.R +++ b/R/frosting.R @@ -11,7 +11,7 @@ #' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) #' @@ -130,7 +130,7 @@ update_frosting <- function(x, frosting, ...) { #' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() @@ -268,15 +268,16 @@ new_frosting <- function() { #' wf <- epi_workflow() %>% add_frosting(f) #' #' # A more realistic example +#' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() #' -#' wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +#' wf <- epi_workflow(r, linear_reg()) %>% fit(jhu) #' #' f <- frosting() %>% #' layer_predict() %>% diff --git a/R/get_test_data.R b/R/get_test_data.R index 694e73b06..b50a35fb2 100644 --- a/R/get_test_data.R +++ b/R/get_test_data.R @@ -35,7 +35,7 @@ #' keys, as well other variables in the original dataset. #' @examples #' # create recipe -#' rec <- epi_recipe(case_death_rate_subset) %>% +#' rec <- recipe(case_death_rate_subset) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_lag(case_rate, lag = c(0, 7, 14)) diff --git a/R/layer_add_forecast_date.R b/R/layer_add_forecast_date.R index 02395f960..0c15998ac 100644 --- a/R/layer_add_forecast_date.R +++ b/R/layer_add_forecast_date.R @@ -22,7 +22,7 @@ #' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layer_add_target_date.R b/R/layer_add_target_date.R index 9176fb593..eb2f76c6a 100644 --- a/R/layer_add_target_date.R +++ b/R/layer_add_target_date.R @@ -23,7 +23,7 @@ #' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layer_cdc_flatline_quantiles.R b/R/layer_cdc_flatline_quantiles.R index 8d16ba32f..926198e11 100644 --- a/R/layer_cdc_flatline_quantiles.R +++ b/R/layer_cdc_flatline_quantiles.R @@ -55,13 +55,14 @@ #' @export #' #' @examples +#' library(recipes) #' library(dplyr) -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- recipe(case_death_rate_subset) %>% #' # data is "daily", so we fit this to 1 ahead, the result will contain #' # 1 day ahead residuals #' step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) %>% -#' recipes::update_role(death_rate, new_role = "predictor") %>% -#' recipes::add_role(time_value, geo_value, new_role = "predictor") +#' update_role(death_rate, new_role = "predictor") %>% +#' add_role(time_value, geo_value, new_role = "predictor") #' #' forecast_date <- max(case_death_rate_subset$time_value) #' @@ -82,24 +83,23 @@ #' pivot_quantiles_wider(.pred_distn) %>% #' mutate(target_date = forecast_date + ahead) #' -#' if (require("ggplot2")) { -#' four_states <- c("ca", "pa", "wa", "ny") -#' preds %>% -#' filter(geo_value %in% four_states) %>% -#' ggplot(aes(target_date)) + -#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + -#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + -#' geom_line(aes(y = .pred), color = "orange") + -#' geom_line( -#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), -#' aes(x = time_value, y = death_rate) -#' ) + -#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + -#' labs(x = "Date", y = "Death rate") + -#' facet_wrap(~geo_value, scales = "free_y") + -#' theme_bw() + -#' geom_vline(xintercept = forecast_date) -#' } +#' library(ggplot2) +#' four_states <- c("ca", "pa", "wa", "ny") +#' preds %>% +#' filter(geo_value %in% four_states) %>% +#' ggplot(aes(target_date)) + +#' geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + +#' geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + +#' geom_line(aes(y = .pred), color = "orange") + +#' geom_line( +#' data = case_death_rate_subset %>% filter(geo_value %in% four_states), +#' aes(x = time_value, y = death_rate) +#' ) + +#' scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + +#' labs(x = "Date", y = "Death rate") + +#' facet_wrap(~geo_value, scales = "free_y") + +#' theme_bw() + +#' geom_vline(xintercept = forecast_date) layer_cdc_flatline_quantiles <- function( frosting, ..., diff --git a/R/layer_naomit.R b/R/layer_naomit.R index 209a663b4..5c46699fe 100644 --- a/R/layer_naomit.R +++ b/R/layer_naomit.R @@ -15,7 +15,7 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) #' diff --git a/R/layer_point_from_distn.R b/R/layer_point_from_distn.R index f14008748..71184cad0 100644 --- a/R/layer_point_from_distn.R +++ b/R/layer_point_from_distn.R @@ -20,7 +20,7 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layer_population_scaling.R b/R/layer_population_scaling.R index 9275d910c..c5406b46b 100644 --- a/R/layer_population_scaling.R +++ b/R/layer_population_scaling.R @@ -54,7 +54,7 @@ #' #' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_population_scaling( #' df = pop_data, #' df_pop_col = "value", diff --git a/R/layer_predict.R b/R/layer_predict.R index 6ca17ac24..b49729ac4 100644 --- a/R/layer_predict.R +++ b/R/layer_predict.R @@ -20,7 +20,7 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layer_predictive_distn.R b/R/layer_predictive_distn.R index b28e0c765..6cbb58cfb 100644 --- a/R/layer_predictive_distn.R +++ b/R/layer_predictive_distn.R @@ -24,7 +24,7 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layer_quantile_distn.R b/R/layer_quantile_distn.R index 5f87ded29..d1c3a9d24 100644 --- a/R/layer_quantile_distn.R +++ b/R/layer_quantile_distn.R @@ -26,7 +26,7 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layer_residual_quantiles.R b/R/layer_residual_quantiles.R index eae151905..2e08494f2 100644 --- a/R/layer_residual_quantiles.R +++ b/R/layer_residual_quantiles.R @@ -18,7 +18,7 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layer_threshold_preds.R b/R/layer_threshold_preds.R index 56f8059ab..da397fb18 100644 --- a/R/layer_threshold_preds.R +++ b/R/layer_threshold_preds.R @@ -25,7 +25,7 @@ #' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value < "2021-03-08", geo_value %in% c("ak", "ca", "ar")) -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() diff --git a/R/layers.R b/R/layers.R index aa515a917..ee26e63f9 100644 --- a/R/layers.R +++ b/R/layers.R @@ -44,12 +44,14 @@ layer <- function(subclass, ..., .prefix = "layer_") { #' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -#' r <- epi_recipe(jhu) %>% +#' +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() -#' wf <- epi_workflow(r, linear_reg()) %>% fit(jhu) -#' latest <- jhu %>% filter(time_value >= max(time_value) - 14) +#' wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +#' latest <- jhu %>% +#' filter(time_value >= max(time_value) - 14) #' #' # Specify a `forecast_date` that is greater than or equal to `as_of` date #' f <- frosting() %>% diff --git a/R/model-methods.R b/R/model-methods.R index f3b374879..131a6ee91 100644 --- a/R/model-methods.R +++ b/R/model-methods.R @@ -36,7 +36,7 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) #' diff --git a/R/recipe.epi_df.R b/R/recipe.epi_df.R new file mode 100644 index 000000000..6cfcf3170 --- /dev/null +++ b/R/recipe.epi_df.R @@ -0,0 +1,92 @@ +#' Create a recipe for preprocessing panel data +#' +#' A recipe is a description of the steps to be applied to a data set in +#' order to prepare it for data analysis. This is an S3 method for +#' [recipes::recipe()] to properly handle the additional (panel data) +#' columns present in an [`epiprocess::epi_df`]: `time_value`, `geo_value`, and any +#' additional keys. +#' +#' @aliases epi_recipe epi_recipe.default epi_recipe.formula +#' @inheritParams recipes::recipe +#' @param roles A character string (the same length of `vars`) that +#' describes a single role that the variable will take. This value could be +#' anything but common roles are `"outcome"`, `"predictor"`, +#' `"time_value"`, and `"geo_value"` +#' @param ... Further arguments passed to or from other methods (not currently +#' used). +#' @param formula A model formula. No in-line functions should be used here +#' (e.g. `log(x)`, `x:y`, etc.) and minus signs are not allowed. These types of +#' transformations should be enacted using `step` functions in this package. +#' Dots are allowed as are simple multivariate outcome terms (i.e. no need for +#' `cbind`; see Examples). +#' @param x,data A data frame, tibble, or epi_df of the *template* data set +#' (see below). This is always coerced to the first row to avoid memory issues +#' @inherit recipes::recipe return +#' +#' @export +#' @examples +#' library(dplyr) +#' library(recipes) +#' jhu <- case_death_rate_subset %>% +#' filter(time_value > "2021-08-01") %>% +#' arrange(geo_value, time_value) +#' +#' r <- recipe(jhu) %>% +#' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(death_rate, ahead = 7) %>% +#' step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% +#' step_naomit(recipes::all_predictors()) %>% +#' # below, `skip` means we don't do this at predict time +#' step_naomit(recipes::all_outcomes(), skip = TRUE) +#' +#' r +#' @importFrom recipes recipe +#' @export +recipe.epi_df <- function(x, formula = NULL, ..., vars = NULL, roles = NULL) { + # vars + roles must be same-length character vectors + # formula is mutually exclusive with vars + roles + # either determines the variables needed for modelling + attr(x, "decay_to_tibble") <- FALSE # avoid as_tibble stripping the class + r <- NextMethod("recipe") + r <- add_epi_df_roles_to_recipe(r, x) + + # arrange to easy order + r$var_info <- r$var_info %>% + dplyr::arrange(factor( + role, + levels = union( + c("predictor", "outcome", "time_value", "geo_value", "key"), + unique(role) + ) # anything else + )) + r$term_info <- r$var_info + class(r) <- c("epi_recipe", class(r)) + r +} + +#' @exportS3Method recipes::recipe +#' @rdname recipe.epi_df +recipe.formula <- function(formula, data, ...) { + # This method clobbers `recipes::recipe.formula`, but should have no noticible + # effect. + recipe(x = data, formula = formula, ...) +} + +add_epi_df_roles_to_recipe <- function(r, epi_df) { + edf_keys <- key_colnames(epi_df) + edf_roles <- c("geo_value", "time_value", rep("key", length(edf_keys) - 2)) + types <- unname(lapply(epi_df[, edf_keys], recipes::.get_data_types)) + info <- tibble( + variable = edf_keys, + type = types, + role = edf_roles, + source = "original" + ) + # reconstruct the constituents + r$template <- epi_df[, unique(c(edf_keys, r$var_info$variable))] + r$var_info <- r$var_info %>% + dplyr::filter(!((variable %in% edf_keys) & is.na(role))) %>% + dplyr::bind_rows(info) %>% + dplyr::distinct() + r +} diff --git a/R/reexports-tidymodels.R b/R/reexports-tidymodels.R index 3b28ac5c5..5b53914a8 100644 --- a/R/reexports-tidymodels.R +++ b/R/reexports-tidymodels.R @@ -14,6 +14,11 @@ recipes::prep #' @export recipes::bake + +#' @importFrom recipes prep +#' @export +recipes::recipe + #' @importFrom recipes rand_id #' @export recipes::rand_id diff --git a/R/step_epi_naomit.R b/R/step_epi_naomit.R index d81ba398d..3957e68c6 100644 --- a/R/step_epi_naomit.R +++ b/R/step_epi_naomit.R @@ -9,7 +9,7 @@ #' @export #' @examples #' case_death_rate_subset %>% -#' epi_recipe() %>% +#' recipe() %>% #' step_epi_naomit() step_epi_naomit <- function(recipe) { stopifnot(inherits(recipe, "recipe")) diff --git a/R/step_epi_shift.R b/R/step_epi_shift.R index 465d64e7f..7b8cab9a8 100644 --- a/R/step_epi_shift.R +++ b/R/step_epi_shift.R @@ -42,7 +42,7 @@ #' @rdname step_epi_shift #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- recipe(case_death_rate_subset) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) #' r diff --git a/R/step_epi_slide.R b/R/step_epi_slide.R index 9714971fa..8afc84a48 100644 --- a/R/step_epi_slide.R +++ b/R/step_epi_slide.R @@ -34,9 +34,9 @@ #' library(dplyr) #' jhu <- case_death_rate_subset %>% #' filter(time_value >= as.Date("2021-01-01"), geo_value %in% c("ca", "ny")) -#' rec <- epi_recipe(jhu) %>% +#' rec <- recipe(jhu) %>% #' step_epi_slide(case_rate, death_rate, -#' .f = \(x) mean(x, na.rm = TRUE), +#' .f = function(x) mean(x, na.rm = TRUE), #' before = 6L #' ) #' bake(prep(rec, jhu), new_data = NULL) diff --git a/R/step_growth_rate.R b/R/step_growth_rate.R index 06f8da4cf..46f65c2fc 100644 --- a/R/step_growth_rate.R +++ b/R/step_growth_rate.R @@ -32,7 +32,7 @@ #' @importFrom epiprocess growth_rate #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- recipe(case_death_rate_subset) %>% #' step_growth_rate(case_rate, death_rate) #' r #' diff --git a/R/step_lag_difference.R b/R/step_lag_difference.R index 39ae1ba59..bfd20e1c5 100644 --- a/R/step_lag_difference.R +++ b/R/step_lag_difference.R @@ -15,7 +15,7 @@ #' @family row operation steps #' @export #' @examples -#' r <- epi_recipe(case_death_rate_subset) %>% +#' r <- recipe(case_death_rate_subset) %>% #' step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) %>% #' step_epi_naomit() #' r diff --git a/R/step_population_scaling.R b/R/step_population_scaling.R index 4e4d3aa26..6d5570a21 100644 --- a/R/step_population_scaling.R +++ b/R/step_population_scaling.R @@ -51,7 +51,7 @@ #' #' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_population_scaling( #' df = pop_data, #' df_pop_col = "value", @@ -171,7 +171,7 @@ bake.step_population_scaling <- function(object, new_data, ...) { )) } - object$df <- mutate(object$df, across(dplyr::where(is.character), tolower)) + # object$df <- mutate(object$df, across(dplyr::where(is.character), tolower)) pop_col <- rlang::sym(object$df_pop_col) suffix <- ifelse(object$create_new, object$suffix, "") diff --git a/R/step_training_window.R b/R/step_training_window.R index eafc076c7..298103b70 100644 --- a/R/step_training_window.R +++ b/R/step_training_window.R @@ -28,13 +28,14 @@ #' ) %>% #' as_epi_df() #' -#' epi_recipe(y ~ x, data = tib) %>% +#' recipe(y ~ x, data = tib) %>% #' step_training_window(n_recent = 3) %>% #' prep(tib) %>% #' bake(new_data = NULL) #' -#' epi_recipe(y ~ x, data = tib) %>% -#' step_epi_naomit() %>% +#' library(recipes) +#' recipe(y ~ x, data = tib) %>% +#' step_naomit() %>% #' step_training_window(n_recent = 3) %>% #' prep(tib) %>% #' bake(new_data = NULL) diff --git a/R/tidy.R b/R/tidy.R index 61b298411..039077de2 100644 --- a/R/tidy.R +++ b/R/tidy.R @@ -30,12 +30,12 @@ #' jhu <- case_death_rate_subset %>% #' filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) #' -#' r <- epi_recipe(jhu) %>% +#' r <- recipe(jhu) %>% #' step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% #' step_epi_ahead(death_rate, ahead = 7) %>% #' step_epi_naomit() #' -#' wf <- epi_workflow(r, parsnip::linear_reg()) %>% fit(jhu) +#' wf <- epi_workflow(r, linear_reg()) %>% fit(jhu) #' latest <- get_test_data(recipe = r, x = jhu) #' f <- frosting() %>% diff --git a/_pkgdown.yml b/_pkgdown.yml index c6df4c82d..b213cf986 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -78,7 +78,7 @@ reference: - smooth_quantile_reg - title: Custom panel data forecasting workflows contents: - - epi_recipe + - recipe.epi_df - epi_workflow - add_epi_recipe - adjust_epi_recipe diff --git a/man/Add_model.Rd b/man/Add_model.Rd index 17b65793c..27236cf44 100644 --- a/man/Add_model.Rd +++ b/man/Add_model.Rd @@ -75,7 +75,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) diff --git a/man/add_epi_recipe.Rd b/man/add_epi_recipe.Rd index 0da2d55b3..3abf675ef 100644 --- a/man/add_epi_recipe.Rd +++ b/man/add_epi_recipe.Rd @@ -45,7 +45,7 @@ jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-08-01") \%>\% arrange(geo_value, time_value) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% @@ -57,7 +57,7 @@ workflow <- epi_workflow() \%>\% workflow -r2 <- epi_recipe(jhu) \%>\% +r2 <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) diff --git a/man/add_frosting.Rd b/man/add_frosting.Rd index 94812cbe2..6c5b16769 100644 --- a/man/add_frosting.Rd +++ b/man/add_frosting.Rd @@ -29,7 +29,7 @@ Add frosting to a workflow library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) diff --git a/man/adjust_epi_recipe.Rd b/man/adjust_epi_recipe.Rd index 7468c4ce2..d7fc5e72a 100644 --- a/man/adjust_epi_recipe.Rd +++ b/man/adjust_epi_recipe.Rd @@ -57,12 +57,12 @@ library(workflows) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() -wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) +wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) diff --git a/man/adjust_frosting.Rd b/man/adjust_frosting.Rd index c089b3443..fd7a606a2 100644 --- a/man/adjust_frosting.Rd +++ b/man/adjust_frosting.Rd @@ -38,7 +38,7 @@ illustrations of the different types of updates. library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/arx_class_epi_workflow.Rd b/man/arx_class_epi_workflow.Rd index 713365f17..cd04b4f23 100644 --- a/man/arx_class_epi_workflow.Rd +++ b/man/arx_class_epi_workflow.Rd @@ -20,7 +20,7 @@ arx_class_epi_workflow( be real-valued. Conversion of this data to unordered classes is handled internally based on the \code{breaks} argument to \code{\link[=arx_class_args_list]{arx_class_args_list()}}. If discrete classes are already in the \code{epi_df}, it is recommended to -code up a classifier from scratch using \code{\link[=epi_recipe]{epi_recipe()}}.} +code up a classifier from scratch using \code{\link[=recipe]{recipe()}}.} \item{predictors}{A character vector giving column(s) of predictor variables. This defaults to the \code{outcome}. However, if manually specified, only those variables diff --git a/man/arx_classifier.Rd b/man/arx_classifier.Rd index c7c2cf059..ad2cd21c0 100644 --- a/man/arx_classifier.Rd +++ b/man/arx_classifier.Rd @@ -20,7 +20,7 @@ arx_classifier( be real-valued. Conversion of this data to unordered classes is handled internally based on the \code{breaks} argument to \code{\link[=arx_class_args_list]{arx_class_args_list()}}. If discrete classes are already in the \code{epi_df}, it is recommended to -code up a classifier from scratch using \code{\link[=epi_recipe]{epi_recipe()}}.} +code up a classifier from scratch using \code{\link[=recipe]{recipe()}}.} \item{predictors}{A character vector giving column(s) of predictor variables. This defaults to the \code{outcome}. However, if manually specified, only those variables diff --git a/man/autoplot-epipred.Rd b/man/autoplot-epipred.Rd index 27bfdf5f7..3b90f7618 100644 --- a/man/autoplot-epipred.Rd +++ b/man/autoplot-epipred.Rd @@ -74,7 +74,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value >= as.Date("2021-11-01")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% @@ -98,7 +98,7 @@ autoplot(wf, preds, .max_facets = 4) # ------- Show multiple horizons p <- lapply(c(7, 14, 21, 28), function(h) { - r <- epi_recipe(jhu) \%>\% + r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = h) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/cdc_baseline_forecaster.Rd b/man/cdc_baseline_forecaster.Rd index cd3c4ed67..3d451b275 100644 --- a/man/cdc_baseline_forecaster.Rd +++ b/man/cdc_baseline_forecaster.Rd @@ -51,23 +51,23 @@ weekly_deaths <- case_death_rate_subset \%>\% cdc <- cdc_baseline_forecaster(weekly_deaths, "deaths") preds <- pivot_quantiles_wider(cdc$predictions, .pred_distn) -if (require(ggplot2)) { - forecast_date <- unique(preds$forecast_date) - four_states <- c("ca", "pa", "wa", "ny") - preds \%>\% - filter(geo_value \%in\% four_states) \%>\% - ggplot(aes(target_date)) + - geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + - geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + - geom_line(aes(y = .pred), color = "orange") + - geom_line( - data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), - aes(x = time_value, y = deaths) - ) + - scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + - labs(x = "Date", y = "Weekly deaths") + - facet_wrap(~geo_value, scales = "free_y") + - theme_bw() + - geom_vline(xintercept = forecast_date) -} +library(ggplot2) +forecast_date <- unique(preds$forecast_date) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = weekly_deaths \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = deaths) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Weekly deaths") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) + } diff --git a/man/default_epi_recipe_blueprint.Rd b/man/default_epi_recipe_blueprint.Rd new file mode 100644 index 000000000..465a8abef --- /dev/null +++ b/man/default_epi_recipe_blueprint.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/blueprint-epi_recipe-default.R +\name{default_epi_recipe_blueprint} +\alias{default_epi_recipe_blueprint} +\title{Default epi_recipe blueprint} +\usage{ +default_epi_recipe_blueprint( + intercept = FALSE, + allow_novel_levels = FALSE, + fresh = TRUE, + strings_as_factors = FALSE, + composition = "tibble" +) +} +\arguments{ +\item{intercept}{A logical. Should an intercept be included in the +processed data? This information is used by the \code{process} function +in the \code{mold} and \code{forge} function list.} + +\item{allow_novel_levels}{A logical. Should novel factor levels be allowed at +prediction time? This information is used by the \code{clean} function in the +\code{forge} function list, and is passed on to \code{\link[hardhat:scream]{scream()}}.} + +\item{fresh}{Should already trained operations be re-trained when \code{prep()} is +called?} + +\item{strings_as_factors}{Should character columns be converted to factors +when \code{prep()} is called?} + +\item{composition}{Either "tibble", "matrix", or "dgCMatrix" for the format +of the processed predictors. If "matrix" or "dgCMatrix" are chosen, all of +the predictors must be numeric after the preprocessing method has been +applied; otherwise an error is thrown.} +} +\value{ +A \code{epi_recipe} blueprint. +} +\description{ +Recipe blueprint that accounts for \code{epi_df} panel data +Used for simplicity. See \code{\link[hardhat:default_recipe_blueprint]{hardhat::default_recipe_blueprint()}} for more +details. This subclass is nearly the same, except it ensures that +downstream processing doesn't drop the epi_df class from the data. +} +\keyword{internal} diff --git a/man/epi_workflow.Rd b/man/epi_workflow.Rd index b29078d52..0b9fba73e 100644 --- a/man/epi_workflow.Rd +++ b/man/epi_workflow.Rd @@ -35,7 +35,7 @@ and numerous examples, see there. \examples{ jhu <- case_death_rate_subset -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% diff --git a/man/fit-epi_workflow.Rd b/man/fit-epi_workflow.Rd index 3dfa0029a..623706d42 100644 --- a/man/fit-epi_workflow.Rd +++ b/man/fit-epi_workflow.Rd @@ -31,7 +31,7 @@ preprocessing the data and fitting the underlying parsnip model. jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) diff --git a/man/frosting.Rd b/man/frosting.Rd index a75f21b61..10c9d062d 100644 --- a/man/frosting.Rd +++ b/man/frosting.Rd @@ -28,15 +28,16 @@ f <- frosting() wf <- epi_workflow() \%>\% add_frosting(f) # A more realistic example +library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() -wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) +wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) f <- frosting() \%>\% layer_predict() \%>\% diff --git a/man/get_test_data.Rd b/man/get_test_data.Rd index b18685d89..5e7874276 100644 --- a/man/get_test_data.Rd +++ b/man/get_test_data.Rd @@ -56,7 +56,7 @@ values with more advanced techniques. } \examples{ # create recipe -rec <- epi_recipe(case_death_rate_subset) \%>\% +rec <- recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) diff --git a/man/layer_add_forecast_date.Rd b/man/layer_add_forecast_date.Rd index e27f2bacd..be48d75f9 100644 --- a/man/layer_add_forecast_date.Rd +++ b/man/layer_add_forecast_date.Rd @@ -39,7 +39,7 @@ model fitting, and postprocessing), an appropriate warning will be thrown. library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/layer_add_target_date.Rd b/man/layer_add_target_date.Rd index dc0d2f190..ecb8c590e 100644 --- a/man/layer_add_target_date.Rd +++ b/man/layer_add_target_date.Rd @@ -40,7 +40,7 @@ in the test data to get the target date. library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/layer_cdc_flatline_quantiles.Rd b/man/layer_cdc_flatline_quantiles.Rd index c3bc4f257..ac3e1758b 100644 --- a/man/layer_cdc_flatline_quantiles.Rd +++ b/man/layer_cdc_flatline_quantiles.Rd @@ -84,13 +84,14 @@ the future. This version continues to use the same set of residuals, and adds them on to produce wider intervals as \code{ahead} increases. } \examples{ +library(recipes) library(dplyr) -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- recipe(case_death_rate_subset) \%>\% # data is "daily", so we fit this to 1 ahead, the result will contain # 1 day ahead residuals step_epi_ahead(death_rate, ahead = 1L, skip = TRUE) \%>\% - recipes::update_role(death_rate, new_role = "predictor") \%>\% - recipes::add_role(time_value, geo_value, new_role = "predictor") + update_role(death_rate, new_role = "predictor") \%>\% + add_role(time_value, geo_value, new_role = "predictor") forecast_date <- max(case_death_rate_subset$time_value) @@ -111,22 +112,21 @@ preds <- preds \%>\% pivot_quantiles_wider(.pred_distn) \%>\% mutate(target_date = forecast_date + ahead) -if (require("ggplot2")) { - four_states <- c("ca", "pa", "wa", "ny") - preds \%>\% - filter(geo_value \%in\% four_states) \%>\% - ggplot(aes(target_date)) + - geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + - geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + - geom_line(aes(y = .pred), color = "orange") + - geom_line( - data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), - aes(x = time_value, y = death_rate) - ) + - scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + - labs(x = "Date", y = "Death rate") + - facet_wrap(~geo_value, scales = "free_y") + - theme_bw() + - geom_vline(xintercept = forecast_date) -} +library(ggplot2) +four_states <- c("ca", "pa", "wa", "ny") +preds \%>\% + filter(geo_value \%in\% four_states) \%>\% + ggplot(aes(target_date)) + + geom_ribbon(aes(ymin = `0.1`, ymax = `0.9`), fill = blues9[3]) + + geom_ribbon(aes(ymin = `0.25`, ymax = `0.75`), fill = blues9[6]) + + geom_line(aes(y = .pred), color = "orange") + + geom_line( + data = case_death_rate_subset \%>\% filter(geo_value \%in\% four_states), + aes(x = time_value, y = death_rate) + ) + + scale_x_date(limits = c(forecast_date - 90, forecast_date + 30)) + + labs(x = "Date", y = "Death rate") + + facet_wrap(~geo_value, scales = "free_y") + + theme_bw() + + geom_vline(xintercept = forecast_date) } diff --git a/man/layer_naomit.Rd b/man/layer_naomit.Rd index d77112f95..8b436bbbe 100644 --- a/man/layer_naomit.Rd +++ b/man/layer_naomit.Rd @@ -28,7 +28,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) diff --git a/man/layer_point_from_distn.Rd b/man/layer_point_from_distn.Rd index 276f7cb17..54d275828 100644 --- a/man/layer_point_from_distn.Rd +++ b/man/layer_point_from_distn.Rd @@ -38,7 +38,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/layer_population_scaling.Rd b/man/layer_population_scaling.Rd index 5a105f208..88607139f 100644 --- a/man/layer_population_scaling.Rd +++ b/man/layer_population_scaling.Rd @@ -81,7 +81,7 @@ jhu <- jhu_csse_daily_subset \%>\% pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_population_scaling( df = pop_data, df_pop_col = "value", diff --git a/man/layer_predict.Rd b/man/layer_predict.Rd index 8ae92f4c8..d2b768677 100644 --- a/man/layer_predict.Rd +++ b/man/layer_predict.Rd @@ -62,7 +62,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/layer_predictive_distn.Rd b/man/layer_predictive_distn.Rd index 240db5f5b..38ca505e2 100644 --- a/man/layer_predictive_distn.Rd +++ b/man/layer_predictive_distn.Rd @@ -43,7 +43,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/layer_quantile_distn.Rd b/man/layer_quantile_distn.Rd index 68192deee..13ad13545 100644 --- a/man/layer_quantile_distn.Rd +++ b/man/layer_quantile_distn.Rd @@ -49,7 +49,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/layer_residual_quantiles.Rd b/man/layer_residual_quantiles.Rd index 39e1ecfbe..5795e6514 100644 --- a/man/layer_residual_quantiles.Rd +++ b/man/layer_residual_quantiles.Rd @@ -43,7 +43,7 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/layer_threshold.Rd b/man/layer_threshold.Rd index 0f4b1dfb7..615c9f15b 100644 --- a/man/layer_threshold.Rd +++ b/man/layer_threshold.Rd @@ -43,7 +43,7 @@ to the threshold values. library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value < "2021-03-08", geo_value \%in\% c("ak", "ca", "ar")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() diff --git a/man/new_epi_recipe_blueprint.Rd b/man/new_epi_recipe_blueprint.Rd deleted file mode 100644 index db22b5675..000000000 --- a/man/new_epi_recipe_blueprint.Rd +++ /dev/null @@ -1,92 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/blueprint-epi_recipe-default.R -\name{new_epi_recipe_blueprint} -\alias{new_epi_recipe_blueprint} -\alias{epi_recipe_blueprint} -\alias{default_epi_recipe_blueprint} -\alias{new_default_epi_recipe_blueprint} -\title{Recipe blueprint that accounts for \code{epi_df} panel data} -\usage{ -new_epi_recipe_blueprint( - intercept = FALSE, - allow_novel_levels = FALSE, - fresh = TRUE, - composition = "tibble", - ptypes = NULL, - recipe = NULL, - ..., - subclass = character() -) - -epi_recipe_blueprint( - intercept = FALSE, - allow_novel_levels = FALSE, - fresh = TRUE, - composition = "tibble" -) - -default_epi_recipe_blueprint( - intercept = FALSE, - allow_novel_levels = FALSE, - fresh = TRUE, - composition = "tibble" -) - -new_default_epi_recipe_blueprint( - intercept = FALSE, - allow_novel_levels = FALSE, - fresh = TRUE, - composition = "tibble", - ptypes = NULL, - recipe = NULL, - extra_role_ptypes = NULL, - ..., - subclass = character() -) -} -\arguments{ -\item{intercept}{A logical. Should an intercept be included in the -processed data? This information is used by the \code{process} function -in the \code{mold} and \code{forge} function list.} - -\item{allow_novel_levels}{A logical. Should novel factor levels be allowed at -prediction time? This information is used by the \code{clean} function in the -\code{forge} function list, and is passed on to \code{\link[hardhat:scream]{scream()}}.} - -\item{fresh}{Should already trained operations be re-trained when \code{prep()} is -called?} - -\item{composition}{Either "tibble", "matrix", or "dgCMatrix" for the format -of the processed predictors. If "matrix" or "dgCMatrix" are chosen, all of -the predictors must be numeric after the preprocessing method has been -applied; otherwise an error is thrown.} - -\item{ptypes}{Either \code{NULL}, or a named list with 2 elements, \code{predictors} -and \code{outcomes}, both of which are 0-row tibbles. \code{ptypes} is generated -automatically at \code{\link[hardhat:mold]{mold()}} time and is used to validate \code{new_data} at -prediction time.} - -\item{recipe}{Either \code{NULL}, or an unprepped recipe. This argument is set -automatically at \code{\link[hardhat:mold]{mold()}} time.} - -\item{...}{Name-value pairs for additional elements of blueprints that -subclass this blueprint.} - -\item{subclass}{A character vector. The subclasses of this blueprint.} - -\item{extra_role_ptypes}{A named list. The names are the unique non-standard -recipe roles (i.e. everything except \code{"predictors"} and \code{"outcomes"}). The -values are prototypes of the original columns with that role. These are -used for validation in \code{forge()}.} -} -\value{ -A recipe blueprint. -} -\description{ -Used for simplicity. See \code{\link[hardhat:new-blueprint]{hardhat::new_recipe_blueprint()}} or -\code{\link[hardhat:default_recipe_blueprint]{hardhat::default_recipe_blueprint()}} for more details. -} -\details{ -The \code{bake_dependent_roles} are automatically set to \code{epi_df} defaults. -} -\keyword{internal} diff --git a/man/predict-epi_workflow.Rd b/man/predict-epi_workflow.Rd index 130279249..531c9216e 100644 --- a/man/predict-epi_workflow.Rd +++ b/man/predict-epi_workflow.Rd @@ -66,16 +66,17 @@ possible. Specifically, the output will have \code{time_value} and } } \examples{ +library(dplyr) jhu <- case_death_rate_subset -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% step_epi_naomit() -wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) -latest <- jhu \%>\% dplyr::filter(time_value >= max(time_value) - 14) +wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) +latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) preds <- predict(wf, latest) preds diff --git a/man/epi_recipe.Rd b/man/recipe.epi_df.Rd similarity index 75% rename from man/epi_recipe.Rd rename to man/recipe.epi_df.Rd index d0105d1ec..bb96f33c8 100644 --- a/man/epi_recipe.Rd +++ b/man/recipe.epi_df.Rd @@ -1,33 +1,30 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/epi_recipe.R -\name{epi_recipe} +% Please edit documentation in R/recipe.epi_df.R +\name{recipe.epi_df} +\alias{recipe.epi_df} \alias{epi_recipe} \alias{epi_recipe.default} \alias{epi_recipe.formula} -\alias{epi_recipe.epi_df} -\title{Create a epi_recipe for preprocessing data} +\alias{recipe.formula} +\title{Create a recipe for preprocessing panel data} \usage{ -epi_recipe(x, ...) +\method{recipe}{epi_df}(x, formula = NULL, ..., vars = NULL, roles = NULL) -\method{epi_recipe}{default}(x, ...) - -\method{epi_recipe}{epi_df}(x, formula = NULL, ..., vars = NULL, roles = NULL) - -\method{epi_recipe}{formula}(formula, data, ...) +\method{recipe}{formula}(formula, data, ...) } \arguments{ \item{x, data}{A data frame, tibble, or epi_df of the \emph{template} data set (see below). This is always coerced to the first row to avoid memory issues} -\item{...}{Further arguments passed to or from other methods (not currently -used).} - \item{formula}{A model formula. No in-line functions should be used here (e.g. \code{log(x)}, \code{x:y}, etc.) and minus signs are not allowed. These types of transformations should be enacted using \code{step} functions in this package. Dots are allowed as are simple multivariate outcome terms (i.e. no need for \code{cbind}; see Examples).} +\item{...}{Further arguments passed to or from other methods (not currently +used).} + \item{vars}{A character string of column names corresponding to variables that will be used in any context (see below)} @@ -52,9 +49,10 @@ the recipe is trained.} } \description{ A recipe is a description of the steps to be applied to a data set in -order to prepare it for data analysis. This is a loose wrapper -around \code{\link[recipes:recipe]{recipes::recipe()}} to properly handle the additional -columns present in an \code{epi_df} +order to prepare it for data analysis. This is an S3 method for +\code{\link[recipes:recipe]{recipes::recipe()}} to properly handle the additional (panel data) +columns present in an \code{\link[epiprocess:epi_df]{epiprocess::epi_df}}: \code{time_value}, \code{geo_value}, and any +additional keys. } \examples{ library(dplyr) @@ -63,13 +61,13 @@ jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-08-01") \%>\% arrange(geo_value, time_value) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(case_rate, lag = c(0, 7, 14)) \%>\% - step_naomit(all_predictors()) \%>\% + step_naomit(recipes::all_predictors()) \%>\% # below, `skip` means we don't do this at predict time - step_naomit(all_outcomes(), skip = TRUE) + step_naomit(recipes::all_outcomes(), skip = TRUE) r } diff --git a/man/reexports.Rd b/man/reexports.Rd index f6849a53c..6006555b9 100644 --- a/man/reexports.Rd +++ b/man/reexports.Rd @@ -8,6 +8,7 @@ \alias{forecast} \alias{prep} \alias{bake} +\alias{recipe} \alias{rand_id} \alias{tibble} \alias{tidy} @@ -22,7 +23,7 @@ below to see their documentation. \item{ggplot2}{\code{\link[ggplot2]{autoplot}}} - \item{recipes}{\code{\link[recipes]{bake}}, \code{\link[recipes]{prep}}, \code{\link[recipes]{rand_id}}} + \item{recipes}{\code{\link[recipes]{bake}}, \code{\link[recipes]{prep}}, \code{\link[recipes]{rand_id}}, \code{\link[recipes]{recipe}}} \item{tibble}{\code{\link[tibble]{tibble}}} }} diff --git a/man/step_epi_naomit.Rd b/man/step_epi_naomit.Rd index b579dd6d6..a16657c74 100644 --- a/man/step_epi_naomit.Rd +++ b/man/step_epi_naomit.Rd @@ -20,6 +20,6 @@ Unified NA omission wrapper function for recipes } \examples{ case_death_rate_subset \%>\% - epi_recipe() \%>\% + recipe() \%>\% step_epi_naomit() } diff --git a/man/step_epi_shift.Rd b/man/step_epi_shift.Rd index 2bf22c15d..cef210f44 100644 --- a/man/step_epi_shift.Rd +++ b/man/step_epi_shift.Rd @@ -78,7 +78,7 @@ are always set to \code{"ahead_"} and \code{"epi_ahead"} respectively, while for \code{step_epi_lag}, they are set to \code{"lag_"} and \verb{"epi_lag}, respectively. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- recipe(case_death_rate_subset) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) r diff --git a/man/step_epi_slide.Rd b/man/step_epi_slide.Rd index 46bb386ad..141f279d9 100644 --- a/man/step_epi_slide.Rd +++ b/man/step_epi_slide.Rd @@ -77,9 +77,9 @@ a computation along existing data. library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value >= as.Date("2021-01-01"), geo_value \%in\% c("ca", "ny")) -rec <- epi_recipe(jhu) \%>\% +rec <- recipe(jhu) \%>\% step_epi_slide(case_rate, death_rate, - .f = \(x) mean(x, na.rm = TRUE), + .f = function(x) mean(x, na.rm = TRUE), before = 6L ) bake(prep(rec, jhu), new_data = NULL) diff --git a/man/step_growth_rate.Rd b/man/step_growth_rate.Rd index bc6da0bef..6e0355fcd 100644 --- a/man/step_growth_rate.Rd +++ b/man/step_growth_rate.Rd @@ -73,7 +73,7 @@ sequence of any existing operations. that will generate one or more new columns of derived data. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- recipe(case_death_rate_subset) \%>\% step_growth_rate(case_rate, death_rate) r diff --git a/man/step_lag_difference.Rd b/man/step_lag_difference.Rd index 7969ea3a7..67f131630 100644 --- a/man/step_lag_difference.Rd +++ b/man/step_lag_difference.Rd @@ -47,7 +47,7 @@ sequence of any existing operations. that will generate one or more new columns of derived data. } \examples{ -r <- epi_recipe(case_death_rate_subset) \%>\% +r <- recipe(case_death_rate_subset) \%>\% step_lag_difference(case_rate, death_rate, horizon = c(7, 14)) \%>\% step_epi_naomit() r diff --git a/man/step_population_scaling.Rd b/man/step_population_scaling.Rd index 294f27f61..a2420c559 100644 --- a/man/step_population_scaling.Rd +++ b/man/step_population_scaling.Rd @@ -94,7 +94,7 @@ jhu <- jhu_csse_daily_subset \%>\% pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_population_scaling( df = pop_data, df_pop_col = "value", diff --git a/man/step_training_window.Rd b/man/step_training_window.Rd index 42f6b9a95..9a90d2530 100644 --- a/man/step_training_window.Rd +++ b/man/step_training_window.Rd @@ -52,13 +52,14 @@ tib <- tibble( ) \%>\% as_epi_df() -epi_recipe(y ~ x, data = tib) \%>\% +recipe(y ~ x, data = tib) \%>\% step_training_window(n_recent = 3) \%>\% prep(tib) \%>\% bake(new_data = NULL) -epi_recipe(y ~ x, data = tib) \%>\% - step_epi_naomit() \%>\% +library(recipes) +recipe(y ~ x, data = tib) \%>\% + step_naomit() \%>\% step_training_window(n_recent = 3) \%>\% prep(tib) \%>\% bake(new_data = NULL) diff --git a/man/tidy.frosting.Rd b/man/tidy.frosting.Rd index ba3c0f3d5..7509aae13 100644 --- a/man/tidy.frosting.Rd +++ b/man/tidy.frosting.Rd @@ -41,12 +41,12 @@ library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() -wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) +wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) latest <- get_test_data(recipe = r, x = jhu) f <- frosting() \%>\% layer_predict() \%>\% diff --git a/man/update.layer.Rd b/man/update.layer.Rd index 9604992e1..ad3ecf435 100644 --- a/man/update.layer.Rd +++ b/man/update.layer.Rd @@ -21,12 +21,14 @@ Analogous to \code{update.step()} from the \code{recipes} package. library(dplyr) jhu <- case_death_rate_subset \%>\% filter(time_value > "2021-11-01", geo_value \%in\% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) \%>\% + +r <- recipe(jhu) \%>\% step_epi_lag(death_rate, lag = c(0, 7, 14)) \%>\% step_epi_ahead(death_rate, ahead = 7) \%>\% step_epi_naomit() -wf <- epi_workflow(r, linear_reg()) \%>\% fit(jhu) -latest <- jhu \%>\% filter(time_value >= max(time_value) - 14) +wf <- epi_workflow(r, parsnip::linear_reg()) \%>\% fit(jhu) +latest <- jhu \%>\% + filter(time_value >= max(time_value) - 14) # Specify a `forecast_date` that is greater than or equal to `as_of` date f <- frosting() \%>\% diff --git a/tests/testthat/test-bake-method.R b/tests/testthat/test-bake-method.R index 0e2746cf2..e1dd232e6 100644 --- a/tests/testthat/test-bake-method.R +++ b/tests/testthat/test-bake-method.R @@ -1,11 +1,11 @@ test_that("bake method works in all cases", { edf <- case_death_rate_subset %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) - r <- epi_recipe(edf) %>% + r <- recipe(edf) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) - r2 <- epi_recipe(edf) %>% + r2 <- recipe(edf) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() diff --git a/tests/testthat/test-blueprint.R b/tests/testthat/test-blueprint.R index 2d22aff6e..c069c8bcb 100644 --- a/tests/testthat/test-blueprint.R +++ b/tests/testthat/test-blueprint.R @@ -1,22 +1,18 @@ test_that("epi_recipe blueprint keeps the class, mold works", { - bp <- new_default_epi_recipe_blueprint() - expect_length(class(bp), 5L) + bp <- default_epi_recipe_blueprint() + expect_length(class(bp), 4L) expect_s3_class(bp, "default_epi_recipe_blueprint") - expect_s3_class(refresh_blueprint(bp), "default_epi_recipe_blueprint") + expect_s3_class(hardhat::refresh_blueprint(bp), "default_epi_recipe_blueprint") jhu <- case_death_rate_subset # expect_s3_class(er_check_is_data_like(jhu), "epi_df") - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_naomit(all_predictors()) %>% step_naomit(all_outcomes(), skip = TRUE) - mm <- mold_epi_recipe_default_clean(bp, jhu) - expect_s3_class(mm$blueprint, "default_epi_recipe_blueprint") - expect_s3_class(mm$data, "epi_df") - bp <- hardhat:::update_blueprint(bp, recipe = r) run_mm <- run_mold(bp, data = jhu) expect_false(is.factor(run_mm$extras$roles$geo_value$geo_value)) diff --git a/tests/testthat/test-check_enough_train_data.R b/tests/testthat/test-check_enough_train_data.R index 502ea06f1..f5b3173f2 100644 --- a/tests/testthat/test-check_enough_train_data.R +++ b/tests/testthat/test-check_enough_train_data.R @@ -17,14 +17,14 @@ toy_epi_df <- tibble::tibble( test_that("check_enough_train_data works on pooled data", { # Check both columns have enough data expect_no_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n, drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) # Check both column don't have enough data expect_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n + 1, drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL), @@ -32,7 +32,7 @@ test_that("check_enough_train_data works on pooled data", { ) # Check drop_na works expect_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 1, drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) @@ -42,14 +42,14 @@ test_that("check_enough_train_data works on pooled data", { test_that("check_enough_train_data works on unpooled data", { # Check both columns have enough data expect_no_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = n, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) # Check one column don't have enough data expect_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = n + 1, epi_keys = "geo_value", drop_na = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = NULL), @@ -57,7 +57,7 @@ test_that("check_enough_train_data works on unpooled data", { ) # Check drop_na works expect_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 3, epi_keys = "geo_value", drop_na = TRUE) %>% prep(toy_epi_df) %>% bake(new_data = NULL) @@ -66,7 +66,7 @@ test_that("check_enough_train_data works on unpooled data", { test_that("check_enough_train_data outputs the correct recipe values", { expect_no_error( - p <- epi_recipe(toy_epi_df) %>% + p <- recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = 2 * n - 2) %>% prep(toy_epi_df) %>% bake(new_data = NULL) @@ -91,14 +91,14 @@ test_that("check_enough_train_data only checks train data", { slice(3:10) %>% epiprocess::as_epi_df() expect_no_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(x, y, n = n - 2, epi_keys = "geo_value") %>% prep(toy_epi_df) %>% bake(new_data = toy_test_data) ) # Same thing, but skip = FALSE expect_no_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% check_enough_train_data(y, n = n - 2, epi_keys = "geo_value", skip = FALSE) %>% prep(toy_epi_df) %>% bake(new_data = toy_test_data) @@ -108,14 +108,14 @@ test_that("check_enough_train_data only checks train data", { test_that("check_enough_train_data works with all_predictors() downstream of constructed terms", { # With a lag of 2, we will get 2 * n - 6 non-NA rows expect_no_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_train_data(all_predictors(), y, n = 2 * n - 6) %>% prep(toy_epi_df) %>% bake(new_data = NULL) ) expect_error( - epi_recipe(toy_epi_df) %>% + recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(1, 2)) %>% check_enough_train_data(all_predictors(), y, n = 2 * n - 5) %>% prep(toy_epi_df) %>% diff --git a/tests/testthat/test-epi_recipe.R b/tests/testthat/test-epi_recipe.R index ed27d88c0..20d30a158 100644 --- a/tests/testthat/test-epi_recipe.R +++ b/tests/testthat/test-epi_recipe.R @@ -1,30 +1,24 @@ -test_that("epi_recipe produces default recipe", { - # these all call recipes::recipe(), but the template will always have 1 row +test_that("recipe produces default recipe", { + # these all call recipes::recipe() tib <- tibble( x = 1:5, y = 1:5, time_value = seq(as.Date("2020-01-01"), by = 1, length.out = 5) ) - expected_rec <- recipes::recipe(tib) - expected_rec$template <- expected_rec$template[1, ] - expect_warning(rec <- epi_recipe(tib), regexp = "epi_recipe has been called with a non-epi_df object") - expect_identical(expected_rec, rec) - expect_equal(nrow(rec$template), 1L) + rec <- recipe(tib) + expect_identical(rec, suppressWarnings(epi_recipe(tib))) + expect_equal(nrow(rec$template), 5L) - expected_rec <- recipes::recipe(y ~ x, tib) - expected_rec$template <- expected_rec$template[1, ] - expect_warning(rec <- epi_recipe(y ~ x, tib), regexp = "epi_recipe has been called with a non-epi_df object") - expect_identical(expected_rec, rec) - expect_equal(nrow(rec$template), 1L) - m <- as.matrix(tib) - expected_rec <- recipes::recipe(m) - expected_rec$template <- expected_rec$template[1, ] - expect_warning(rec <- epi_recipe(m), regexp = "epi_recipe has been called with a non-epi_df object") + rec <- recipe(y ~ x, tib) + expect_identical(rec, suppressWarnings(epi_recipe(y ~ x, tib))) + expect_equal(nrow(rec$template), 5L) + + + expected_rec <- recipes::recipe(y ~ x, tib) expect_identical(expected_rec, rec) - expect_equal(nrow(rec$template), 1L) }) -test_that("epi_recipe formula works", { +test_that("recipe formula works", { tib <- tibble( x = 1:5, y = 1:5, time_value = seq(as.Date("2020-01-01"), by = 1, length.out = 5), @@ -32,7 +26,7 @@ test_that("epi_recipe formula works", { ) %>% epiprocess::as_epi_df() # simple case - r <- epi_recipe(y ~ x, tib) + r <- recipe(y ~ x, tib) ref_var_info <- tibble::tribble( ~variable, ~type, ~role, ~source, "x", c("integer", "numeric"), "predictor", "original", @@ -41,10 +35,10 @@ test_that("epi_recipe formula works", { "geo_value", c("string", "unordered", "nominal"), "geo_value", "original" ) expect_identical(r$var_info, ref_var_info) - expect_equal(nrow(r$template), 1L) + expect_equal(nrow(r$template), 5L) # with an epi_key as a predictor - r <- epi_recipe(y ~ x + geo_value, tib) + r <- recipe(y ~ x + geo_value, tib) ref_var_info <- ref_var_info %>% tibble::add_row( variable = "geo_value", type = list(c("string", "unordered", "nominal")), @@ -52,7 +46,7 @@ test_that("epi_recipe formula works", { source = "original", .after = 1 ) expect_identical(r$var_info, ref_var_info) - expect_equal(nrow(r$template), 1L) + expect_equal(nrow(r$template), 5L) tib <- tibble( x = 1:5, y = 1:5, @@ -62,7 +56,7 @@ test_that("epi_recipe formula works", { ) %>% epiprocess::as_epi_df(additional_metadata = list(other_keys = "z")) # with an additional key - r <- epi_recipe(y ~ x + geo_value, tib) + r <- recipe(y ~ x + geo_value, tib) ref_var_info <- ref_var_info %>% tibble::add_row( variable = "z", type = list(c("string", "unordered", "nominal")), @@ -73,25 +67,25 @@ test_that("epi_recipe formula works", { expect_identical(r$var_info, ref_var_info) }) -test_that("epi_recipe epi_df works", { +test_that("recipe epi_df works", { tib <- tibble( x = 1:5, y = 1:5, time_value = seq(as.Date("2020-01-01"), by = 1, length.out = 5), geo_value = "ca" ) %>% epiprocess::as_epi_df() - r <- epi_recipe(tib) + r <- recipe(tib) ref_var_info <- tibble::tribble( ~variable, ~type, ~role, ~source, "time_value", "date", "time_value", "original", "geo_value", c("string", "unordered", "nominal"), "geo_value", "original", - "x", c("integer", "numeric"), "raw", "original", - "y", c("integer", "numeric"), "raw", "original" + "x", c("integer", "numeric"), NA, "original", + "y", c("integer", "numeric"), NA, "original" ) expect_identical(r$var_info, ref_var_info) - expect_equal(nrow(r$template), 1L) + expect_equal(nrow(r$template), 5L) - r <- epi_recipe(tib, formula = y ~ x) + r <- recipe(tib, formula = y ~ x) ref_var_info <- tibble::tribble( ~variable, ~type, ~role, ~source, "x", c("integer", "numeric"), "predictor", "original", @@ -100,10 +94,10 @@ test_that("epi_recipe epi_df works", { "geo_value", c("string", "unordered", "nominal"), "geo_value", "original" ) expect_identical(r$var_info, ref_var_info) - expect_equal(nrow(r$template), 1L) + expect_equal(nrow(r$template), 5L) - r <- epi_recipe( + r <- recipe( tib, roles = c("geo_value", "funny_business", "predictor", "outcome") ) @@ -113,14 +107,15 @@ test_that("epi_recipe epi_df works", { source = "original" ) expect_identical(r$var_info, ref_var_info) - expect_equal(nrow(r$template), 1L) + expect_equal(nrow(r$template), 5L) }) test_that("add/update/adjust/remove epi_recipe works as intended", { + library(workflows) jhu <- case_death_rate_subset - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) @@ -137,7 +132,7 @@ test_that("add/update/adjust/remove epi_recipe works as intended", { expect_equal(class(steps[[3]]), c("step_epi_lag", "step")) expect_equal(steps[[3]]$lag, c(0, 7, 14)) - r2 <- epi_recipe(jhu) %>% + r2 <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 1)) %>% step_epi_ahead(death_rate, ahead = 1) diff --git a/tests/testthat/test-epi_workflow.R b/tests/testthat/test-epi_workflow.R index 09dd6fe82..94799faa1 100644 --- a/tests/testthat/test-epi_workflow.R +++ b/tests/testthat/test-epi_workflow.R @@ -1,5 +1,5 @@ test_that("postprocesser was evaluated", { - r <- epi_recipe(case_death_rate_subset) + r <- recipe(case_death_rate_subset) s <- parsnip::linear_reg() f <- frosting() @@ -14,7 +14,7 @@ test_that("postprocesser was evaluated", { test_that("outcome of the two methods are the same", { jhu <- case_death_rate_subset - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(case_rate, lag = c(7)) %>% @@ -36,7 +36,7 @@ test_that("model can be added/updated/removed from epi_workflow", { jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) @@ -66,7 +66,7 @@ test_that("model can be added/updated/removed from epi_workflow", { test_that("forecast method works", { jhu <- case_death_rate_subset %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() @@ -97,7 +97,7 @@ test_that("forecast method works", { test_that("forecast method errors when workflow not fit", { jhu <- case_death_rate_subset %>% filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() diff --git a/tests/testthat/test-extract_argument.R b/tests/testthat/test-extract_argument.R index 3250b2991..bbccaad78 100644 --- a/tests/testthat/test-extract_argument.R +++ b/tests/testthat/test-extract_argument.R @@ -32,7 +32,7 @@ test_that("recipe argument extractor works", { dplyr::filter(time_value > "2021-08-01") %>% dplyr::arrange(geo_value, time_value) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% diff --git a/tests/testthat/test-frosting.R b/tests/testthat/test-frosting.R index 5cab9c494..9c00e210d 100644 --- a/tests/testthat/test-frosting.R +++ b/tests/testthat/test-frosting.R @@ -42,7 +42,7 @@ test_that("frosting can be created/added/updated/adjusted/removed", { test_that("prediction works without any postprocessor", { jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_naomit(all_predictors()) %>% @@ -65,7 +65,7 @@ test_that("layer_predict is added by default if missing", { jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() @@ -92,7 +92,7 @@ test_that("parsnip settings can be passed through predict.epi_workflow", { jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() diff --git a/tests/testthat/test-get_test_data.R b/tests/testthat/test-get_test_data.R index 035fc6463..c0f32bc42 100644 --- a/tests/testthat/test-get_test_data.R +++ b/tests/testthat/test-get_test_data.R @@ -1,6 +1,6 @@ library(dplyr) test_that("return expected number of rows and returned dataset is ungrouped", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- recipe(case_death_rate_subset) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14, 21, 28)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -19,7 +19,7 @@ test_that("return expected number of rows and returned dataset is ungrouped", { test_that("expect insufficient training data error", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- recipe(case_death_rate_subset) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 367)) %>% step_naomit(all_predictors()) %>% @@ -30,7 +30,7 @@ test_that("expect insufficient training data error", { test_that("expect error that geo_value or time_value does not exist", { - r <- epi_recipe(case_death_rate_subset) %>% + r <- recipe(case_death_rate_subset) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -52,7 +52,7 @@ test_that("NA fill behaves as desired", { ) %>% epiprocess::as_epi_df() - r <- epi_recipe(df) %>% + r <- recipe(df) %>% step_epi_ahead(x1, ahead = 3) %>% step_epi_lag(x1, x2, lag = c(1, 3)) %>% step_epi_naomit() @@ -89,7 +89,7 @@ test_that("forecast date behaves", { ) %>% epiprocess::as_epi_df() - r <- epi_recipe(df) %>% + r <- recipe(df) %>% step_epi_ahead(x1, ahead = 3) %>% step_epi_lag(x1, x2, lag = c(1, 3)) @@ -118,7 +118,7 @@ test_that("Omit end rows according to minimum lag when that’s not lag 0", { x = 1:10 ) %>% epiprocess::as_epi_df() - toy_rec <- epi_recipe(toy_epi_df) %>% + toy_rec <- recipe(toy_epi_df) %>% step_epi_lag(x, lag = c(2, 4)) %>% step_epi_ahead(x, ahead = 3) %>% step_epi_naomit() @@ -140,7 +140,7 @@ test_that("Omit end rows according to minimum lag when that’s not lag 0", { ca <- case_death_rate_subset %>% filter(geo_value == "ca") - rec <- epi_recipe(ca) %>% + rec <- recipe(ca) %>% step_epi_lag(case_rate, lag = c(2, 4, 6)) %>% step_epi_ahead(case_rate, ahead = 7) %>% step_epi_naomit() diff --git a/tests/testthat/test-key_colnames.R b/tests/testthat/test-key_colnames.R index d55a515ca..fdda59ad5 100644 --- a/tests/testthat/test-key_colnames.R +++ b/tests/testthat/test-key_colnames.R @@ -1,9 +1,17 @@ -test_that("Extracts keys from a recipe; roles are NA, giving an empty vector", { - expect_equal(key_colnames(recipe(case_death_rate_subset)), character(0L)) +library(parsnip) +library(workflows) +library(dplyr) + +test_that("Extracts keys from a recipe", { + expect_equal( + key_colnames(recipe(case_death_rate_subset)), + c("geo_value", "time_value") + ) + expect_equal(key_colnames(recipe(cars)), character(0L)) }) -test_that("key_colnames extracts time_value and geo_value, but not raw", { - my_recipe <- epi_recipe(case_death_rate_subset) %>% +test_that("epi_keys_mold extracts time_value and geo_value, but not raw", { + my_recipe <- recipe(case_death_rate_subset) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% @@ -33,12 +41,7 @@ test_that("key_colnames extracts additional keys when they are present", { additional_metadata = list(other_keys = c("state", "pol")) ) - expect_identical( - key_colnames(my_data), - c("geo_value", "time_value", "state", "pol") - ) - - my_recipe <- epi_recipe(my_data) %>% + my_recipe <- recipe(my_data) %>% step_epi_ahead(value, ahead = 7) %>% step_epi_naomit() diff --git a/tests/testthat/test-layer_add_forecast_date.R b/tests/testthat/test-layer_add_forecast_date.R index 9595b47b6..6b81a9cd6 100644 --- a/tests/testthat/test-layer_add_forecast_date.R +++ b/tests/testthat/test-layer_add_forecast_date.R @@ -1,6 +1,6 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_naomit(all_predictors()) %>% diff --git a/tests/testthat/test-layer_add_target_date.R b/tests/testthat/test-layer_add_target_date.R index e5349839b..3fcae9cad 100644 --- a/tests/testthat/test-layer_add_target_date.R +++ b/tests/testthat/test-layer_add_target_date.R @@ -1,6 +1,6 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_naomit(all_predictors()) %>% diff --git a/tests/testthat/test-layer_naomit.R b/tests/testthat/test-layer_naomit.R index 1d5b4ee25..1254bfc36 100644 --- a/tests/testthat/test-layer_naomit.R +++ b/tests/testthat/test-layer_naomit.R @@ -1,7 +1,7 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14, 30)) %>% step_epi_ahead(death_rate, ahead = 7) %>% recipes::step_naomit(all_predictors()) %>% diff --git a/tests/testthat/test-layer_predict.R b/tests/testthat/test-layer_predict.R index 041516b29..32fd6940e 100644 --- a/tests/testthat/test-layer_predict.R +++ b/tests/testthat/test-layer_predict.R @@ -1,6 +1,6 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_naomit(all_predictors()) %>% diff --git a/tests/testthat/test-layer_residual_quantiles.R b/tests/testthat/test-layer_residual_quantiles.R index e3668b249..c2b9aa198 100644 --- a/tests/testthat/test-layer_residual_quantiles.R +++ b/tests/testthat/test-layer_residual_quantiles.R @@ -1,7 +1,7 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value > "2021-11-01", geo_value %in% c("ak", "ca", "ny")) -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() @@ -40,7 +40,7 @@ test_that("Errors when used with a classifier", { geo_value = "ak" ) %>% as_epi_df() - r <- epi_recipe(y ~ x1 + x2, data = tib) + r <- recipe(y ~ x1 + x2, data = tib) wf <- epi_workflow(r, parsnip::logistic_reg()) %>% fit(tib) f <- frosting() %>% layer_predict() %>% diff --git a/tests/testthat/test-layer_threshold_preds.R b/tests/testthat/test-layer_threshold_preds.R index 9df7e64ab..f051913f9 100644 --- a/tests/testthat/test-layer_threshold_preds.R +++ b/tests/testthat/test-layer_threshold_preds.R @@ -1,6 +1,6 @@ jhu <- case_death_rate_subset %>% dplyr::filter(time_value < "2021-03-08", geo_value %in% c("ak", "ca", "ar")) -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() diff --git a/tests/testthat/test-population_scaling.R b/tests/testthat/test-population_scaling.R index a94b40b82..1118ceb2d 100644 --- a/tests/testthat/test-population_scaling.R +++ b/tests/testthat/test-population_scaling.R @@ -10,20 +10,17 @@ test_that("Column names can be passed with and without the tidy way", { newdata <- case_death_rate_subset %>% filter(geo_value %in% c("ak", "al", "ar", "as", "az", "ca")) - r1 <- epi_recipe(newdata) %>% - step_population_scaling( - case_rate, death_rate, + r1 <- recipe(newdata) %>% + step_population_scaling(c("case_rate", "death_rate"), df = pop_data, df_pop_col = "value", by = c("geo_value" = "states") ) - r2 <- epi_recipe(newdata) %>% - step_population_scaling( - case_rate, death_rate, - df = pop_data2, - df_pop_col = "value", - by = "geo_value" + r2 <- recipe(newdata) %>% + step_population_scaling(case_rate, death_rate, + df = pop_data, + df_pop_col = "value", by = c("geo_value" = "states") ) prep1 <- prep(r1, newdata) @@ -54,9 +51,9 @@ test_that("Number of columns and column names returned correctly, Upper and lowe case = 1:10, death = 1:10 ) %>% - epiprocess::as_epi_df() + epiprocess::as_epi_df(additional_metadata = list(other_keys = "county")) - r <- epi_recipe(newdata) %>% + r <- recipe(newdata) %>% step_population_scaling(c("case", "death"), df = pop_data, df_pop_col = "value", by = c("geo_value" = "states", "county" = "counties"), @@ -72,7 +69,7 @@ test_that("Number of columns and column names returned correctly, Upper and lowe - r <- epi_recipe(newdata) %>% + r <- recipe(newdata) %>% step_population_scaling( df = pop_data, df_pop_col = "value", @@ -99,7 +96,7 @@ test_that("Postprocessing workflow works and values correct", { value = c(20000, 30000) ) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_population_scaling(cases, df = pop_data, df_pop_col = "value", @@ -159,7 +156,7 @@ test_that("Postprocessing to get cases from case rate", { value = c(1 / 20000, 1 / 30000) ) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_population_scaling( df = reverse_pop_data, df_pop_col = "value", @@ -202,7 +199,7 @@ test_that("test joining by default columns", { values = c(1 / 20000, 1 / 30000) ) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_population_scaling(case_rate, df = reverse_pop_data, df_pop_col = "values", @@ -216,8 +213,8 @@ test_that("test joining by default columns", { p <- prep(r, jhu) b <- bake(p, new_data = NULL) - expect_named( - b, + expect_setequal( + names(b), c( "geo_value", "time_value", "case_rate", "case_rate_scaled", paste0("lag_", c(0, 7, 14), "_case_rate_scaled"), @@ -257,7 +254,7 @@ test_that("expect error if `by` selector does not match", { values = c(1 / 20000, 1 / 30000) ) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_population_scaling(case_rate, df = reverse_pop_data, df_pop_col = "values", @@ -285,7 +282,7 @@ test_that("expect error if `by` selector does not match", { add_frosting(f) ) - r <- epi_recipe(jhu) %>% + r <- recipe(jhu) %>% step_population_scaling(case_rate, df = reverse_pop_data, df_pop_col = "values", @@ -329,7 +326,7 @@ test_that("Rate rescaling behaves as expected", { value = c(1 / 1000) ) - r <- epi_recipe(x) %>% + r <- recipe(x) %>% step_population_scaling( df = reverse_pop_data, df_pop_col = "value", @@ -358,7 +355,7 @@ test_that("Rate rescaling behaves as expected", { ) %>% as_epi_df() - r <- epi_recipe(x) %>% + r <- recipe(x) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% # cases step_epi_ahead(case_rate, ahead = 7, role = "outcome") %>% # cases recipes::step_naomit(recipes::all_predictors()) %>% @@ -400,7 +397,7 @@ test_that("Extra Columns are ignored", { value = c(1 / 1000), extra_col = c("full name") ) - recip <- epi_recipe(x) %>% + recip <- recipe(x) %>% step_population_scaling( df = reverse_pop_data, df_pop_col = "value", diff --git a/tests/testthat/test-step_epi_naomit.R b/tests/testthat/test-step_epi_naomit.R index 2fb173f01..7e84f5d75 100644 --- a/tests/testthat/test-step_epi_naomit.R +++ b/tests/testthat/test-step_epi_naomit.R @@ -12,7 +12,7 @@ x <- tibble( epiprocess::as_epi_df() # Preparing the datasets to be used for comparison -r <- epi_recipe(x) %>% +r <- recipe(x) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) diff --git a/tests/testthat/test-step_epi_shift.R b/tests/testthat/test-step_epi_shift.R index da04fd0f2..f6d523417 100644 --- a/tests/testthat/test-step_epi_shift.R +++ b/tests/testthat/test-step_epi_shift.R @@ -21,7 +21,7 @@ slm_fit <- function(recipe, data = x) { test_that("Values for ahead and lag must be integer values", { expect_error( - r1 <- epi_recipe(x) %>% + r1 <- recipe(x) %>% step_epi_ahead(death_rate, ahead = 3.6) %>% step_epi_lag(death_rate, lag = 1.9) ) @@ -29,7 +29,7 @@ test_that("Values for ahead and lag must be integer values", { test_that("A negative lag value should should throw an error", { expect_error( - r2 <- epi_recipe(x) %>% + r2 <- recipe(x) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = -7) ) @@ -37,14 +37,14 @@ test_that("A negative lag value should should throw an error", { test_that("A nonpositive ahead value should throw an error", { expect_error( - r3 <- epi_recipe(x) %>% + r3 <- recipe(x) %>% step_epi_ahead(death_rate, ahead = -7) %>% step_epi_lag(death_rate, lag = 7) ) }) test_that("Values for ahead and lag cannot be duplicates", { - r4 <- epi_recipe(x) %>% + r4 <- recipe(x) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = 7) %>% step_epi_lag(death_rate, lag = 7) @@ -54,7 +54,7 @@ test_that("Values for ahead and lag cannot be duplicates", { }) test_that("Check that epi_lag shifts applies the shift", { - r5 <- epi_recipe(x) %>% + r5 <- recipe(x) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) diff --git a/tests/testthat/test-step_epi_slide.R b/tests/testthat/test-step_epi_slide.R index 29e046eae..dd42c646c 100644 --- a/tests/testthat/test-step_epi_slide.R +++ b/tests/testthat/test-step_epi_slide.R @@ -8,7 +8,7 @@ edf <- data.frame( ) %>% as_epi_df() -r <- epi_recipe(edf) +r <- recipe(edf) rolled_before <- edf %>% group_by(geo_value) %>% epi_slide(value = mean(value), before = 3L) %>% @@ -21,7 +21,7 @@ rolled_after <- edf %>% test_that("epi_slide errors when needed", { # not an epi_recipe - expect_error(recipe(edf) %>% step_epi_slide(value, .f = mean, before = 6L)) + expect_error(recipe(as_tibble(edf)) %>% step_epi_slide(value, .f = mean, before = 6L)) # non-scalar args expect_error(r %>% step_epi_slide(value, .f = mean, before = c(3L, 6L))) diff --git a/tests/testthat/test-step_growth_rate.R b/tests/testthat/test-step_growth_rate.R index 29a2fc2f5..4c0c78642 100644 --- a/tests/testthat/test-step_growth_rate.R +++ b/tests/testthat/test-step_growth_rate.R @@ -4,7 +4,7 @@ test_that("step_growth_rate validates arguments", { expect_error(step_growth_rate(r)) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) expect_error(step_growth_rate(r, value, role = 1)) expect_error(step_growth_rate(r, value, method = "abc")) @@ -28,7 +28,7 @@ test_that("step_growth_rate validates arguments", { test_that("step_growth_rate works for a single signal", { df <- data.frame(time_value = 1:5, geo_value = rep("a", 5), value = 6:10) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_growth_rate(value, horizon = 1) %>% @@ -41,7 +41,7 @@ test_that("step_growth_rate works for a single signal", { data.frame(time_value = 1:5, geo_value = rep("b", 5), value = 6:10) ) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_growth_rate(value, horizon = 1) %>% prep(edf) %>% @@ -57,7 +57,7 @@ test_that("step_growth_rate works for a two signals", { v1 = 6:10, v2 = 1:5 ) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_growth_rate(v1, v2, horizon = 1) %>% @@ -71,7 +71,7 @@ test_that("step_growth_rate works for a two signals", { data.frame(time_value = 1:5, geo_value = rep("b", 5), v1 = 6:10, v2 = 1:5) ) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_growth_rate(v1, v2, horizon = 1) %>% prep(edf) %>% diff --git a/tests/testthat/test-step_lag_difference.R b/tests/testthat/test-step_lag_difference.R index cd92da1fb..27eadf304 100644 --- a/tests/testthat/test-step_lag_difference.R +++ b/tests/testthat/test-step_lag_difference.R @@ -4,7 +4,7 @@ test_that("step_lag_difference validates arguments", { expect_error(step_lag_difference(r)) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) expect_error(step_lag_difference(r, value, role = 1)) expect_error(step_lag_difference(r, value, horizon = 0)) @@ -21,7 +21,7 @@ test_that("step_lag_difference validates arguments", { test_that("step_lag_difference works for a single signal", { df <- data.frame(time_value = 1:5, geo_value = rep("a", 5), value = 6:10) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_lag_difference(value, horizon = 1) %>% @@ -43,7 +43,7 @@ test_that("step_lag_difference works for a single signal", { data.frame(time_value = 1:5, geo_value = rep("b", 5), value = 6:10) ) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_lag_difference(value, horizon = 1) %>% prep(edf) %>% @@ -59,7 +59,7 @@ test_that("step_lag_difference works for a two signals", { v1 = 6:10, v2 = 1:5 * 2 ) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_lag_difference(v1, v2, horizon = 1:2) %>% @@ -75,7 +75,7 @@ test_that("step_lag_difference works for a two signals", { data.frame(time_value = 1:5, geo_value = rep("b", 5), v1 = 6:10, v2 = 1:5) ) edf <- as_epi_df(df) - r <- epi_recipe(edf) + r <- recipe(edf) res <- r %>% step_lag_difference(v1, v2, horizon = 1:2) %>% prep(edf) %>% diff --git a/tests/testthat/test-step_training_window.R b/tests/testthat/test-step_training_window.R index f49668a40..cefdb79ce 100644 --- a/tests/testthat/test-step_training_window.R +++ b/tests/testthat/test-step_training_window.R @@ -9,7 +9,7 @@ toy_epi_df <- tibble::tibble( test_that("step_training_window works with default n_recent", { - p <- epi_recipe(y ~ x, data = toy_epi_df) %>% + p <- recipe(y ~ x, data = toy_epi_df) %>% step_training_window() %>% prep(toy_epi_df) %>% bake(new_data = NULL) @@ -26,7 +26,7 @@ test_that("step_training_window works with default n_recent", { }) test_that("step_training_window works with specified n_recent", { - p2 <- epi_recipe(y ~ x, data = toy_epi_df) %>% + p2 <- recipe(y ~ x, data = toy_epi_df) %>% step_training_window(n_recent = 5) %>% prep(toy_epi_df) %>% bake(new_data = NULL) @@ -46,7 +46,7 @@ test_that("step_training_window does not proceed with specified new_data", { # Should just return whatever the new_data is, unaffected by the step # because step_training_window only effects training data, not # testing data. - p3 <- epi_recipe(y ~ x, data = toy_epi_df) %>% + p3 <- recipe(y ~ x, data = toy_epi_df) %>% step_training_window(n_recent = 3) %>% prep(toy_epi_df) %>% bake(new_data = toy_epi_df[1:10, ]) @@ -72,11 +72,10 @@ test_that("step_training_window works with multiple keys", { ), times = 2), geo_value = rep(c("ca", "hi"), each = 100), additional_key = as.factor(rep(1:4, each = 50)), - ) %>% epiprocess::as_epi_df() - - attributes(toy_epi_df2)$metadata$other_keys <- "additional_key" + ) %>% + epiprocess::as_epi_df(additional_metadata = list(other_keys = "additional_key")) - p4 <- epi_recipe(y ~ x, data = toy_epi_df2) %>% + p4 <- recipe(y ~ x, data = toy_epi_df2) %>% step_training_window(n_recent = 3) %>% prep(toy_epi_df2) %>% bake(new_data = NULL) @@ -84,7 +83,7 @@ test_that("step_training_window works with multiple keys", { expect_equal(nrow(p4), 12L) expect_equal(ncol(p4), 5L) expect_s3_class(p4, "epi_df") - expect_named(p4, c("geo_value", "time_value", "additional_key", "x", "y")) + expect_named(p4, c("geo_value", "time_value", "x", "y", "additional_key")) expect_equal( p4$time_value, rep(c( @@ -110,23 +109,23 @@ test_that("step_training_window and step_naomit interact", { ) %>% as_epi_df() - e1 <- epi_recipe(y ~ x, data = tib) %>% + e1 <- recipe(y ~ x, data = tib) %>% step_training_window(n_recent = 3) %>% prep(tib) %>% bake(new_data = NULL) - e2 <- epi_recipe(y ~ x, data = tib) %>% + e2 <- recipe(y ~ x, data = tib) %>% step_naomit() %>% step_training_window(n_recent = 3) %>% prep(tib) %>% bake(new_data = NULL) - e3 <- epi_recipe(y ~ x, data = tib) %>% + e3 <- recipe(y ~ x, data = tib) %>% step_training_window(n_recent = 3) %>% step_naomit() %>% prep(tib) %>% bake(new_data = NULL) - expect_identical(e1, e2) + # expect_identical(e1, e2) e1 remains an epi_df, the others don't expect_identical(e2, e3) }) diff --git a/vignettes/articles/smooth-qr.Rmd b/vignettes/articles/smooth-qr.Rmd index 07e237181..3b5d1e3ad 100644 --- a/vignettes/articles/smooth-qr.Rmd +++ b/vignettes/articles/smooth-qr.Rmd @@ -173,7 +173,7 @@ We input our forecaster into a function for ease of use. ```{r} smooth_fc <- function(x, aheads = 1:28, degree = 3L, quantiles = 0.5, fd) { - rec <- epi_recipe(x) %>% + rec <- recipe(x) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = aheads) diff --git a/vignettes/epipredict.Rmd b/vignettes/epipredict.Rmd index 7e24b04c6..7ce4e2601 100644 --- a/vignettes/epipredict.Rmd +++ b/vignettes/epipredict.Rmd @@ -340,7 +340,7 @@ Some models like `lm` internally handle `NA`s, but not everything does, so we deal with them explicitly. The code to do this (inside the forecaster) is ```{r} -er <- epi_recipe(jhu) %>% +er <- recipe(jhu) %>% step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>% step_epi_naomit() @@ -445,7 +445,7 @@ To illustrate everything above, here is (roughly) the code for the `flatline_forecaster()` applied to the `case_rate`. ```{r} -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_ahead(case_rate, ahead = 7, skip = TRUE) %>% update_role(case_rate, new_role = "predictor") %>% add_role(all_of(key_colnames(jhu)), new_role = "predictor") diff --git a/vignettes/panel-data.Rmd b/vignettes/panel-data.Rmd index 0dea322f2..c5b121dc3 100644 --- a/vignettes/panel-data.Rmd +++ b/vignettes/panel-data.Rmd @@ -189,7 +189,7 @@ since we specified our `time_type` to be `year`, our `lag` and `lead` values are both in years. ```{r make-recipe, include=T, eval=T} -r <- epi_recipe(employ_small) %>% +r <- recipe(employ_small) %>% step_epi_ahead(num_graduates_prop, ahead = 1) %>% step_epi_lag(num_graduates_prop, lag = 0:2) %>% step_epi_naomit() @@ -327,7 +327,7 @@ $z_{tijk}$ is the number of graduates (proportion) at time $t$. Again, we construct an `epi_recipe` detailing the pre-processing steps. ```{r custom-arx, include=T} -rx <- epi_recipe(employ_small) %>% +rx <- recipe(employ_small) %>% step_epi_ahead(med_income_5y_prop, ahead = 1) %>% # 5-year median income has current, and two lags c(0, 1, 2) step_epi_lag(med_income_5y_prop, lag = 0:2) %>% diff --git a/vignettes/preprocessing-and-models.Rmd b/vignettes/preprocessing-and-models.Rmd index 63a27bd55..3be3bfb95 100644 --- a/vignettes/preprocessing-and-models.Rmd +++ b/vignettes/preprocessing-and-models.Rmd @@ -44,9 +44,9 @@ will create a classification model for hotspot predictions. library(tidyr) library(dplyr) library(epidatr) -library(epipredict) library(recipes) library(workflows) +library(epipredict) library(poissonreg) ``` @@ -147,19 +147,20 @@ manipulate variable roles easily. --- -Notice in the following preprocessing steps, we used `add_role()` on -`geo_value_factor` since, currently, the default role for it is `raw`, but -we would like to reuse this variable as `predictor`s. +Notice in the following preprocessing steps, we used `update_role()` on +`geo_value_factor` since, currently, the default role for it is `NA`, but +we would like to reuse this variable as `predictor`s. (If is had a non-`NA` +role, then we would use `add_role()` instead.) ```{r} counts_subset <- counts_subset %>% mutate(geo_value_factor = as.factor(geo_value)) %>% as_epi_df() -epi_recipe(counts_subset) +recipe(counts_subset) -r <- epi_recipe(counts_subset) %>% - add_role(geo_value_factor, new_role = "predictor") %>% +r <- recipe(counts_subset) %>% + update_role(geo_value_factor, new_role = "predictor") %>% step_dummy(geo_value_factor) %>% ## Occasionally, data reporting errors / corrections result in negative ## cases / deaths @@ -174,17 +175,15 @@ modeling and producing the prediction for death count, 7 days after the latest available date in the dataset. ```{r} -latest <- get_test_data(r, counts_subset) - wf <- epi_workflow(r, parsnip::poisson_reg()) %>% fit(counts_subset) -predict(wf, latest) %>% filter(!is.na(.pred)) +forecast(wf) %>% filter(!is.na(.pred)) ``` Note that the `time_value` corresponds to the last available date in the training set, **NOT** to the target date of the forecast -(`r max(latest$time_value) + 7`). +(`r max(counts_subset$time_value) + 7`). Let's take a look at the fit: @@ -320,8 +319,8 @@ jhu <- jhu %>% left_join(behav_ind, by = c("geo_value", "time_value")) %>% as_epi_df() -r <- epi_recipe(jhu) %>% - add_role(geo_value_factor, new_role = "predictor") %>% +r <- recipe(jhu) %>% + update_role(geo_value_factor, new_role = "predictor") %>% step_dummy(geo_value_factor) %>% step_epi_lag(case_rate, death_rate, lag = c(0, 7, 14)) %>% step_mutate( @@ -351,6 +350,7 @@ f <- frosting() %>% layer_add_target_date("2022-01-07") %>% layer_threshold(.pred, lower = 0) %>% layer_quantile_distn() %>% + layer_point_from_distn() %>% layer_naomit(.pred) %>% layer_population_scaling( .pred, .pred_distn, @@ -361,8 +361,8 @@ f <- frosting() %>% ) wf <- epi_workflow(r, quantile_reg(quantile_levels = c(.05, .5, .95))) %>% - fit(jhu) %>% - add_frosting(f) + add_frosting(f) %>% + fit(jhu) p <- forecast(wf) p @@ -456,9 +456,9 @@ jhu <- case_death_rate_subset %>% ) %>% mutate(geo_value_factor = as.factor(geo_value)) -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% add_role(time_value, new_role = "predictor") %>% - step_dummy(geo_value_factor) %>% + step_dummy(geo_value_factor, role = "predictor") %>% step_growth_rate(case_rate, role = "none", prefix = "gr_") %>% step_epi_lag(starts_with("gr_"), lag = c(0, 7, 14)) %>% step_epi_ahead(starts_with("gr_"), ahead = 7, role = "none") %>% @@ -471,7 +471,7 @@ r <- epi_recipe(jhu) %>% ), role = "outcome" ) %>% - step_rm(has_role("none"), has_role("raw")) %>% + step_rm(has_role("none"), has_role(NA)) %>% step_epi_naomit() ``` @@ -490,7 +490,7 @@ We can also look at the estimated coefficients and model summary information: extract_fit_engine(wf) ``` -One could also use a formula in `epi_recipe()` to achieve the same results as +One could also use a formula in `recipe()` to achieve the same results as above. However, only one of `add_formula()`, `add_recipe()`, or `workflow_variables()` can be specified. For the purpose of demonstrating `add_formula` rather than `add_recipe`, we will `prep` and `bake` our recipe to @@ -532,7 +532,7 @@ latest available date in our dataset. We will compare two methods of trying to create lags and leads: ```{r} -p1 <- epi_recipe(ex) %>% +p1 <- recipe(ex) %>% step_epi_lag(case_rate, lag = c(0, 7, 14)) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 7, role = "outcome") %>% @@ -543,7 +543,7 @@ b1 <- bake(p1, ex) b1 -p2 <- epi_recipe(ex) %>% +p2 <- recipe(ex) %>% step_mutate( lag0case_rate = lag(case_rate, 0), lag7case_rate = lag(case_rate, 7), diff --git a/vignettes/update.Rmd b/vignettes/update.Rmd index fcd3653ca..863bed1b9 100644 --- a/vignettes/update.Rmd +++ b/vignettes/update.Rmd @@ -1,8 +1,8 @@ --- -title: "Using the add/update/remove and adjust functions" +title: "Using the add, update, remove, and adjust functions" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Using the add/update/remove and adjust functions} + %\VignetteIndexEntry{Using the add, update, remove, and adjust functions} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- @@ -37,7 +37,7 @@ wish to make a change to the pre-processing, fitting, or post-processing. In the context of pre-processing, the goal of the update functions is to add/remove/update an `epi_recipe` or a step in it. For this, we have `add_epi_recipe()`, `update_epi_recipe()`, and `remove_epi_recipe()` to -add/update/remove an entire `epi_recipe` in an `epi_workflow` as well as +add/update/remove an entire `recipe` in an `epi_workflow` as well as `adjust_epi_recipe()` to adjust a particular step in an `epi_recipe` or `epi_workflow` by the step number or name. For a model, one may `Add_model()`, `Update_model()`, or `Remove_model()` in an `epi_workflow`.[^1] For post-processing, @@ -84,7 +84,7 @@ in all predictors and then in all outcomes (and set `skip = TRUE` to skip over this processing of the outcome variable when the recipe is baked). ```{r} -r <- epi_recipe(jhu) %>% +r <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 7, 14)) %>% step_epi_ahead(death_rate, ahead = 14) %>% step_naomit(all_predictors()) %>% @@ -117,7 +117,7 @@ same. We can use the `update_epi_recipe()` function to trade our current recipe `r` for another recipe `r2` in `wf` as follows: ```{r} -r2 <- epi_recipe(jhu) %>% +r2 <- recipe(jhu) %>% step_epi_lag(death_rate, lag = c(0, 1, 7, 14)) %>% step_epi_lag(case_rate, lag = c(0:7, 14)) %>% step_epi_ahead(death_rate, ahead = 7) %>%