From 577a866d213b59bde59522f6de7c36d4bb91ae3b Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Mar 2025 13:41:15 -0700 Subject: [PATCH 1/7] feat: add step_/layer_ epi_YeoJohnson --- DESCRIPTION | 1 + NAMESPACE | 8 + R/layer_yeo_johnson.R | 249 +++++++++++++++++++ R/step_yeo_johnson.R | 397 ++++++++++++++++++++++++++++++ man/layer_epi_YeoJohnson.Rd | 63 +++++ man/step_adjust_latency.Rd | 8 +- man/step_epi_YeoJohnson.Rd | 118 +++++++++ tests/testthat/test-yeo-johnson.R | 141 +++++++++++ 8 files changed, 981 insertions(+), 4 deletions(-) create mode 100644 R/layer_yeo_johnson.R create mode 100644 R/step_yeo_johnson.R create mode 100644 man/layer_epi_YeoJohnson.Rd create mode 100644 man/step_epi_YeoJohnson.Rd create mode 100644 tests/testthat/test-yeo-johnson.R diff --git a/DESCRIPTION b/DESCRIPTION index 81a35b30..e3594a50 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,6 +42,7 @@ Imports: recipes (>= 1.0.4), rlang (>= 1.1.0), stats, + stringr, tibble, tidyr, tidyselect, diff --git a/NAMESPACE b/NAMESPACE index c2fa9494..351530de 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,7 @@ S3method(bake,check_enough_data) S3method(bake,epi_recipe) S3method(bake,step_adjust_latency) S3method(bake,step_climate) +S3method(bake,step_epi_YeoJohnson) S3method(bake,step_epi_ahead) S3method(bake,step_epi_lag) S3method(bake,step_epi_slide) @@ -55,6 +56,7 @@ S3method(prep,check_enough_data) S3method(prep,epi_recipe) S3method(prep,step_adjust_latency) S3method(prep,step_climate) +S3method(prep,step_epi_YeoJohnson) S3method(prep,step_epi_ahead) S3method(prep,step_epi_lag) S3method(prep,step_epi_slide) @@ -76,6 +78,7 @@ S3method(print,flatline) S3method(print,frosting) S3method(print,layer_add_forecast_date) S3method(print,layer_add_target_date) +S3method(print,layer_epi_YeoJohnson) S3method(print,layer_naomit) S3method(print,layer_point_from_distn) S3method(print,layer_population_scaling) @@ -86,6 +89,7 @@ S3method(print,layer_threshold) S3method(print,layer_unnest) S3method(print,step_adjust_latency) S3method(print,step_climate) +S3method(print,step_epi_YeoJohnson) S3method(print,step_epi_ahead) S3method(print,step_epi_lag) S3method(print,step_epi_slide) @@ -101,6 +105,7 @@ S3method(run_mold,default_epi_recipe_blueprint) S3method(slather,layer_add_forecast_date) S3method(slather,layer_add_target_date) S3method(slather,layer_cdc_flatline_quantiles) +S3method(slather,layer_epi_YeoJohnson) S3method(slather,layer_naomit) S3method(slather,layer_point_from_distn) S3method(slather,layer_population_scaling) @@ -114,6 +119,7 @@ S3method(snap,quantile_pred) S3method(tidy,check_enough_data) S3method(tidy,frosting) S3method(tidy,layer) +S3method(tidy,step_epi_YeoJohnson) S3method(update,layer) S3method(vec_arith,quantile_pred) S3method(vec_arith.numeric,quantile_pred) @@ -176,6 +182,7 @@ export(layer) export(layer_add_forecast_date) export(layer_add_target_date) export(layer_cdc_flatline_quantiles) +export(layer_epi_YeoJohnson) export(layer_naomit) export(layer_point_from_distn) export(layer_population_scaling) @@ -207,6 +214,7 @@ export(smooth_quantile_reg) export(snap) export(step_adjust_latency) export(step_climate) +export(step_epi_YeoJohnson) export(step_epi_ahead) export(step_epi_lag) export(step_epi_naomit) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R new file mode 100644 index 00000000..826bd23d --- /dev/null +++ b/R/layer_yeo_johnson.R @@ -0,0 +1,249 @@ +#' Unormalizing transformation +#' +#' Will undo a step_epi_YeoJohnson transformation. +#' +#' @param frosting a `frosting` postprocessor. The layer will be added to the +#' sequence of operations for this frosting. +#' @param lambdas Internal. A data frame of lambda values to be used for +#' inverting the transformation. +#' @param ... One or more selector functions to scale variables +#' for this step. See [recipes::selections()] for more details. +#' @param by A (possibly named) character vector of variables to join by. +#' @param id a random id string +#' +#' @return an updated `frosting` postprocessor +#' @export +#' @examples +#' library(dplyr) +#' jhu <- epidatasets::cases_deaths_subset %>% +#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' +#' # Create a recipe with a Yeo-Johnson transformation. +#' r <- epi_recipe(jhu) %>% +#' step_epi_YeoJohnson(cases) %>% +#' step_epi_lag(cases, lag = 0) %>% +#' step_epi_ahead(cases, ahead = 0, role = "outcome") %>% +#' step_epi_naomit() +#' +#' # Create a frosting layer that will undo the Yeo-Johnson transformation. +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_epi_YeoJohnson(.pred) +#' +#' # Create a workflow and fit it. +#' wf <- epi_workflow(r, linear_reg()) %>% +#' fit(jhu) %>% +#' add_frosting(f) +#' +#' # Forecast the workflow, which should reverse the Yeo-Johnson transformation. +#' forecast(wf) +#' # Compare to the original data. +#' jhu %>% filter(time_value == "2021-12-31") +#' forecast(wf) +layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { + checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE) + + add_layer( + frosting, + layer_epi_YeoJohnson_new( + lambdas = lambdas, + by = by, + terms = dplyr::enquos(...), + id = id + ) + ) +} + +layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) { + layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id) +} + +#' @export +#' @importFrom workflows extract_preprocessor +slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { + rlang::check_dots_empty() + + # Get the lambdas from the layer or from the workflow. + lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow) + + # If the by is not specified, try to infer it from the lambdas. + if (is.null(object$by)) { + # Assume `layer_predict` has calculated the prediction keys and other + # layers don't change the prediction key colnames: + prediction_key_colnames <- names(components$keys) + lhs_potential_keys <- prediction_key_colnames + rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_"))) + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) + suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") + if (!all(suggested_min_keys %in% object$by)) { + cli_warn( + c( + "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, + but {?wasn't/weren't} found in the population `df`.", + "i" = "Defaulting to join by {object$by}", + ">" = "Double-check whether column names on the population `df` match those expected in your predictions", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", + ">" = "Manually specify `by =` to silence" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + } + } + + # Establish the join columns. + object$by <- object$by %||% + intersect( + epi_keys_only(components$predictions), + colnames(select(lambdas, -starts_with(".lambda_"))) + ) + joinby <- list(x = names(object$by) %||% object$by, y = object$by) + hardhat::validate_column_names(components$predictions, joinby$x) + hardhat::validate_column_names(lambdas, joinby$y) + + # Join the lambdas. + components$predictions <- inner_join( + components$predictions, + lambdas, + by = object$by, + relationship = "many-to-one", + unmatched = c("error", "drop") + ) + + exprs <- rlang::expr(c(!!!object$terms)) + pos <- tidyselect::eval_select(exprs, components$predictions) + col_names <- names(pos) + + # The `object$terms` is where the user specifies the columns they want to + # untransform. We need to match the outcomes with their lambda columns in our + # parameter table and then apply the inverse transformation. + if (identical(col_names, ".pred")) { + # In this case, we don't get a hint for the outcome column name, so we need + # to infer it from the mold. + if (length(components$mold$outcomes) > 1) { + cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env()) + } + # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc. + # We want to extract the cases part. + outcome_cols <- names(components$mold$outcomes) %>% + stringr::str_match("ahead_\\d+_(.*)") %>% + magrittr::extract(, 2) + + components$predictions <- components$predictions %>% + rowwise() %>% + mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols)))) + } else if (identical(col_names, character(0))) { + # Wish I could suggest `all_outcomes()` here, but currently it's the same as + # not specifying any terms. I don't want to spend time with dealing with + # this case until someone asks for it. + cli::cli_abort( + "Not specifying columns to layer Yeo-Johnson is not implemented. + If you had a single outcome, you can use `.pred` as a column name. + If you had multiple outcomes, you'll need to specify them like + `.pred_ahead_1_`, `.pred_ahead_7_`, etc. + ", + call = rlang::caller_env() + ) + } else { + # In this case, we assume that the user has specified the columns they want + # transformed here. We then need to determine the lambda columns for each of + # these columns. That is, we need to convert a vector of column names like + # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to + # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate"). + original_outcome_cols <- stringr::str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] + outcomes_wout_ahead <- stringr::str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[, 2] + if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { + cli_abort( + "All columns specified in `...` must be outcome columns. + They must be of the form `.pred_ahead_1_`, `.pred_ahead_7_`, etc. + ", + call = rlang::caller_env() + ) + } + + for (i in seq_along(col_names)) { + col <- col_names[i] + lambda_col <- paste0(".lambda_", original_outcome_cols[i]) + components$predictions <- components$predictions %>% + rowwise() %>% + mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col))) + } + } + + # Remove the lambda columns. + components$predictions <- components$predictions %>% + select(-any_of(starts_with(".lambda_"))) %>% + ungroup() + components +} + +#' @export +print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) { + title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + print_layer(x$terms, title = title, width = width) +} + +# Inverse Yeo-Johnson transformation +# +# Inverse of `yj_transform` in step_yeo_johnson.R. Note that this function is +# vectorized in x, but not in lambda. +yj_inverse <- function(x, lambda, eps = 0.001) { + if (is.na(lambda)) { + return(x) + } + if (!inherits(x, "tbl_df") || is.data.frame(x)) { + x <- unlist(x, use.names = FALSE) + } else { + if (!is.vector(x)) { + x <- as.vector(x) + } + } + + dat_neg <- x < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + not_neg <- ind_neg[["not"]] + is_neg <- ind_neg[["is"]] + + nn_inv_trans <- function(x, lambda) { + if (abs(lambda) < eps) { + # log(x + 1) + exp(x) - 1 + } else { + # ((x + 1)^lambda - 1) / lambda + (lambda * x + 1)^(1 / lambda) - 1 + } + } + + ng_inv_trans <- function(x, lambda) { + if (abs(lambda - 2) < eps) { + # -log(-x + 1) + -(exp(-x) - 1) + } else { + # -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + } + } + + if (length(not_neg) > 0) { + x[not_neg] <- nn_inv_trans(x[not_neg], lambda) + } + + if (length(is_neg) > 0) { + x[is_neg] <- ng_inv_trans(x[is_neg], lambda) + } + x +} + +get_lambdas_in_layer <- function(workflow) { + this_recipe <- hardhat::extract_recipe(workflow) + if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { + cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) + } + for (step in this_recipe$steps) { + if (inherits(step, "step_epi_YeoJohnson")) { + lambdas <- step$lambdas + break + } + } + lambdas +} diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R new file mode 100644 index 00000000..3a5fdce6 --- /dev/null +++ b/R/step_yeo_johnson.R @@ -0,0 +1,397 @@ +#' Yeo-Johnson transformation +#' +#' `step_epi_YeoJohnson()` creates a *specification* of a recipe step that will +#' transform data using a Yeo-Johnson transformation. This fork works with panel +#' data and is meant for epidata. +#' +#' @param recipe A recipe object. The step will be added to the +#' sequence of operations for this recipe. +#' @param ... One or more selector functions to choose variables +#' for this step. See [recipes::selections()] for more details. +#' @param role For model terms created by this step, what analysis role should +#' they be assigned? `lag` is default a predictor while `ahead` is an outcome. +#' @param trained A logical for whether the selectors in `...` +#' have been resolved by [prep()]. +#' @param lambdas Internal. A numeric vector of transformation values. This +#' is `NULL` until computed by [prep()]. +#' @param na_lambda_fill A numeric value to fill in for any +#' geos where the lambda cannot be estimated. +#' @param limits A length 2 numeric vector defining the range to +#' compute the transformation parameter lambda. +#' @param num_unique An integer where data that have fewer than this +#' many unique values will not be evaluated for a transformation. +#' @param na_rm A logical indicating whether missing values should be +#' removed. +#' @param skip A logical. Should the step be skipped when the recipe is +#' baked by [bake()]. On the `training` data, the step will always be +#' conducted (even if `skip = TRUE`). +#' @param id A unique identifier for the step +#' @template step-return +#' @family individual transformation steps +#' @export +#' @details The Yeo-Johnson transformation is variance-stabilizing +#' transformation, similar to the Box-Cox but does not require the input +#' variables to be strictly positive. In the package, the partial +#' log-likelihood function is directly optimized within a reasonable set of +#' transformation values (which can be changed by the user). The optimization +#' finds a lambda parameter for each group in the data that minimizes the +#' variance of the transformed data. +#' +#' This transformation is typically done on the outcome variable +#' using the residuals for a statistical model (such as ordinary +#' least squares). Here, a simple null model (intercept only) is +#' used to apply the transformation to the *predictor* +#' variables individually. This can have the effect of making the +#' variable distributions more symmetric. +#' +#' If the transformation parameters are estimated to be very +#' close to the bounds, or if the optimization fails, a value of +#' `NA` is used and no transformation is applied. +#' +#' # Tidying +#' +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with +#' columns `terms`, `value` , and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected} +#' \item{value}{numeric, the lambda estimate} +#' \item{id}{character, id of this step} +#' } +#' +#' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power +#' transformations to improve normality or symmetry. *Biometrika*. +#' @examples +#' jhu <- cases_deaths_subset %>% +#' filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' filtered_data <- jhu +#' +#' r <- epi_recipe(filtered_data) %>% +#' step_epi_YeoJohnson(cases) +#' # View the recipe +#' r +#' # Fit the recipe +#' tr <- r %>% prep(filtered_data) +#' # View the lambda values +#' tr$steps[[1]]$lambdas +#' # View the transformed data +#' df <- tr %>% bake(filtered_data) +#' plot(density(df$cases)) +#' plot(density(filtered_data$cases)) +step_epi_YeoJohnson <- function( + recipe, + ..., + role = "predictor", + trained = FALSE, + lambdas = NULL, + na_lambda_fill = 1 / 4, + limits = c(-5, 5), + num_unique = 5, + na_rm = TRUE, + skip = FALSE, + id = rand_id("epi_YeoJohnson") +) { + checkmate::assert_numeric(limits, len = 2) + checkmate::assert_numeric(na_lambda_fill, lower = min(limits), upper = max(limits), len = 1) + checkmate::assert_numeric(num_unique, lower = 2, upper = Inf, len = 1) + checkmate::assert_logical(na_rm, len = 1) + checkmate::assert_logical(skip, len = 1) + add_step( + recipe, + step_epi_YeoJohnson_new( + terms = enquos(...), + role = role, + trained = trained, + lambdas = lambdas, + na_lambda_fill = na_lambda_fill, + limits = sort(limits)[1:2], + num_unique = num_unique, + na_rm = na_rm, + forecast_date = NULL, + metadata = NULL, + columns = NULL, + skip = skip, + id = id + ) + ) +} + +step_epi_YeoJohnson_new <- function( + terms, + role, + trained, + lambdas, + na_lambda_fill, + limits, + num_unique, + na_rm, + forecast_date, + metadata, + columns, + skip, + id +) { + step( + subclass = "epi_YeoJohnson", + terms = terms, + role = role, + trained = trained, + lambdas = lambdas, + na_lambda_fill = na_lambda_fill, + limits = limits, + num_unique = num_unique, + na_rm = na_rm, + forecast_date = forecast_date, + metadata = metadata, + columns = columns, + skip = skip, + id = id + ) +} + +#' @export +prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { + # Check that the columns selected for transformation are numeric. + col_names <- recipes_eval_select(x$terms, training, info) + recipes::check_type(training[, col_names], types = c("double", "integer")) + + lambdas <- get_lambdas_yj_table( + training, + col_names, + x$limits, + x$num_unique, + x$na_lambda_fill, + x$na_rm, + key_colnames(training, exclude = "time_value") + ) + + step_epi_YeoJohnson_new( + terms = x$terms, + role = x$role, + trained = TRUE, + lambdas = lambdas, + na_lambda_fill = x$na_lambda_fill, + limits = x$limits, + num_unique = x$num_unique, + na_rm = x$na_rm, + forecast_date = attr(training, "metadata")$as_of, + metadata = attr(training, "metadata"), + columns = col_names, + skip = x$skip, + id = x$id + ) +} + +#' @export +bake.step_epi_YeoJohnson <- function(object, new_data, ...) { + # If not an epi_df, make it one assuming the template of training data. + # If it is an epi_df, check that the keys match. + # Imitating the pattern in step_adjust_latency(). + if (!inherits(new_data, "epi_df") || is.null(attr(new_data, "metadata")$as_of)) { + new_data <- as_epi_df( + new_data, + as_of = object$forecast_date, + other_keys = object$metadata$other_keys %||% character() + ) + attr(new_data, "metadata") <- object$metadata + } + # Check that the keys match. + keys <- key_colnames(new_data, exclude = "time_value") + old_keys <- object$lambdas %>% + select(-starts_with(".lambda_")) %>% + colnames() + if (!all(keys %in% old_keys)) { + cli::cli_abort( + "The keys of the new data do not match the keys of the training data.", + call = rlang::caller_fn() + ) + } + # Check that the columns for transformation are present in new_data. + col_names <- object$columns + check_new_data(col_names, object, new_data) + + # Transform each column, using the appropriate lambda column per row. + # Note that yj_transform() is vectorized in x, but not in lambda. + new_data <- left_join(new_data, object$lambdas, by = keys) + for (col in col_names) { + new_data <- new_data %>% + rowwise() %>% + mutate(!!col := yj_transform(!!sym(col), !!sym(paste0(".lambda_", col)))) + } + # Remove the lambda columns. + new_data %>% + select(-starts_with(".lambda_")) %>% + ungroup() +} + +#' @export +print.step_epi_YeoJohnson <- function(x, width = max(20, options()$width - 39), ...) { + title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + print_epi_step(x$terms, x$terms, title = title, width = width) + invisible(x) +} + +# Compute the lambda values per group for each column. +get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lambda_fill, na_rm, epi_keys_checked) { + # Estimate the lambda for each column, creating a lambda_ column for each. + # Note that estimate_yj() operates on a vector. + lambdas <- training %>% + summarise( + across(all_of(col_names), ~ estimate_yj(.x, limits, num_unique, na_rm)), + .by = all_of(epi_keys_checked) + ) %>% + dplyr::rename_with(~ paste0(".lambda_", .x), -all_of(epi_keys_checked)) + + # Check for NAs in any of the lambda_ columns. + # EDIT: This warning was too noisy. Keeping code around, in case we want it. + # for (col in col_names) { + # if (any(is.na(values[[paste0("lambda_", col)]]))) { + # cli::cli_warn( + # c( + # x = "Yeo-Johnson lambda could not be estimated for some geos for {col}.", + # i = "Using lambda={x$na_lambda_fill} in these cases." + # ), + # call = rlang::caller_fn() + # ) + # } + # } + + # Fill in NAs with the default lambda. + lambdas %>% + mutate(across(starts_with(".lambda_"), \(col) ifelse(is.na(col), na_lambda_fill, col))) +} + + +### Code below taken from recipes::step_YeoJohnson. +### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 + +# Yeo-Johnson transformation +# +# Note that this function is vectorized in x, but not in lambda. +yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { + if (is.na(lambda)) { + return(x) + } + if (!inherits(x, "tbl_df") || is.data.frame(x)) { + x <- unlist(x, use.names = FALSE) + } else { + if (!is.vector(x)) { + x <- as.vector(x) + } + } + # TODO case weights: can we use weights here? + if (is.null(ind_neg)) { + dat_neg <- x < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + } + not_neg <- ind_neg[["not"]] + is_neg <- ind_neg[["is"]] + + nn_trans <- function(x, lambda) { + if (abs(lambda) < eps) { + log(x + 1) + } else { + ((x + 1)^lambda - 1) / lambda + } + } + + ng_trans <- function(x, lambda) { + if (abs(lambda - 2) < eps) { + -log(-x + 1) + } else { + -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) + } + } + + if (length(not_neg) > 0) { + x[not_neg] <- nn_trans(x[not_neg], lambda) + } + + if (length(is_neg) > 0) { + x[is_neg] <- ng_trans(x[is_neg], lambda) + } + x +} + +## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., +## & Johnson, R. A. (2000). A new family of power transformations +## to improve normality or symmetry. Biometrika. page 957 +ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) { + n <- length(y) + y_t <- yj_transform(y, lambda, ind_neg) + # EDIT: Unused in the original recipes code. + # mu_t <- mean(y_t) + var_t <- var(y_t) * (n - 1) / n + res <- -.5 * n * log(var_t) + (lambda - 1) * const + res +} + +## eliminates missing data and returns -llh +yj_obj <- function(lam, dat, ind_neg, const) { + ll_yj(lambda = lam, y = dat, ind_neg = ind_neg, const = const) +} + +## estimates the values +estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, call = caller_env(2)) { + na_rows <- which(is.na(dat)) + if (length(na_rows) > 0) { + if (na_rm) { + dat <- dat[-na_rows] + } else { + cli::cli_abort( + c( + x = "Missing values are not allowed for the YJ transformation.", + i = "See {.arg na_rm} option." + ), + call = call + ) + } + } + + eps <- .001 + if (length(unique(dat)) < num_unique) { + return(NA) + } + dat_neg <- dat < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + + const <- sum(sign(dat) * log(abs(dat) + 1)) + + suppressWarnings( + res <- optimize( + yj_obj, + interval = limits, + maximum = TRUE, + dat = dat, + ind_neg = ind_neg, + const = const, + tol = .0001 + ) + ) + lam <- res$maximum + if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= eps) { + lam <- NA + } + lam +} + +# Copied from recipes::tidy.step_BoxCox +# +#' @export +tidy.step_epi_YeoJohnson <- function(x, ...) { + if (is_trained(x)) { + res <- tibble( + terms = names(x$lambdas), + value = unname(x$lambdas) + ) + } else { + term_names <- sel2char(x$terms) + res <- tibble( + terms = term_names, + value = na_dbl + ) + } + res$id <- x$id + res +} diff --git a/man/layer_epi_YeoJohnson.Rd b/man/layer_epi_YeoJohnson.Rd new file mode 100644 index 00000000..1ca4d9cc --- /dev/null +++ b/man/layer_epi_YeoJohnson.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_yeo_johnson.R +\name{layer_epi_YeoJohnson} +\alias{layer_epi_YeoJohnson} +\title{Unormalizing transformation} +\usage{ +layer_epi_YeoJohnson( + frosting, + ..., + lambdas = NULL, + by = NULL, + id = rand_id("epi_YeoJohnson") +) +} +\arguments{ +\item{frosting}{a \code{frosting} postprocessor. The layer will be added to the +sequence of operations for this frosting.} + +\item{...}{One or more selector functions to scale variables +for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} + +\item{lambdas}{Internal. A data frame of lambda values to be used for +inverting the transformation.} + +\item{by}{A (possibly named) character vector of variables to join by.} + +\item{id}{a random id string} +} +\value{ +an updated \code{frosting} postprocessor +} +\description{ +Will undo a step_epi_YeoJohnson transformation. +} +\examples{ +library(dplyr) +jhu <- epidatasets::cases_deaths_subset \%>\% + filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% + select(geo_value, time_value, cases) + +# Create a recipe with a Yeo-Johnson transformation. +r <- epi_recipe(jhu) \%>\% + step_epi_YeoJohnson(cases) \%>\% + step_epi_lag(cases, lag = 0) \%>\% + step_epi_ahead(cases, ahead = 0, role = "outcome") \%>\% + step_epi_naomit() + +# Create a frosting layer that will undo the Yeo-Johnson transformation. +f <- frosting() \%>\% + layer_predict() \%>\% + layer_epi_YeoJohnson(.pred) + +# Create a workflow and fit it. +wf <- epi_workflow(r, linear_reg()) \%>\% + fit(jhu) \%>\% + add_frosting(f) + +# Forecast the workflow, which should reverse the Yeo-Johnson transformation. +forecast(wf) +# Compare to the original data. +jhu \%>\% filter(time_value == "2021-12-31") +forecast(wf) +} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 9e1bafbd..c904a5ad 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -140,7 +140,7 @@ toy_recipe \%>\% #> * geo_type = state #> * time_type = day #> * as_of = 2015-01-14 -#> +#> #> # A tibble: 8 x 4 #> geo_value time_value a b #> @@ -176,7 +176,7 @@ toy_recipe \%>\% #> * geo_type = state #> * time_type = day #> * as_of = 2015-01-14 -#> +#> #> # A tibble: 21 x 7 #> geo_value time_value a b lag_3_a lag_4_b ahead_1_a #> @@ -224,7 +224,7 @@ toy_recipe \%>\% #> * geo_type = state #> * time_type = day #> * as_of = 2015-01-14 -#> +#> #> # A tibble: 10 x 6 #> geo_value time_value a b lag_0_a ahead_3_a #> @@ -296,7 +296,7 @@ rates_fit } \seealso{ -Other row operation steps: +Other row operation steps: \code{\link{step_epi_lag}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} diff --git a/man/step_epi_YeoJohnson.Rd b/man/step_epi_YeoJohnson.Rd new file mode 100644 index 00000000..1fa63761 --- /dev/null +++ b/man/step_epi_YeoJohnson.Rd @@ -0,0 +1,118 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_yeo_johnson.R +\name{step_epi_YeoJohnson} +\alias{step_epi_YeoJohnson} +\title{Yeo-Johnson transformation} +\usage{ +step_epi_YeoJohnson( + recipe, + ..., + role = "predictor", + trained = FALSE, + lambdas = NULL, + na_lambda_fill = 1/4, + limits = c(-5, 5), + num_unique = 5, + na_rm = TRUE, + skip = FALSE, + id = rand_id("epi_YeoJohnson") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} + +\item{role}{For model terms created by this step, what analysis role should +they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} + +\item{trained}{A logical for whether the selectors in \code{...} +have been resolved by \code{\link[=prep]{prep()}}.} + +\item{lambdas}{Internal. A numeric vector of transformation values. This +is \code{NULL} until computed by \code{\link[=prep]{prep()}}.} + +\item{na_lambda_fill}{A numeric value to fill in for any +geos where the lambda cannot be estimated.} + +\item{limits}{A length 2 numeric vector defining the range to +compute the transformation parameter lambda.} + +\item{num_unique}{An integer where data that have fewer than this +many unique values will not be evaluated for a transformation.} + +\item{na_rm}{A logical indicating whether missing values should be +removed.} + +\item{skip}{A logical. Should the step be skipped when the recipe is +baked by \code{\link[=bake]{bake()}}. On the \code{training} data, the step will always be +conducted (even if \code{skip = TRUE}).} + +\item{id}{A unique identifier for the step} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +\code{step_epi_YeoJohnson()} creates a \emph{specification} of a recipe step that will +transform data using a Yeo-Johnson transformation. This fork works with panel +data and is meant for epidata. +} +\details{ +The Yeo-Johnson transformation is variance-stabilizing +transformation, similar to the Box-Cox but does not require the input +variables to be strictly positive. In the package, the partial +log-likelihood function is directly optimized within a reasonable set of +transformation values (which can be changed by the user). The optimization +finds a lambda parameter for each group in the data that minimizes the +variance of the transformed data. + +This transformation is typically done on the outcome variable +using the residuals for a statistical model (such as ordinary +least squares). Here, a simple null model (intercept only) is +used to apply the transformation to the \emph{predictor} +variables individually. This can have the effect of making the +variable distributions more symmetric. + +If the transformation parameters are estimated to be very +close to the bounds, or if the optimization fails, a value of +\code{NA} is used and no transformation is applied. +} +\section{Tidying}{ +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with +columns \code{terms}, \code{value} , and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected} +\item{value}{numeric, the lambda estimate} +\item{id}{character, id of this step} +} +} + +\examples{ +jhu <- cases_deaths_subset \%>\% + filter(time_value > "2021-01-01", geo_value \%in\% c("ca", "ny")) \%>\% + select(geo_value, time_value, cases) +filtered_data <- jhu + +r <- epi_recipe(filtered_data) \%>\% + step_epi_YeoJohnson(cases) +# View the recipe +r +# Fit the recipe +tr <- r \%>\% prep(filtered_data) +# View the lambda values +tr$steps[[1]]$lambdas +# View the transformed data +df <- tr \%>\% bake(filtered_data) +plot(density(df$cases)) +plot(density(filtered_data$cases)) +} +\references{ +Yeo, I. K., and Johnson, R. A. (2000). A new family of power +transformations to improve normality or symmetry. \emph{Biometrika}. +} +\concept{individual transformation steps} diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R new file mode 100644 index 00000000..124f9648 --- /dev/null +++ b/tests/testthat/test-yeo-johnson.R @@ -0,0 +1,141 @@ +test_that("Yeo-Johnson transformation inverts correctly", { + # Note that the special lambda values of 0 and 2 are covered by the tests + # below. + expect_true( + map_lgl(seq(-5, 5, 0.1), function(lambda) { + map_lgl(seq(-10, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all() + }) %>% + all() + ) +}) + +test_that("Yeo-Johnson steps and layers invert each other", { + jhu <- epidatasets::cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + # Get some lambda values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general lambda values tibble structure + expect_true(".lambda_cases" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_cases)) + # Still works on a tibble + expect_equal( + tr %>% bake(filtered_data %>% as_tibble()), + tr %>% bake(filtered_data) + ) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + as_tibble() %>% + slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(cases = .pred) + expect_equal(out1, out2) + + # Make sure it works when there are multiple predictors and outcomes + jhu_multi <- epidatasets::covid_case_death_rates_extended %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, case_rate, death_rate) + filtered_data <- jhu_multi + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(case_rate, death_rate) %>% + step_epi_lag(case_rate, death_rate, lag = 0) %>% + step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general lambda values tibble structure + expect_true(".lambda_case_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true(".lambda_death_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_case_rate)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_death_rate)) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred_ahead_0_case_rate, .pred_ahead_0_death_rate) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + as_tibble() %>% + slice_max(time_value, by = geo_value) + # debugonce(slather.layer_epi_YeoJohnson) + out2 <- forecast(wf) %>% rename(case_rate = .pred_ahead_0_case_rate, death_rate = .pred_ahead_0_death_rate) + expect_equal(out1, out2) +}) + +test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { + # Small synthetic grad_employ_dataset version. + filtered_data <- tribble( + ~geo_value, ~age_group, ~edu_qual, ~time_value, ~med_income_2y, + "ca", "25-34", "bachelor", 2017, 50000, + "ca", "25-34", "bachelor", 2018, 50500, + "ca", "25-34", "bachelor", 2019, 51000, + "ca", "25-34", "bachelor", 2020, 51500, + "ca", "25-34", "bachelor", 2021, 52000, + "ca", "25-34", "bachelor", 2022, 52500, + "ca", "35-1000", "bachelor", 2017, 3e10, + "ca", "35-1000", "bachelor", 2018, 3e10 + 10, + "ca", "35-1000", "bachelor", 2019, 3e10 + 20, + "ca", "35-1000", "bachelor", 2020, 3e10 + 30, + "ca", "35-1000", "bachelor", 2021, 3e10 + 40, + "ca", "35-1000", "bachelor", 2022, 3e10 + 50, + "ca", "25-34", "master", 2017, 2 * 50000, + "ca", "25-34", "master", 2018, 2 * 50500, + "ca", "25-34", "master", 2019, 2 * 51000, + "ca", "25-34", "master", 2020, 2 * 51500, + "ca", "25-34", "master", 2021, 2 * 52000, + "ca", "25-34", "master", 2022, 2 * 52500, + "ca", "35-1000", "master", 2017, 2 * 3e10, + "ca", "35-1000", "master", 2018, 2 * (3e10 + 10), + "ca", "35-1000", "master", 2019, 2 * (3e10 + 20), + "ca", "35-1000", "master", 2020, 2 * (3e10 + 30), + "ca", "35-1000", "master", 2021, 2 * (3e10 + 40), + "ca", "35-1000", "master", 2022, 2 * (3e10 + 50) + ) %>% as_epi_df(other_keys = c("age_group", "edu_qual")) + + # Get some lambda values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(med_income_2y) %>% + step_epi_lag(med_income_2y, lag = 0) %>% + step_epi_ahead(med_income_2y, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + expect_true(".lambda_med_income_2y" %in% names(tr$steps[[1]]$lambdas)) + expect_true("geo_value" %in% names(tr$steps[[1]]$lambdas)) + expect_true("age_group" %in% names(tr$steps[[1]]$lambdas)) + expect_true("edu_qual" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_med_income_2y)) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + as_tibble() %>% + slice_max(time_value, by = geo_value) %>% + select(geo_value, age_group, time_value, med_income_2y) %>% + arrange(geo_value, age_group, time_value) + out2 <- forecast(wf) %>% + rename(med_income_2y = .pred) %>% + select(geo_value, age_group, time_value, med_income_2y) %>% + arrange(geo_value, age_group, time_value) + expect_equal(out1, out2) +}) From b4b06277e60de9fdd8ea1744018f5410466e44fa Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 31 Mar 2025 13:50:27 -0700 Subject: [PATCH 2/7] feat: add air.toml --- .Rbuildignore | 1 + air.toml | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 air.toml diff --git a/.Rbuildignore b/.Rbuildignore index dc41e622..0bdb211f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -22,3 +22,4 @@ ^.lintr$ ^.venv$ ^inst/templates$ +^air\.toml$ \ No newline at end of file diff --git a/air.toml b/air.toml new file mode 100644 index 00000000..6cb579db --- /dev/null +++ b/air.toml @@ -0,0 +1,2 @@ +[format] +line-width = 120 From 2d05117e3a9a260a69fc803b78849b8c1026460a Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Mar 2025 13:42:08 -0700 Subject: [PATCH 3/7] doc: version bump + news --- NEWS.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index de698ee9..36b0198f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,12 +12,12 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat `data(, package = "epidatasets")`, `epidatasets::` or, after loading the package, the name of the dataset alone (#382). - `step_adjust_latency()` no longer allows empty column selection. -- Addresses upstream breaking changes from cmu-delphi/epiprocess#595 (`growth_rate()`). +- Addresses upstream breaking changes from cmu-delphi/epiprocess#595 (`growth_rate()`). `step_growth_rate()` has lost its `additional_gr_args_list` argument and now has an `na_rm` argument. - Moves `epiprocess` out of depends (#440). No internals have changed, but downstream users may need to add `library(epiprocess)` to existing code. -- Removes dependence on the `distributional` package, replacing the quantiles +- Removes dependence on the `distributional` package, replacing the quantiles with `hardhat::quantile_pred()`. Some associated functions are deprecated with `lifecycle` messages. - Rename `check_enough_train_data()` to `check_enough_data()`, and generalize it @@ -38,6 +38,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Replace `dist_quantiles()` with `hardhat::quantile_pred()` - Allow `quantile()` to threshold to an interval if desired (#434) - `arx_forecaster()` detects if there's enough data to predict +- Add `step_epi_YeoJohnson()` to perform a Yeo-Johnson transformation on the outcome variable. +- Add `layer_epi_YeoJohnson()` to undo a Yeo-Johnson transformation on the outcome variable in a forecast workflow. ## Bug fixes From 95c50b5766613d0ffd083d304cad3b59c9325cfa Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 31 Mar 2025 13:58:05 -0700 Subject: [PATCH 4/7] Update R/layer_yeo_johnson.R Co-authored-by: Daniel McDonald --- R/layer_yeo_johnson.R | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index 826bd23d..30c62ff1 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -205,13 +205,14 @@ yj_inverse <- function(x, lambda, eps = 0.001) { is_neg <- ind_neg[["is"]] nn_inv_trans <- function(x, lambda) { - if (abs(lambda) < eps) { - # log(x + 1) - exp(x) - 1 - } else { - # ((x + 1)^lambda - 1) / lambda - (lambda * x + 1)^(1 / lambda) - 1 - } + out <- double(length(x)) + sm_lambdas <- abs(lambda) < eps + out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 + x <- x[!sm_lambdas] + lambda <- lambda[!sm_lambdas] + out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 + out + } } ng_inv_trans <- function(x, lambda) { From 6a247437e4d34f7ca7f98bf524d9e2906cb7e8e7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 31 Mar 2025 18:31:06 -0700 Subject: [PATCH 5/7] fix: review tweaks * vectorize in lambda * inheritParams in docs * lambda -> yj_param in many places --- R/layer_yeo_johnson.R | 110 ++++++++++---------- R/step_adjust_latency.R | 1 - R/step_yeo_johnson.R | 167 ++++++++++++++++-------------- man/layer_epi_YeoJohnson.Rd | 4 +- man/step_adjust_latency.Rd | 12 +-- man/step_epi_YeoJohnson.Rd | 15 +-- tests/testthat/test-yeo-johnson.R | 54 +++++----- 7 files changed, 191 insertions(+), 172 deletions(-) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index 30c62ff1..d399717f 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -2,14 +2,10 @@ #' #' Will undo a step_epi_YeoJohnson transformation. #' -#' @param frosting a `frosting` postprocessor. The layer will be added to the -#' sequence of operations for this frosting. -#' @param lambdas Internal. A data frame of lambda values to be used for +#' @inheritParams layer_population_scaling +#' @param yj_params Internal. A data frame of parameters to be used for #' inverting the transformation. -#' @param ... One or more selector functions to scale variables -#' for this step. See [recipes::selections()] for more details. #' @param by A (possibly named) character vector of variables to join by. -#' @param id a random id string #' #' @return an updated `frosting` postprocessor #' @export @@ -41,13 +37,13 @@ #' # Compare to the original data. #' jhu %>% filter(time_value == "2021-12-31") #' forecast(wf) -layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { - checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE) +layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { + checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE) add_layer( frosting, layer_epi_YeoJohnson_new( - lambdas = lambdas, + yj_params = yj_params, by = by, terms = dplyr::enquos(...), id = id @@ -55,8 +51,8 @@ layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = ) } -layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) { - layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id) +layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) { + layer("epi_YeoJohnson", yj_params = yj_params, by = by, terms = terms, id = id) } #' @export @@ -64,16 +60,18 @@ layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) { slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() - # Get the lambdas from the layer or from the workflow. - lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow) + # TODO: We will error if we don't have a workflow. Write a check later. - # If the by is not specified, try to infer it from the lambdas. + # Get the yj_params from the layer or from the workflow. + yj_params <- object$yj_params %||% get_yj_params_in_layer(workflow) + + # If the by is not specified, try to infer it from the yj_params. if (is.null(object$by)) { # Assume `layer_predict` has calculated the prediction keys and other # layers don't change the prediction key colnames: prediction_key_colnames <- names(components$keys) lhs_potential_keys <- prediction_key_colnames - rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_"))) + rhs_potential_keys <- colnames(select(yj_params, -starts_with(".yj_param_"))) object$by <- intersect(lhs_potential_keys, rhs_potential_keys) suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") if (!all(suggested_min_keys %in% object$by)) { @@ -95,16 +93,16 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, object$by <- object$by %||% intersect( epi_keys_only(components$predictions), - colnames(select(lambdas, -starts_with(".lambda_"))) + colnames(select(yj_params, -starts_with(".yj_param_"))) ) joinby <- list(x = names(object$by) %||% object$by, y = object$by) hardhat::validate_column_names(components$predictions, joinby$x) - hardhat::validate_column_names(lambdas, joinby$y) + hardhat::validate_column_names(yj_params, joinby$y) - # Join the lambdas. + # Join the yj_params. components$predictions <- inner_join( components$predictions, - lambdas, + yj_params, by = object$by, relationship = "many-to-one", unmatched = c("error", "drop") @@ -115,7 +113,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, col_names <- names(pos) # The `object$terms` is where the user specifies the columns they want to - # untransform. We need to match the outcomes with their lambda columns in our + # untransform. We need to match the outcomes with their yj_param columns in our # parameter table and then apply the inverse transformation. if (identical(col_names, ".pred")) { # In this case, we don't get a hint for the outcome column name, so we need @@ -130,8 +128,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, magrittr::extract(, 2) components$predictions <- components$predictions %>% - rowwise() %>% - mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols)))) + mutate(.pred := yj_inverse(.pred, !!sym(paste0(".yj_param_", outcome_cols)))) } else if (identical(col_names, character(0))) { # Wish I could suggest `all_outcomes()` here, but currently it's the same as # not specifying any terms. I don't want to spend time with dealing with @@ -146,10 +143,10 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ) } else { # In this case, we assume that the user has specified the columns they want - # transformed here. We then need to determine the lambda columns for each of + # transformed here. We then need to determine the yj_param columns for each of # these columns. That is, we need to convert a vector of column names like # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to - # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate"). + # c(".yj_param_ahead_1_case_rate", ".yj_param_ahead_7_case_rate"). original_outcome_cols <- stringr::str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] outcomes_wout_ahead <- stringr::str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[, 2] if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { @@ -163,34 +160,37 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, for (i in seq_along(col_names)) { col <- col_names[i] - lambda_col <- paste0(".lambda_", original_outcome_cols[i]) + yj_param_col <- paste0(".yj_param_", original_outcome_cols[i]) components$predictions <- components$predictions %>% - rowwise() %>% - mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col))) + mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(yj_param_col))) } } - # Remove the lambda columns. + # Remove the yj_param columns. components$predictions <- components$predictions %>% - select(-any_of(starts_with(".lambda_"))) %>% + select(-any_of(starts_with(".yj_param_"))) %>% ungroup() components } #' @export print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) { - title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + title <- "Yeo-Johnson transformation (see `yj_params` object for values) on " print_layer(x$terms, title = title, width = width) } # Inverse Yeo-Johnson transformation # -# Inverse of `yj_transform` in step_yeo_johnson.R. Note that this function is -# vectorized in x, but not in lambda. +# Inverse of `yj_transform` in step_yeo_johnson.R. yj_inverse <- function(x, lambda, eps = 0.001) { - if (is.na(lambda)) { + if (any(is.na(lambda))) { return(x) } + if (length(x) > 1 && length(lambda) == 1) { + lambda <- rep(lambda, length(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) + } if (!inherits(x, "tbl_df") || is.data.frame(x)) { x <- unlist(x, use.names = FALSE) } else { @@ -199,52 +199,58 @@ yj_inverse <- function(x, lambda, eps = 0.001) { } } - dat_neg <- x < 0 - ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) - not_neg <- ind_neg[["not"]] - is_neg <- ind_neg[["is"]] - nn_inv_trans <- function(x, lambda) { out <- double(length(x)) sm_lambdas <- abs(lambda) < eps - out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 + if (length(sm_lambdas) > 0) { + out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 + } x <- x[!sm_lambdas] lambda <- lambda[!sm_lambdas] - out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 + if (length(x) > 0) { + out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 + } out } - } ng_inv_trans <- function(x, lambda) { - if (abs(lambda - 2) < eps) { - # -log(-x + 1) - -(exp(-x) - 1) - } else { - # -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) - -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + out <- double(length(x)) + near2_lambdas <- abs(lambda - 2) < eps + if (length(near2_lambdas) > 0) { + out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1) + } + x <- x[!near2_lambdas] + lambda <- lambda[!near2_lambdas] + if (length(x) > 0) { + out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) } + out } + dat_neg <- x < 0 + not_neg <- which(!dat_neg) + is_neg <- which(dat_neg) + if (length(not_neg) > 0) { - x[not_neg] <- nn_inv_trans(x[not_neg], lambda) + x[not_neg] <- nn_inv_trans(x[not_neg], lambda[not_neg]) } if (length(is_neg) > 0) { - x[is_neg] <- ng_inv_trans(x[is_neg], lambda) + x[is_neg] <- ng_inv_trans(x[is_neg], lambda[is_neg]) } x } -get_lambdas_in_layer <- function(workflow) { +get_yj_params_in_layer <- function(workflow) { this_recipe <- hardhat::extract_recipe(workflow) if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) } for (step in this_recipe$steps) { if (inherits(step, "step_epi_YeoJohnson")) { - lambdas <- step$lambdas + yj_params <- step$yj_params break } } - lambdas + yj_params } diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index ae9db6ef..a0d59bc1 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -272,7 +272,6 @@ step_adjust_latency_new <- # lags introduces max(lags) NA's after the max_time_value. #' @export #' @importFrom glue glue -#' @importFrom dplyr rowwise prep.step_adjust_latency <- function(x, training, info = NULL, ...) { latency <- x$latency col_names <- recipes::recipes_eval_select(x$terms, training, info) diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R index 3a5fdce6..272b034a 100644 --- a/R/step_yeo_johnson.R +++ b/R/step_yeo_johnson.R @@ -4,28 +4,19 @@ #' transform data using a Yeo-Johnson transformation. This fork works with panel #' data and is meant for epidata. #' -#' @param recipe A recipe object. The step will be added to the -#' sequence of operations for this recipe. -#' @param ... One or more selector functions to choose variables -#' for this step. See [recipes::selections()] for more details. -#' @param role For model terms created by this step, what analysis role should -#' they be assigned? `lag` is default a predictor while `ahead` is an outcome. +#' @inheritParams step_population_scaling #' @param trained A logical for whether the selectors in `...` #' have been resolved by [prep()]. -#' @param lambdas Internal. A numeric vector of transformation values. This +#' @param yj_params Internal. A numeric vector of transformation values. This #' is `NULL` until computed by [prep()]. -#' @param na_lambda_fill A numeric value to fill in for any -#' geos where the lambda cannot be estimated. -#' @param limits A length 2 numeric vector defining the range to -#' compute the transformation parameter lambda. -#' @param num_unique An integer where data that have fewer than this -#' many unique values will not be evaluated for a transformation. -#' @param na_rm A logical indicating whether missing values should be -#' removed. -#' @param skip A logical. Should the step be skipped when the recipe is -#' baked by [bake()]. On the `training` data, the step will always be -#' conducted (even if `skip = TRUE`). -#' @param id A unique identifier for the step +#' @param na_fill A numeric value to fill in for any geos where a Yeo-Johnson +#' parameter cannot be estimated. +#' @param limits A length 2 numeric vector defining the range to compute the +#' transformation parameter. +#' @param num_unique An integer where data that have fewer than this many unique +#' values will not be evaluated for a transformation. +#' @param na_rm A logical indicating whether missing values should be removed +#' before estimating the transformation parameter. #' @template step-return #' @family individual transformation steps #' @export @@ -73,8 +64,8 @@ #' r #' # Fit the recipe #' tr <- r %>% prep(filtered_data) -#' # View the lambda values -#' tr$steps[[1]]$lambdas +#' # View the parameter values +#' tr$steps[[1]]$yj_params #' # View the transformed data #' df <- tr %>% bake(filtered_data) #' plot(density(df$cases)) @@ -84,8 +75,8 @@ step_epi_YeoJohnson <- function( ..., role = "predictor", trained = FALSE, - lambdas = NULL, - na_lambda_fill = 1 / 4, + yj_params = NULL, + na_fill = 1 / 4, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, @@ -93,7 +84,7 @@ step_epi_YeoJohnson <- function( id = rand_id("epi_YeoJohnson") ) { checkmate::assert_numeric(limits, len = 2) - checkmate::assert_numeric(na_lambda_fill, lower = min(limits), upper = max(limits), len = 1) + checkmate::assert_numeric(na_fill, lower = min(limits), upper = max(limits), len = 1) checkmate::assert_numeric(num_unique, lower = 2, upper = Inf, len = 1) checkmate::assert_logical(na_rm, len = 1) checkmate::assert_logical(skip, len = 1) @@ -103,8 +94,8 @@ step_epi_YeoJohnson <- function( terms = enquos(...), role = role, trained = trained, - lambdas = lambdas, - na_lambda_fill = na_lambda_fill, + yj_params = yj_params, + na_fill = na_fill, limits = sort(limits)[1:2], num_unique = num_unique, na_rm = na_rm, @@ -121,8 +112,8 @@ step_epi_YeoJohnson_new <- function( terms, role, trained, - lambdas, - na_lambda_fill, + yj_params, + na_fill, limits, num_unique, na_rm, @@ -137,8 +128,8 @@ step_epi_YeoJohnson_new <- function( terms = terms, role = role, trained = trained, - lambdas = lambdas, - na_lambda_fill = na_lambda_fill, + yj_params = yj_params, + na_fill = na_fill, limits = limits, num_unique = num_unique, na_rm = na_rm, @@ -156,12 +147,12 @@ prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { col_names <- recipes_eval_select(x$terms, training, info) recipes::check_type(training[, col_names], types = c("double", "integer")) - lambdas <- get_lambdas_yj_table( + yj_params <- compute_yj_params( training, col_names, x$limits, x$num_unique, - x$na_lambda_fill, + x$na_fill, x$na_rm, key_colnames(training, exclude = "time_value") ) @@ -170,8 +161,8 @@ prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { terms = x$terms, role = x$role, trained = TRUE, - lambdas = lambdas, - na_lambda_fill = x$na_lambda_fill, + yj_params = yj_params, + na_fill = x$na_fill, limits = x$limits, num_unique = x$num_unique, na_rm = x$na_rm, @@ -196,14 +187,10 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { ) attr(new_data, "metadata") <- object$metadata } - # Check that the keys match. - keys <- key_colnames(new_data, exclude = "time_value") - old_keys <- object$lambdas %>% - select(-starts_with(".lambda_")) %>% - colnames() - if (!all(keys %in% old_keys)) { + # Check that the columns for transformation are present in new_data. + if (!all(object$columns %in% colnames(new_data))) { cli::cli_abort( - "The keys of the new data do not match the keys of the training data.", + "The columns for transformation are not present in the new data.", call = rlang::caller_fn() ) } @@ -211,68 +198,82 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { col_names <- object$columns check_new_data(col_names, object, new_data) - # Transform each column, using the appropriate lambda column per row. - # Note that yj_transform() is vectorized in x, but not in lambda. - new_data <- left_join(new_data, object$lambdas, by = keys) + # Check that the keys match. + check <- hardhat::check_column_names(new_data, object$yj_params %>% select(-starts_with(".yj_param_")) %>% colnames()) + if (!check$ok) { + cli_abort(c( + "Some variables used for training are not available in {.arg x}.", + i = "The following required columns are missing: {check$missing_names}" + ), call = rlang::caller_fn()) + } + # Transform each column, using the appropriate yj_param column per row. + new_data <- left_join(new_data, object$yj_params, by = key_colnames(new_data, exclude = "time_value")) for (col in col_names) { new_data <- new_data %>% - rowwise() %>% - mutate(!!col := yj_transform(!!sym(col), !!sym(paste0(".lambda_", col)))) + mutate(!!col := yj_transform(!!sym(col), !!sym(paste0(".yj_param_", col)))) } - # Remove the lambda columns. + # Remove the yj_param columns. new_data %>% - select(-starts_with(".lambda_")) %>% + select(-starts_with(".yj_param_")) %>% ungroup() } #' @export print.step_epi_YeoJohnson <- function(x, width = max(20, options()$width - 39), ...) { - title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + title <- "Yeo-Johnson transformation (see `yj_params` object for values) on " print_epi_step(x$terms, x$terms, title = title, width = width) invisible(x) } -# Compute the lambda values per group for each column. -get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lambda_fill, na_rm, epi_keys_checked) { - # Estimate the lambda for each column, creating a lambda_ column for each. - # Note that estimate_yj() operates on a vector. - lambdas <- training %>% +# Compute the yj_param values per group for each column. +compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, na_rm, epi_keys_checked) { + # Estimate the yj_param for each column, creating a .yj_param_ column for + # each. Note that estimate_yj() operates on each column. + yj_params <- training %>% summarise( across(all_of(col_names), ~ estimate_yj(.x, limits, num_unique, na_rm)), .by = all_of(epi_keys_checked) ) %>% - dplyr::rename_with(~ paste0(".lambda_", .x), -all_of(epi_keys_checked)) + dplyr::rename_with(~ paste0(".yj_param_", .x), -all_of(epi_keys_checked)) - # Check for NAs in any of the lambda_ columns. + # Check for NAs in any of the yj_param_ columns. # EDIT: This warning was too noisy. Keeping code around, in case we want it. # for (col in col_names) { - # if (any(is.na(values[[paste0("lambda_", col)]]))) { + # if (any(is.na(values[[paste0(".yj_param_", col)]]))) { # cli::cli_warn( # c( - # x = "Yeo-Johnson lambda could not be estimated for some geos for {col}.", - # i = "Using lambda={x$na_lambda_fill} in these cases." + # x = "Yeo-Johnson parameter could not be estimated for some geos for {col}.", + # i = "Using parameter={x$na_fill} in these cases." # ), # call = rlang::caller_fn() # ) # } # } - # Fill in NAs with the default lambda. - lambdas %>% - mutate(across(starts_with(".lambda_"), \(col) ifelse(is.na(col), na_lambda_fill, col))) + # Fill in NAs with the default yj_param. + yj_params %>% + mutate(across(starts_with(".yj_param_"), \(col) ifelse(is.na(col), na_fill, col))) } ### Code below taken from recipes::step_YeoJohnson. +### We keep "lambda" here, but above we renamed it to "yj_param". +### Modified yj_transform() to be vectorized in lambda. ### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 # Yeo-Johnson transformation -# -# Note that this function is vectorized in x, but not in lambda. yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { - if (is.na(lambda)) { + if (any(is.na(lambda))) { return(x) } + if (length(x) > 1 && length(lambda) == 1) { + lambda <- rep(lambda, length(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort( + "Length of `x` must be equal to length of `lambda` or lambda must be a scalar.", + call = rlang::caller_fn() + ) + } if (!inherits(x, "tbl_df") || is.data.frame(x)) { x <- unlist(x, use.names = FALSE) } else { @@ -289,27 +290,39 @@ yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { is_neg <- ind_neg[["is"]] nn_trans <- function(x, lambda) { - if (abs(lambda) < eps) { - log(x + 1) - } else { - ((x + 1)^lambda - 1) / lambda + out <- double(length(x)) + sm_lambdas <- abs(lambda) < eps + if (length(sm_lambdas) > 0) { + out[sm_lambdas] <- log(x[sm_lambdas] + 1) } + x <- x[!sm_lambdas] + lambda <- lambda[!sm_lambdas] + if (length(x) > 0) { + out[!sm_lambdas] <- ((x + 1)^lambda - 1) / lambda + } + out } ng_trans <- function(x, lambda) { - if (abs(lambda - 2) < eps) { - -log(-x + 1) - } else { - -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) + out <- double(length(x)) + near2_lambdas <- abs(lambda - 2) < eps + if (length(near2_lambdas) > 0) { + out[near2_lambdas] <- -log(-x[near2_lambdas] + 1) + } + x <- x[!near2_lambdas] + lambda <- lambda[!near2_lambdas] + if (length(x) > 0) { + out[!near2_lambdas] <- -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) } + out } if (length(not_neg) > 0) { - x[not_neg] <- nn_trans(x[not_neg], lambda) + x[not_neg] <- nn_trans(x[not_neg], lambda[not_neg]) } if (length(is_neg) > 0) { - x[is_neg] <- ng_trans(x[is_neg], lambda) + x[is_neg] <- ng_trans(x[is_neg], lambda[is_neg]) } x } @@ -382,8 +395,8 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca tidy.step_epi_YeoJohnson <- function(x, ...) { if (is_trained(x)) { res <- tibble( - terms = names(x$lambdas), - value = unname(x$lambdas) + terms = names(x$yj_params), + value = unname(x$yj_params) ) } else { term_names <- sel2char(x$terms) diff --git a/man/layer_epi_YeoJohnson.Rd b/man/layer_epi_YeoJohnson.Rd index 1ca4d9cc..53520b4e 100644 --- a/man/layer_epi_YeoJohnson.Rd +++ b/man/layer_epi_YeoJohnson.Rd @@ -7,7 +7,7 @@ layer_epi_YeoJohnson( frosting, ..., - lambdas = NULL, + yj_params = NULL, by = NULL, id = rand_id("epi_YeoJohnson") ) @@ -19,7 +19,7 @@ sequence of operations for this frosting.} \item{...}{One or more selector functions to scale variables for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} -\item{lambdas}{Internal. A data frame of lambda values to be used for +\item{yj_params}{Internal. A data frame of parameters to be used for inverting the transformation.} \item{by}{A (possibly named) character vector of variables to join by.} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index c904a5ad..258b72a6 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -140,7 +140,7 @@ toy_recipe \%>\% #> * geo_type = state #> * time_type = day #> * as_of = 2015-01-14 -#> +#> #> # A tibble: 8 x 4 #> geo_value time_value a b #> @@ -176,7 +176,7 @@ toy_recipe \%>\% #> * geo_type = state #> * time_type = day #> * as_of = 2015-01-14 -#> +#> #> # A tibble: 21 x 7 #> geo_value time_value a b lag_3_a lag_4_b ahead_1_a #> @@ -224,7 +224,7 @@ toy_recipe \%>\% #> * geo_type = state #> * time_type = day #> * as_of = 2015-01-14 -#> +#> #> # A tibble: 10 x 6 #> geo_value time_value a b lag_0_a ahead_3_a #> @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with -#> modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous +#> `step_epi_lag`s won't work with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as @@ -296,7 +296,7 @@ rates_fit } \seealso{ -Other row operation steps: +Other row operation steps: \code{\link{step_epi_lag}()}, \code{\link{step_growth_rate}()}, \code{\link{step_lag_difference}()} diff --git a/man/step_epi_YeoJohnson.Rd b/man/step_epi_YeoJohnson.Rd index 1fa63761..ffde5579 100644 --- a/man/step_epi_YeoJohnson.Rd +++ b/man/step_epi_YeoJohnson.Rd @@ -9,7 +9,7 @@ step_epi_YeoJohnson( ..., role = "predictor", trained = FALSE, - lambdas = NULL, + yj_params = NULL, na_lambda_fill = 1/4, limits = c(-5, 5), num_unique = 5, @@ -31,7 +31,7 @@ they be assigned? \code{lag} is default a predictor while \code{ahead} is an out \item{trained}{A logical for whether the selectors in \code{...} have been resolved by \code{\link[=prep]{prep()}}.} -\item{lambdas}{Internal. A numeric vector of transformation values. This +\item{yj_params}{Internal. A numeric vector of transformation values. This is \code{NULL} until computed by \code{\link[=prep]{prep()}}.} \item{na_lambda_fill}{A numeric value to fill in for any @@ -46,9 +46,12 @@ many unique values will not be evaluated for a transformation.} \item{na_rm}{A logical indicating whether missing values should be removed.} -\item{skip}{A logical. Should the step be skipped when the recipe is -baked by \code{\link[=bake]{bake()}}. On the \code{training} data, the step will always be -conducted (even if \code{skip = TRUE}).} +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} \item{id}{A unique identifier for the step} } @@ -105,7 +108,7 @@ r # Fit the recipe tr <- r \%>\% prep(filtered_data) # View the lambda values -tr$steps[[1]]$lambdas +tr$steps[[1]]$yj_params # View the transformed data df <- tr \%>\% bake(filtered_data) plot(density(df$cases)) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 124f9648..48d28f10 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -1,11 +1,17 @@ test_that("Yeo-Johnson transformation inverts correctly", { + # Vectorized x and scalar lambda work + lambdas <- seq(-5, 5, 0.1) + x <- seq(-10, 10, 0.1) + expect_true( + map_lgl(lambdas, \(lambda) sum(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5) %>% + all() + ) # Note that the special lambda values of 0 and 2 are covered by the tests # below. + # Vectorized x and lambda both work + x <- seq(-5, 5, 0.1) expect_true( - map_lgl(seq(-5, 5, 0.1), function(lambda) { - map_lgl(seq(-10, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all() - }) %>% - all() + sum(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 ) }) @@ -15,7 +21,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { select(geo_value, time_value, cases) filtered_data <- jhu - # Get some lambda values + # Get some yj_param values r <- epi_recipe(filtered_data) %>% step_epi_YeoJohnson(cases) %>% step_epi_lag(cases, lag = 0) %>% @@ -23,14 +29,9 @@ test_that("Yeo-Johnson steps and layers invert each other", { step_epi_naomit() tr <- r %>% prep(filtered_data) - # Check general lambda values tibble structure - expect_true(".lambda_cases" %in% names(tr$steps[[1]]$lambdas)) - expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_cases)) - # Still works on a tibble - expect_equal( - tr %>% bake(filtered_data %>% as_tibble()), - tr %>% bake(filtered_data) - ) + # Check general yj_param values tibble structure + expect_true(".yj_param_cases" %in% names(tr$steps[[1]]$yj_params)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_cases)) # Make sure that the inverse transformation works f <- frosting() %>% @@ -40,7 +41,6 @@ test_that("Yeo-Johnson steps and layers invert each other", { fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - as_tibble() %>% slice_max(time_value, by = geo_value) out2 <- forecast(wf) %>% rename(cases = .pred) expect_equal(out1, out2) @@ -57,11 +57,11 @@ test_that("Yeo-Johnson steps and layers invert each other", { step_epi_naomit() tr <- r %>% prep(filtered_data) - # Check general lambda values tibble structure - expect_true(".lambda_case_rate" %in% names(tr$steps[[1]]$lambdas)) - expect_true(".lambda_death_rate" %in% names(tr$steps[[1]]$lambdas)) - expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_case_rate)) - expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_death_rate)) + # Check general yj_param values tibble structure + expect_true(".yj_param_case_rate" %in% names(tr$steps[[1]]$yj_params)) + expect_true(".yj_param_death_rate" %in% names(tr$steps[[1]]$yj_params)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_case_rate)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_death_rate)) # Make sure that the inverse transformation works f <- frosting() %>% @@ -71,15 +71,14 @@ test_that("Yeo-Johnson steps and layers invert each other", { fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - as_tibble() %>% slice_max(time_value, by = geo_value) - # debugonce(slather.layer_epi_YeoJohnson) out2 <- forecast(wf) %>% rename(case_rate = .pred_ahead_0_case_rate, death_rate = .pred_ahead_0_death_rate) expect_equal(out1, out2) }) test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { # Small synthetic grad_employ_dataset version. + # fmt: skip filtered_data <- tribble( ~geo_value, ~age_group, ~edu_qual, ~time_value, ~med_income_2y, "ca", "25-34", "bachelor", 2017, 50000, @@ -108,18 +107,18 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr "ca", "35-1000", "master", 2022, 2 * (3e10 + 50) ) %>% as_epi_df(other_keys = c("age_group", "edu_qual")) - # Get some lambda values + # Get some yj_param values r <- epi_recipe(filtered_data) %>% step_epi_YeoJohnson(med_income_2y) %>% step_epi_lag(med_income_2y, lag = 0) %>% step_epi_ahead(med_income_2y, ahead = 0, role = "outcome") %>% step_epi_naomit() tr <- r %>% prep(filtered_data) - expect_true(".lambda_med_income_2y" %in% names(tr$steps[[1]]$lambdas)) - expect_true("geo_value" %in% names(tr$steps[[1]]$lambdas)) - expect_true("age_group" %in% names(tr$steps[[1]]$lambdas)) - expect_true("edu_qual" %in% names(tr$steps[[1]]$lambdas)) - expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_med_income_2y)) + expect_true(".yj_param_med_income_2y" %in% names(tr$steps[[1]]$yj_params)) + expect_true("geo_value" %in% names(tr$steps[[1]]$yj_params)) + expect_true("age_group" %in% names(tr$steps[[1]]$yj_params)) + expect_true("edu_qual" %in% names(tr$steps[[1]]$yj_params)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_med_income_2y)) # Make sure that the inverse transformation works f <- frosting() %>% @@ -129,7 +128,6 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - as_tibble() %>% slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) From fd76c3678abcdd60e85448b77d1d9741af858df9 Mon Sep 17 00:00:00 2001 From: dsweber2 Date: Thu, 3 Apr 2025 18:05:24 -0500 Subject: [PATCH 6/7] fix lambda(s), styling --- R/step_yeo_johnson.R | 50 +++++++++++++++---------------- man/step_adjust_latency.Rd | 4 +-- man/step_epi_YeoJohnson.Rd | 20 ++++++------- tests/testthat/test-yeo-johnson.R | 2 +- 4 files changed, 37 insertions(+), 39 deletions(-) diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R index 272b034a..4e429528 100644 --- a/R/step_yeo_johnson.R +++ b/R/step_yeo_johnson.R @@ -71,18 +71,17 @@ #' plot(density(df$cases)) #' plot(density(filtered_data$cases)) step_epi_YeoJohnson <- function( - recipe, - ..., - role = "predictor", - trained = FALSE, - yj_params = NULL, - na_fill = 1 / 4, - limits = c(-5, 5), - num_unique = 5, - na_rm = TRUE, - skip = FALSE, - id = rand_id("epi_YeoJohnson") -) { + recipe, + ..., + role = "predictor", + trained = FALSE, + yj_params = NULL, + na_fill = 1 / 4, + limits = c(-5, 5), + num_unique = 5, + na_rm = TRUE, + skip = FALSE, + id = rand_id("epi_YeoJohnson")) { checkmate::assert_numeric(limits, len = 2) checkmate::assert_numeric(na_fill, lower = min(limits), upper = max(limits), len = 1) checkmate::assert_numeric(num_unique, lower = 2, upper = Inf, len = 1) @@ -109,20 +108,19 @@ step_epi_YeoJohnson <- function( } step_epi_YeoJohnson_new <- function( - terms, - role, - trained, - yj_params, - na_fill, - limits, - num_unique, - na_rm, - forecast_date, - metadata, - columns, - skip, - id -) { + terms, + role, + trained, + yj_params, + na_fill, + limits, + num_unique, + na_rm, + forecast_date, + metadata, + columns, + skip, + id) { step( subclass = "epi_YeoJohnson", terms = terms, diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 258b72a6..9e1bafbd 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous -#> `step_epi_lag`s won't work with modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with +#> modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/man/step_epi_YeoJohnson.Rd b/man/step_epi_YeoJohnson.Rd index ffde5579..cfe85169 100644 --- a/man/step_epi_YeoJohnson.Rd +++ b/man/step_epi_YeoJohnson.Rd @@ -10,7 +10,7 @@ step_epi_YeoJohnson( role = "predictor", trained = FALSE, yj_params = NULL, - na_lambda_fill = 1/4, + na_fill = 1/4, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, @@ -34,17 +34,17 @@ have been resolved by \code{\link[=prep]{prep()}}.} \item{yj_params}{Internal. A numeric vector of transformation values. This is \code{NULL} until computed by \code{\link[=prep]{prep()}}.} -\item{na_lambda_fill}{A numeric value to fill in for any -geos where the lambda cannot be estimated.} +\item{na_fill}{A numeric value to fill in for any geos where a Yeo-Johnson +parameter cannot be estimated.} -\item{limits}{A length 2 numeric vector defining the range to -compute the transformation parameter lambda.} +\item{limits}{A length 2 numeric vector defining the range to compute the +transformation parameter.} -\item{num_unique}{An integer where data that have fewer than this -many unique values will not be evaluated for a transformation.} +\item{num_unique}{An integer where data that have fewer than this many unique +values will not be evaluated for a transformation.} -\item{na_rm}{A logical indicating whether missing values should be -removed.} +\item{na_rm}{A logical indicating whether missing values should be removed +before estimating the transformation parameter.} \item{skip}{A logical. Should the step be skipped when the recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked @@ -107,7 +107,7 @@ r <- epi_recipe(filtered_data) \%>\% r # Fit the recipe tr <- r \%>\% prep(filtered_data) -# View the lambda values +# View the parameter values tr$steps[[1]]$yj_params # View the transformed data df <- tr \%>\% bake(filtered_data) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 48d28f10..9ae82151 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -11,7 +11,7 @@ test_that("Yeo-Johnson transformation inverts correctly", { # Vectorized x and lambda both work x <- seq(-5, 5, 0.1) expect_true( - sum(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 + sum(abs(yj_inverse(yj_transform(x, lambdas), lambdas) - x)) < 1e-5 ) }) From 47dce2f4aac64f0f6a1957c1f48119311eeccacc Mon Sep 17 00:00:00 2001 From: David Weber Date: Thu, 10 Apr 2025 16:50:58 -0500 Subject: [PATCH 7/7] extend to quantile_dist, exclude multi-output (#458) * extend to quantile_dist, exclude multi-output * Drop by specification and infer from the epi_df * lint+test: test coverage, handle na lambda case, lint * fix: quantile_pred arithmetic * fix: rlang calls --------- Co-authored-by: Dmitry Shemetov --- NAMESPACE | 2 + R/layer_yeo_johnson.R | 236 ++++++++++----------------- R/quantile_pred-methods.R | 20 ++- R/step_yeo_johnson.R | 127 +++++++------- man/epipredict-vctrs.Rd | 13 ++ man/get_params_in_layer.Rd | 23 +++ man/layer_epi_YeoJohnson.Rd | 20 ++- man/step_adjust_latency.Rd | 4 +- tests/testthat/_snaps/yeo-johnson.md | 16 ++ tests/testthat/test-quantile_pred.R | 25 +++ tests/testthat/test-yeo-johnson.R | 70 +++++++- 11 files changed, 324 insertions(+), 232 deletions(-) create mode 100644 man/epipredict-vctrs.Rd create mode 100644 man/get_params_in_layer.Rd create mode 100644 tests/testthat/_snaps/yeo-johnson.md diff --git a/NAMESPACE b/NAMESPACE index 351530de..053913c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -124,6 +124,7 @@ S3method(update,layer) S3method(vec_arith,quantile_pred) S3method(vec_arith.numeric,quantile_pred) S3method(vec_arith.quantile_pred,numeric) +S3method(vec_arith.quantile_pred,quantile_pred) S3method(vec_math,quantile_pred) S3method(vec_proxy_equal,quantile_pred) S3method(weighted_interval_score,quantile_pred) @@ -235,6 +236,7 @@ import(epidatasets) import(epiprocess) import(parsnip) import(recipes) +import(vctrs) importFrom(checkmate,assert_class) importFrom(checkmate,assert_numeric) importFrom(checkmate,test_character) diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R index d399717f..8d63b7f6 100644 --- a/R/layer_yeo_johnson.R +++ b/R/layer_yeo_johnson.R @@ -1,11 +1,21 @@ #' Unormalizing transformation #' -#' Will undo a step_epi_YeoJohnson transformation. +#' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +#' are using this step on a column that will eventually become the outcome +#' variable, you should make sure that the original name of that column is a +#' subset of the outcome variable name. `ahead_7_cases` when `cases` is +#' transformed will work well, while `ahead_7` will not. #' #' @inheritParams layer_population_scaling -#' @param yj_params Internal. A data frame of parameters to be used for -#' inverting the transformation. -#' @param by A (possibly named) character vector of variables to join by. +#' @param yj_params A data frame of parameters to be used for inverting the +#' transformation. Typically set automatically. If you have done multiple +#' transformations such that the outcome variable name no longer contains the +#' column that this step transforms, then you should manually specify this to +#' be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an +#' example where you wouldn't need to set this, if your output is +#' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with +#' other columns), then you wouldn't need to set this. However if you have +#' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step. #' #' @return an updated `frosting` postprocessor #' @export @@ -37,22 +47,21 @@ #' # Compare to the original data. #' jhu %>% filter(time_value == "2021-12-31") #' forecast(wf) -layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { +layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, id = rand_id("epi_YeoJohnson")) { checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE) add_layer( frosting, layer_epi_YeoJohnson_new( yj_params = yj_params, - by = by, terms = dplyr::enquos(...), id = id ) ) } -layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) { - layer("epi_YeoJohnson", yj_params = yj_params, by = by, terms = terms, id = id) +layer_epi_YeoJohnson_new <- function(yj_params, terms, id) { + layer("epi_YeoJohnson", yj_params = yj_params, terms = terms, id = id) } #' @export @@ -60,42 +69,14 @@ layer_epi_YeoJohnson_new <- function(yj_params, by, terms, id) { slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { rlang::check_dots_empty() - # TODO: We will error if we don't have a workflow. Write a check later. - - # Get the yj_params from the layer or from the workflow. - yj_params <- object$yj_params %||% get_yj_params_in_layer(workflow) - - # If the by is not specified, try to infer it from the yj_params. - if (is.null(object$by)) { - # Assume `layer_predict` has calculated the prediction keys and other - # layers don't change the prediction key colnames: - prediction_key_colnames <- names(components$keys) - lhs_potential_keys <- prediction_key_colnames - rhs_potential_keys <- colnames(select(yj_params, -starts_with(".yj_param_"))) - object$by <- intersect(lhs_potential_keys, rhs_potential_keys) - suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") - if (!all(suggested_min_keys %in% object$by)) { - cli_warn( - c( - "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, - but {?wasn't/weren't} found in the population `df`.", - "i" = "Defaulting to join by {object$by}", - ">" = "Double-check whether column names on the population `df` match those expected in your predictions", - ">" = "Consider using population data with breakdowns by {suggested_min_keys}", - ">" = "Manually specify `by =` to silence" - ), - class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" - ) - } - } + # get the yj_params from the layer or from the workflow. + yj_params <- + object$yj_params %||% + get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params") # Establish the join columns. - object$by <- object$by %||% - intersect( - epi_keys_only(components$predictions), - colnames(select(yj_params, -starts_with(".yj_param_"))) - ) - joinby <- list(x = names(object$by) %||% object$by, y = object$by) + join_by_columns <- key_colnames(new_data, exclude = "time_value") %>% sort() + joinby <- list(x = join_by_columns, y = join_by_columns) hardhat::validate_column_names(components$predictions, joinby$x) hardhat::validate_column_names(yj_params, joinby$y) @@ -115,55 +96,15 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, # The `object$terms` is where the user specifies the columns they want to # untransform. We need to match the outcomes with their yj_param columns in our # parameter table and then apply the inverse transformation. - if (identical(col_names, ".pred")) { - # In this case, we don't get a hint for the outcome column name, so we need - # to infer it from the mold. - if (length(components$mold$outcomes) > 1) { - cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env()) - } - # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc. - # We want to extract the cases part. - outcome_cols <- names(components$mold$outcomes) %>% - stringr::str_match("ahead_\\d+_(.*)") %>% - magrittr::extract(, 2) - + if (length(col_names) == 0) { + # not specified by the user, so just modify everything starting with `.pred` components$predictions <- components$predictions %>% - mutate(.pred := yj_inverse(.pred, !!sym(paste0(".yj_param_", outcome_cols)))) - } else if (identical(col_names, character(0))) { - # Wish I could suggest `all_outcomes()` here, but currently it's the same as - # not specifying any terms. I don't want to spend time with dealing with - # this case until someone asks for it. - cli::cli_abort( - "Not specifying columns to layer Yeo-Johnson is not implemented. - If you had a single outcome, you can use `.pred` as a column name. - If you had multiple outcomes, you'll need to specify them like - `.pred_ahead_1_`, `.pred_ahead_7_`, etc. - ", - call = rlang::caller_env() - ) + mutate(across(starts_with(".pred"), \(.pred) yj_inverse(.pred, .lambda))) %>% + select(-.lambda) } else { - # In this case, we assume that the user has specified the columns they want - # transformed here. We then need to determine the yj_param columns for each of - # these columns. That is, we need to convert a vector of column names like - # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to - # c(".yj_param_ahead_1_case_rate", ".yj_param_ahead_7_case_rate"). - original_outcome_cols <- stringr::str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] - outcomes_wout_ahead <- stringr::str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[, 2] - if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { - cli_abort( - "All columns specified in `...` must be outcome columns. - They must be of the form `.pred_ahead_1_`, `.pred_ahead_7_`, etc. - ", - call = rlang::caller_env() - ) - } - - for (i in seq_along(col_names)) { - col <- col_names[i] - yj_param_col <- paste0(".yj_param_", original_outcome_cols[i]) - components$predictions <- components$predictions %>% - mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(yj_param_col))) - } + components$predictions <- components$predictions %>% + mutate(across(all_of(col_names), \(.pred) yj_inverse(.pred, .lambda))) %>% + select(-.lambda) } # Remove the yj_param columns. @@ -182,75 +123,72 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), # Inverse Yeo-Johnson transformation # # Inverse of `yj_transform` in step_yeo_johnson.R. -yj_inverse <- function(x, lambda, eps = 0.001) { +yj_inverse <- function(x_in, lambda, eps = 0.001) { if (any(is.na(lambda))) { - return(x) - } - if (length(x) > 1 && length(lambda) == 1) { - lambda <- rep(lambda, length(x)) - } else if (length(x) != length(lambda)) { - cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_fn()) - } - if (!inherits(x, "tbl_df") || is.data.frame(x)) { - x <- unlist(x, use.names = FALSE) - } else { - if (!is.vector(x)) { - x <- as.vector(x) - } - } - - nn_inv_trans <- function(x, lambda) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- exp(x[sm_lambdas]) - 1 - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- (lambda * x + 1)^(1 / lambda) - 1 - } - out + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) } - - ng_inv_trans <- function(x, lambda) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - if (length(near2_lambdas) > 0) { - out[near2_lambdas] <- -(exp(-x[near2_lambdas]) - 1) - } - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) - } - out - } - - dat_neg <- x < 0 - not_neg <- which(!dat_neg) - is_neg <- which(dat_neg) - - if (length(not_neg) > 0) { - x[not_neg] <- nn_inv_trans(x[not_neg], lambda[not_neg]) - } - - if (length(is_neg) > 0) { - x[is_neg] <- ng_inv_trans(x[is_neg], lambda[is_neg]) + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] + inv_x <- ifelse( + x < 0, + # negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -(exp(-x) - 1), + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + ), + # non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + (exp(x) - 1), + (lambda * x + 1)^(1 / lambda) - 1 + ) + ) + if (x_in %>% inherits("quantile_pred")) { + inv_x <- inv_x %>% quantile_pred(x_in %@% "quantile_levels") } - x + inv_x } -get_yj_params_in_layer <- function(workflow) { + +#' get the parameters used in the initial step +#' +#' @param workflow the workflow to extract the parameters from +#' @param step_name the name of the step to look for, as recognized by `detect_step` +#' @param param_name the parameter to pull out of the step +#' @keywords internal +get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_name = "yj_params") { + full_step_name <- glue::glue("step_{step_name}") this_recipe <- hardhat::extract_recipe(workflow) - if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { - cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) + if (!(this_recipe %>% recipes::detect_step(step_name))) { + cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_call()) + } + outcomes <- + workflows::extract_recipe(workflow)$term_info %>% + filter(role == "outcome") %>% + pull(variable) + if (length(outcomes) > 1) { + cli_abort( + "`layer_{step_name}` doesn't support multiple output columns. + This workflow produces {outcomes} as output columns.", + call = rlang::caller_call(), + class = "epipredict__layer_yeo_johnson_multi_outcome_error" + ) } for (step in this_recipe$steps) { - if (inherits(step, "step_epi_YeoJohnson")) { - yj_params <- step$yj_params + # if it's a `step_name` step that also transforms a column that is a subset + # of the output column name + is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes)) + if (inherits(step, full_step_name) && any(is_outcome_subset)) { + params <- step[[param_name]] %>% + select( + key_colnames(workflow$original_data, exclude = "time_value"), + contains(step$columns[is_outcome_subset]) + ) %>% + rename(.lambda = contains(step$columns)) break } } - yj_params + params } diff --git a/R/quantile_pred-methods.R b/R/quantile_pred-methods.R index 293fad90..56e8fcf0 100644 --- a/R/quantile_pred-methods.R +++ b/R/quantile_pred-methods.R @@ -111,7 +111,6 @@ vec_proxy_equal.quantile_pred <- function(x, ...) { dplyr::select(-.row) } - # quantiles by treating quantile_pred like a distribution ----------------- @@ -287,6 +286,12 @@ vec_math.quantile_pred <- function(.fn, .x, ...) { quantile_pred(.fn(.x), quantile_levels) } +#' Internal vctrs methods +#' +#' @import vctrs +#' @keywords internal +#' @name epipredict-vctrs + #' @importFrom vctrs vec_arith vec_arith.numeric #' @export #' @method vec_arith quantile_pred @@ -294,6 +299,19 @@ vec_arith.quantile_pred <- function(op, x, y, ...) { UseMethod("vec_arith.quantile_pred", y) } + +#' @export +#' @method vec_arith.quantile_pred quantile_pred +vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) { + all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels")) + op_fn <- getExportedValue("base", op) + # Interpolate/extrapolate to the same quantiles + x <- quantile.quantile_pred(x, all_quantiles) + y <- quantile.quantile_pred(y, all_quantiles) + out <- op_fn(x, y, ...) + quantile_pred(out, all_quantiles) +} + #' @export #' @method vec_arith.quantile_pred numeric vec_arith.quantile_pred.numeric <- function(op, x, y, ...) { diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R index 4e429528..ff4016cd 100644 --- a/R/step_yeo_johnson.R +++ b/R/step_yeo_johnson.R @@ -188,8 +188,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { # Check that the columns for transformation are present in new_data. if (!all(object$columns %in% colnames(new_data))) { cli::cli_abort( - "The columns for transformation are not present in the new data.", - call = rlang::caller_fn() + "The columns for transformation are not present in the new data." ) } # Check that the columns for transformation are present in new_data. @@ -202,7 +201,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { cli_abort(c( "Some variables used for training are not available in {.arg x}.", i = "The following required columns are missing: {check$missing_names}" - ), call = rlang::caller_fn()) + )) } # Transform each column, using the appropriate yj_param column per row. new_data <- left_join(new_data, object$yj_params, by = key_colnames(new_data, exclude = "time_value")) @@ -243,7 +242,7 @@ compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, # x = "Yeo-Johnson parameter could not be estimated for some geos for {col}.", # i = "Using parameter={x$na_fill} in these cases." # ), - # call = rlang::caller_fn() + # call = rlang::caller_call() # ) # } # } @@ -254,75 +253,71 @@ compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, } -### Code below taken from recipes::step_YeoJohnson. -### We keep "lambda" here, but above we renamed it to "yj_param". -### Modified yj_transform() to be vectorized in lambda. -### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 - -# Yeo-Johnson transformation -yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { - if (any(is.na(lambda))) { - return(x) - } - if (length(x) > 1 && length(lambda) == 1) { - lambda <- rep(lambda, length(x)) - } else if (length(x) != length(lambda)) { - cli::cli_abort( - "Length of `x` must be equal to length of `lambda` or lambda must be a scalar.", - call = rlang::caller_fn() - ) - } - if (!inherits(x, "tbl_df") || is.data.frame(x)) { - x <- unlist(x, use.names = FALSE) +yj_input_type_management <- function(x_in, lambda) { + if (x_in %>% inherits("quantile_pred")) { + x <- as.matrix(x_in) + if (length(lambda) == 1) { + lambda <- lambda %>% + rep(prod(dim(x))) %>% + matrix(dim(x)) + } else if (length(x_in) == length(lambda)) { + lambda <- lambda %>% + rep(dim(x)[[2]]) %>% + matrix(dim(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) + } + } else if (!inherits(x_in, "tbl_df") || is.data.frame(x_in)) { + x <- unlist(x_in, use.names = FALSE) } else { - if (!is.vector(x)) { - x <- as.vector(x) + if (!is.vector(x_in)) { + x <- as.vector(x_in) + } else { + x <- x_in } } - # TODO case weights: can we use weights here? - if (is.null(ind_neg)) { - dat_neg <- x < 0 - ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) - } - not_neg <- ind_neg[["not"]] - is_neg <- ind_neg[["is"]] - nn_trans <- function(x, lambda) { - out <- double(length(x)) - sm_lambdas <- abs(lambda) < eps - if (length(sm_lambdas) > 0) { - out[sm_lambdas] <- log(x[sm_lambdas] + 1) - } - x <- x[!sm_lambdas] - lambda <- lambda[!sm_lambdas] - if (length(x) > 0) { - out[!sm_lambdas] <- ((x + 1)^lambda - 1) / lambda - } - out + # these only apply if x_in isn't a quantile distribution + if (length(x) > 1 && length(lambda) == 1) { + lambda <- rep(lambda, length(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) } - - ng_trans <- function(x, lambda) { - out <- double(length(x)) - near2_lambdas <- abs(lambda - 2) < eps - if (length(near2_lambdas) > 0) { - out[near2_lambdas] <- -log(-x[near2_lambdas] + 1) - } - x <- x[!near2_lambdas] - lambda <- lambda[!near2_lambdas] - if (length(x) > 0) { - out[!near2_lambdas] <- -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) - } - out + list(x, lambda) +} +### Code below taken from recipes::step_YeoJohnson. +### We keep "lambda" here, but above we renamed it to "yj_param". +### Modified yj_transform() to be vectorized in lambda. Also modified to work on distributions. +### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 +# Yeo-Johnson transformation +yj_transform <- function(x_in, lambda, ind_neg = NULL, eps = 0.001) { + if (any(is.na(lambda))) { + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) } + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] - if (length(not_neg) > 0) { - x[not_neg] <- nn_trans(x[not_neg], lambda[not_neg]) - } + transformed <- ifelse( + x < 0, + # for negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -log(abs(x) + 1), + -((abs(x) + 1)^(2 - lambda) - 1) / (2 - lambda) + ), + # for non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + log(abs(x) + 1), + ((abs(x) + 1)^lambda - 1) / lambda + ) + ) - if (length(is_neg) > 0) { - x[is_neg] <- ng_trans(x[is_neg], lambda[is_neg]) + if (x_in %>% inherits("quantile_pred")) { + transformed <- transformed %>% quantile_pred(x_in %@% "quantile_levels") } - x + transformed } ## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., @@ -344,7 +339,7 @@ yj_obj <- function(lam, dat, ind_neg, const) { } ## estimates the values -estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, call = caller_env(2)) { +estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE) { na_rows <- which(is.na(dat)) if (length(na_rows) > 0) { if (na_rm) { @@ -355,7 +350,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca x = "Missing values are not allowed for the YJ transformation.", i = "See {.arg na_rm} option." ), - call = call + call = rlang::caller_call(n = 2) ) } } diff --git a/man/epipredict-vctrs.Rd b/man/epipredict-vctrs.Rd new file mode 100644 index 00000000..a4dabbfa --- /dev/null +++ b/man/epipredict-vctrs.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quantile_pred-methods.R +\name{epipredict-vctrs} +\alias{epipredict-vctrs} +\alias{vec_arith.quantile_pred} +\title{Internal vctrs methods} +\usage{ +\method{vec_arith}{quantile_pred}(op, x, y, ...) +} +\description{ +Internal vctrs methods +} +\keyword{internal} diff --git a/man/get_params_in_layer.Rd b/man/get_params_in_layer.Rd new file mode 100644 index 00000000..1d6c98ef --- /dev/null +++ b/man/get_params_in_layer.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_yeo_johnson.R +\name{get_params_in_layer} +\alias{get_params_in_layer} +\title{get the parameters used in the initial step} +\usage{ +get_params_in_layer( + workflow, + step_name = "epi_YeoJohnson", + param_name = "yj_params" +) +} +\arguments{ +\item{workflow}{the workflow to extract the parameters from} + +\item{step_name}{the name of the step to look for, as recognized by \code{detect_step}} + +\item{param_name}{the parameter to pull out of the step} +} +\description{ +get the parameters used in the initial step +} +\keyword{internal} diff --git a/man/layer_epi_YeoJohnson.Rd b/man/layer_epi_YeoJohnson.Rd index 53520b4e..0c35ee74 100644 --- a/man/layer_epi_YeoJohnson.Rd +++ b/man/layer_epi_YeoJohnson.Rd @@ -8,7 +8,6 @@ layer_epi_YeoJohnson( frosting, ..., yj_params = NULL, - by = NULL, id = rand_id("epi_YeoJohnson") ) } @@ -19,10 +18,15 @@ sequence of operations for this frosting.} \item{...}{One or more selector functions to scale variables for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} -\item{yj_params}{Internal. A data frame of parameters to be used for -inverting the transformation.} - -\item{by}{A (possibly named) character vector of variables to join by.} +\item{yj_params}{A data frame of parameters to be used for inverting the +transformation. Typically set automatically. If you have done multiple +transformations such that the outcome variable name no longer contains the +column that this step transforms, then you should manually specify this to +be the parameters fit in the corresponding \code{step_epi_YeoJohnson}. For an +example where you wouldn't need to set this, if your output is +\code{ahead_7_cases} and \code{step_epi_YeoJohnson} transformed cases (possibly with +other columns), then you wouldn't need to set this. However if you have +renamed your output column to \code{diff_7}, then you will need to extract the \code{yj_params} from the step.} \item{id}{a random id string} } @@ -30,7 +34,11 @@ inverting the transformation.} an updated \code{frosting} postprocessor } \description{ -Will undo a step_epi_YeoJohnson transformation. +Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +are using this step on a column that will eventually become the outcome +variable, you should make sure that the original name of that column is a +subset of the outcome variable name. \code{ahead_7_cases} when \code{cases} is +transformed will work well, while \code{ahead_7} will not. } \examples{ library(dplyr) diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 9e1bafbd..685f806b 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with -#> modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work +#> with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/tests/testthat/_snaps/yeo-johnson.md b/tests/testthat/_snaps/yeo-johnson.md new file mode 100644 index 00000000..b3a42c24 --- /dev/null +++ b/tests/testthat/_snaps/yeo-johnson.md @@ -0,0 +1,16 @@ +# Yeo-Johnson transformation inverts correctly + + Code + yj_transform(x, c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + +--- + + Code + yj_transform(list(1, 2), c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + diff --git a/tests/testthat/test-quantile_pred.R b/tests/testthat/test-quantile_pred.R index 70d7c71a..f5097c1d 100644 --- a/tests/testthat/test-quantile_pred.R +++ b/tests/testthat/test-quantile_pred.R @@ -81,6 +81,7 @@ test_that("unary math works on quantiles", { }) test_that("arithmetic works on quantiles", { + # Quantile and numeric arithmetic works dstn <- hardhat::quantile_pred( matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE), 1:4 / 5 @@ -100,4 +101,28 @@ test_that("arithmetic works on quantiles", { expect_identical((1 / 4) * dstn, dstn2) expect_snapshot(error = TRUE, sum(dstn)) + + # Quantile and quantile arithmetic works + val <- c(1:4, 8:11) + dstn3 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + dstn4 <- hardhat::quantile_pred( + matrix(val + 2 * val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + expect_identical(dstn3 + (2 * dstn3), dstn4) + + # Extrapolate when quantile_levels are not the same + val <- c(1:4, 8:11) + dstn5 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.1, 0.25, 0.5, 0.75) + ) + dstn6 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.25, 0.5, 0.75, 0.9) + ) + expect_identical((dstn5 + dstn6) %@% "quantile_levels", c(0.1, 0.25, 0.5, 0.75, 0.9)) }) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 9ae82151..c0bd12bd 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -13,6 +13,29 @@ test_that("Yeo-Johnson transformation inverts correctly", { expect_true( sum(abs(yj_inverse(yj_transform(x, lambdas), lambdas) - x)) < 1e-5 ) + + # also works on quantile distributions + x <- quantile_pred(matrix(c(-5, 1, 3, 0, 0.1, 0.5), nrow = 2, byrow = TRUE), c(0.01, 0.5, 0.7)) + x_back <- map( + lambdas, + \(lambda) mean(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 + ) + expect_true(all(unlist(x_back))) + + # Get coverage on yj_input_type_management + # Breaks on bad length of lambda + expect_snapshot(error = TRUE, + yj_transform(x, c(1, 2, 3)) + ) + expect_snapshot(error = TRUE, + yj_transform(list(1, 2), c(1, 2, 3)) + ) + expect_true( + identical( + yj_input_type_management(list(1, 2, 3), c(1, 2, 3)), + list(c(1, 2, 3), c(1, 2, 3)) + ) + ) }) test_that("Yeo-Johnson steps and layers invert each other", { @@ -36,16 +59,16 @@ test_that("Yeo-Johnson steps and layers invert each other", { # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% - layer_epi_YeoJohnson(.pred) + layer_epi_YeoJohnson() wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) + dplyr::slice_max(time_value, by = geo_value) out2 <- forecast(wf) %>% rename(cases = .pred) expect_equal(out1, out2) - # Make sure it works when there are multiple predictors and outcomes + # Make sure it works when there are multiple predictors jhu_multi <- epidatasets::covid_case_death_rates_extended %>% filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% select(geo_value, time_value, case_rate, death_rate) @@ -53,7 +76,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { r <- epi_recipe(filtered_data) %>% step_epi_YeoJohnson(case_rate, death_rate) %>% step_epi_lag(case_rate, death_rate, lag = 0) %>% - step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>% + step_epi_ahead(case_rate, ahead = 0, role = "outcome") %>% step_epi_naomit() tr <- r %>% prep(filtered_data) @@ -66,13 +89,43 @@ test_that("Yeo-Johnson steps and layers invert each other", { # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% - layer_epi_YeoJohnson(.pred_ahead_0_case_rate, .pred_ahead_0_death_rate) + layer_epi_YeoJohnson() + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + select(-death_rate) %>% + dplyr::slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(case_rate = .pred) + expect_equal(out1, out2) +}) + +test_that("Yeo-Johnson layers work on quantiles", { + jhu <- epidatasets::cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + + f <- frosting() %>% + layer_predict() %>% + layer_residual_quantiles() %>% + layer_epi_YeoJohnson() wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) - out2 <- forecast(wf) %>% rename(case_rate = .pred_ahead_0_case_rate, death_rate = .pred_ahead_0_death_rate) + dplyr::slice_max(time_value, by = geo_value) %>% + rename(.pred = cases) %>% + tidyr::expand_grid(.pred_distn_quantile_level = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95)) %>% + mutate(.pred_distn_value = .pred) %>% + select(geo_value, time_value, .pred, .pred_distn_value, .pred_distn_quantile_level) + out2 <- forecast(wf) %>% pivot_quantiles_longer(.pred_distn) %>% as_tibble() expect_equal(out1, out2) }) @@ -123,12 +176,13 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% + layer_residual_quantiles() %>% layer_epi_YeoJohnson(.pred) wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) out1 <- filtered_data %>% - slice_max(time_value, by = geo_value) %>% + dplyr::slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) out2 <- forecast(wf) %>%