From 67c82a9aab5471c8b81990dbe9768ef88bd9c3a8 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Mon, 17 Mar 2025 15:47:55 -0700 Subject: [PATCH 01/20] feat: add yeo-johnson * step and layer work with a single outcome and layer_yj(.pred) * need to work on multiple outcomes case --- R/new_epipredict_steps/layer_yeo_johnson.R | 248 +++++++++++++ R/new_epipredict_steps/step_yeo_johnson.R | 409 +++++++++++++++++++++ _targets.yaml | 6 +- test-yeo-johnson.Rmd | 85 +++++ tests/testthat/test-yeo-johnson.R | 103 ++++++ 5 files changed, 850 insertions(+), 1 deletion(-) create mode 100644 R/new_epipredict_steps/layer_yeo_johnson.R create mode 100644 R/new_epipredict_steps/step_yeo_johnson.R create mode 100644 test-yeo-johnson.Rmd create mode 100644 tests/testthat/test-yeo-johnson.R diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R new file mode 100644 index 00000000..aa6c6feb --- /dev/null +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -0,0 +1,248 @@ +#' Unormalizing transformation +#' +#' Will undo a step_epi_YeoJohnson transformation. +#' +#' @param frosting a `frosting` postprocessor. The layer will be added to the +#' sequence of operations for this frosting. +#' @param ... One or more selector functions to scale variables +#' for this step. See [recipes::selections()] for more details. +#' @param df a data frame that contains the population data to be used for +#' inverting the existing scaling. +#' @param by A (possibly named) character vector of variables to join by. +#' @param id a random id string +#' +#' @return an updated `frosting` postprocessor +#' @export +#' @examples +#' library(dplyr) +#' jhu <- epidatasets::cases_deaths_subset %>% +#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' +#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) +#' +#' r <- epi_recipe(jhu) %>% +#' step_epi_YeoJohnson( +#' df = pop_data, +#' df_pop_col = "value", +#' by = c("geo_value" = "states"), +#' cases, suffix = "_scaled" +#' ) %>% +#' step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>% +#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>% +#' step_epi_naomit() +#' +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_threshold(.pred) %>% +#' layer_naomit(.pred) %>% +#' layer_epi_YeoJohnson(.pred, +#' df = pop_data, +#' by = c("geo_value" = "states"), +#' df_pop_col = "value" +#' ) +#' +#' wf <- epi_workflow(r, linear_reg()) %>% +#' fit(jhu) %>% +#' add_frosting(f) +#' +#' forecast(wf) +layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { + checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE) + + add_layer( + frosting, + layer_epi_YeoJohnson_new( + lambdas = lambdas, + by = by, + terms = dplyr::enquos(...), + id = id + ) + ) +} + +layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) { + layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id) +} + +#' @export +#' @importFrom workflows extract_preprocessor +slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { + rlang::check_dots_empty() + + # Get the lambdas from the layer or from the workflow. + lambdas <- object$lambdas %||% get_lambdas_in_layer(workflow) + + # If the by is not specified, try to infer it from the lambdas. + if (is.null(object$by)) { + # Assume `layer_predict` has calculated the prediction keys and other + # layers don't change the prediction key colnames: + prediction_key_colnames <- names(components$keys) + lhs_potential_keys <- prediction_key_colnames + rhs_potential_keys <- colnames(select(lambdas, -starts_with("lambda_"))) + object$by <- intersect(lhs_potential_keys, rhs_potential_keys) + suggested_min_keys <- setdiff(lhs_potential_keys, "time_value") + if (!all(suggested_min_keys %in% object$by)) { + cli_warn( + c( + "{setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions, + but {?wasn't/weren't} found in the population `df`.", + "i" = "Defaulting to join by {object$by}", + ">" = "Double-check whether column names on the population `df` match those expected in your predictions", + ">" = "Consider using population data with breakdowns by {suggested_min_keys}", + ">" = "Manually specify `by =` to silence" + ), + class = "epipredict__layer_population_scaling__default_by_missing_suggested_keys" + ) + } + } + + # Establish the join columns. + object$by <- object$by %||% + intersect( + epipredict:::epi_keys_only(components$predictions), + colnames(select(lambdas, -starts_with("lambda_"))) + ) + joinby <- list(x = names(object$by) %||% object$by, y = object$by) + hardhat::validate_column_names(components$predictions, joinby$x) + hardhat::validate_column_names(lambdas, joinby$y) + + # Join the lambdas. + components$predictions <- inner_join( + components$predictions, + lambdas, + by = object$by, + relationship = "many-to-one", + unmatched = c("error", "drop") + ) + + # TODO: There are many possibilities here: + # - (a) the terms can be empty, where we should probably default to + # all_outcomes(). + # - (b) explicitly giving all_outcomes(), we end here with terms being empty, + # which doesn't seem right; need to make sure we pull in all the outcome + # columns here. The question is what form should they have? + # - (c) if the user just specifies .pred, then we have to infer the outcome + # from the mold, which is simple enough and the main case I have working. + # - (d) the user might specify outcomes of the form .pred_ahead_1_cases, + # .pred_ahead_7_cases, etc. Is that the right format? Trying those out now + # and getting errors downstream from forecast(). + # Get the columns to transform. + exprs <- rlang::expr(c(!!!object$terms)) + pos <- tidyselect::eval_select(exprs, components$predictions) + col_names <- names(pos) + + # For every column, we need to use the appropriate lambda column, which differs per row. + # Note that yj_inverse() is vectorized. + if (identical(col_names, ".pred")) { + # In this case, we don't get a hint for the outcome column name, so we need to + # infer it from the mold. `outcomes` is a vector of objects like + # ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part. + outcome_cols <- names(components$mold$outcomes) %>% + stringr::str_match("ahead_\\d+_(.*)") %>% + extract(, 2) + + components$predictions <- components$predictions %>% + rowwise() %>% + mutate(.pred := yj_inverse(.pred, !!sym(paste0("lambda_", outcome_cols)))) + } else if (identical(col_names, character(0))) { + # In this case, we should assume the user wants to transform all outcomes. + cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env()) + } else { + # In this case, we assume that the user has specified the columns they want + # transformed here. We then need to determine the lambda columns for each of + # these columns. That is, we need to convert a vector of column names like + # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to + # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate"). + original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] + if (all(original_outcome_cols %nin% names(components$mold$outcomes))) { + cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env()) + } + + for (i in seq_along(col_names)) { + col <- col_names[i] + lambda_col <- paste0("lambda_", original_outcome_cols[i]) + components$predictions <- components$predictions %>% + rowwise() %>% + mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col))) + } + } + + # Remove the lambda columns. + components$predictions <- components$predictions %>% + select(-any_of(starts_with("lambda_"))) %>% + ungroup() + components +} + +#' @export +print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) { + title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + epipredict:::print_layer(x$terms, title = title, width = width) +} + +#' Inverse Yeo-Johnson transformation +#' +#' Inverse of `yj_transform` in step_yeo_johnson.R. +#' +#' @keywords internal +yj_inverse <- function(x, lambda, eps = 0.001) { + if (is.na(lambda)) { + return(x) + } + if (!inherits(x, "tbl_df") || is.data.frame(x)) { + x <- unlist(x, use.names = FALSE) + } else { + if (!is.vector(x)) { + x <- as.vector(x) + } + } + + dat_neg <- x < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + not_neg <- ind_neg[["not"]] + is_neg <- ind_neg[["is"]] + + nn_inv_trans <- function(x, lambda) { + if (abs(lambda) < eps) { + # log(x + 1) + exp(x) - 1 + } else { + # ((x + 1)^lambda - 1) / lambda + (lambda * x + 1)^(1 / lambda) - 1 + } + } + + ng_inv_trans <- function(x, lambda) { + if (abs(lambda - 2) < eps) { + # -log(-x + 1) + -(exp(-x) - 1) + } else { + # -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + } + } + + if (length(not_neg) > 0) { + x[not_neg] <- nn_inv_trans(x[not_neg], lambda) + } + + if (length(is_neg) > 0) { + x[is_neg] <- ng_inv_trans(x[is_neg], lambda) + } + x +} + +get_lambdas_in_layer <- function(workflow) { + this_recipe <- hardhat::extract_recipe(workflow) + if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { + cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) + } + for (step in this_recipe$steps) { + if (inherits(step, "step_epi_YeoJohnson")) { + lambdas <- step$lambdas + break + } + } + lambdas +} diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R new file mode 100644 index 00000000..81e66ba8 --- /dev/null +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -0,0 +1,409 @@ +#' Yeo-Johnson transformation +#' +#' `step_epi_YeoJohnson()` creates a *specification* of a recipe step that will +#' transform data using a Yeo-Johnson transformation. This fork works with panel +#' data and is meant for epidata. +#' TODO: Do an edit pass on this docstring. +#' +#' @inheritParams step_center +#' @param lambdas A numeric vector of transformation values. This +#' is `NULL` until computed by [prep()]. +#' @param na_lambda_fill A numeric value to fill in for any +#' geos where the lambda cannot be estimated. +#' @param limits A length 2 numeric vector defining the range to +#' compute the transformation parameter lambda. +#' @param num_unique An integer where data that have less possible +#' values will not be evaluated for a transformation. +#' @param na_rm A logical indicating whether missing values should be +#' removed. +#' @param epi_keys_checked Internal. A character vector of key columns +#' that are expected in the data. +#' @param skip A logical. Should the step be skipped when the recipe is +#' baked by [bake()]. On the `training` data, the step will always be +#' conducted (even if `skip = TRUE`). +#' @template step-return +#' @family individual transformation steps +#' @export +#' @details The Yeo-Johnson transformation is very similar to the +#' Box-Cox but does not require the input variables to be strictly +#' positive. In the package, the partial log-likelihood function is +#' directly optimized within a reasonable set of transformation +#' values (which can be changed by the user). +#' +#' This transformation is typically done on the outcome variable +#' using the residuals for a statistical model (such as ordinary +#' least squares). Here, a simple null model (intercept only) is +#' used to apply the transformation to the *predictor* +#' variables individually. This can have the effect of making the +#' variable distributions more symmetric. +#' +#' If the transformation parameters are estimated to be very +#' closed to the bounds, or if the optimization fails, a value of +#' `NA` is used and no transformation is applied. +#' +#' # Tidying +#' +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with +#' columns `terms`, `value` , and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected} +#' \item{value}{numeric, the lambda estimate} +#' \item{id}{character, id of this step} +#' } +#' +#' @template case-weights-not-supported +#' +#' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power +#' transformations to improve normality or symmetry. *Biometrika*. +#' @examplesIf rlang::is_installed("modeldata") +#' data(biomass, package = "modeldata") +#' +#' biomass_tr <- biomass[biomass$dataset == "Training", ] +#' biomass_te <- biomass[biomass$dataset == "Testing", ] +#' +#' rec <- recipe( +#' HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur, +#' data = biomass_tr +#' ) +#' +#' yj_transform <- step_epi_YeoJohnson(rec, all_numeric()) +#' +#' yj_estimates <- prep(yj_transform, training = biomass_tr) +#' +#' yj_te <- bake(yj_estimates, biomass_te) +#' +#' plot(density(biomass_te$sulfur), main = "before") +#' plot(density(yj_te$sulfur), main = "after") +#' +#' tidy(yj_transform, number = 1) +#' tidy(yj_estimates, number = 1) +step_epi_YeoJohnson <- function( + recipe, + ..., + role = NA, + trained = FALSE, + lambdas = NULL, + na_lambda_fill = 1 / 4, + limits = c(-5, 5), + num_unique = 5, + na_rm = TRUE, + epi_keys_checked = NULL, + skip = FALSE, + id = rand_id("epi_YeoJohnson") +) { + checkmate::assert_numeric(limits, len = 2) + checkmate::assert_numeric(na_lambda_fill, lower = min(limits), upper = max(limits), len = 1) + checkmate::assert_numeric(num_unique, lower = 2, upper = Inf, len = 1) + checkmate::assert_logical(na_rm, len = 1) + checkmate::assert_logical(skip, len = 1) + if (is.null(epi_keys_checked)) { + epi_keys_checked <- key_colnames(recipe$template, exclude = "time_value") + } + add_step( + recipe, + step_epi_YeoJohnson_new( + terms = enquos(...), + role = role, + trained = trained, + lambdas = lambdas, + na_lambda_fill = na_lambda_fill, + limits = sort(limits)[1:2], + num_unique = num_unique, + na_rm = na_rm, + epi_keys_checked = epi_keys_checked, + forecast_date = NULL, + metadata = NULL, + columns = NULL, + skip = skip, + id = id + ) + ) +} + +step_epi_YeoJohnson_new <- function( + terms, + role, + trained, + lambdas, + na_lambda_fill, + limits, + num_unique, + na_rm, + epi_keys_checked, + forecast_date, + metadata, + columns, + skip, + id +) { + step( + subclass = "epi_YeoJohnson", + terms = terms, + role = role, + trained = trained, + lambdas = lambdas, + na_lambda_fill = na_lambda_fill, + limits = limits, + num_unique = num_unique, + na_rm = na_rm, + epi_keys_checked = epi_keys_checked, + forecast_date = forecast_date, + metadata = metadata, + columns = columns, + skip = skip, + id = id + ) +} + +#' @export +prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { + # Check that the columns selected for transformation are numeric. + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], types = c("double", "integer")) + + lambdas <- get_lambdas_yj_table( + training, + col_names, + x$limits, + x$num_unique, + x$na_lambda_fill, + x$na_rm, + x$epi_keys_checked + ) + + step_epi_YeoJohnson_new( + terms = x$terms, + role = x$role, + trained = TRUE, + lambdas = lambdas, + na_lambda_fill = x$na_lambda_fill, + limits = x$limits, + num_unique = x$num_unique, + na_rm = x$na_rm, + epi_keys_checked = x$epi_keys_checked, + forecast_date = attributes(training)$metadata$as_of, + metadata = attributes(training)$metadata, + columns = col_names, + skip = x$skip, + id = x$id + ) +} + +#' @export +bake.step_epi_YeoJohnson <- function(object, new_data, ...) { + # If not an epi_df, make it one assuming the template of training data. + # If it is an epi_df, check that the keys match. + # Imitating the pattern in step_adjust_latency(). + if (!inherits(new_data, "epi_df") || is.null(attributes(new_data)$metadata$as_of)) { + new_data <- as_epi_df( + new_data, + as_of = object$forecast_date, + other_keys = object$metadata$other_keys %||% character() + ) + new_data %@% metadata <- object$metadata + keys <- object$epi_keys_checked + } + # Check that the keys match. + keys <- key_colnames(new_data, exclude = "time_value") + if (!identical(keys, object$epi_keys_checked)) { + cli::cli_abort( + "The keys of the new data do not match the keys of the training data.", + call = rlang::caller_fn() + ) + } + # Check that the columns for transformation are present in new_data. + col_names <- object$columns + check_new_data(col_names, object, new_data) + + # Transform each column, using the appropriate lambda column per row. + # Note that yj_transform() is vectorized. + new_data %<>% left_join(object$lambdas, by = keys) + for (col in col_names) { + new_data <- new_data %>% + rowwise() %>% + mutate(!!col := yj_transform(!!sym(col), !!sym(paste0("lambda_", col)))) + } + # Remove the lambda columns. + new_data %>% + select(-starts_with("lambda_")) %>% + ungroup() +} + +#' @export +print.step_epi_YeoJohnson <- function(x, width = max(20, options()$width - 39), ...) { + title <- "Yeo-Johnson transformation (see `lambdas` object for values) on " + epipredict:::print_epi_step(x$terms, x$terms, title = title, width = width) + invisible(x) +} + +#' Compute the lambda values per group for each column. +#' +#' @keywords internal +#' @rdname recipes-internal +get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lambda_fill, na_rm, epi_keys_checked) { + # Estimate the lambda for each column, creating a lambda_ column for each. + # Note that estimate_yj() operates on a vector. + lambdas <- training %>% + summarise( + across(all_of(col_names), ~ estimate_yj(.x, limits, num_unique, na_rm)), + .by = epi_keys_checked + ) %>% + rename_with(~ paste0("lambda_", .x), -all_of(epi_keys_checked)) + + # Check for NAs in any of the lambda_ columns. + # EDIT: This warning was too noisy. Keeping code around, in case we want it. + # for (col in col_names) { + # if (any(is.na(values[[paste0("lambda_", col)]]))) { + # cli::cli_warn( + # c( + # x = "Yeo-Johnson lambda could not be estimated for some geos for {col}.", + # i = "Using lambda={x$na_lambda_fill} in these cases." + # ), + # call = rlang::caller_fn() + # ) + # } + # } + + # Fill in NAs with the default lambda. + lambdas %>% + mutate(across(starts_with("lambda_"), \(col) ifelse(is.na(col), na_lambda_fill, col))) +} + + +### Code below taken from recipes::step_YeoJohnson. +### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 + +#' Internal Functions +#' +#' @keywords internal +#' @rdname recipes-internal +#' @export +yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { + if (is.na(lambda)) { + return(x) + } + if (!inherits(x, "tbl_df") || is.data.frame(x)) { + x <- unlist(x, use.names = FALSE) + } else { + if (!is.vector(x)) { + x <- as.vector(x) + } + } + # TODO case weights: can we use weights here? + if (is.null(ind_neg)) { + dat_neg <- x < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + } + not_neg <- ind_neg[["not"]] + is_neg <- ind_neg[["is"]] + + nn_trans <- function(x, lambda) { + if (abs(lambda) < eps) { + log(x + 1) + } else { + ((x + 1)^lambda - 1) / lambda + } + } + + ng_trans <- function(x, lambda) { + if (abs(lambda - 2) < eps) { + -log(-x + 1) + } else { + -((-x + 1)^(2 - lambda) - 1) / (2 - lambda) + } + } + + if (length(not_neg) > 0) { + x[not_neg] <- nn_trans(x[not_neg], lambda) + } + + if (length(is_neg) > 0) { + x[is_neg] <- ng_trans(x[is_neg], lambda) + } + x +} + + +## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., +## & Johnson, R. A. (2000). A new family of power transformations +## to improve normality or symmetry. Biometrika. page 957 +ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) { + n <- length(y) + y_t <- yj_transform(y, lambda, ind_neg) + mu_t <- mean(y_t) + var_t <- var(y_t) * (n - 1) / n + res <- -.5 * n * log(var_t) + (lambda - 1) * const + res +} + +## eliminates missing data and returns -llh +yj_obj <- function(lam, dat, ind_neg, const) { + ll_yj(lambda = lam, y = dat, ind_neg = ind_neg, const = const) +} + +## estimates the values +#' @keywords internal +#' @rdname recipes-internal +#' @export +estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, call = caller_env(2)) { + na_rows <- which(is.na(dat)) + if (length(na_rows) > 0) { + if (na_rm) { + dat <- dat[-na_rows] + } else { + cli::cli_abort( + c( + x = "Missing values are not allowed for the YJ transformation.", + i = "See {.arg na_rm} option." + ), + call = call + ) + } + } + + eps <- .001 + if (length(unique(dat)) < num_unique) { + return(NA) + } + dat_neg <- dat < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + + const <- sum(sign(dat) * log(abs(dat) + 1)) + + res <- optimize( + yj_obj, + interval = limits, + maximum = TRUE, + dat = dat, + ind_neg = ind_neg, + const = const, + tol = .0001 + ) + lam <- res$maximum + if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= eps) { + lam <- NA + } + lam +} + +# Copied from recipes:::tidy.step_BoxCox +# +#' @rdname tidy.recipe +#' @export +tidy.step_epi_YeoJohnson <- function(x, ...) { + if (is_trained(x)) { + res <- tibble( + terms = names(x$lambdas), + value = unname(x$lambdas) + ) + } else { + term_names <- sel2char(x$terms) + res <- tibble( + terms = term_names, + value = na_dbl + ) + } + res$id <- x$id + res +} diff --git a/_targets.yaml b/_targets.yaml index 3f31a6e6..c76259e4 100644 --- a/_targets.yaml +++ b/_targets.yaml @@ -18,4 +18,8 @@ covid_hosp_prod: store: covid_hosp_prod use_crew: yes reporter_make: timestamp - +# test_proj: +# script: scripts/test_proj.R +# store: test_proj +# use_crew: yes +# reporter_make: timestamp diff --git a/test-yeo-johnson.Rmd b/test-yeo-johnson.Rmd new file mode 100644 index 00000000..24c395af --- /dev/null +++ b/test-yeo-johnson.Rmd @@ -0,0 +1,85 @@ +--- +title: "Yeo-Johnson Transformation Testing" +output: + html_document: + self_contained: True +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set( + digits = 3, + comment = "#>", + collapse = TRUE, + cache = FALSE, + dev.args = list(bg = "transparent"), + dpi = 300, + cache.lazy = FALSE, + out.width = "90%", + fig.align = "center", + fig.width = 9, + fig.height = 6 +) +ggplot2::theme_set(ggplot2::theme_bw()) +options( + dplyr.print_min = 6, + dplyr.print_max = 6, + pillar.max_footer_lines = 2, + pillar.min_chars = 15, + stringr.view_n = 6, + pillar.bold = TRUE, + width = 77 +) +suppressPackageStartupMessages(source(here::here("R", "load_all.R"))) +``` + +## Setup and Data Loading + +First, we'll set up the environment and load the necessary data: + +```{r setup-env} +# Simple case with keys = geo_value. +filtered_data <- cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) +``` + +## Yeo-Johnson Transformation + +Let's apply the Yeo-Johnson transformation to our data: + +```{r yeo-johnson-transform} +r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + prep(filtered_data) + +# Display the recipe +r + +# Inspect the lambda values for each state +r$steps[[1]]$lambdas +``` + +## Manual Whitening Comparison + +Now, let's compare the Yeo-Johnson transformation with a manual whitening approach using quarter root scaling: + +```{r manual-whitening} +# Apply the transformation +out1 <- r %>% bake(filtered_data) +out2 <- filtered_data %>% + mutate(cases = (cases + 0.01)^(1 / 4)) + +filtered_data %>% + mutate(cases = log(cases)) %>% + ggplot(aes(time_value, cases)) + + geom_line(color = "blue") + + geom_line(data = out1 %>% mutate(cases = log(cases)), + aes(time_value, cases), color = "green") + + geom_line(data = out2 %>% mutate(cases = log(cases)), + aes(time_value, cases), color = "red") + + facet_wrap(~geo_value, scales = "free_y") + + theme_minimal() + + labs(title = "Yeo-Johnson transformation", x = "Time", y = "Log Cases") +``` diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R new file mode 100644 index 00000000..b45deacf --- /dev/null +++ b/tests/testthat/test-yeo-johnson.R @@ -0,0 +1,103 @@ +suppressPackageStartupMessages(source(here::here("R", "load_all.R"))) + +test_that("Yeo-Johnson transformation inverts correctly", { + expect_true( + map_lgl(seq(-5, 5, 0.1), function(lambda) { + map_lgl(seq(0, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all() + }) %>% + all() + ) +}) + +test_that("Yeo-Johnson steps and layers invert each other", { + jhu <- cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + # Get some lambda values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general lambda values tibble structure + expect_true("lambda_cases" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_cases)) + # Still works on a tibble + expect_equal( + tr %>% bake(filtered_data %>% as_tibble()), + tr %>% bake(filtered_data) + ) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(cases = .pred) + expect_equal(out1, out2) + + # Make sure it works when there are multiple predictors and outcomes + jhu_multi <- epidatasets::covid_case_death_rates_extended %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, case_rate, death_rate) + filtered_data <- jhu_multi + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(case_rate, death_rate) %>% + step_epi_lag(case_rate, death_rate, lag = 0) %>% + step_epi_ahead(case_rate, death_rate, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general lambda values tibble structure + expect_true("lambda_case_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true("lambda_death_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_case_rate)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_death_rate)) + + # TODO: Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred_ahead_0_case_rate) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) + # debugonce(slather.layer_epi_YeoJohnson) + out2 <- forecast(wf) %>% rename(case_rate = .pred) + expect_equal(out1, out2) +}) + +test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { + jhu <- cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + # Get some lambda values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + # Check for fixed lambda values + expect_true(all(near(tr$steps[[1]]$lambdas$lambda_cases, c(0.856, 0.207), tol = 0.001))) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson(.pred) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(cases = .pred) + expect_equal(out1, out2) +}) From 0319acc846dedda0125bc880f68f4443fc6cc843 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 19 Mar 2025 10:29:25 -0700 Subject: [PATCH 02/20] Update R/new_epipredict_steps/step_yeo_johnson.R Co-authored-by: Daniel McDonald --- R/new_epipredict_steps/step_yeo_johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index 81e66ba8..d4ca0765 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -81,7 +81,7 @@ step_epi_YeoJohnson <- function( recipe, ..., - role = NA, + role = "predictor", trained = FALSE, lambdas = NULL, na_lambda_fill = 1 / 4, From 8d029c22f799b443bad1ad06100cb87cc1f93180 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 19 Mar 2025 14:43:31 -0700 Subject: [PATCH 03/20] Update R/new_epipredict_steps/step_yeo_johnson.R Co-authored-by: Daniel McDonald --- R/new_epipredict_steps/step_yeo_johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index d4ca0765..b4276c4f 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -218,7 +218,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { # Transform each column, using the appropriate lambda column per row. # Note that yj_transform() is vectorized. - new_data %<>% left_join(object$lambdas, by = keys) + new_data <- left_join(new_data, object$lambdas, by = keys) for (col in col_names) { new_data <- new_data %>% rowwise() %>% From cb6b431c4c8a901557f4fd2c770e16e9d507075e Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 19 Mar 2025 14:49:20 -0700 Subject: [PATCH 04/20] Update R/new_epipredict_steps/step_yeo_johnson.R Co-authored-by: Daniel McDonald --- R/new_epipredict_steps/step_yeo_johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index b4276c4f..53bd5515 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -160,7 +160,7 @@ step_epi_YeoJohnson_new <- function( prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { # Check that the columns selected for transformation are numeric. col_names <- recipes_eval_select(x$terms, training, info) - check_type(training[, col_names], types = c("double", "integer")) + recipes::check_type(training[, col_names], types = c("double", "integer")) lambdas <- get_lambdas_yj_table( training, From 415d99a61690b3323d00f437f2ab1db3d9e89c7d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 19 Mar 2025 15:52:29 -0700 Subject: [PATCH 05/20] fix: temp columns lambda_ -> .lambda_ --- R/new_epipredict_steps/layer_yeo_johnson.R | 10 +++++----- R/new_epipredict_steps/step_yeo_johnson.R | 10 +++++----- tests/testthat/test-yeo-johnson.R | 16 +++++++++------- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R index aa6c6feb..6eb5f9d9 100644 --- a/R/new_epipredict_steps/layer_yeo_johnson.R +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -101,7 +101,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, object$by <- object$by %||% intersect( epipredict:::epi_keys_only(components$predictions), - colnames(select(lambdas, -starts_with("lambda_"))) + colnames(select(lambdas, -starts_with(".lambda_"))) ) joinby <- list(x = names(object$by) %||% object$by, y = object$by) hardhat::validate_column_names(components$predictions, joinby$x) @@ -133,7 +133,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, col_names <- names(pos) # For every column, we need to use the appropriate lambda column, which differs per row. - # Note that yj_inverse() is vectorized. + # Note that yj_inverse() is vectorized in x, but not in lambda. if (identical(col_names, ".pred")) { # In this case, we don't get a hint for the outcome column name, so we need to # infer it from the mold. `outcomes` is a vector of objects like @@ -144,7 +144,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, components$predictions <- components$predictions %>% rowwise() %>% - mutate(.pred := yj_inverse(.pred, !!sym(paste0("lambda_", outcome_cols)))) + mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols)))) } else if (identical(col_names, character(0))) { # In this case, we should assume the user wants to transform all outcomes. cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env()) @@ -161,7 +161,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, for (i in seq_along(col_names)) { col <- col_names[i] - lambda_col <- paste0("lambda_", original_outcome_cols[i]) + lambda_col <- paste0(".lambda_", original_outcome_cols[i]) components$predictions <- components$predictions %>% rowwise() %>% mutate(!!sym(col) := yj_inverse(!!sym(col), !!sym(lambda_col))) @@ -170,7 +170,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, # Remove the lambda columns. components$predictions <- components$predictions %>% - select(-any_of(starts_with("lambda_"))) %>% + select(-any_of(starts_with(".lambda_"))) %>% ungroup() components } diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index 53bd5515..e9a2633e 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -217,16 +217,16 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { check_new_data(col_names, object, new_data) # Transform each column, using the appropriate lambda column per row. - # Note that yj_transform() is vectorized. + # Note that yj_transform() is vectorized in x, but not in lambda. new_data <- left_join(new_data, object$lambdas, by = keys) for (col in col_names) { new_data <- new_data %>% rowwise() %>% - mutate(!!col := yj_transform(!!sym(col), !!sym(paste0("lambda_", col)))) + mutate(!!col := yj_transform(!!sym(col), !!sym(paste0(".lambda_", col)))) } # Remove the lambda columns. new_data %>% - select(-starts_with("lambda_")) %>% + select(-starts_with(".lambda_")) %>% ungroup() } @@ -249,7 +249,7 @@ get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lam across(all_of(col_names), ~ estimate_yj(.x, limits, num_unique, na_rm)), .by = epi_keys_checked ) %>% - rename_with(~ paste0("lambda_", .x), -all_of(epi_keys_checked)) + rename_with(~ paste0(".lambda_", .x), -all_of(epi_keys_checked)) # Check for NAs in any of the lambda_ columns. # EDIT: This warning was too noisy. Keeping code around, in case we want it. @@ -267,7 +267,7 @@ get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lam # Fill in NAs with the default lambda. lambdas %>% - mutate(across(starts_with("lambda_"), \(col) ifelse(is.na(col), na_lambda_fill, col))) + mutate(across(starts_with(".lambda_"), \(col) ifelse(is.na(col), na_lambda_fill, col))) } diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index b45deacf..1c5d42dd 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -24,8 +24,8 @@ test_that("Yeo-Johnson steps and layers invert each other", { tr <- r %>% prep(filtered_data) # Check general lambda values tibble structure - expect_true("lambda_cases" %in% names(tr$steps[[1]]$lambdas)) - expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_cases)) + expect_true(".lambda_cases" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_cases)) # Still works on a tibble expect_equal( tr %>% bake(filtered_data %>% as_tibble()), @@ -56,12 +56,13 @@ test_that("Yeo-Johnson steps and layers invert each other", { tr <- r %>% prep(filtered_data) # Check general lambda values tibble structure - expect_true("lambda_case_rate" %in% names(tr$steps[[1]]$lambdas)) - expect_true("lambda_death_rate" %in% names(tr$steps[[1]]$lambdas)) - expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_case_rate)) - expect_true(is.numeric(tr$steps[[1]]$lambdas$lambda_death_rate)) + expect_true(".lambda_case_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true(".lambda_death_rate" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_case_rate)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_death_rate)) # TODO: Make sure that the inverse transformation works + skip("TODO") f <- frosting() %>% layer_predict() %>% layer_epi_YeoJohnson(.pred_ahead_0_case_rate) @@ -75,6 +76,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { }) test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { + skip("TODO") jhu <- cases_deaths_subset %>% filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% select(geo_value, time_value, cases) @@ -88,7 +90,7 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr step_epi_naomit() tr <- r %>% prep(filtered_data) # Check for fixed lambda values - expect_true(all(near(tr$steps[[1]]$lambdas$lambda_cases, c(0.856, 0.207), tol = 0.001))) + expect_true(all(near(tr$steps[[1]]$lambdas$.lambda_cases, c(0.856, 0.207), tol = 0.001))) # Make sure that the inverse transformation works f <- frosting() %>% From ca2df2f47663519c48c5767bc7d00ec146dbed4a Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Wed, 19 Mar 2025 16:12:14 -0700 Subject: [PATCH 06/20] fix: remove epi_keys_checked --- R/new_epipredict_steps/step_yeo_johnson.R | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index e9a2633e..384fc272 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -16,8 +16,6 @@ #' values will not be evaluated for a transformation. #' @param na_rm A logical indicating whether missing values should be #' removed. -#' @param epi_keys_checked Internal. A character vector of key columns -#' that are expected in the data. #' @param skip A logical. Should the step be skipped when the recipe is #' baked by [bake()]. On the `training` data, the step will always be #' conducted (even if `skip = TRUE`). @@ -88,7 +86,6 @@ step_epi_YeoJohnson <- function( limits = c(-5, 5), num_unique = 5, na_rm = TRUE, - epi_keys_checked = NULL, skip = FALSE, id = rand_id("epi_YeoJohnson") ) { @@ -97,9 +94,6 @@ step_epi_YeoJohnson <- function( checkmate::assert_numeric(num_unique, lower = 2, upper = Inf, len = 1) checkmate::assert_logical(na_rm, len = 1) checkmate::assert_logical(skip, len = 1) - if (is.null(epi_keys_checked)) { - epi_keys_checked <- key_colnames(recipe$template, exclude = "time_value") - } add_step( recipe, step_epi_YeoJohnson_new( @@ -111,7 +105,6 @@ step_epi_YeoJohnson <- function( limits = sort(limits)[1:2], num_unique = num_unique, na_rm = na_rm, - epi_keys_checked = epi_keys_checked, forecast_date = NULL, metadata = NULL, columns = NULL, @@ -130,7 +123,6 @@ step_epi_YeoJohnson_new <- function( limits, num_unique, na_rm, - epi_keys_checked, forecast_date, metadata, columns, @@ -147,7 +139,6 @@ step_epi_YeoJohnson_new <- function( limits = limits, num_unique = num_unique, na_rm = na_rm, - epi_keys_checked = epi_keys_checked, forecast_date = forecast_date, metadata = metadata, columns = columns, @@ -169,7 +160,7 @@ prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { x$num_unique, x$na_lambda_fill, x$na_rm, - x$epi_keys_checked + key_colnames(training, exclude = "time_value") ) step_epi_YeoJohnson_new( @@ -181,7 +172,6 @@ prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { limits = x$limits, num_unique = x$num_unique, na_rm = x$na_rm, - epi_keys_checked = x$epi_keys_checked, forecast_date = attributes(training)$metadata$as_of, metadata = attributes(training)$metadata, columns = col_names, @@ -202,11 +192,11 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { other_keys = object$metadata$other_keys %||% character() ) new_data %@% metadata <- object$metadata - keys <- object$epi_keys_checked } # Check that the keys match. keys <- key_colnames(new_data, exclude = "time_value") - if (!identical(keys, object$epi_keys_checked)) { + old_keys <- object$lambdas %>% select(-starts_with(".lambda_")) %>% colnames() + if (!identical(keys, old_keys)) { cli::cli_abort( "The keys of the new data do not match the keys of the training data.", call = rlang::caller_fn() From 17fac6a3b9f57ee4cf0136409ed0cbc2973b5dcc Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 17:56:34 -0700 Subject: [PATCH 07/20] Update R/new_epipredict_steps/step_yeo_johnson.R Co-authored-by: David Weber --- R/new_epipredict_steps/step_yeo_johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index 384fc272..91ecb4d7 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -196,7 +196,7 @@ bake.step_epi_YeoJohnson <- function(object, new_data, ...) { # Check that the keys match. keys <- key_colnames(new_data, exclude = "time_value") old_keys <- object$lambdas %>% select(-starts_with(".lambda_")) %>% colnames() - if (!identical(keys, old_keys)) { + if (!all(keys %in% old_keys)) { cli::cli_abort( "The keys of the new data do not match the keys of the training data.", call = rlang::caller_fn() From a14c932623bae69976330ea2e71663bffc0552af Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 18:02:09 -0700 Subject: [PATCH 08/20] Update test-yeo-johnson.Rmd Co-authored-by: David Weber --- test-yeo-johnson.Rmd | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/test-yeo-johnson.Rmd b/test-yeo-johnson.Rmd index 24c395af..27379ad7 100644 --- a/test-yeo-johnson.Rmd +++ b/test-yeo-johnson.Rmd @@ -71,15 +71,18 @@ out1 <- r %>% bake(filtered_data) out2 <- filtered_data %>% mutate(cases = (cases + 0.01)^(1 / 4)) -filtered_data %>% - mutate(cases = log(cases)) %>% - ggplot(aes(time_value, cases)) + - geom_line(color = "blue") + - geom_line(data = out1 %>% mutate(cases = log(cases)), - aes(time_value, cases), color = "green") + - geom_line(data = out2 %>% mutate(cases = log(cases)), - aes(time_value, cases), color = "red") + - facet_wrap(~geo_value, scales = "free_y") + +all_together <- rbind( + filtered_data %>% + mutate(name = "raw"), + out1 %>% mutate(name = "yeo-johnson"), + out2 %>% mutate(name = "quarter-root") +) + +all_together %>% + ggplot(aes(time_value, cases, color = name)) + + geom_line() + + facet_grid(~geo_value, scales = "free_y") + theme_minimal() + - labs(title = "Yeo-Johnson transformation", x = "Time", y = "Log Cases") + labs(title = "Yeo-Johnson transformation", x = "Time", y = "Log Cases") + + scale_y_log10() ``` From 66a15efb98e4ff16c60b5c2a16874d72f722c7f3 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 18:05:08 -0700 Subject: [PATCH 09/20] Update tests/testthat/test-yeo-johnson.R Co-authored-by: David Weber --- tests/testthat/test-yeo-johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 1c5d42dd..07b9ab16 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -3,7 +3,7 @@ suppressPackageStartupMessages(source(here::here("R", "load_all.R"))) test_that("Yeo-Johnson transformation inverts correctly", { expect_true( map_lgl(seq(-5, 5, 0.1), function(lambda) { - map_lgl(seq(0, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all() + map_lgl(seq(-10, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all() }) %>% all() ) From 80c5cd0069e1cb19c1352d88f8382563e714ad1a Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 18:08:32 -0700 Subject: [PATCH 10/20] Update R/new_epipredict_steps/layer_yeo_johnson.R Co-authored-by: David Weber --- R/new_epipredict_steps/layer_yeo_johnson.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R index 6eb5f9d9..0c697f71 100644 --- a/R/new_epipredict_steps/layer_yeo_johnson.R +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -155,7 +155,8 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate"). original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] - if (all(original_outcome_cols %nin% names(components$mold$outcomes))) { + outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2] + if (all(original_outcome_cols %nin% outcomes_wout_ahead)) { cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env()) } From cd87b0bc6c969b290061e07df037000e958e23b7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 18:08:38 -0700 Subject: [PATCH 11/20] Update tests/testthat/test-yeo-johnson.R Co-authored-by: David Weber --- tests/testthat/test-yeo-johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 07b9ab16..03a22a93 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -65,7 +65,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { skip("TODO") f <- frosting() %>% layer_predict() %>% - layer_epi_YeoJohnson(.pred_ahead_0_case_rate) + layer_epi_YeoJohnson(.pred_ahead_0_case_rate, .pred_ahead_0_death_rate) wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) From 88cb4753e4d6cac206eeba3996e9d12c0b880c01 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 18:08:48 -0700 Subject: [PATCH 12/20] Update tests/testthat/test-yeo-johnson.R Co-authored-by: David Weber --- tests/testthat/test-yeo-johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 03a22a93..0980348a 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -71,7 +71,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { add_frosting(f) out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) # debugonce(slather.layer_epi_YeoJohnson) - out2 <- forecast(wf) %>% rename(case_rate = .pred) + out2 <- forecast(wf) %>% rename(case_rate = .pred_ahead_0_case_rate, death_rate = .pred_ahead_0_death_rate) expect_equal(out1, out2) }) From 8f200f39f1c2e5ac5ec840026b623e4315ce472d Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 18:10:05 -0700 Subject: [PATCH 13/20] merge --- R/new_epipredict_steps/layer_yeo_johnson.R | 2 +- R/new_epipredict_steps/step_yeo_johnson.R | 2 +- tests/testthat/test-yeo-johnson.R | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R index 0c697f71..731297cc 100644 --- a/R/new_epipredict_steps/layer_yeo_johnson.R +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -156,7 +156,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, # c("lambda_ahead_1_case_rate", "lambda_ahead_7_case_rate"). original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2] - if (all(original_outcome_cols %nin% outcomes_wout_ahead)) { + if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env()) } diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index 91ecb4d7..1abae34d 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -237,7 +237,7 @@ get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lam lambdas <- training %>% summarise( across(all_of(col_names), ~ estimate_yj(.x, limits, num_unique, na_rm)), - .by = epi_keys_checked + .by = all_of(epi_keys_checked) ) %>% rename_with(~ paste0(".lambda_", .x), -all_of(epi_keys_checked)) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 0980348a..4c490325 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -1,6 +1,8 @@ suppressPackageStartupMessages(source(here::here("R", "load_all.R"))) test_that("Yeo-Johnson transformation inverts correctly", { + # Note that the special lambda values of 0 and 2 are covered by the tests + # below. expect_true( map_lgl(seq(-5, 5, 0.1), function(lambda) { map_lgl(seq(-10, 10, 0.1), \(x) abs(yj_inverse(yj_transform(x, lambda), lambda) - x) < 0.00001) %>% all() From b56559883d824fc2166be09a390611cf0b327bed Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 18:11:07 -0700 Subject: [PATCH 14/20] test: inverse transform with multiple outcomes works --- tests/testthat/test-yeo-johnson.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index 4c490325..ea5653a8 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -63,8 +63,7 @@ test_that("Yeo-Johnson steps and layers invert each other", { expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_case_rate)) expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_death_rate)) - # TODO: Make sure that the inverse transformation works - skip("TODO") + # Make sure that the inverse transformation works f <- frosting() %>% layer_predict() %>% layer_epi_YeoJohnson(.pred_ahead_0_case_rate, .pred_ahead_0_death_rate) From d579ef8a48498e7645cbea1d47e148fd1ba6ac35 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Thu, 20 Mar 2025 19:41:19 -0700 Subject: [PATCH 15/20] ci+test: update r version, remove snap --- .github/workflows/tests.yaml | 3 +++ tests/testthat/_snaps/forecasters-basics.md | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index 5cf55c1b..6dbadacc 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -19,6 +19,9 @@ jobs: fail-fast: false matrix: config: + # The one we use in production. + - { os: ubuntu-latest, r: "renv" } + # See if the latest release works. - { os: ubuntu-latest, r: "release" } env: diff --git a/tests/testthat/_snaps/forecasters-basics.md b/tests/testthat/_snaps/forecasters-basics.md index d82ac804..15216ea7 100644 --- a/tests/testthat/_snaps/forecasters-basics.md +++ b/tests/testthat/_snaps/forecasters-basics.md @@ -17,8 +17,3 @@ ! Can't rename columns that don't exist. x Column `slide_value_case_rate` doesn't exist. -# no_recent_outcome deals with no as_of - - Code - res <- forecaster[[2]](jhu, "case_rate", extra_sources = "death_rate", ahead = 2L) - From 28c8471ab6c46e0534e80f9dcd5d5d4dc28f7a22 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Mar 2025 11:13:21 -0700 Subject: [PATCH 16/20] fix: tests --- R/new_epipredict_steps/layer_yeo_johnson.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R index 731297cc..84815697 100644 --- a/R/new_epipredict_steps/layer_yeo_johnson.R +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -62,7 +62,7 @@ layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = } layer_epi_YeoJohnson_new <- function(lambdas, by, terms, id) { - layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id) + epipredict:::layer("epi_YeoJohnson", lambdas = lambdas, by = by, terms = terms, id = id) } #' @export @@ -140,7 +140,7 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, # ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part. outcome_cols <- names(components$mold$outcomes) %>% stringr::str_match("ahead_\\d+_(.*)") %>% - extract(, 2) + magrittr::extract(, 2) components$predictions <- components$predictions %>% rowwise() %>% From 812d00c978a73097ffc8ee39348a857d4a73ec43 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Mar 2025 13:31:28 -0700 Subject: [PATCH 17/20] doc+fix+test: fix other_keys tests, terms handling, docs pass --- R/new_epipredict_steps/layer_yeo_johnson.R | 85 ++++++++++++---------- R/new_epipredict_steps/step_yeo_johnson.R | 67 ++++++++--------- tests/testthat/test-yeo-johnson.R | 50 ++++++++++--- 3 files changed, 119 insertions(+), 83 deletions(-) diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R index 84815697..abd3ae41 100644 --- a/R/new_epipredict_steps/layer_yeo_johnson.R +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -19,34 +19,28 @@ #' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% #' select(geo_value, time_value, cases) #' -#' pop_data <- data.frame(states = c("ca", "ny"), value = c(20000, 30000)) -#' +#' # Create a recipe with a Yeo-Johnson transformation. #' r <- epi_recipe(jhu) %>% -#' step_epi_YeoJohnson( -#' df = pop_data, -#' df_pop_col = "value", -#' by = c("geo_value" = "states"), -#' cases, suffix = "_scaled" -#' ) %>% -#' step_epi_lag(cases_scaled, lag = c(0, 7, 14)) %>% -#' step_epi_ahead(cases_scaled, ahead = 7, role = "outcome") %>% +#' step_epi_YeoJohnson(cases) %>% +#' step_epi_lag(cases, lag = 0) %>% +#' step_epi_ahead(cases, ahead = 0, role = "outcome") %>% #' step_epi_naomit() #' +#' # Create a frosting layer that will undo the Yeo-Johnson transformation. #' f <- frosting() %>% #' layer_predict() %>% -#' layer_threshold(.pred) %>% -#' layer_naomit(.pred) %>% -#' layer_epi_YeoJohnson(.pred, -#' df = pop_data, -#' by = c("geo_value" = "states"), -#' df_pop_col = "value" -#' ) +#' layer_epi_YeoJohnson(.pred) #' +#' # Create a workflow and fit it. #' wf <- epi_workflow(r, linear_reg()) %>% #' fit(jhu) %>% #' add_frosting(f) #' +#' # Forecast the workflow, which should reverse the Yeo-Johnson transformation. #' forecast(wf) +#' # Compare to the original data. +#' plot(density(jhu$cases)) +#' plot(density(forecast(wf)$cases)) layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE) @@ -116,28 +110,21 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, unmatched = c("error", "drop") ) - # TODO: There are many possibilities here: - # - (a) the terms can be empty, where we should probably default to - # all_outcomes(). - # - (b) explicitly giving all_outcomes(), we end here with terms being empty, - # which doesn't seem right; need to make sure we pull in all the outcome - # columns here. The question is what form should they have? - # - (c) if the user just specifies .pred, then we have to infer the outcome - # from the mold, which is simple enough and the main case I have working. - # - (d) the user might specify outcomes of the form .pred_ahead_1_cases, - # .pred_ahead_7_cases, etc. Is that the right format? Trying those out now - # and getting errors downstream from forecast(). - # Get the columns to transform. exprs <- rlang::expr(c(!!!object$terms)) pos <- tidyselect::eval_select(exprs, components$predictions) col_names <- names(pos) - # For every column, we need to use the appropriate lambda column, which differs per row. - # Note that yj_inverse() is vectorized in x, but not in lambda. + # The `object$terms` is where the user specifies the columns they want to + # untransform. We need to match the outcomes with their lambda columns in our + # parameter table and then apply the inverse transformation. if (identical(col_names, ".pred")) { - # In this case, we don't get a hint for the outcome column name, so we need to - # infer it from the mold. `outcomes` is a vector of objects like - # ahead_1_cases, ahead_7_cases, etc. We want to extract the cases part. + # In this case, we don't get a hint for the outcome column name, so we need + # to infer it from the mold. + if (length(components$mold$outcomes) > 1) { + cli_abort("Only one outcome is allowed when specifying `.pred`.", call = rlang::caller_env()) + } + # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc. + # We want to extract the cases part. outcome_cols <- names(components$mold$outcomes) %>% stringr::str_match("ahead_\\d+_(.*)") %>% magrittr::extract(, 2) @@ -146,8 +133,14 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, rowwise() %>% mutate(.pred := yj_inverse(.pred, !!sym(paste0(".lambda_", outcome_cols)))) } else if (identical(col_names, character(0))) { - # In this case, we should assume the user wants to transform all outcomes. - cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented yet.", call = rlang::caller_env()) + # Wish I could suggest `all_outcomes()` here, but currently it's the same as + # not specifying any terms. I don't want to spend time with dealing with + # this case until someone asks for it. + cli::cli_abort("Not specifying columns to layer Yeo-Johnson is not implemented. + If you had a single outcome, you can use `.pred` as a column name. + If you had multiple outcomes, you'll need to specify them like + `.pred_ahead_1_`, `.pred_ahead_7_`, etc. + ", call = rlang::caller_env()) } else { # In this case, we assume that the user has specified the columns they want # transformed here. We then need to determine the lambda columns for each of @@ -157,7 +150,9 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, original_outcome_cols <- str_match(col_names, ".pred_ahead_\\d+_(.*)")[, 2] outcomes_wout_ahead <- str_match(names(components$mold$outcomes), "ahead_\\d+_(.*)")[,2] if (any(original_outcome_cols %nin% outcomes_wout_ahead)) { - cli_abort("All columns specified in `...` must be outcome columns.", call = rlang::caller_env()) + cli_abort("All columns specified in `...` must be outcome columns. + They must be of the form `.pred_ahead_1_`, `.pred_ahead_7_`, etc. + ", call = rlang::caller_env()) } for (i in seq_along(col_names)) { @@ -184,7 +179,8 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), #' Inverse Yeo-Johnson transformation #' -#' Inverse of `yj_transform` in step_yeo_johnson.R. +#' Inverse of `yj_transform` in step_yeo_johnson.R. Note that this function is +#' vectorized in x, but not in lambda. #' #' @keywords internal yj_inverse <- function(x, lambda, eps = 0.001) { @@ -247,3 +243,16 @@ get_lambdas_in_layer <- function(workflow) { } lambdas } + +get_transformed_cols_in_layer <- function(workflow) { + this_recipe <- hardhat::extract_recipe(workflow) + if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { + cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) + } + for (step in this_recipe$steps) { + if (inherits(step, "step_epi_YeoJohnson")) { + lambdas <- step$lambdas + break + } + } +} diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index 1abae34d..65290628 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -3,17 +3,16 @@ #' `step_epi_YeoJohnson()` creates a *specification* of a recipe step that will #' transform data using a Yeo-Johnson transformation. This fork works with panel #' data and is meant for epidata. -#' TODO: Do an edit pass on this docstring. #' #' @inheritParams step_center -#' @param lambdas A numeric vector of transformation values. This +#' @param lambdas Internal. A numeric vector of transformation values. This #' is `NULL` until computed by [prep()]. #' @param na_lambda_fill A numeric value to fill in for any #' geos where the lambda cannot be estimated. #' @param limits A length 2 numeric vector defining the range to #' compute the transformation parameter lambda. -#' @param num_unique An integer where data that have less possible -#' values will not be evaluated for a transformation. +#' @param num_unique An integer where data that have fewer than this +#' many unique values will not be evaluated for a transformation. #' @param na_rm A logical indicating whether missing values should be #' removed. #' @param skip A logical. Should the step be skipped when the recipe is @@ -22,11 +21,13 @@ #' @template step-return #' @family individual transformation steps #' @export -#' @details The Yeo-Johnson transformation is very similar to the -#' Box-Cox but does not require the input variables to be strictly -#' positive. In the package, the partial log-likelihood function is -#' directly optimized within a reasonable set of transformation -#' values (which can be changed by the user). +#' @details The Yeo-Johnson transformation is variance-stabilizing +#' transformation, similar to the Box-Cox but does not require the input +#' variables to be strictly positive. In the package, the partial +#' log-likelihood function is directly optimized within a reasonable set of +#' transformation values (which can be changed by the user). The optimization +#' finds a lambda parameter for each group in the data that minimizes the +#' variance of the transformed data. #' #' This transformation is typically done on the outcome variable #' using the residuals for a statistical model (such as ordinary @@ -36,7 +37,7 @@ #' variable distributions more symmetric. #' #' If the transformation parameters are estimated to be very -#' closed to the bounds, or if the optimization fails, a value of +#' close to the bounds, or if the optimization fails, a value of #' `NA` is used and no transformation is applied. #' #' # Tidying @@ -54,28 +55,24 @@ #' #' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power #' transformations to improve normality or symmetry. *Biometrika*. -#' @examplesIf rlang::is_installed("modeldata") -#' data(biomass, package = "modeldata") +#' @examplesIf +#' jhu <- cases_deaths_subset %>% +#' filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' filtered_data <- jhu #' -#' biomass_tr <- biomass[biomass$dataset == "Training", ] -#' biomass_te <- biomass[biomass$dataset == "Testing", ] -#' -#' rec <- recipe( -#' HHV ~ carbon + hydrogen + oxygen + nitrogen + sulfur, -#' data = biomass_tr -#' ) -#' -#' yj_transform <- step_epi_YeoJohnson(rec, all_numeric()) -#' -#' yj_estimates <- prep(yj_transform, training = biomass_tr) -#' -#' yj_te <- bake(yj_estimates, biomass_te) -#' -#' plot(density(biomass_te$sulfur), main = "before") -#' plot(density(yj_te$sulfur), main = "after") -#' -#' tidy(yj_transform, number = 1) -#' tidy(yj_estimates, number = 1) +#' r <- epi_recipe(filtered_data) %>% +#' step_epi_YeoJohnson(cases) +#' # View the recipe +#' r +#' # Fit the recipe +#' tr <- r %>% prep(filtered_data) +#' # View the lambda values +#' tr$steps[[1]]$lambdas +#' # View the transformed data +#' df <- tr %>% bake(filtered_data) +#' plot(density(df$cases)) +#' plot(density(filtered_data$cases)) step_epi_YeoJohnson <- function( recipe, ..., @@ -266,6 +263,8 @@ get_lambdas_yj_table <- function(training, col_names, limits, num_unique, na_lam #' Internal Functions #' +#' Note that this function is vectorized in x, but not in lambda. +#' #' @keywords internal #' @rdname recipes-internal #' @export @@ -314,14 +313,14 @@ yj_transform <- function(x, lambda, ind_neg = NULL, eps = 0.001) { x } - ## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., ## & Johnson, R. A. (2000). A new family of power transformations ## to improve normality or symmetry. Biometrika. page 957 ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) { n <- length(y) y_t <- yj_transform(y, lambda, ind_neg) - mu_t <- mean(y_t) + # EDIT: Unused in the original recipes code. + # mu_t <- mean(y_t) var_t <- var(y_t) * (n - 1) / n res <- -.5 * n * log(var_t) + (lambda - 1) * const res @@ -361,6 +360,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca const <- sum(sign(dat) * log(abs(dat) + 1)) + suppressWarnings( res <- optimize( yj_obj, interval = limits, @@ -370,6 +370,7 @@ estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE, ca const = const, tol = .0001 ) + ) lam <- res$maximum if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= eps) { lam <- NA diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R index ea5653a8..28ca7b72 100644 --- a/tests/testthat/test-yeo-johnson.R +++ b/tests/testthat/test-yeo-johnson.R @@ -77,21 +77,47 @@ test_that("Yeo-Johnson steps and layers invert each other", { }) test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { - skip("TODO") - jhu <- cases_deaths_subset %>% - filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% - select(geo_value, time_value, cases) - filtered_data <- jhu + # Small synthetic grad_employ_dataset version. + filtered_data <- tribble( + ~geo_value, ~age_group, ~edu_qual, ~time_value, ~med_income_2y, + "ca", "25-34", "bachelor", 2017, 50000, + "ca", "25-34", "bachelor", 2018, 50500, + "ca", "25-34", "bachelor", 2019, 51000, + "ca", "25-34", "bachelor", 2020, 51500, + "ca", "25-34", "bachelor", 2021, 52000, + "ca", "25-34", "bachelor", 2022, 52500, + "ca", "35-1000", "bachelor", 2017, 3e10, + "ca", "35-1000", "bachelor", 2018, 3e10 + 10, + "ca", "35-1000", "bachelor", 2019, 3e10 + 20, + "ca", "35-1000", "bachelor", 2020, 3e10 + 30, + "ca", "35-1000", "bachelor", 2021, 3e10 + 40, + "ca", "35-1000", "bachelor", 2022, 3e10 + 50, + "ca", "25-34", "master", 2017, 2 * 50000, + "ca", "25-34", "master", 2018, 2 * 50500, + "ca", "25-34", "master", 2019, 2 * 51000, + "ca", "25-34", "master", 2020, 2 * 51500, + "ca", "25-34", "master", 2021, 2 * 52000, + "ca", "25-34", "master", 2022, 2 * 52500, + "ca", "35-1000", "master", 2017, 2 * 3e10, + "ca", "35-1000", "master", 2018, 2 * (3e10 + 10), + "ca", "35-1000", "master", 2019, 2 * (3e10 + 20), + "ca", "35-1000", "master", 2020, 2 * (3e10 + 30), + "ca", "35-1000", "master", 2021, 2 * (3e10 + 40), + "ca", "35-1000", "master", 2022, 2 * (3e10 + 50) + ) %>% as_epi_df(other_keys = c("age_group", "edu_qual")) # Get some lambda values r <- epi_recipe(filtered_data) %>% - step_epi_YeoJohnson(cases) %>% - step_epi_lag(cases, lag = 0) %>% - step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_YeoJohnson(med_income_2y) %>% + step_epi_lag(med_income_2y, lag = 0) %>% + step_epi_ahead(med_income_2y, ahead = 0, role = "outcome") %>% step_epi_naomit() tr <- r %>% prep(filtered_data) - # Check for fixed lambda values - expect_true(all(near(tr$steps[[1]]$lambdas$.lambda_cases, c(0.856, 0.207), tol = 0.001))) + expect_true(".lambda_med_income_2y" %in% names(tr$steps[[1]]$lambdas)) + expect_true("geo_value" %in% names(tr$steps[[1]]$lambdas)) + expect_true("age_group" %in% names(tr$steps[[1]]$lambdas)) + expect_true("edu_qual" %in% names(tr$steps[[1]]$lambdas)) + expect_true(is.numeric(tr$steps[[1]]$lambdas$.lambda_med_income_2y)) # Make sure that the inverse transformation works f <- frosting() %>% @@ -100,7 +126,7 @@ test_that("Yeo-Johnson steps and layers invert each other when other_keys are pr wf <- epi_workflow(r, linear_reg()) %>% fit(filtered_data) %>% add_frosting(f) - out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) - out2 <- forecast(wf) %>% rename(cases = .pred) + out1 <- filtered_data %>% as_tibble() %>% slice_max(time_value, by = geo_value) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) + out2 <- forecast(wf) %>% rename(med_income_2y = .pred) %>% select(geo_value, age_group, time_value, med_income_2y) %>% arrange(geo_value, age_group, time_value) expect_equal(out1, out2) }) From e8eef18a377e3197b5d127da8d2a82fec5f9e4d7 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Mar 2025 13:48:16 -0700 Subject: [PATCH 18/20] Update R/new_epipredict_steps/layer_yeo_johnson.R Co-authored-by: David Weber --- R/new_epipredict_steps/layer_yeo_johnson.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R index abd3ae41..1fac3261 100644 --- a/R/new_epipredict_steps/layer_yeo_johnson.R +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -39,8 +39,8 @@ #' # Forecast the workflow, which should reverse the Yeo-Johnson transformation. #' forecast(wf) #' # Compare to the original data. -#' plot(density(jhu$cases)) -#' plot(density(forecast(wf)$cases)) +#' jhu %>% filter(time_value == "2021-12-31") +#' forecast(wf) layer_epi_YeoJohnson <- function(frosting, ..., lambdas = NULL, by = NULL, id = rand_id("epi_YeoJohnson")) { checkmate::assert_tibble(lambdas, min.rows = 1, null.ok = TRUE) From 9e5748802547430818838707c2cd477baca99df4 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Mar 2025 13:50:52 -0700 Subject: [PATCH 19/20] fix: delete unused function --- R/new_epipredict_steps/layer_yeo_johnson.R | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/R/new_epipredict_steps/layer_yeo_johnson.R b/R/new_epipredict_steps/layer_yeo_johnson.R index 1fac3261..b3ae0bd7 100644 --- a/R/new_epipredict_steps/layer_yeo_johnson.R +++ b/R/new_epipredict_steps/layer_yeo_johnson.R @@ -243,16 +243,3 @@ get_lambdas_in_layer <- function(workflow) { } lambdas } - -get_transformed_cols_in_layer <- function(workflow) { - this_recipe <- hardhat::extract_recipe(workflow) - if (!(this_recipe %>% recipes::detect_step("epi_YeoJohnson"))) { - cli_abort("`layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe.", call = rlang::caller_env()) - } - for (step in this_recipe$steps) { - if (inherits(step, "step_epi_YeoJohnson")) { - lambdas <- step$lambdas - break - } - } -} From 5d5f1a6f1d01fdc2a80145293e781a93a9129e47 Mon Sep 17 00:00:00 2001 From: Dmitry Shemetov Date: Fri, 21 Mar 2025 13:51:48 -0700 Subject: [PATCH 20/20] doc: fix @examples --- R/new_epipredict_steps/step_yeo_johnson.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/new_epipredict_steps/step_yeo_johnson.R b/R/new_epipredict_steps/step_yeo_johnson.R index 65290628..ef83e172 100644 --- a/R/new_epipredict_steps/step_yeo_johnson.R +++ b/R/new_epipredict_steps/step_yeo_johnson.R @@ -55,7 +55,7 @@ #' #' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power #' transformations to improve normality or symmetry. *Biometrika*. -#' @examplesIf +#' @examples #' jhu <- cases_deaths_subset %>% #' filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% #' select(geo_value, time_value, cases)