diff --git a/.Rbuildignore b/.Rbuildignore index dc41e622..0bdb211f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -22,3 +22,4 @@ ^.lintr$ ^.venv$ ^inst/templates$ +^air\.toml$ \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 81a35b30..e3594a50 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,6 +42,7 @@ Imports: recipes (>= 1.0.4), rlang (>= 1.1.0), stats, + stringr, tibble, tidyr, tidyselect, diff --git a/NAMESPACE b/NAMESPACE index c2fa9494..053913c9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,6 +19,7 @@ S3method(bake,check_enough_data) S3method(bake,epi_recipe) S3method(bake,step_adjust_latency) S3method(bake,step_climate) +S3method(bake,step_epi_YeoJohnson) S3method(bake,step_epi_ahead) S3method(bake,step_epi_lag) S3method(bake,step_epi_slide) @@ -55,6 +56,7 @@ S3method(prep,check_enough_data) S3method(prep,epi_recipe) S3method(prep,step_adjust_latency) S3method(prep,step_climate) +S3method(prep,step_epi_YeoJohnson) S3method(prep,step_epi_ahead) S3method(prep,step_epi_lag) S3method(prep,step_epi_slide) @@ -76,6 +78,7 @@ S3method(print,flatline) S3method(print,frosting) S3method(print,layer_add_forecast_date) S3method(print,layer_add_target_date) +S3method(print,layer_epi_YeoJohnson) S3method(print,layer_naomit) S3method(print,layer_point_from_distn) S3method(print,layer_population_scaling) @@ -86,6 +89,7 @@ S3method(print,layer_threshold) S3method(print,layer_unnest) S3method(print,step_adjust_latency) S3method(print,step_climate) +S3method(print,step_epi_YeoJohnson) S3method(print,step_epi_ahead) S3method(print,step_epi_lag) S3method(print,step_epi_slide) @@ -101,6 +105,7 @@ S3method(run_mold,default_epi_recipe_blueprint) S3method(slather,layer_add_forecast_date) S3method(slather,layer_add_target_date) S3method(slather,layer_cdc_flatline_quantiles) +S3method(slather,layer_epi_YeoJohnson) S3method(slather,layer_naomit) S3method(slather,layer_point_from_distn) S3method(slather,layer_population_scaling) @@ -114,10 +119,12 @@ S3method(snap,quantile_pred) S3method(tidy,check_enough_data) S3method(tidy,frosting) S3method(tidy,layer) +S3method(tidy,step_epi_YeoJohnson) S3method(update,layer) S3method(vec_arith,quantile_pred) S3method(vec_arith.numeric,quantile_pred) S3method(vec_arith.quantile_pred,numeric) +S3method(vec_arith.quantile_pred,quantile_pred) S3method(vec_math,quantile_pred) S3method(vec_proxy_equal,quantile_pred) S3method(weighted_interval_score,quantile_pred) @@ -176,6 +183,7 @@ export(layer) export(layer_add_forecast_date) export(layer_add_target_date) export(layer_cdc_flatline_quantiles) +export(layer_epi_YeoJohnson) export(layer_naomit) export(layer_point_from_distn) export(layer_population_scaling) @@ -207,6 +215,7 @@ export(smooth_quantile_reg) export(snap) export(step_adjust_latency) export(step_climate) +export(step_epi_YeoJohnson) export(step_epi_ahead) export(step_epi_lag) export(step_epi_naomit) @@ -227,6 +236,7 @@ import(epidatasets) import(epiprocess) import(parsnip) import(recipes) +import(vctrs) importFrom(checkmate,assert_class) importFrom(checkmate,assert_numeric) importFrom(checkmate,test_character) diff --git a/NEWS.md b/NEWS.md index de698ee9..36b0198f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -12,12 +12,12 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat `data(, package = "epidatasets")`, `epidatasets::` or, after loading the package, the name of the dataset alone (#382). - `step_adjust_latency()` no longer allows empty column selection. -- Addresses upstream breaking changes from cmu-delphi/epiprocess#595 (`growth_rate()`). +- Addresses upstream breaking changes from cmu-delphi/epiprocess#595 (`growth_rate()`). `step_growth_rate()` has lost its `additional_gr_args_list` argument and now has an `na_rm` argument. - Moves `epiprocess` out of depends (#440). No internals have changed, but downstream users may need to add `library(epiprocess)` to existing code. -- Removes dependence on the `distributional` package, replacing the quantiles +- Removes dependence on the `distributional` package, replacing the quantiles with `hardhat::quantile_pred()`. Some associated functions are deprecated with `lifecycle` messages. - Rename `check_enough_train_data()` to `check_enough_data()`, and generalize it @@ -38,6 +38,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat - Replace `dist_quantiles()` with `hardhat::quantile_pred()` - Allow `quantile()` to threshold to an interval if desired (#434) - `arx_forecaster()` detects if there's enough data to predict +- Add `step_epi_YeoJohnson()` to perform a Yeo-Johnson transformation on the outcome variable. +- Add `layer_epi_YeoJohnson()` to undo a Yeo-Johnson transformation on the outcome variable in a forecast workflow. ## Bug fixes diff --git a/R/layer_yeo_johnson.R b/R/layer_yeo_johnson.R new file mode 100644 index 00000000..8d63b7f6 --- /dev/null +++ b/R/layer_yeo_johnson.R @@ -0,0 +1,194 @@ +#' Unormalizing transformation +#' +#' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +#' are using this step on a column that will eventually become the outcome +#' variable, you should make sure that the original name of that column is a +#' subset of the outcome variable name. `ahead_7_cases` when `cases` is +#' transformed will work well, while `ahead_7` will not. +#' +#' @inheritParams layer_population_scaling +#' @param yj_params A data frame of parameters to be used for inverting the +#' transformation. Typically set automatically. If you have done multiple +#' transformations such that the outcome variable name no longer contains the +#' column that this step transforms, then you should manually specify this to +#' be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an +#' example where you wouldn't need to set this, if your output is +#' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with +#' other columns), then you wouldn't need to set this. However if you have +#' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step. +#' +#' @return an updated `frosting` postprocessor +#' @export +#' @examples +#' library(dplyr) +#' jhu <- epidatasets::cases_deaths_subset %>% +#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' +#' # Create a recipe with a Yeo-Johnson transformation. +#' r <- epi_recipe(jhu) %>% +#' step_epi_YeoJohnson(cases) %>% +#' step_epi_lag(cases, lag = 0) %>% +#' step_epi_ahead(cases, ahead = 0, role = "outcome") %>% +#' step_epi_naomit() +#' +#' # Create a frosting layer that will undo the Yeo-Johnson transformation. +#' f <- frosting() %>% +#' layer_predict() %>% +#' layer_epi_YeoJohnson(.pred) +#' +#' # Create a workflow and fit it. +#' wf <- epi_workflow(r, linear_reg()) %>% +#' fit(jhu) %>% +#' add_frosting(f) +#' +#' # Forecast the workflow, which should reverse the Yeo-Johnson transformation. +#' forecast(wf) +#' # Compare to the original data. +#' jhu %>% filter(time_value == "2021-12-31") +#' forecast(wf) +layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, id = rand_id("epi_YeoJohnson")) { + checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE) + + add_layer( + frosting, + layer_epi_YeoJohnson_new( + yj_params = yj_params, + terms = dplyr::enquos(...), + id = id + ) + ) +} + +layer_epi_YeoJohnson_new <- function(yj_params, terms, id) { + layer("epi_YeoJohnson", yj_params = yj_params, terms = terms, id = id) +} + +#' @export +#' @importFrom workflows extract_preprocessor +slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) { + rlang::check_dots_empty() + + # get the yj_params from the layer or from the workflow. + yj_params <- + object$yj_params %||% + get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params") + + # Establish the join columns. + join_by_columns <- key_colnames(new_data, exclude = "time_value") %>% sort() + joinby <- list(x = join_by_columns, y = join_by_columns) + hardhat::validate_column_names(components$predictions, joinby$x) + hardhat::validate_column_names(yj_params, joinby$y) + + # Join the yj_params. + components$predictions <- inner_join( + components$predictions, + yj_params, + by = object$by, + relationship = "many-to-one", + unmatched = c("error", "drop") + ) + + exprs <- rlang::expr(c(!!!object$terms)) + pos <- tidyselect::eval_select(exprs, components$predictions) + col_names <- names(pos) + + # The `object$terms` is where the user specifies the columns they want to + # untransform. We need to match the outcomes with their yj_param columns in our + # parameter table and then apply the inverse transformation. + if (length(col_names) == 0) { + # not specified by the user, so just modify everything starting with `.pred` + components$predictions <- components$predictions %>% + mutate(across(starts_with(".pred"), \(.pred) yj_inverse(.pred, .lambda))) %>% + select(-.lambda) + } else { + components$predictions <- components$predictions %>% + mutate(across(all_of(col_names), \(.pred) yj_inverse(.pred, .lambda))) %>% + select(-.lambda) + } + + # Remove the yj_param columns. + components$predictions <- components$predictions %>% + select(-any_of(starts_with(".yj_param_"))) %>% + ungroup() + components +} + +#' @export +print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) { + title <- "Yeo-Johnson transformation (see `yj_params` object for values) on " + print_layer(x$terms, title = title, width = width) +} + +# Inverse Yeo-Johnson transformation +# +# Inverse of `yj_transform` in step_yeo_johnson.R. +yj_inverse <- function(x_in, lambda, eps = 0.001) { + if (any(is.na(lambda))) { + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) + } + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] + inv_x <- ifelse( + x < 0, + # negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -(exp(-x) - 1), + -(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1) + ), + # non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + (exp(x) - 1), + (lambda * x + 1)^(1 / lambda) - 1 + ) + ) + if (x_in %>% inherits("quantile_pred")) { + inv_x <- inv_x %>% quantile_pred(x_in %@% "quantile_levels") + } + inv_x +} + + +#' get the parameters used in the initial step +#' +#' @param workflow the workflow to extract the parameters from +#' @param step_name the name of the step to look for, as recognized by `detect_step` +#' @param param_name the parameter to pull out of the step +#' @keywords internal +get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_name = "yj_params") { + full_step_name <- glue::glue("step_{step_name}") + this_recipe <- hardhat::extract_recipe(workflow) + if (!(this_recipe %>% recipes::detect_step(step_name))) { + cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_call()) + } + outcomes <- + workflows::extract_recipe(workflow)$term_info %>% + filter(role == "outcome") %>% + pull(variable) + if (length(outcomes) > 1) { + cli_abort( + "`layer_{step_name}` doesn't support multiple output columns. + This workflow produces {outcomes} as output columns.", + call = rlang::caller_call(), + class = "epipredict__layer_yeo_johnson_multi_outcome_error" + ) + } + for (step in this_recipe$steps) { + # if it's a `step_name` step that also transforms a column that is a subset + # of the output column name + is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes)) + if (inherits(step, full_step_name) && any(is_outcome_subset)) { + params <- step[[param_name]] %>% + select( + key_colnames(workflow$original_data, exclude = "time_value"), + contains(step$columns[is_outcome_subset]) + ) %>% + rename(.lambda = contains(step$columns)) + break + } + } + params +} diff --git a/R/quantile_pred-methods.R b/R/quantile_pred-methods.R index 293fad90..56e8fcf0 100644 --- a/R/quantile_pred-methods.R +++ b/R/quantile_pred-methods.R @@ -111,7 +111,6 @@ vec_proxy_equal.quantile_pred <- function(x, ...) { dplyr::select(-.row) } - # quantiles by treating quantile_pred like a distribution ----------------- @@ -287,6 +286,12 @@ vec_math.quantile_pred <- function(.fn, .x, ...) { quantile_pred(.fn(.x), quantile_levels) } +#' Internal vctrs methods +#' +#' @import vctrs +#' @keywords internal +#' @name epipredict-vctrs + #' @importFrom vctrs vec_arith vec_arith.numeric #' @export #' @method vec_arith quantile_pred @@ -294,6 +299,19 @@ vec_arith.quantile_pred <- function(op, x, y, ...) { UseMethod("vec_arith.quantile_pred", y) } + +#' @export +#' @method vec_arith.quantile_pred quantile_pred +vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) { + all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels")) + op_fn <- getExportedValue("base", op) + # Interpolate/extrapolate to the same quantiles + x <- quantile.quantile_pred(x, all_quantiles) + y <- quantile.quantile_pred(y, all_quantiles) + out <- op_fn(x, y, ...) + quantile_pred(out, all_quantiles) +} + #' @export #' @method vec_arith.quantile_pred numeric vec_arith.quantile_pred.numeric <- function(op, x, y, ...) { diff --git a/R/step_adjust_latency.R b/R/step_adjust_latency.R index ae9db6ef..a0d59bc1 100644 --- a/R/step_adjust_latency.R +++ b/R/step_adjust_latency.R @@ -272,7 +272,6 @@ step_adjust_latency_new <- # lags introduces max(lags) NA's after the max_time_value. #' @export #' @importFrom glue glue -#' @importFrom dplyr rowwise prep.step_adjust_latency <- function(x, training, info = NULL, ...) { latency <- x$latency col_names <- recipes::recipes_eval_select(x$terms, training, info) diff --git a/R/step_yeo_johnson.R b/R/step_yeo_johnson.R new file mode 100644 index 00000000..ff4016cd --- /dev/null +++ b/R/step_yeo_johnson.R @@ -0,0 +1,403 @@ +#' Yeo-Johnson transformation +#' +#' `step_epi_YeoJohnson()` creates a *specification* of a recipe step that will +#' transform data using a Yeo-Johnson transformation. This fork works with panel +#' data and is meant for epidata. +#' +#' @inheritParams step_population_scaling +#' @param trained A logical for whether the selectors in `...` +#' have been resolved by [prep()]. +#' @param yj_params Internal. A numeric vector of transformation values. This +#' is `NULL` until computed by [prep()]. +#' @param na_fill A numeric value to fill in for any geos where a Yeo-Johnson +#' parameter cannot be estimated. +#' @param limits A length 2 numeric vector defining the range to compute the +#' transformation parameter. +#' @param num_unique An integer where data that have fewer than this many unique +#' values will not be evaluated for a transformation. +#' @param na_rm A logical indicating whether missing values should be removed +#' before estimating the transformation parameter. +#' @template step-return +#' @family individual transformation steps +#' @export +#' @details The Yeo-Johnson transformation is variance-stabilizing +#' transformation, similar to the Box-Cox but does not require the input +#' variables to be strictly positive. In the package, the partial +#' log-likelihood function is directly optimized within a reasonable set of +#' transformation values (which can be changed by the user). The optimization +#' finds a lambda parameter for each group in the data that minimizes the +#' variance of the transformed data. +#' +#' This transformation is typically done on the outcome variable +#' using the residuals for a statistical model (such as ordinary +#' least squares). Here, a simple null model (intercept only) is +#' used to apply the transformation to the *predictor* +#' variables individually. This can have the effect of making the +#' variable distributions more symmetric. +#' +#' If the transformation parameters are estimated to be very +#' close to the bounds, or if the optimization fails, a value of +#' `NA` is used and no transformation is applied. +#' +#' # Tidying +#' +#' When you [`tidy()`][tidy.recipe()] this step, a tibble is returned with +#' columns `terms`, `value` , and `id`: +#' +#' \describe{ +#' \item{terms}{character, the selectors or variables selected} +#' \item{value}{numeric, the lambda estimate} +#' \item{id}{character, id of this step} +#' } +#' +#' @references Yeo, I. K., and Johnson, R. A. (2000). A new family of power +#' transformations to improve normality or symmetry. *Biometrika*. +#' @examples +#' jhu <- cases_deaths_subset %>% +#' filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% +#' select(geo_value, time_value, cases) +#' filtered_data <- jhu +#' +#' r <- epi_recipe(filtered_data) %>% +#' step_epi_YeoJohnson(cases) +#' # View the recipe +#' r +#' # Fit the recipe +#' tr <- r %>% prep(filtered_data) +#' # View the parameter values +#' tr$steps[[1]]$yj_params +#' # View the transformed data +#' df <- tr %>% bake(filtered_data) +#' plot(density(df$cases)) +#' plot(density(filtered_data$cases)) +step_epi_YeoJohnson <- function( + recipe, + ..., + role = "predictor", + trained = FALSE, + yj_params = NULL, + na_fill = 1 / 4, + limits = c(-5, 5), + num_unique = 5, + na_rm = TRUE, + skip = FALSE, + id = rand_id("epi_YeoJohnson")) { + checkmate::assert_numeric(limits, len = 2) + checkmate::assert_numeric(na_fill, lower = min(limits), upper = max(limits), len = 1) + checkmate::assert_numeric(num_unique, lower = 2, upper = Inf, len = 1) + checkmate::assert_logical(na_rm, len = 1) + checkmate::assert_logical(skip, len = 1) + add_step( + recipe, + step_epi_YeoJohnson_new( + terms = enquos(...), + role = role, + trained = trained, + yj_params = yj_params, + na_fill = na_fill, + limits = sort(limits)[1:2], + num_unique = num_unique, + na_rm = na_rm, + forecast_date = NULL, + metadata = NULL, + columns = NULL, + skip = skip, + id = id + ) + ) +} + +step_epi_YeoJohnson_new <- function( + terms, + role, + trained, + yj_params, + na_fill, + limits, + num_unique, + na_rm, + forecast_date, + metadata, + columns, + skip, + id) { + step( + subclass = "epi_YeoJohnson", + terms = terms, + role = role, + trained = trained, + yj_params = yj_params, + na_fill = na_fill, + limits = limits, + num_unique = num_unique, + na_rm = na_rm, + forecast_date = forecast_date, + metadata = metadata, + columns = columns, + skip = skip, + id = id + ) +} + +#' @export +prep.step_epi_YeoJohnson <- function(x, training, info = NULL, ...) { + # Check that the columns selected for transformation are numeric. + col_names <- recipes_eval_select(x$terms, training, info) + recipes::check_type(training[, col_names], types = c("double", "integer")) + + yj_params <- compute_yj_params( + training, + col_names, + x$limits, + x$num_unique, + x$na_fill, + x$na_rm, + key_colnames(training, exclude = "time_value") + ) + + step_epi_YeoJohnson_new( + terms = x$terms, + role = x$role, + trained = TRUE, + yj_params = yj_params, + na_fill = x$na_fill, + limits = x$limits, + num_unique = x$num_unique, + na_rm = x$na_rm, + forecast_date = attr(training, "metadata")$as_of, + metadata = attr(training, "metadata"), + columns = col_names, + skip = x$skip, + id = x$id + ) +} + +#' @export +bake.step_epi_YeoJohnson <- function(object, new_data, ...) { + # If not an epi_df, make it one assuming the template of training data. + # If it is an epi_df, check that the keys match. + # Imitating the pattern in step_adjust_latency(). + if (!inherits(new_data, "epi_df") || is.null(attr(new_data, "metadata")$as_of)) { + new_data <- as_epi_df( + new_data, + as_of = object$forecast_date, + other_keys = object$metadata$other_keys %||% character() + ) + attr(new_data, "metadata") <- object$metadata + } + # Check that the columns for transformation are present in new_data. + if (!all(object$columns %in% colnames(new_data))) { + cli::cli_abort( + "The columns for transformation are not present in the new data." + ) + } + # Check that the columns for transformation are present in new_data. + col_names <- object$columns + check_new_data(col_names, object, new_data) + + # Check that the keys match. + check <- hardhat::check_column_names(new_data, object$yj_params %>% select(-starts_with(".yj_param_")) %>% colnames()) + if (!check$ok) { + cli_abort(c( + "Some variables used for training are not available in {.arg x}.", + i = "The following required columns are missing: {check$missing_names}" + )) + } + # Transform each column, using the appropriate yj_param column per row. + new_data <- left_join(new_data, object$yj_params, by = key_colnames(new_data, exclude = "time_value")) + for (col in col_names) { + new_data <- new_data %>% + mutate(!!col := yj_transform(!!sym(col), !!sym(paste0(".yj_param_", col)))) + } + # Remove the yj_param columns. + new_data %>% + select(-starts_with(".yj_param_")) %>% + ungroup() +} + +#' @export +print.step_epi_YeoJohnson <- function(x, width = max(20, options()$width - 39), ...) { + title <- "Yeo-Johnson transformation (see `yj_params` object for values) on " + print_epi_step(x$terms, x$terms, title = title, width = width) + invisible(x) +} + +# Compute the yj_param values per group for each column. +compute_yj_params <- function(training, col_names, limits, num_unique, na_fill, na_rm, epi_keys_checked) { + # Estimate the yj_param for each column, creating a .yj_param_ column for + # each. Note that estimate_yj() operates on each column. + yj_params <- training %>% + summarise( + across(all_of(col_names), ~ estimate_yj(.x, limits, num_unique, na_rm)), + .by = all_of(epi_keys_checked) + ) %>% + dplyr::rename_with(~ paste0(".yj_param_", .x), -all_of(epi_keys_checked)) + + # Check for NAs in any of the yj_param_ columns. + # EDIT: This warning was too noisy. Keeping code around, in case we want it. + # for (col in col_names) { + # if (any(is.na(values[[paste0(".yj_param_", col)]]))) { + # cli::cli_warn( + # c( + # x = "Yeo-Johnson parameter could not be estimated for some geos for {col}.", + # i = "Using parameter={x$na_fill} in these cases." + # ), + # call = rlang::caller_call() + # ) + # } + # } + + # Fill in NAs with the default yj_param. + yj_params %>% + mutate(across(starts_with(".yj_param_"), \(col) ifelse(is.na(col), na_fill, col))) +} + + +yj_input_type_management <- function(x_in, lambda) { + if (x_in %>% inherits("quantile_pred")) { + x <- as.matrix(x_in) + if (length(lambda) == 1) { + lambda <- lambda %>% + rep(prod(dim(x))) %>% + matrix(dim(x)) + } else if (length(x_in) == length(lambda)) { + lambda <- lambda %>% + rep(dim(x)[[2]]) %>% + matrix(dim(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) + } + } else if (!inherits(x_in, "tbl_df") || is.data.frame(x_in)) { + x <- unlist(x_in, use.names = FALSE) + } else { + if (!is.vector(x_in)) { + x <- as.vector(x_in) + } else { + x <- x_in + } + } + + # these only apply if x_in isn't a quantile distribution + if (length(x) > 1 && length(lambda) == 1) { + lambda <- rep(lambda, length(x)) + } else if (length(x) != length(lambda)) { + cli::cli_abort("Length of `x` must be equal to length of `lambda`.", call = rlang::caller_call(n = 2)) + } + list(x, lambda) +} +### Code below taken from recipes::step_YeoJohnson. +### We keep "lambda" here, but above we renamed it to "yj_param". +### Modified yj_transform() to be vectorized in lambda. Also modified to work on distributions. +### https://github.com/tidymodels/recipes/blob/v1.1.1/R/YeoJohnson.R#L172 +# Yeo-Johnson transformation +yj_transform <- function(x_in, lambda, ind_neg = NULL, eps = 0.001) { + if (any(is.na(lambda))) { + cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call()) + } + x_lambda <- yj_input_type_management(x_in, lambda) + x <- x_lambda[[1]] + lambda <- x_lambda[[2]] + + transformed <- ifelse( + x < 0, + # for negative values we test if lambda is ~2 + ifelse( + abs(lambda - 2) < eps, + -log(abs(x) + 1), + -((abs(x) + 1)^(2 - lambda) - 1) / (2 - lambda) + ), + # for non-negative values we test if lambda is ~0 + ifelse( + abs(lambda) < eps, + log(abs(x) + 1), + ((abs(x) + 1)^lambda - 1) / lambda + ) + ) + + if (x_in %>% inherits("quantile_pred")) { + transformed <- transformed %>% quantile_pred(x_in %@% "quantile_levels") + } + transformed +} + +## Helper for the log-likelihood calc for eq 3.1 of Yeo, I. K., +## & Johnson, R. A. (2000). A new family of power transformations +## to improve normality or symmetry. Biometrika. page 957 +ll_yj <- function(lambda, y, ind_neg, const, eps = 0.001) { + n <- length(y) + y_t <- yj_transform(y, lambda, ind_neg) + # EDIT: Unused in the original recipes code. + # mu_t <- mean(y_t) + var_t <- var(y_t) * (n - 1) / n + res <- -.5 * n * log(var_t) + (lambda - 1) * const + res +} + +## eliminates missing data and returns -llh +yj_obj <- function(lam, dat, ind_neg, const) { + ll_yj(lambda = lam, y = dat, ind_neg = ind_neg, const = const) +} + +## estimates the values +estimate_yj <- function(dat, limits = c(-5, 5), num_unique = 5, na_rm = TRUE) { + na_rows <- which(is.na(dat)) + if (length(na_rows) > 0) { + if (na_rm) { + dat <- dat[-na_rows] + } else { + cli::cli_abort( + c( + x = "Missing values are not allowed for the YJ transformation.", + i = "See {.arg na_rm} option." + ), + call = rlang::caller_call(n = 2) + ) + } + } + + eps <- .001 + if (length(unique(dat)) < num_unique) { + return(NA) + } + dat_neg <- dat < 0 + ind_neg <- list(is = which(dat_neg), not = which(!dat_neg)) + + const <- sum(sign(dat) * log(abs(dat) + 1)) + + suppressWarnings( + res <- optimize( + yj_obj, + interval = limits, + maximum = TRUE, + dat = dat, + ind_neg = ind_neg, + const = const, + tol = .0001 + ) + ) + lam <- res$maximum + if (abs(limits[1] - lam) <= eps | abs(limits[2] - lam) <= eps) { + lam <- NA + } + lam +} + +# Copied from recipes::tidy.step_BoxCox +# +#' @export +tidy.step_epi_YeoJohnson <- function(x, ...) { + if (is_trained(x)) { + res <- tibble( + terms = names(x$yj_params), + value = unname(x$yj_params) + ) + } else { + term_names <- sel2char(x$terms) + res <- tibble( + terms = term_names, + value = na_dbl + ) + } + res$id <- x$id + res +} diff --git a/air.toml b/air.toml new file mode 100644 index 00000000..6cb579db --- /dev/null +++ b/air.toml @@ -0,0 +1,2 @@ +[format] +line-width = 120 diff --git a/man/epipredict-vctrs.Rd b/man/epipredict-vctrs.Rd new file mode 100644 index 00000000..a4dabbfa --- /dev/null +++ b/man/epipredict-vctrs.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quantile_pred-methods.R +\name{epipredict-vctrs} +\alias{epipredict-vctrs} +\alias{vec_arith.quantile_pred} +\title{Internal vctrs methods} +\usage{ +\method{vec_arith}{quantile_pred}(op, x, y, ...) +} +\description{ +Internal vctrs methods +} +\keyword{internal} diff --git a/man/get_params_in_layer.Rd b/man/get_params_in_layer.Rd new file mode 100644 index 00000000..1d6c98ef --- /dev/null +++ b/man/get_params_in_layer.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_yeo_johnson.R +\name{get_params_in_layer} +\alias{get_params_in_layer} +\title{get the parameters used in the initial step} +\usage{ +get_params_in_layer( + workflow, + step_name = "epi_YeoJohnson", + param_name = "yj_params" +) +} +\arguments{ +\item{workflow}{the workflow to extract the parameters from} + +\item{step_name}{the name of the step to look for, as recognized by \code{detect_step}} + +\item{param_name}{the parameter to pull out of the step} +} +\description{ +get the parameters used in the initial step +} +\keyword{internal} diff --git a/man/layer_epi_YeoJohnson.Rd b/man/layer_epi_YeoJohnson.Rd new file mode 100644 index 00000000..0c35ee74 --- /dev/null +++ b/man/layer_epi_YeoJohnson.Rd @@ -0,0 +1,71 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/layer_yeo_johnson.R +\name{layer_epi_YeoJohnson} +\alias{layer_epi_YeoJohnson} +\title{Unormalizing transformation} +\usage{ +layer_epi_YeoJohnson( + frosting, + ..., + yj_params = NULL, + id = rand_id("epi_YeoJohnson") +) +} +\arguments{ +\item{frosting}{a \code{frosting} postprocessor. The layer will be added to the +sequence of operations for this frosting.} + +\item{...}{One or more selector functions to scale variables +for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} + +\item{yj_params}{A data frame of parameters to be used for inverting the +transformation. Typically set automatically. If you have done multiple +transformations such that the outcome variable name no longer contains the +column that this step transforms, then you should manually specify this to +be the parameters fit in the corresponding \code{step_epi_YeoJohnson}. For an +example where you wouldn't need to set this, if your output is +\code{ahead_7_cases} and \code{step_epi_YeoJohnson} transformed cases (possibly with +other columns), then you wouldn't need to set this. However if you have +renamed your output column to \code{diff_7}, then you will need to extract the \code{yj_params} from the step.} + +\item{id}{a random id string} +} +\value{ +an updated \code{frosting} postprocessor +} +\description{ +Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you +are using this step on a column that will eventually become the outcome +variable, you should make sure that the original name of that column is a +subset of the outcome variable name. \code{ahead_7_cases} when \code{cases} is +transformed will work well, while \code{ahead_7} will not. +} +\examples{ +library(dplyr) +jhu <- epidatasets::cases_deaths_subset \%>\% + filter(time_value > "2021-11-01", geo_value \%in\% c("ca", "ny")) \%>\% + select(geo_value, time_value, cases) + +# Create a recipe with a Yeo-Johnson transformation. +r <- epi_recipe(jhu) \%>\% + step_epi_YeoJohnson(cases) \%>\% + step_epi_lag(cases, lag = 0) \%>\% + step_epi_ahead(cases, ahead = 0, role = "outcome") \%>\% + step_epi_naomit() + +# Create a frosting layer that will undo the Yeo-Johnson transformation. +f <- frosting() \%>\% + layer_predict() \%>\% + layer_epi_YeoJohnson(.pred) + +# Create a workflow and fit it. +wf <- epi_workflow(r, linear_reg()) \%>\% + fit(jhu) \%>\% + add_frosting(f) + +# Forecast the workflow, which should reverse the Yeo-Johnson transformation. +forecast(wf) +# Compare to the original data. +jhu \%>\% filter(time_value == "2021-12-31") +forecast(wf) +} diff --git a/man/step_adjust_latency.Rd b/man/step_adjust_latency.Rd index 9e1bafbd..685f806b 100644 --- a/man/step_adjust_latency.Rd +++ b/man/step_adjust_latency.Rd @@ -267,8 +267,8 @@ while this will not: \if{html}{\out{
}}\preformatted{toy_recipe <- epi_recipe(toy_df) \%>\% step_epi_lag(a, lag=0) \%>\% step_adjust_latency(a, method = "extend_lags") -#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work with -#> modified data. +#> Warning: If `method` is "extend_lags" or "locf", then the previous `step_epi_lag`s won't work +#> with modified data. }\if{html}{\out{
}} If you create columns that you then apply lags to (such as diff --git a/man/step_epi_YeoJohnson.Rd b/man/step_epi_YeoJohnson.Rd new file mode 100644 index 00000000..cfe85169 --- /dev/null +++ b/man/step_epi_YeoJohnson.Rd @@ -0,0 +1,121 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/step_yeo_johnson.R +\name{step_epi_YeoJohnson} +\alias{step_epi_YeoJohnson} +\title{Yeo-Johnson transformation} +\usage{ +step_epi_YeoJohnson( + recipe, + ..., + role = "predictor", + trained = FALSE, + yj_params = NULL, + na_fill = 1/4, + limits = c(-5, 5), + num_unique = 5, + na_rm = TRUE, + skip = FALSE, + id = rand_id("epi_YeoJohnson") +) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the +sequence of operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[recipes:selections]{recipes::selections()}} for more details.} + +\item{role}{For model terms created by this step, what analysis role should +they be assigned? \code{lag} is default a predictor while \code{ahead} is an outcome.} + +\item{trained}{A logical for whether the selectors in \code{...} +have been resolved by \code{\link[=prep]{prep()}}.} + +\item{yj_params}{Internal. A numeric vector of transformation values. This +is \code{NULL} until computed by \code{\link[=prep]{prep()}}.} + +\item{na_fill}{A numeric value to fill in for any geos where a Yeo-Johnson +parameter cannot be estimated.} + +\item{limits}{A length 2 numeric vector defining the range to compute the +transformation parameter.} + +\item{num_unique}{An integer where data that have fewer than this many unique +values will not be evaluated for a transformation.} + +\item{na_rm}{A logical indicating whether missing values should be removed +before estimating the transformation parameter.} + +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} + +\item{id}{A unique identifier for the step} +} +\value{ +An updated version of \code{recipe} with the new step added to the +sequence of any existing operations. +} +\description{ +\code{step_epi_YeoJohnson()} creates a \emph{specification} of a recipe step that will +transform data using a Yeo-Johnson transformation. This fork works with panel +data and is meant for epidata. +} +\details{ +The Yeo-Johnson transformation is variance-stabilizing +transformation, similar to the Box-Cox but does not require the input +variables to be strictly positive. In the package, the partial +log-likelihood function is directly optimized within a reasonable set of +transformation values (which can be changed by the user). The optimization +finds a lambda parameter for each group in the data that minimizes the +variance of the transformed data. + +This transformation is typically done on the outcome variable +using the residuals for a statistical model (such as ordinary +least squares). Here, a simple null model (intercept only) is +used to apply the transformation to the \emph{predictor} +variables individually. This can have the effect of making the +variable distributions more symmetric. + +If the transformation parameters are estimated to be very +close to the bounds, or if the optimization fails, a value of +\code{NA} is used and no transformation is applied. +} +\section{Tidying}{ +When you \code{\link[=tidy.recipe]{tidy()}} this step, a tibble is returned with +columns \code{terms}, \code{value} , and \code{id}: + +\describe{ +\item{terms}{character, the selectors or variables selected} +\item{value}{numeric, the lambda estimate} +\item{id}{character, id of this step} +} +} + +\examples{ +jhu <- cases_deaths_subset \%>\% + filter(time_value > "2021-01-01", geo_value \%in\% c("ca", "ny")) \%>\% + select(geo_value, time_value, cases) +filtered_data <- jhu + +r <- epi_recipe(filtered_data) \%>\% + step_epi_YeoJohnson(cases) +# View the recipe +r +# Fit the recipe +tr <- r \%>\% prep(filtered_data) +# View the parameter values +tr$steps[[1]]$yj_params +# View the transformed data +df <- tr \%>\% bake(filtered_data) +plot(density(df$cases)) +plot(density(filtered_data$cases)) +} +\references{ +Yeo, I. K., and Johnson, R. A. (2000). A new family of power +transformations to improve normality or symmetry. \emph{Biometrika}. +} +\concept{individual transformation steps} diff --git a/tests/testthat/_snaps/yeo-johnson.md b/tests/testthat/_snaps/yeo-johnson.md new file mode 100644 index 00000000..b3a42c24 --- /dev/null +++ b/tests/testthat/_snaps/yeo-johnson.md @@ -0,0 +1,16 @@ +# Yeo-Johnson transformation inverts correctly + + Code + yj_transform(x, c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + +--- + + Code + yj_transform(list(1, 2), c(1, 2, 3)) + Condition + Error: + ! Length of `x` must be equal to length of `lambda`. + diff --git a/tests/testthat/test-quantile_pred.R b/tests/testthat/test-quantile_pred.R index 70d7c71a..f5097c1d 100644 --- a/tests/testthat/test-quantile_pred.R +++ b/tests/testthat/test-quantile_pred.R @@ -81,6 +81,7 @@ test_that("unary math works on quantiles", { }) test_that("arithmetic works on quantiles", { + # Quantile and numeric arithmetic works dstn <- hardhat::quantile_pred( matrix(c(1:4, 8:11), nrow = 2, byrow = TRUE), 1:4 / 5 @@ -100,4 +101,28 @@ test_that("arithmetic works on quantiles", { expect_identical((1 / 4) * dstn, dstn2) expect_snapshot(error = TRUE, sum(dstn)) + + # Quantile and quantile arithmetic works + val <- c(1:4, 8:11) + dstn3 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + dstn4 <- hardhat::quantile_pred( + matrix(val + 2 * val, nrow = 2, byrow = TRUE), + 1:4 / 5 + ) + expect_identical(dstn3 + (2 * dstn3), dstn4) + + # Extrapolate when quantile_levels are not the same + val <- c(1:4, 8:11) + dstn5 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.1, 0.25, 0.5, 0.75) + ) + dstn6 <- hardhat::quantile_pred( + matrix(val, nrow = 2, byrow = TRUE), + c(0.25, 0.5, 0.75, 0.9) + ) + expect_identical((dstn5 + dstn6) %@% "quantile_levels", c(0.1, 0.25, 0.5, 0.75, 0.9)) }) diff --git a/tests/testthat/test-yeo-johnson.R b/tests/testthat/test-yeo-johnson.R new file mode 100644 index 00000000..c0bd12bd --- /dev/null +++ b/tests/testthat/test-yeo-johnson.R @@ -0,0 +1,193 @@ +test_that("Yeo-Johnson transformation inverts correctly", { + # Vectorized x and scalar lambda work + lambdas <- seq(-5, 5, 0.1) + x <- seq(-10, 10, 0.1) + expect_true( + map_lgl(lambdas, \(lambda) sum(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5) %>% + all() + ) + # Note that the special lambda values of 0 and 2 are covered by the tests + # below. + # Vectorized x and lambda both work + x <- seq(-5, 5, 0.1) + expect_true( + sum(abs(yj_inverse(yj_transform(x, lambdas), lambdas) - x)) < 1e-5 + ) + + # also works on quantile distributions + x <- quantile_pred(matrix(c(-5, 1, 3, 0, 0.1, 0.5), nrow = 2, byrow = TRUE), c(0.01, 0.5, 0.7)) + x_back <- map( + lambdas, + \(lambda) mean(abs(yj_inverse(yj_transform(x, lambda), lambda) - x)) < 1e-5 + ) + expect_true(all(unlist(x_back))) + + # Get coverage on yj_input_type_management + # Breaks on bad length of lambda + expect_snapshot(error = TRUE, + yj_transform(x, c(1, 2, 3)) + ) + expect_snapshot(error = TRUE, + yj_transform(list(1, 2), c(1, 2, 3)) + ) + expect_true( + identical( + yj_input_type_management(list(1, 2, 3), c(1, 2, 3)), + list(c(1, 2, 3), c(1, 2, 3)) + ) + ) +}) + +test_that("Yeo-Johnson steps and layers invert each other", { + jhu <- epidatasets::cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + # Get some yj_param values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general yj_param values tibble structure + expect_true(".yj_param_cases" %in% names(tr$steps[[1]]$yj_params)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_cases)) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson() + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + dplyr::slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(cases = .pred) + expect_equal(out1, out2) + + # Make sure it works when there are multiple predictors + jhu_multi <- epidatasets::covid_case_death_rates_extended %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, case_rate, death_rate) + filtered_data <- jhu_multi + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(case_rate, death_rate) %>% + step_epi_lag(case_rate, death_rate, lag = 0) %>% + step_epi_ahead(case_rate, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + + # Check general yj_param values tibble structure + expect_true(".yj_param_case_rate" %in% names(tr$steps[[1]]$yj_params)) + expect_true(".yj_param_death_rate" %in% names(tr$steps[[1]]$yj_params)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_case_rate)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_death_rate)) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_epi_YeoJohnson() + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + select(-death_rate) %>% + dplyr::slice_max(time_value, by = geo_value) + out2 <- forecast(wf) %>% rename(case_rate = .pred) + expect_equal(out1, out2) +}) + +test_that("Yeo-Johnson layers work on quantiles", { + jhu <- epidatasets::cases_deaths_subset %>% + filter(time_value > "2021-01-01", geo_value %in% c("ca", "ny")) %>% + select(geo_value, time_value, cases) + filtered_data <- jhu + + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(cases) %>% + step_epi_lag(cases, lag = 0) %>% + step_epi_ahead(cases, ahead = 0, role = "outcome") %>% + step_epi_naomit() + + f <- frosting() %>% + layer_predict() %>% + layer_residual_quantiles() %>% + layer_epi_YeoJohnson() + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + dplyr::slice_max(time_value, by = geo_value) %>% + rename(.pred = cases) %>% + tidyr::expand_grid(.pred_distn_quantile_level = c(0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95)) %>% + mutate(.pred_distn_value = .pred) %>% + select(geo_value, time_value, .pred, .pred_distn_value, .pred_distn_quantile_level) + out2 <- forecast(wf) %>% pivot_quantiles_longer(.pred_distn) %>% as_tibble() + expect_equal(out1, out2) +}) + +test_that("Yeo-Johnson steps and layers invert each other when other_keys are present", { + # Small synthetic grad_employ_dataset version. + # fmt: skip + filtered_data <- tribble( + ~geo_value, ~age_group, ~edu_qual, ~time_value, ~med_income_2y, + "ca", "25-34", "bachelor", 2017, 50000, + "ca", "25-34", "bachelor", 2018, 50500, + "ca", "25-34", "bachelor", 2019, 51000, + "ca", "25-34", "bachelor", 2020, 51500, + "ca", "25-34", "bachelor", 2021, 52000, + "ca", "25-34", "bachelor", 2022, 52500, + "ca", "35-1000", "bachelor", 2017, 3e10, + "ca", "35-1000", "bachelor", 2018, 3e10 + 10, + "ca", "35-1000", "bachelor", 2019, 3e10 + 20, + "ca", "35-1000", "bachelor", 2020, 3e10 + 30, + "ca", "35-1000", "bachelor", 2021, 3e10 + 40, + "ca", "35-1000", "bachelor", 2022, 3e10 + 50, + "ca", "25-34", "master", 2017, 2 * 50000, + "ca", "25-34", "master", 2018, 2 * 50500, + "ca", "25-34", "master", 2019, 2 * 51000, + "ca", "25-34", "master", 2020, 2 * 51500, + "ca", "25-34", "master", 2021, 2 * 52000, + "ca", "25-34", "master", 2022, 2 * 52500, + "ca", "35-1000", "master", 2017, 2 * 3e10, + "ca", "35-1000", "master", 2018, 2 * (3e10 + 10), + "ca", "35-1000", "master", 2019, 2 * (3e10 + 20), + "ca", "35-1000", "master", 2020, 2 * (3e10 + 30), + "ca", "35-1000", "master", 2021, 2 * (3e10 + 40), + "ca", "35-1000", "master", 2022, 2 * (3e10 + 50) + ) %>% as_epi_df(other_keys = c("age_group", "edu_qual")) + + # Get some yj_param values + r <- epi_recipe(filtered_data) %>% + step_epi_YeoJohnson(med_income_2y) %>% + step_epi_lag(med_income_2y, lag = 0) %>% + step_epi_ahead(med_income_2y, ahead = 0, role = "outcome") %>% + step_epi_naomit() + tr <- r %>% prep(filtered_data) + expect_true(".yj_param_med_income_2y" %in% names(tr$steps[[1]]$yj_params)) + expect_true("geo_value" %in% names(tr$steps[[1]]$yj_params)) + expect_true("age_group" %in% names(tr$steps[[1]]$yj_params)) + expect_true("edu_qual" %in% names(tr$steps[[1]]$yj_params)) + expect_true(is.numeric(tr$steps[[1]]$yj_params$.yj_param_med_income_2y)) + + # Make sure that the inverse transformation works + f <- frosting() %>% + layer_predict() %>% + layer_residual_quantiles() %>% + layer_epi_YeoJohnson(.pred) + wf <- epi_workflow(r, linear_reg()) %>% + fit(filtered_data) %>% + add_frosting(f) + out1 <- filtered_data %>% + dplyr::slice_max(time_value, by = geo_value) %>% + select(geo_value, age_group, time_value, med_income_2y) %>% + arrange(geo_value, age_group, time_value) + out2 <- forecast(wf) %>% + rename(med_income_2y = .pred) %>% + select(geo_value, age_group, time_value, med_income_2y) %>% + arrange(geo_value, age_group, time_value) + expect_equal(out1, out2) +})