Skip to content

feat: add step_/layer_ epi_YeoJohnson #451

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@
^.lintr$
^.venv$
^inst/templates$
^air\.toml$
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Imports:
recipes (>= 1.0.4),
rlang (>= 1.1.0),
stats,
stringr,
tibble,
tidyr,
tidyselect,
Expand Down
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ S3method(bake,check_enough_data)
S3method(bake,epi_recipe)
S3method(bake,step_adjust_latency)
S3method(bake,step_climate)
S3method(bake,step_epi_YeoJohnson)
S3method(bake,step_epi_ahead)
S3method(bake,step_epi_lag)
S3method(bake,step_epi_slide)
Expand Down Expand Up @@ -55,6 +56,7 @@ S3method(prep,check_enough_data)
S3method(prep,epi_recipe)
S3method(prep,step_adjust_latency)
S3method(prep,step_climate)
S3method(prep,step_epi_YeoJohnson)
S3method(prep,step_epi_ahead)
S3method(prep,step_epi_lag)
S3method(prep,step_epi_slide)
Expand All @@ -76,6 +78,7 @@ S3method(print,flatline)
S3method(print,frosting)
S3method(print,layer_add_forecast_date)
S3method(print,layer_add_target_date)
S3method(print,layer_epi_YeoJohnson)
S3method(print,layer_naomit)
S3method(print,layer_point_from_distn)
S3method(print,layer_population_scaling)
Expand All @@ -86,6 +89,7 @@ S3method(print,layer_threshold)
S3method(print,layer_unnest)
S3method(print,step_adjust_latency)
S3method(print,step_climate)
S3method(print,step_epi_YeoJohnson)
S3method(print,step_epi_ahead)
S3method(print,step_epi_lag)
S3method(print,step_epi_slide)
Expand All @@ -101,6 +105,7 @@ S3method(run_mold,default_epi_recipe_blueprint)
S3method(slather,layer_add_forecast_date)
S3method(slather,layer_add_target_date)
S3method(slather,layer_cdc_flatline_quantiles)
S3method(slather,layer_epi_YeoJohnson)
S3method(slather,layer_naomit)
S3method(slather,layer_point_from_distn)
S3method(slather,layer_population_scaling)
Expand All @@ -114,10 +119,12 @@ S3method(snap,quantile_pred)
S3method(tidy,check_enough_data)
S3method(tidy,frosting)
S3method(tidy,layer)
S3method(tidy,step_epi_YeoJohnson)
S3method(update,layer)
S3method(vec_arith,quantile_pred)
S3method(vec_arith.numeric,quantile_pred)
S3method(vec_arith.quantile_pred,numeric)
S3method(vec_arith.quantile_pred,quantile_pred)
S3method(vec_math,quantile_pred)
S3method(vec_proxy_equal,quantile_pred)
S3method(weighted_interval_score,quantile_pred)
Expand Down Expand Up @@ -176,6 +183,7 @@ export(layer)
export(layer_add_forecast_date)
export(layer_add_target_date)
export(layer_cdc_flatline_quantiles)
export(layer_epi_YeoJohnson)
export(layer_naomit)
export(layer_point_from_distn)
export(layer_population_scaling)
Expand Down Expand Up @@ -207,6 +215,7 @@ export(smooth_quantile_reg)
export(snap)
export(step_adjust_latency)
export(step_climate)
export(step_epi_YeoJohnson)
export(step_epi_ahead)
export(step_epi_lag)
export(step_epi_naomit)
Expand All @@ -227,6 +236,7 @@ import(epidatasets)
import(epiprocess)
import(parsnip)
import(recipes)
import(vctrs)
importFrom(checkmate,assert_class)
importFrom(checkmate,assert_numeric)
importFrom(checkmate,test_character)
Expand Down
6 changes: 4 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
`data(<dataset name>, package = "epidatasets")`, `epidatasets::<dataset name>`
or, after loading the package, the name of the dataset alone (#382).
- `step_adjust_latency()` no longer allows empty column selection.
- Addresses upstream breaking changes from cmu-delphi/epiprocess#595 (`growth_rate()`).
- Addresses upstream breaking changes from cmu-delphi/epiprocess#595 (`growth_rate()`).
`step_growth_rate()` has lost its `additional_gr_args_list` argument and now
has an `na_rm` argument.
- Moves `epiprocess` out of depends (#440). No internals have changed, but downstream
users may need to add `library(epiprocess)` to existing code.
- Removes dependence on the `distributional` package, replacing the quantiles
- Removes dependence on the `distributional` package, replacing the quantiles
with `hardhat::quantile_pred()`. Some associated functions are deprecated with
`lifecycle` messages.
- Rename `check_enough_train_data()` to `check_enough_data()`, and generalize it
Expand All @@ -38,6 +38,8 @@ Pre-1.0.0 numbering scheme: 0.x will indicate releases, while 0.0.x will indicat
- Replace `dist_quantiles()` with `hardhat::quantile_pred()`
- Allow `quantile()` to threshold to an interval if desired (#434)
- `arx_forecaster()` detects if there's enough data to predict
- Add `step_epi_YeoJohnson()` to perform a Yeo-Johnson transformation on the outcome variable.
- Add `layer_epi_YeoJohnson()` to undo a Yeo-Johnson transformation on the outcome variable in a forecast workflow.

## Bug fixes

Expand Down
194 changes: 194 additions & 0 deletions R/layer_yeo_johnson.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#' Unormalizing transformation
#'
#' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you
#' are using this step on a column that will eventually become the outcome
#' variable, you should make sure that the original name of that column is a
#' subset of the outcome variable name. `ahead_7_cases` when `cases` is
#' transformed will work well, while `ahead_7` will not.
#'
#' @inheritParams layer_population_scaling
#' @param yj_params A data frame of parameters to be used for inverting the
#' transformation. Typically set automatically. If you have done multiple
#' transformations such that the outcome variable name no longer contains the
#' column that this step transforms, then you should manually specify this to
#' be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an
#' example where you wouldn't need to set this, if your output is
#' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with
#' other columns), then you wouldn't need to set this. However if you have
#' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step.
#'
#' @return an updated `frosting` postprocessor
#' @export
#' @examples
#' library(dplyr)
#' jhu <- epidatasets::cases_deaths_subset %>%
#' filter(time_value > "2021-11-01", geo_value %in% c("ca", "ny")) %>%
#' select(geo_value, time_value, cases)
#'
#' # Create a recipe with a Yeo-Johnson transformation.
#' r <- epi_recipe(jhu) %>%
#' step_epi_YeoJohnson(cases) %>%
#' step_epi_lag(cases, lag = 0) %>%
#' step_epi_ahead(cases, ahead = 0, role = "outcome") %>%
#' step_epi_naomit()
#'
#' # Create a frosting layer that will undo the Yeo-Johnson transformation.
#' f <- frosting() %>%
#' layer_predict() %>%
#' layer_epi_YeoJohnson(.pred)
#'
#' # Create a workflow and fit it.
#' wf <- epi_workflow(r, linear_reg()) %>%
#' fit(jhu) %>%
#' add_frosting(f)
#'
#' # Forecast the workflow, which should reverse the Yeo-Johnson transformation.
#' forecast(wf)
#' # Compare to the original data.
#' jhu %>% filter(time_value == "2021-12-31")
#' forecast(wf)
layer_epi_YeoJohnson <- function(frosting, ..., yj_params = NULL, id = rand_id("epi_YeoJohnson")) {
checkmate::assert_tibble(yj_params, min.rows = 1, null.ok = TRUE)

add_layer(
frosting,
layer_epi_YeoJohnson_new(
yj_params = yj_params,
terms = dplyr::enquos(...),
id = id
)
)
}

layer_epi_YeoJohnson_new <- function(yj_params, terms, id) {
layer("epi_YeoJohnson", yj_params = yj_params, terms = terms, id = id)
}

#' @export
#' @importFrom workflows extract_preprocessor
slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data, ...) {
rlang::check_dots_empty()

# get the yj_params from the layer or from the workflow.
yj_params <-
object$yj_params %||%
get_params_in_layer(workflow, "epi_YeoJohnson", "yj_params")

# Establish the join columns.
join_by_columns <- key_colnames(new_data, exclude = "time_value") %>% sort()
joinby <- list(x = join_by_columns, y = join_by_columns)
hardhat::validate_column_names(components$predictions, joinby$x)
hardhat::validate_column_names(yj_params, joinby$y)

# Join the yj_params.
components$predictions <- inner_join(
components$predictions,
yj_params,
by = object$by,
relationship = "many-to-one",
unmatched = c("error", "drop")
)

exprs <- rlang::expr(c(!!!object$terms))
pos <- tidyselect::eval_select(exprs, components$predictions)
col_names <- names(pos)

# The `object$terms` is where the user specifies the columns they want to
# untransform. We need to match the outcomes with their yj_param columns in our
# parameter table and then apply the inverse transformation.
if (length(col_names) == 0) {
# not specified by the user, so just modify everything starting with `.pred`
components$predictions <- components$predictions %>%
mutate(across(starts_with(".pred"), \(.pred) yj_inverse(.pred, .lambda))) %>%
select(-.lambda)
} else {
components$predictions <- components$predictions %>%
mutate(across(all_of(col_names), \(.pred) yj_inverse(.pred, .lambda))) %>%
select(-.lambda)
}

# Remove the yj_param columns.
components$predictions <- components$predictions %>%
select(-any_of(starts_with(".yj_param_"))) %>%
ungroup()
components
}

#' @export
print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30), ...) {
title <- "Yeo-Johnson transformation (see `yj_params` object for values) on "
print_layer(x$terms, title = title, width = width)
}

# Inverse Yeo-Johnson transformation
#
# Inverse of `yj_transform` in step_yeo_johnson.R.
yj_inverse <- function(x_in, lambda, eps = 0.001) {
if (any(is.na(lambda))) {
cli::cli_abort("`lambda` cannot be `NA`.", call = rlang::caller_call())
}
x_lambda <- yj_input_type_management(x_in, lambda)
x <- x_lambda[[1]]
lambda <- x_lambda[[2]]
inv_x <- ifelse(
x < 0,
# negative values we test if lambda is ~2
ifelse(
abs(lambda - 2) < eps,
-(exp(-x) - 1),
-(((lambda - 2) * x + 1)^(1 / (2 - lambda)) - 1)
),
# non-negative values we test if lambda is ~0
ifelse(
abs(lambda) < eps,
(exp(x) - 1),
(lambda * x + 1)^(1 / lambda) - 1
)
)
if (x_in %>% inherits("quantile_pred")) {
inv_x <- inv_x %>% quantile_pred(x_in %@% "quantile_levels")
}
inv_x
}


#' get the parameters used in the initial step
#'
#' @param workflow the workflow to extract the parameters from
#' @param step_name the name of the step to look for, as recognized by `detect_step`
#' @param param_name the parameter to pull out of the step
#' @keywords internal
get_params_in_layer <- function(workflow, step_name = "epi_YeoJohnson", param_name = "yj_params") {
full_step_name <- glue::glue("step_{step_name}")
this_recipe <- hardhat::extract_recipe(workflow)
if (!(this_recipe %>% recipes::detect_step(step_name))) {
cli_abort("`layer_{step_name}` requires `step_{step_name}` in the recipe.", call = rlang::caller_call())
}
outcomes <-
workflows::extract_recipe(workflow)$term_info %>%
filter(role == "outcome") %>%
pull(variable)
if (length(outcomes) > 1) {
cli_abort(
"`layer_{step_name}` doesn't support multiple output columns.
This workflow produces {outcomes} as output columns.",
call = rlang::caller_call(),
class = "epipredict__layer_yeo_johnson_multi_outcome_error"
)
}
for (step in this_recipe$steps) {
# if it's a `step_name` step that also transforms a column that is a subset
# of the output column name
is_outcome_subset <- map_lgl(step$columns, ~ grepl(.x, outcomes))
if (inherits(step, full_step_name) && any(is_outcome_subset)) {
params <- step[[param_name]] %>%
select(
key_colnames(workflow$original_data, exclude = "time_value"),
contains(step$columns[is_outcome_subset])
) %>%
rename(.lambda = contains(step$columns))
break
}
}
params
}
20 changes: 19 additions & 1 deletion R/quantile_pred-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,6 @@ vec_proxy_equal.quantile_pred <- function(x, ...) {
dplyr::select(-.row)
}


# quantiles by treating quantile_pred like a distribution -----------------


Expand Down Expand Up @@ -287,13 +286,32 @@ vec_math.quantile_pred <- function(.fn, .x, ...) {
quantile_pred(.fn(.x), quantile_levels)
}

#' Internal vctrs methods
#'
#' @import vctrs
#' @keywords internal
#' @name epipredict-vctrs

#' @importFrom vctrs vec_arith vec_arith.numeric
#' @export
#' @method vec_arith quantile_pred
vec_arith.quantile_pred <- function(op, x, y, ...) {
UseMethod("vec_arith.quantile_pred", y)
}


#' @export
#' @method vec_arith.quantile_pred quantile_pred
vec_arith.quantile_pred.quantile_pred <- function(op, x, y, ...) {
all_quantiles <- unique(c(x %@% "quantile_levels", y %@% "quantile_levels"))
op_fn <- getExportedValue("base", op)
# Interpolate/extrapolate to the same quantiles
x <- quantile.quantile_pred(x, all_quantiles)
y <- quantile.quantile_pred(y, all_quantiles)
out <- op_fn(x, y, ...)
quantile_pred(out, all_quantiles)
}

#' @export
#' @method vec_arith.quantile_pred numeric
vec_arith.quantile_pred.numeric <- function(op, x, y, ...) {
Expand Down
1 change: 0 additions & 1 deletion R/step_adjust_latency.R
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,6 @@ step_adjust_latency_new <-
# lags introduces max(lags) NA's after the max_time_value.
#' @export
#' @importFrom glue glue
#' @importFrom dplyr rowwise
prep.step_adjust_latency <- function(x, training, info = NULL, ...) {
latency <- x$latency
col_names <- recipes::recipes_eval_select(x$terms, training, info)
Expand Down
Loading