1
1
# ' Unormalizing transformation
2
2
# '
3
- # ' Will undo a step_epi_YeoJohnson transformation.
3
+ # ' Will undo a step_epi_YeoJohnson transformation. For practical reasons, if you
4
+ # ' are using this step on a column that will eventually become the outcome
5
+ # ' variable, you should make sure that the original name of that column is a
6
+ # ' subset of the outcome variable name. `ahead_7_cases` when `cases` is
7
+ # ' transformed will work well, while `ahead_7` will not.
4
8
# '
5
9
# ' @inheritParams layer_population_scaling
6
- # ' @param yj_params Internal. A data frame of parameters to be used for
7
- # ' inverting the transformation.
8
- # ' @param by A (possibly named) character vector of variables to join by.
10
+ # ' @param yj_params A data frame of parameters to be used for inverting the
11
+ # ' transformation. Typically set automatically. If you have done multiple
12
+ # ' transformations such that the outcome variable name no longer contains the
13
+ # ' column that this step transforms, then you should manually specify this to
14
+ # ' be the parameters fit in the corresponding `step_epi_YeoJohnson`. For an
15
+ # ' example where you wouldn't need to set this, if your output is
16
+ # ' `ahead_7_cases` and `step_epi_YeoJohnson` transformed cases (possibly with
17
+ # ' other columns), then you wouldn't need to set this. However if you have
18
+ # ' renamed your output column to `diff_7`, then you will need to extract the `yj_params` from the step.
9
19
# '
10
20
# ' @return an updated `frosting` postprocessor
11
21
# ' @export
37
47
# ' # Compare to the original data.
38
48
# ' jhu %>% filter(time_value == "2021-12-31")
39
49
# ' forecast(wf)
40
- layer_epi_YeoJohnson <- function (frosting , ... , yj_params = NULL , by = NULL , id = rand_id(" epi_YeoJohnson" )) {
50
+ layer_epi_YeoJohnson <- function (frosting , ... , yj_params = NULL , id = rand_id(" epi_YeoJohnson" )) {
41
51
checkmate :: assert_tibble(yj_params , min.rows = 1 , null.ok = TRUE )
42
52
43
53
add_layer(
44
54
frosting ,
45
55
layer_epi_YeoJohnson_new(
46
56
yj_params = yj_params ,
47
- by = by ,
48
57
terms = dplyr :: enquos(... ),
49
58
id = id
50
59
)
51
60
)
52
61
}
53
62
54
- layer_epi_YeoJohnson_new <- function (yj_params , by , terms , id ) {
55
- layer(" epi_YeoJohnson" , yj_params = yj_params , by = by , terms = terms , id = id )
63
+ layer_epi_YeoJohnson_new <- function (yj_params , terms , id ) {
64
+ layer(" epi_YeoJohnson" , yj_params = yj_params , terms = terms , id = id )
56
65
}
57
66
58
67
# ' @export
59
68
# ' @importFrom workflows extract_preprocessor
60
69
slather.layer_epi_YeoJohnson <- function (object , components , workflow , new_data , ... ) {
61
70
rlang :: check_dots_empty()
62
71
63
- # TODO: We will error if we don't have a workflow. Write a check later.
64
-
65
- # Get the yj_params from the layer or from the workflow.
66
- yj_params <- object $ yj_params %|| % get_yj_params_in_layer(workflow )
67
-
68
- # If the by is not specified, try to infer it from the yj_params.
69
- if (is.null(object $ by )) {
70
- # Assume `layer_predict` has calculated the prediction keys and other
71
- # layers don't change the prediction key colnames:
72
- prediction_key_colnames <- names(components $ keys )
73
- lhs_potential_keys <- prediction_key_colnames
74
- rhs_potential_keys <- colnames(select(yj_params , - starts_with(" .yj_param_" )))
75
- object $ by <- intersect(lhs_potential_keys , rhs_potential_keys )
76
- suggested_min_keys <- setdiff(lhs_potential_keys , " time_value" )
77
- if (! all(suggested_min_keys %in% object $ by )) {
78
- cli_warn(
79
- c(
80
- " {setdiff(suggested_min_keys, object$by)} {?was an/were} epikey column{?s} in the predictions,
81
- but {?wasn't/weren't} found in the population `df`." ,
82
- " i" = " Defaulting to join by {object$by}" ,
83
- " >" = " Double-check whether column names on the population `df` match those expected in your predictions" ,
84
- " >" = " Consider using population data with breakdowns by {suggested_min_keys}" ,
85
- " >" = " Manually specify `by =` to silence"
86
- ),
87
- class = " epipredict__layer_population_scaling__default_by_missing_suggested_keys"
88
- )
89
- }
90
- }
72
+ # get the yj_params from the layer or from the workflow.
73
+ yj_params <-
74
+ object $ yj_params %|| %
75
+ get_params_in_layer(workflow , " epi_YeoJohnson" , " yj_params" )
91
76
92
77
# Establish the join columns.
93
- object $ by <- object $ by %|| %
94
- intersect(
95
- epi_keys_only(components $ predictions ),
96
- colnames(select(yj_params , - starts_with(" .yj_param_" )))
97
- )
98
- joinby <- list (x = names(object $ by ) %|| % object $ by , y = object $ by )
78
+ join_by_columns <- key_colnames(new_data , exclude = " time_value" ) %> % sort()
79
+ joinby <- list (x = join_by_columns , y = join_by_columns )
99
80
hardhat :: validate_column_names(components $ predictions , joinby $ x )
100
81
hardhat :: validate_column_names(yj_params , joinby $ y )
101
82
@@ -115,55 +96,15 @@ slather.layer_epi_YeoJohnson <- function(object, components, workflow, new_data,
115
96
# The `object$terms` is where the user specifies the columns they want to
116
97
# untransform. We need to match the outcomes with their yj_param columns in our
117
98
# parameter table and then apply the inverse transformation.
118
- if (identical(col_names , " .pred" )) {
119
- # In this case, we don't get a hint for the outcome column name, so we need
120
- # to infer it from the mold.
121
- if (length(components $ mold $ outcomes ) > 1 ) {
122
- cli_abort(" Only one outcome is allowed when specifying `.pred`." , call = rlang :: caller_env())
123
- }
124
- # `outcomes` is a vector of objects like ahead_1_cases, ahead_7_cases, etc.
125
- # We want to extract the cases part.
126
- outcome_cols <- names(components $ mold $ outcomes ) %> %
127
- stringr :: str_match(" ahead_\\ d+_(.*)" ) %> %
128
- magrittr :: extract(, 2 )
129
-
99
+ if (length(col_names ) == 0 ) {
100
+ # not specified by the user, so just modify everything starting with `.pred`
130
101
components $ predictions <- components $ predictions %> %
131
- mutate(.pred : = yj_inverse(.pred , !! sym(paste0(" .yj_param_" , outcome_cols ))))
132
- } else if (identical(col_names , character (0 ))) {
133
- # Wish I could suggest `all_outcomes()` here, but currently it's the same as
134
- # not specifying any terms. I don't want to spend time with dealing with
135
- # this case until someone asks for it.
136
- cli :: cli_abort(
137
- " Not specifying columns to layer Yeo-Johnson is not implemented.
138
- If you had a single outcome, you can use `.pred` as a column name.
139
- If you had multiple outcomes, you'll need to specify them like
140
- `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
141
- " ,
142
- call = rlang :: caller_env()
143
- )
102
+ mutate(across(starts_with(" .pred" ), \(.pred ) yj_inverse(.pred , .lambda ))) %> %
103
+ select(- .lambda )
144
104
} else {
145
- # In this case, we assume that the user has specified the columns they want
146
- # transformed here. We then need to determine the yj_param columns for each of
147
- # these columns. That is, we need to convert a vector of column names like
148
- # c(".pred_ahead_1_case_rate", ".pred_ahead_7_case_rate") to
149
- # c(".yj_param_ahead_1_case_rate", ".yj_param_ahead_7_case_rate").
150
- original_outcome_cols <- stringr :: str_match(col_names , " .pred_ahead_\\ d+_(.*)" )[, 2 ]
151
- outcomes_wout_ahead <- stringr :: str_match(names(components $ mold $ outcomes ), " ahead_\\ d+_(.*)" )[, 2 ]
152
- if (any(original_outcome_cols %nin % outcomes_wout_ahead )) {
153
- cli_abort(
154
- " All columns specified in `...` must be outcome columns.
155
- They must be of the form `.pred_ahead_1_<outcome_col>`, `.pred_ahead_7_<outcome_col>`, etc.
156
- " ,
157
- call = rlang :: caller_env()
158
- )
159
- }
160
-
161
- for (i in seq_along(col_names )) {
162
- col <- col_names [i ]
163
- yj_param_col <- paste0(" .yj_param_" , original_outcome_cols [i ])
164
- components $ predictions <- components $ predictions %> %
165
- mutate(!! sym(col ) : = yj_inverse(!! sym(col ), !! sym(yj_param_col )))
166
- }
105
+ components $ predictions <- components $ predictions %> %
106
+ mutate(across(all_of(col_names ), \(.pred ) yj_inverse(.pred , .lambda ))) %> %
107
+ select(- .lambda )
167
108
}
168
109
169
110
# Remove the yj_param columns.
@@ -182,75 +123,72 @@ print.layer_epi_YeoJohnson <- function(x, width = max(20, options()$width - 30),
182
123
# Inverse Yeo-Johnson transformation
183
124
#
184
125
# Inverse of `yj_transform` in step_yeo_johnson.R.
185
- yj_inverse <- function (x , lambda , eps = 0.001 ) {
126
+ yj_inverse <- function (x_in , lambda , eps = 0.001 ) {
186
127
if (any(is.na(lambda ))) {
187
- return (x )
188
- }
189
- if (length(x ) > 1 && length(lambda ) == 1 ) {
190
- lambda <- rep(lambda , length(x ))
191
- } else if (length(x ) != length(lambda )) {
192
- cli :: cli_abort(" Length of `x` must be equal to length of `lambda`." , call = rlang :: caller_fn())
193
- }
194
- if (! inherits(x , " tbl_df" ) || is.data.frame(x )) {
195
- x <- unlist(x , use.names = FALSE )
196
- } else {
197
- if (! is.vector(x )) {
198
- x <- as.vector(x )
199
- }
200
- }
201
-
202
- nn_inv_trans <- function (x , lambda ) {
203
- out <- double(length(x ))
204
- sm_lambdas <- abs(lambda ) < eps
205
- if (length(sm_lambdas ) > 0 ) {
206
- out [sm_lambdas ] <- exp(x [sm_lambdas ]) - 1
207
- }
208
- x <- x [! sm_lambdas ]
209
- lambda <- lambda [! sm_lambdas ]
210
- if (length(x ) > 0 ) {
211
- out [! sm_lambdas ] <- (lambda * x + 1 )^ (1 / lambda ) - 1
212
- }
213
- out
128
+ cli :: cli_abort(" `lambda` cannot be `NA`." , call = rlang :: caller_call())
214
129
}
215
-
216
- ng_inv_trans <- function (x , lambda ) {
217
- out <- double(length(x ))
218
- near2_lambdas <- abs(lambda - 2 ) < eps
219
- if (length(near2_lambdas ) > 0 ) {
220
- out [near2_lambdas ] <- - (exp(- x [near2_lambdas ]) - 1 )
221
- }
222
- x <- x [! near2_lambdas ]
223
- lambda <- lambda [! near2_lambdas ]
224
- if (length(x ) > 0 ) {
225
- out [! near2_lambdas ] <- - (((lambda - 2 ) * x + 1 )^ (1 / (2 - lambda )) - 1 )
226
- }
227
- out
228
- }
229
-
230
- dat_neg <- x < 0
231
- not_neg <- which(! dat_neg )
232
- is_neg <- which(dat_neg )
233
-
234
- if (length(not_neg ) > 0 ) {
235
- x [not_neg ] <- nn_inv_trans(x [not_neg ], lambda [not_neg ])
236
- }
237
-
238
- if (length(is_neg ) > 0 ) {
239
- x [is_neg ] <- ng_inv_trans(x [is_neg ], lambda [is_neg ])
130
+ x_lambda <- yj_input_type_management(x_in , lambda )
131
+ x <- x_lambda [[1 ]]
132
+ lambda <- x_lambda [[2 ]]
133
+ inv_x <- ifelse(
134
+ x < 0 ,
135
+ # negative values we test if lambda is ~2
136
+ ifelse(
137
+ abs(lambda - 2 ) < eps ,
138
+ - (exp(- x ) - 1 ),
139
+ - (((lambda - 2 ) * x + 1 )^ (1 / (2 - lambda )) - 1 )
140
+ ),
141
+ # non-negative values we test if lambda is ~0
142
+ ifelse(
143
+ abs(lambda ) < eps ,
144
+ (exp(x ) - 1 ),
145
+ (lambda * x + 1 )^ (1 / lambda ) - 1
146
+ )
147
+ )
148
+ if (x_in %> % inherits(" quantile_pred" )) {
149
+ inv_x <- inv_x %> % quantile_pred(x_in %@ % " quantile_levels" )
240
150
}
241
- x
151
+ inv_x
242
152
}
243
153
244
- get_yj_params_in_layer <- function (workflow ) {
154
+
155
+ # ' get the parameters used in the initial step
156
+ # '
157
+ # ' @param workflow the workflow to extract the parameters from
158
+ # ' @param step_name the name of the step to look for, as recognized by `detect_step`
159
+ # ' @param param_name the parameter to pull out of the step
160
+ # ' @keywords internal
161
+ get_params_in_layer <- function (workflow , step_name = " epi_YeoJohnson" , param_name = " yj_params" ) {
162
+ full_step_name <- glue :: glue(" step_{step_name}" )
245
163
this_recipe <- hardhat :: extract_recipe(workflow )
246
- if (! (this_recipe %> % recipes :: detect_step(" epi_YeoJohnson" ))) {
247
- cli_abort(" `layer_epi_YeoJohnson` requires `step_epi_YeoJohnson` in the recipe." , call = rlang :: caller_env())
164
+ if (! (this_recipe %> % recipes :: detect_step(step_name ))) {
165
+ cli_abort(" `layer_{step_name}` requires `step_{step_name}` in the recipe." , call = rlang :: caller_call())
166
+ }
167
+ outcomes <-
168
+ workflows :: extract_recipe(workflow )$ term_info %> %
169
+ filter(role == " outcome" ) %> %
170
+ pull(variable )
171
+ if (length(outcomes ) > 1 ) {
172
+ cli_abort(
173
+ " `layer_{step_name}` doesn't support multiple output columns.
174
+ This workflow produces {outcomes} as output columns." ,
175
+ call = rlang :: caller_call(),
176
+ class = " epipredict__layer_yeo_johnson_multi_outcome_error"
177
+ )
248
178
}
249
179
for (step in this_recipe $ steps ) {
250
- if (inherits(step , " step_epi_YeoJohnson" )) {
251
- yj_params <- step $ yj_params
180
+ # if it's a `step_name` step that also transforms a column that is a subset
181
+ # of the output column name
182
+ is_outcome_subset <- map_lgl(step $ columns , ~ grepl(.x , outcomes ))
183
+ if (inherits(step , full_step_name ) && any(is_outcome_subset )) {
184
+ params <- step [[param_name ]] %> %
185
+ select(
186
+ key_colnames(workflow $ original_data , exclude = " time_value" ),
187
+ contains(step $ columns [is_outcome_subset ])
188
+ ) %> %
189
+ rename(.lambda = contains(step $ columns ))
252
190
break
253
191
}
254
192
}
255
- yj_params
193
+ params
256
194
}
0 commit comments