Open
Description
feature-request
It took me a minute to realize the behavior of step_cut
is different than that of cut
.
The functionality of step_cut
expects explicit breaks whereas cut
will generate intervals when a single integer value is provided in breaks
.
library('dplyr')
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library('tidymodels')
tidymodels_prefer()
df <- data.frame(x = c(1:10,31:40), y = 1:20)
rec3 <- recipe(df) %>%
step_mutate(x_cut = x) %>%
step_cut(x_cut, breaks = 3, include_outside_range = T) %>%
prep()
tidy(rec3, 2)
#> # A tibble: 3 × 3
#> terms value id
#> <chr> <dbl> <chr>
#> 1 x_cut 1 cut_5ea2R
#> 2 x_cut 3 cut_5ea2R
#> 3 x_cut 40 cut_5ea2R
bake(rec3, new_data = df) %>%
mutate(dplyr_cut = cut(x,3, include.lowest = TRUE))
#> # A tibble: 20 × 4
#> x y x_cut dplyr_cut
#> <int> <int> <fct> <fct>
#> 1 1 1 [min,3] [0.961,14]
#> 2 2 2 [min,3] [0.961,14]
#> 3 3 3 [min,3] [0.961,14]
#> 4 4 4 (3,max] [0.961,14]
#> 5 5 5 (3,max] [0.961,14]
#> 6 6 6 (3,max] [0.961,14]
#> 7 7 7 (3,max] [0.961,14]
#> 8 8 8 (3,max] [0.961,14]
#> 9 9 9 (3,max] [0.961,14]
#> 10 10 10 (3,max] [0.961,14]
#> 11 31 11 (3,max] (27,40]
#> 12 32 12 (3,max] (27,40]
#> 13 33 13 (3,max] (27,40]
#> 14 34 14 (3,max] (27,40]
#> 15 35 15 (3,max] (27,40]
#> 16 36 16 (3,max] (27,40]
#> 17 37 17 (3,max] (27,40]
#> 18 38 18 (3,max] (27,40]
#> 19 39 19 (3,max] (27,40]
#> 20 40 20 (3,max] (27,40]
Created on 2024-11-19 with [reprex v2.1.1](https://reprex.tidyverse.org/)
Perhaps an option for n_breaks
to the step_cut
option, could also exist for vectors or named lists and just apply the vector, named list to the list of variables
# psuedo code for step_cut_n_breaks
step_cut_n_breaks <- function(var, n_breaks, include_outside_range) {
var_min <- min(var)
var_max <- max(var)
diff <- (var_max - var_min)/n_cut_breaks
res_seq <- seq(
var_min,
var_max,
by = diff
)
res_seq <- res_seq[-1]
res_seq <- res_seq[-length(resl_seq)]
# Once the ranges have been computed you could still use the existing step_cut functionality:
step_cut(force_cuts, breaks = res_seq, include_outside_range = include_outside_range)
}