Skip to content

Commit

Permalink
Changes, bug fixes and new function.
Browse files Browse the repository at this point in the history
  • Loading branch information
NicChr committed Oct 16, 2024
1 parent 241ddb8 commit c92a545
Show file tree
Hide file tree
Showing 10 changed files with 396 additions and 180 deletions.
3 changes: 0 additions & 3 deletions CRAN-SUBMISSION

This file was deleted.

1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ export(levels_add_na)
export(levels_drop)
export(levels_drop_na)
export(levels_factor)
export(levels_lump)
export(levels_reorder)
export(levels_unused)
export(levels_used)
Expand Down
4 changes: 2 additions & 2 deletions R/dots.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,6 @@ named_list <- function(..., .keep_null = TRUE){
dots
}
expr_names <- function(...){
as.character(substitute(c(...))[-1L])
# vapply(substitute(alist(...))[-1L], deparse2, "", USE.NAMES = FALSE)
# as.character(substitute(c(...))[-1L])
vapply(substitute(alist(...))[-1L], deparse2, "", USE.NAMES = FALSE)
}
106 changes: 105 additions & 1 deletion R/factors.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#' `levels_add_na()` which adds an explicit `NA` level,
#' `levels_drop_na()` which drops the `NA` level,
#' `levels_drop()` which drops unused factor levels,
#' `levels_lump()` which returns top n levels and lumps all others into the
#' same category,
#' and finally `levels_reorder()` which reorders the levels of `x`
#' based on `y` using the ordered median values of `y` for each level.
#'
Expand All @@ -32,6 +34,11 @@
#' `order_by`.
#' @param decreasing Should the reordered levels be in decreasing order?
#' Default is `FALSE`.
#' @param n Top n number of levels to calculate.
#' @param prop Top proportion of levels to calculate.
#' This is a proportion of the total unique levels in x.
#' @param other_category Name of 'other' category.
#' @param ties Ties method to use. See `?rank`.
#'
#' @details
#' This operates similarly to `collapse::qF()`. \cr
Expand All @@ -45,6 +52,37 @@
#' `factor_(as.POSIXct(1729984360, tz = "Europe/London") + 3600 *(1:5))`
#' produces 5 levels.
#'
#' `levels_lump()` is a cheaper version of `forcats::lump_n()` but returns
#' levels in order of highest frequency to lowest. This can be very useful
#' for plotting.
#'
#'
#' @examples
#' library(cheapr)
#'
#' x <- factor_(sample(letters[sample.int(26, 10)], 100, TRUE), levels = letters)
#' x
#' # Used/unused levels
#'
#' levels_used(x)
#' levels_unused(x)
#'
#' # Drop unused levels
#' levels_drop(x)
#'
#' # Top 3 letters by by frequency
#'
#' table(levels_lump(x, 3))
#'
#' # We can use levels_lump to create a generic top n function for non-factors too
#'
#' get_top_n <- function(x, n){
#' f <- levels_lump(factor_(x, order = FALSE), n = n)
#' new_df(value = levels(f),
#' count = tabulate(f))
#' }
#'
#' get_top_n(x, 5)
#' @export
#' @rdname factors
factor_ <- function(
Expand Down Expand Up @@ -198,7 +236,7 @@ levels_drop <- function(x){
if (length(which_used) == n_lvls){
x
} else {
out <- which_used[unclass(x)]
out <- collapse::fmatch(unclass(x), which_used, overid = 2L)
attributes(out) <- attributes(x)
attr(out, "levels") <- levels(x)[which_used]
out
Expand All @@ -220,6 +258,72 @@ levels_reorder <- function(x, order_by, decreasing = FALSE){
factor_(x, levels = ordered_levels)
}
}
# levels_lump <- function(x, n, prop, other_category = "Other",
# ties = c("min", "average", "first", "last", "random", "max")){
# check_is_factor(x)
# if (!missing(n) && !missing(prop)){
# stop("Please supply either n or prop, not both")
# }
# if (!missing(prop)){
# n <- floor(prop * length(levels(x)))
# }
# temp <- unclass(x)
# ties <- match.arg(ties)
# # counts <- collapse::GRPN(x, expand = FALSE)
# counts <- tabulate(x, length(levels(x)))
# if (ties == "min"){
# bound <- sort(counts, decreasing = TRUE)[min(n, length(counts))]
# top <- which_(counts >= bound)
# } else {
# rank <- rank(counts, ties.method = ties)
# top <- which_(rank <= n)
# }
# if (length(top) == length(counts)){
# x
# } else {
# lvls <- levels_factor(x)[top]
# out <- collapse::fmatch(x, lvls, nomatch = length(lvls) + 1L, overid = 2L)
# out[which_na(x)] <- NA
# out_levels <- c(factor_as_character(lvls), other_category)
# attr(out, "levels") <- out_levels
# class(out) <- "factor"
# out
# }
# }
#' @export
#' @rdname factors
levels_lump <- function(x, n, prop, other_category = "Other",
ties = c("min", "average", "first", "last", "random", "max")){
check_is_factor(x)
if (!missing(n) && !missing(prop)){
stop("Please supply either n or prop, not both")
}
if (!missing(prop)){
n <- floor(prop * length(levels(x)))
}
ties <- match.arg(ties)
counts <- tabulate(x, length(levels(x)))
o <- order(counts, decreasing = TRUE)
sorted_counts <- counts[o]
if (ties == "min"){
bound <- sorted_counts[min(n, length(counts))]
top <- which_(sorted_counts >= bound)
} else {
rank <- rank(-sorted_counts, ties.method = ties)
top <- which_(rank <= n)
}
if (length(top) == length(counts)){
x
} else {
lvls <- levels_factor(x)[o][top]
out <- collapse::fmatch(x, lvls, nomatch = length(lvls) + 1L, overid = 2L)
out[which_na(x)] <- NA
out_levels <- c(factor_as_character(lvls), other_category)
attr(out, "levels") <- out_levels
class(out) <- "factor"
out
}
}
# Generic factor conversion to data representation
factor_as_type <- function(x, type){
check_length(type, 1)
Expand Down
Loading

0 comments on commit c92a545

Please sign in to comment.