diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION deleted file mode 100644 index 724a32c..0000000 --- a/CRAN-SUBMISSION +++ /dev/null @@ -1,3 +0,0 @@ -Version: 0.9.9 -Date: 2024-10-14 08:25:15 UTC -SHA: 07d5eded069746483915cdb0e6f5240527d25c63 diff --git a/NAMESPACE b/NAMESPACE index 56ac1e9..1c8bbbf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -81,6 +81,7 @@ export(levels_add_na) export(levels_drop) export(levels_drop_na) export(levels_factor) +export(levels_lump) export(levels_reorder) export(levels_unused) export(levels_used) diff --git a/R/dots.R b/R/dots.R index 49dd5e0..57b1285 100644 --- a/R/dots.R +++ b/R/dots.R @@ -30,6 +30,6 @@ named_list <- function(..., .keep_null = TRUE){ dots } expr_names <- function(...){ - as.character(substitute(c(...))[-1L]) - # vapply(substitute(alist(...))[-1L], deparse2, "", USE.NAMES = FALSE) + # as.character(substitute(c(...))[-1L]) + vapply(substitute(alist(...))[-1L], deparse2, "", USE.NAMES = FALSE) } diff --git a/R/factors.R b/R/factors.R index 80177e4..302ccd7 100644 --- a/R/factors.R +++ b/R/factors.R @@ -12,6 +12,8 @@ #' `levels_add_na()` which adds an explicit `NA` level, #' `levels_drop_na()` which drops the `NA` level, #' `levels_drop()` which drops unused factor levels, +#' `levels_lump()` which returns top n levels and lumps all others into the +#' same category, #' and finally `levels_reorder()` which reorders the levels of `x` #' based on `y` using the ordered median values of `y` for each level. #' @@ -32,6 +34,11 @@ #' `order_by`. #' @param decreasing Should the reordered levels be in decreasing order? #' Default is `FALSE`. +#' @param n Top n number of levels to calculate. +#' @param prop Top proportion of levels to calculate. +#' This is a proportion of the total unique levels in x. +#' @param other_category Name of 'other' category. +#' @param ties Ties method to use. See `?rank`. #' #' @details #' This operates similarly to `collapse::qF()`. \cr @@ -45,6 +52,37 @@ #' `factor_(as.POSIXct(1729984360, tz = "Europe/London") + 3600 *(1:5))` #' produces 5 levels. #' +#' `levels_lump()` is a cheaper version of `forcats::lump_n()` but returns +#' levels in order of highest frequency to lowest. This can be very useful +#' for plotting. +#' +#' +#' @examples +#' library(cheapr) +#' +#' x <- factor_(sample(letters[sample.int(26, 10)], 100, TRUE), levels = letters) +#' x +#' # Used/unused levels +#' +#' levels_used(x) +#' levels_unused(x) +#' +#' # Drop unused levels +#' levels_drop(x) +#' +#' # Top 3 letters by by frequency +#' +#' table(levels_lump(x, 3)) +#' +#' # We can use levels_lump to create a generic top n function for non-factors too +#' +#' get_top_n <- function(x, n){ +#' f <- levels_lump(factor_(x, order = FALSE), n = n) +#' new_df(value = levels(f), +#' count = tabulate(f)) +#' } +#' +#' get_top_n(x, 5) #' @export #' @rdname factors factor_ <- function( @@ -198,7 +236,7 @@ levels_drop <- function(x){ if (length(which_used) == n_lvls){ x } else { - out <- which_used[unclass(x)] + out <- collapse::fmatch(unclass(x), which_used, overid = 2L) attributes(out) <- attributes(x) attr(out, "levels") <- levels(x)[which_used] out @@ -220,6 +258,72 @@ levels_reorder <- function(x, order_by, decreasing = FALSE){ factor_(x, levels = ordered_levels) } } +# levels_lump <- function(x, n, prop, other_category = "Other", +# ties = c("min", "average", "first", "last", "random", "max")){ +# check_is_factor(x) +# if (!missing(n) && !missing(prop)){ +# stop("Please supply either n or prop, not both") +# } +# if (!missing(prop)){ +# n <- floor(prop * length(levels(x))) +# } +# temp <- unclass(x) +# ties <- match.arg(ties) +# # counts <- collapse::GRPN(x, expand = FALSE) +# counts <- tabulate(x, length(levels(x))) +# if (ties == "min"){ +# bound <- sort(counts, decreasing = TRUE)[min(n, length(counts))] +# top <- which_(counts >= bound) +# } else { +# rank <- rank(counts, ties.method = ties) +# top <- which_(rank <= n) +# } +# if (length(top) == length(counts)){ +# x +# } else { +# lvls <- levels_factor(x)[top] +# out <- collapse::fmatch(x, lvls, nomatch = length(lvls) + 1L, overid = 2L) +# out[which_na(x)] <- NA +# out_levels <- c(factor_as_character(lvls), other_category) +# attr(out, "levels") <- out_levels +# class(out) <- "factor" +# out +# } +# } +#' @export +#' @rdname factors +levels_lump <- function(x, n, prop, other_category = "Other", + ties = c("min", "average", "first", "last", "random", "max")){ + check_is_factor(x) + if (!missing(n) && !missing(prop)){ + stop("Please supply either n or prop, not both") + } + if (!missing(prop)){ + n <- floor(prop * length(levels(x))) + } + ties <- match.arg(ties) + counts <- tabulate(x, length(levels(x))) + o <- order(counts, decreasing = TRUE) + sorted_counts <- counts[o] + if (ties == "min"){ + bound <- sorted_counts[min(n, length(counts))] + top <- which_(sorted_counts >= bound) + } else { + rank <- rank(-sorted_counts, ties.method = ties) + top <- which_(rank <= n) + } + if (length(top) == length(counts)){ + x + } else { + lvls <- levels_factor(x)[o][top] + out <- collapse::fmatch(x, lvls, nomatch = length(lvls) + 1L, overid = 2L) + out[which_na(x)] <- NA + out_levels <- c(factor_as_character(lvls), other_category) + attr(out, "levels") <- out_levels + class(out) <- "factor" + out + } +} # Generic factor conversion to data representation factor_as_type <- function(x, type){ check_length(type, 1) diff --git a/R/overview.R b/R/overview.R index e08eea9..a3f2dcc 100644 --- a/R/overview.R +++ b/R/overview.R @@ -36,71 +36,71 @@ #' options(cheapr.digits = 2) # The default #' @rdname overview #' @export -overview <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ UseMethod("overview") } #' @rdname overview #' @export -overview.default <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.default <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ overview(new_df(x = x), hist = hist, digits = digits) } #' @rdname overview #' @export -overview.logical <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.logical <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ overview(new_df(x = as.logical(x)), hist = hist, digits = digits) } #' @rdname overview #' @export -overview.integer <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.integer <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(new_df(x = as.integer(x)), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.numeric <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.numeric <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(new_df(x = as.numeric(x)), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.integer64 <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.integer64 <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(cpp_int64_to_numeric(x), hist = hist, digits = digits) out$numeric$class <- class(x)[1] out } #' @rdname overview #' @export -overview.character <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.character <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(new_df(x = as.character(x)), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.factor <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.factor <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(new_df(x = as.factor(x)), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.Date <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.Date <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(new_df(x = as.Date(x)), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.POSIXt <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.POSIXt <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(new_df(x = as.POSIXct(x)), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.ts <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.ts <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ out <- overview(transform_all(as.data.frame(x), as.numeric), hist = hist, digits = digits) out$time_series <- out$numeric out$numeric <- sset(out$numeric, 0) @@ -112,7 +112,7 @@ overview.ts <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) overview.zoo <- overview.ts #' @rdname overview #' @export -overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ +overview.data.frame <- function(x, hist = TRUE, digits = getOption("cheapr.digits", 2)){ check_is_df(x) N <- nrow(x) num_cols <- ncol(x) @@ -340,75 +340,101 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi } #' @export print.overview <- function(x, max = NULL, ...){ - digits <- x[["print_digits"]] + temp <- unclass(x) + digits <- temp[["print_digits"]] pretty_round <- function(x, decimal_digits = digits, ...){ pretty_num(round(x, digits = decimal_digits), ...) } - cat(paste("obs:", x$obs, "\ncols:", x$cols), "\n") - if (nrow(x$logical)){ - x$logical$p_complete <- pretty_round(x$logical$p_complete) + abbr <- function(x, min = 6, left = TRUE){ + abbreviate(x, minlength = min, named = FALSE, + method = if (left) "left.kept" else "both.sides") + } + cat(paste("obs:", temp$obs, "\ncols:", temp$cols), "\n") + if (nrow(temp$logical)){ + temp$logical$p_complete <- pretty_round(temp$logical$p_complete) cat("\n----- Logical -----\n") - print(x$logical) + names(temp$logical) <- abbr(names(temp$logical)) + temp$logical$class <- abbr(temp$logical$class, 5, FALSE) + temp$logical$col <- abbr(temp$logical$col, 10, FALSE) + print(temp$logical) } - if (nrow(x$numeric)){ - x$numeric$p_complete <- pretty_round(x$numeric$p_complete) - x$numeric$mean <- pretty_round(x$numeric$mean) - x$numeric$p0 <- pretty_round(x$numeric$p0) - x$numeric$p25 <- pretty_round(x$numeric$p25) - x$numeric$p50 <- pretty_round(x$numeric$p50) - x$numeric$p75 <- pretty_round(x$numeric$p75) - x$numeric$p100 <- pretty_round(x$numeric$p100) - x$numeric$iqr <- pretty_round(x$numeric$iqr) - x$numeric$sd <- pretty_round(x$numeric$sd) + if (nrow(temp$numeric)){ + temp$numeric$p_complete <- pretty_round(temp$numeric$p_complete) + temp$numeric$mean <- pretty_round(temp$numeric$mean) + temp$numeric$p0 <- pretty_round(temp$numeric$p0) + temp$numeric$p25 <- pretty_round(temp$numeric$p25) + temp$numeric$p50 <- pretty_round(temp$numeric$p50) + temp$numeric$p75 <- pretty_round(temp$numeric$p75) + temp$numeric$p100 <- pretty_round(temp$numeric$p100) + temp$numeric$iqr <- pretty_round(temp$numeric$iqr) + temp$numeric$sd <- pretty_round(temp$numeric$sd) cat("\n----- Numeric -----\n") - print(x$numeric) + names(temp$numeric) <- abbr(names(temp$numeric)) + temp$numeric$class <- abbr(temp$numeric$class, 5, FALSE) + temp$numeric$col <- abbr(temp$numeric$col, 10, FALSE) + print(temp$numeric) } - if (nrow(x$date)){ - x$date$p_complete <- pretty_round(x$date$p_complete) + if (nrow(temp$date)){ + temp$date$p_complete <- pretty_round(temp$date$p_complete) cat("\n----- Dates -----\n") - print(x$date) + names(temp$date) <- abbr(names(temp$date)) + temp$date$class <- abbr(temp$date$class, 5, FALSE) + temp$date$col <- abbr(temp$date$col, 10, FALSE) + print(temp$date) } - if (nrow(x$datetime)){ - x$datetime$p_complete <- pretty_round(x$datetime$p_complete) + if (nrow(temp$datetime)){ + temp$datetime$p_complete <- pretty_round(temp$datetime$p_complete) # An overview list contains a 'min' & 'max' variable of date-times # This is UTC because R can't handle a date-time with multiple time-zones # And so we want to print it in local-time - datetime_chr_min <- character(nrow(x$datetime)) - datetime_chr_max <- character(nrow(x$datetime)) - mins <- x[["datetime"]][["min"]] - maxs <- x[["datetime"]][["max"]] - tzones <- x[["datetime"]][["tzone"]] - for (i in seq_len(nrow(x$datetime))){ + datetime_chr_min <- character(nrow(temp$datetime)) + datetime_chr_max <- character(nrow(temp$datetime)) + mins <- temp[["datetime"]][["min"]] + maxs <- temp[["datetime"]][["max"]] + tzones <- temp[["datetime"]][["tzone"]] + for (i in seq_len(nrow(temp$datetime))){ datetime_chr_min[i] <- format(mins[i], tz = tzones[i]) datetime_chr_max[i] <- format(maxs[i], tz = tzones[i]) } - x$datetime$min <- datetime_chr_min - x$datetime$max <- datetime_chr_max + temp$datetime$min <- datetime_chr_min + temp$datetime$max <- datetime_chr_max cat("\n----- Date-Times -----\n") - print(x$datetime) + names(temp$datetime) <- abbr(names(temp$datetime)) + temp$datetime$class <- abbr(temp$datetime$class, 5, FALSE) + temp$datetime$col <- abbr(temp$datetime$col, 10, FALSE) + print(temp$datetime) } - if (nrow(x$time_series)){ - x$time_series$p_complete <- pretty_round(x$time_series$p_complete) - x$time_series$mean <- pretty_round(x$time_series$mean) - x$time_series$p0 <- pretty_round(x$time_series$p0) - x$time_series$p25 <- pretty_round(x$time_series$p25) - x$time_series$p50 <- pretty_round(x$time_series$p50) - x$time_series$p75 <- pretty_round(x$time_series$p75) - x$time_series$p100 <- pretty_round(x$time_series$p100) - x$time_series$iqr <- pretty_round(x$time_series$iqr) - x$time_series$sd <- pretty_round(x$time_series$sd) + if (nrow(temp$time_series)){ + temp$time_series$p_complete <- pretty_round(temp$time_series$p_complete) + temp$time_series$mean <- pretty_round(temp$time_series$mean) + temp$time_series$p0 <- pretty_round(temp$time_series$p0) + temp$time_series$p25 <- pretty_round(temp$time_series$p25) + temp$time_series$p50 <- pretty_round(temp$time_series$p50) + temp$time_series$p75 <- pretty_round(temp$time_series$p75) + temp$time_series$p100 <- pretty_round(temp$time_series$p100) + temp$time_series$iqr <- pretty_round(temp$time_series$iqr) + temp$time_series$sd <- pretty_round(temp$time_series$sd) cat("\n----- Time-Series -----\n") - print(x$time_series) + names(temp$time_series) <- abbr(names(temp$time_series)) + temp$time_series$class <- abbr(temp$time_series$class, 5, FALSE) + temp$time_series$col <- abbr(temp$time_series$col, 10, FALSE) + print(temp$time_series) } - if (nrow(x$categorical)){ - x$categorical$p_complete <- pretty_round(x$categorical$p_complete) + if (nrow(temp$categorical)){ + temp$categorical$p_complete <- pretty_round(temp$categorical$p_complete) cat("\n----- Categorical -----\n") - print(x$categorical) + names(temp$categorical) <- abbr(names(temp$categorical)) + temp$categorical$class <- abbr(temp$categorical$class, 5, FALSE) + temp$categorical$col <- abbr(temp$categorical$col, 10, FALSE) + print(temp$categorical) } - if (nrow(x$other)){ - x$other$p_complete <- pretty_round(x$other$p_complete) + if (nrow(temp$other)){ + temp$other$p_complete <- pretty_round(temp$other$p_complete) cat("\n----- Other -----\n") - print(x$other) + names(temp$other) <- abbr(names(temp$other)) + temp$other$class <- abbr(temp$other$class, 5, FALSE) + temp$other$col <- abbr(temp$other$col, 10, FALSE) + print(temp$other) } invisible(x) } @@ -455,10 +481,10 @@ pluck_row <- function(data, i = 1){ spark_bar <- function(x){ bars <- intToUtf8(c(9601L, 9602L, 9603L, 9605L, 9606L, 9607L), multiple = TRUE) - bar_codes <- findInterval( - x, vec = seq.int(0, to = 1, length.out = length(bars) + 1L), - rightmost.closed = TRUE, - left.open = FALSE, all.inside = FALSE + bar_codes <- bin( + x, seq.int(0, to = 1, length.out = length(bars) + 1L), + left_closed = TRUE, include_oob = TRUE, + include_endpoint = TRUE ) bar_codes[bar_codes == 0L] <- NA_integer_ out <- bars[bar_codes] @@ -470,7 +496,7 @@ inline_hist <- function(x, n_bins = 5L){ return(" ") } if (is.infinite(max(abs(collapse::frange(x, na.rm = TRUE))))) { - x[is.infinite(x)] <- NA + x[which_(is.infinite(x))] <- NA } if (all_na(x)) { return(" ") @@ -478,8 +504,10 @@ inline_hist <- function(x, n_bins = 5L){ if (allv2(na_rm(x), 0)) { x <- x + 1 } - hist_dt <- tabulate(cut_numeric(x, n_bins, labels = FALSE), - nbins = n_bins) + hist_dt <- tabulate( + bin(x, r_cut_breaks(x, n_bins), left_closed = FALSE), + nbins = n_bins + ) hist_dt <- hist_dt / max(hist_dt) spark_bar(hist_dt) } diff --git a/R/utils.R b/R/utils.R index c2ebba9..4a1c7f7 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,7 +1,7 @@ #' @noRd # Like deparse1 but has a cutoff in case of massive strings -deparse2 <- function(expr, collapse = " ", width.cutoff = 500L, nlines = 5L, ...){ +deparse2 <- function(expr, collapse = " ", width.cutoff = 500L, nlines = 10L, ...){ paste(deparse(expr, width.cutoff, nlines = nlines, ...), collapse = collapse) } @@ -148,3 +148,49 @@ n_dots <- function(...){ # Keep this in-case anyone was using it fill_with_na <- na_insert + +r_cut_breaks <- function(x, n){ + check_length(n, 1) + stopifnot(n >= 2) + breaks <- get_breaks(x, n, pretty = FALSE) + adj <- diff(range(breaks)) * 0.001 + breaks[1] <- breaks[1] - adj + breaks[length(breaks)] <- breaks[length(breaks)] + adj + breaks +} + + +# str_to_factor_size <- function(x){ +# size <- as.double(object.size(x)) +# lvls_size <- size - (8 * (length(x) - collapse::fnunique(x))) +# int_size <- 48 + (4 * length(x)) +# (48 * 7) + int_size + lvls_size +# } +# factor_to_str_size <- function(x){ +# lvls_size <- as.double(object.size(levels(x))) +# lvls_size + (8 * (length(x) - length(levels(x)))) +# } + +# vec_compress <- function(x){ +# if (inherits(x, "data.frame")){ +# for (i in seq_along(x)){ +# x[[i]] <- vec_compress(x[[i]]) +# } +# } +# if (is.factor(x)){ +# if (length(levels(x)) >= (length(x) / 2)){ +# x <- factor_as_character(x) +# } +# } else if (is.character(x)){ +# if (collapse::fnunique(x) < (length(x) / 2)){ +# x <- as_factor(x) +# } +# } else if ( +# is.double(x) && !is.object(x) && +# all_integerable(x) && +# cpp_all_whole_numbers(x, tol = sqrt(.Machine$double.eps), na_ignore = TRUE) +# ){ +# storage.mode(x) <- "integer" # This keeps matrix structure intact +# } +# x +# } diff --git a/man/factors.Rd b/man/factors.Rd index b2b8945..b3da87e 100644 --- a/man/factors.Rd +++ b/man/factors.Rd @@ -12,6 +12,7 @@ \alias{levels_drop_na} \alias{levels_drop} \alias{levels_reorder} +\alias{levels_lump} \title{A cheaper version of \code{factor()} along with cheaper utilities} \usage{ factor_( @@ -41,6 +42,14 @@ levels_drop_na(x) levels_drop(x) levels_reorder(x, order_by, decreasing = FALSE) + +levels_lump( + x, + n, + prop, + other_category = "Other", + ties = c("min", "average", "first", "last", "random", "max") +) } \arguments{ \item{x}{A vector.} @@ -65,6 +74,15 @@ Default is \code{TRUE}.} \item{decreasing}{Should the reordered levels be in decreasing order? Default is \code{FALSE}.} + +\item{n}{Top n number of levels to calculate.} + +\item{prop}{Top proportion of levels to calculate. +This is a proportion of the total unique levels in x.} + +\item{other_category}{Name of 'other' category.} + +\item{ties}{Ties method to use. See \code{?rank}.} } \value{ A \code{factor} or \code{character} in the case of \code{levels_used} and \code{levels_unused}. @@ -81,6 +99,8 @@ There are some additional utilities, most of which begin with the prefix \code{levels_add_na()} which adds an explicit \code{NA} level, \code{levels_drop_na()} which drops the \code{NA} level, \code{levels_drop()} which drops unused factor levels, +\code{levels_lump()} which returns top n levels and lumps all others into the +same category, and finally \code{levels_reorder()} which reorders the levels of \code{x} based on \code{y} using the ordered median values of \code{y} for each level. } @@ -95,4 +115,35 @@ Using a daylight savings example where the clocks go back: \cr produces 4 levels whereas \cr \code{factor_(as.POSIXct(1729984360, tz = "Europe/London") + 3600 *(1:5))} produces 5 levels. + +\code{levels_lump()} is a cheaper version of \code{forcats::lump_n()} but returns +levels in order of highest frequency to lowest. This can be very useful +for plotting. +} +\examples{ +library(cheapr) + +x <- factor_(sample(letters[sample.int(26, 10)], 100, TRUE), levels = letters) +x +# Used/unused levels + +levels_used(x) +levels_unused(x) + +# Drop unused levels +levels_drop(x) + +# Top 3 letters by by frequency + +table(levels_lump(x, 3)) + +# We can use levels_lump to create a generic top n function for non-factors too + +get_top_n <- function(x, n){ + f <- levels_lump(factor_(x, order = FALSE), n = n) + new_df(value = levels(f), + count = tabulate(f)) +} + +get_top_n(x, 5) } diff --git a/man/overview.Rd b/man/overview.Rd index 4e745a0..ee1e726 100644 --- a/man/overview.Rd +++ b/man/overview.Rd @@ -16,31 +16,31 @@ \alias{overview.data.frame} \title{An alternative to \code{summary()} inspired by the skimr package} \usage{ -overview(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +overview(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{default}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{default}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{logical}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{logical}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{integer}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{integer}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{numeric}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{numeric}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{integer64}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{integer64}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{character}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{character}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{factor}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{factor}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{Date}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{Date}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{POSIXt}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{POSIXt}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{ts}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{ts}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{zoo}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{zoo}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) -\method{overview}{data.frame}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) +\method{overview}{data.frame}(x, hist = TRUE, digits = getOption("cheapr.digits", 2)) } \arguments{ \item{x}{A vector or data frame.} diff --git a/src/utils.cpp b/src/utils.cpp index 05027cb..d10e549 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -391,6 +391,10 @@ SEXP cpp_rev(SEXP x, bool set){ // break; // } // default: { +// if (is_int64(x)){ +// out = true; +// break; +// } // out = true; // double *p_x = REAL(x); // for (R_xlen_t i = 0; i < n; ++i) { diff --git a/tests/testthat/_snaps/overview.md b/tests/testthat/_snaps/overview.md index 49776e0..b49471a 100644 --- a/tests/testthat/_snaps/overview.md +++ b/tests/testthat/_snaps/overview.md @@ -7,20 +7,20 @@ cols: 6 ----- Numeric ----- - col class n_missing p_complete n_unique mean p0 p25 p50 p75 - 1 Ozone integer 37 0.76 67 42.13 1 18 31.5 63.25 - 2 Solar.R integer 7 0.95 117 185.93 7 115.75 205 258.75 - 3 Wind numeric 0 1 31 9.96 1.7 7.4 9.7 11.5 - 4 Temp integer 0 1 40 77.88 56 72 79 85 - 5 Month integer 0 1 5 6.99 5 6 7 8 - 6 Day integer 0 1 31 15.8 1 8 16 23 - p100 iqr sd - 1 168 45.25 32.99 - 2 334 143 90.06 - 3 20.7 4.1 3.52 - 4 97 13 9.47 - 5 9 2 1.42 - 6 31 15 8.86 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 p100 iqr + 1 Ozone intgr 37 0.76 67 42.13 1 18 31.5 63.25 168 45.25 + 2 Solar.R intgr 7 0.95 117 185.93 7 115.75 205 258.75 334 143 + 3 Wind numrc 0 1 31 9.96 1.7 7.4 9.7 11.5 20.7 4.1 + 4 Temp intgr 0 1 40 77.88 56 72 79 85 97 13 + 5 Month intgr 0 1 5 6.99 5 6 7 8 9 2 + 6 Day intgr 0 1 31 15.8 1 8 16 23 31 15 + sd + 1 32.99 + 2 90.06 + 3 3.52 + 4 9.47 + 5 1.42 + 6 8.86 --- @@ -31,20 +31,15 @@ cols: 5 ----- Numeric ----- - col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 - 1 Sepal.Length numeric 0 1 35 5.84 4.3 5.1 5.8 6.4 7.9 - 2 Sepal.Width numeric 0 1 23 3.06 2 2.8 3 3.3 4.4 - 3 Petal.Length numeric 0 1 43 3.76 1 1.6 4.35 5.1 6.9 - 4 Petal.Width numeric 0 1 22 1.2 0.1 0.3 1.3 1.8 2.5 - iqr sd - 1 1.3 0.83 - 2 0.5 0.44 - 3 3.5 1.77 - 4 1.5 0.76 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 p100 iqr sd + 1 Sepl.Lngth numrc 0 1 35 5.84 4.3 5.1 5.8 6.4 7.9 1.3 0.83 + 2 Sepal.Wdth numrc 0 1 23 3.06 2 2.8 3 3.3 4.4 0.5 0.44 + 3 Petl.Lngth numrc 0 1 43 3.76 1 1.6 4.35 5.1 6.9 3.5 1.77 + 4 Petal.Wdth numrc 0 1 22 1.2 0.1 0.3 1.3 1.8 2.5 1.5 0.76 ----- Categorical ----- - col class n_missing p_complete n_unique n_levels min max - 1 Species factor 0 1 3 3 setosa virginica + col class n_mssn p_cmpl n_uniq n_lvls min max + 1 Species factr 0 1 3 3 setosa virginica --- @@ -55,25 +50,20 @@ cols: 7 ----- Logical ----- - col class n_missing p_complete n_true n_false p_true - 1 large logical 0 1 24 76 0.24 + col class n_mssn p_cmpl n_true n_fals p_true + 1 large logcl 0 1 24 76 0.24 ----- Numeric ----- - col class n_missing p_complete n_unique mean p0 p25 p50 p75 - 1 Sepal.Length numeric 0 1 28 5.47 4.3 5 5.4 5.9 - 2 Sepal.Width numeric 0 1 23 3.1 2 2.8 3.05 3.4 - 3 Petal.Length numeric 0 1 28 2.86 1 1.5 2.45 4.32 - 4 Petal.Width numeric 0 1 15 0.79 0.1 0.2 0.8 1.3 - p100 iqr sd - 1 7 0.9 0.64 - 2 4.4 0.6 0.48 - 3 5.1 2.83 1.45 - 4 1.8 1.1 0.57 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 p100 iqr sd + 1 Sepl.Lngth numrc 0 1 28 5.47 4.3 5 5.4 5.9 7 0.9 0.64 + 2 Sepal.Wdth numrc 0 1 23 3.1 2 2.8 3.05 3.4 4.4 0.6 0.48 + 3 Petl.Lngth numrc 0 1 28 2.86 1 1.5 2.45 4.32 5.1 2.83 1.45 + 4 Petal.Wdth numrc 0 1 15 0.79 0.1 0.2 0.8 1.3 1.8 1.1 0.57 ----- Categorical ----- - col class n_missing p_complete n_unique n_levels min max - 1 Species factor 0 1 2 3 setosa versicolor - 2 Species2 character 0 1 2 NA setosa versicolor + col class n_mssn p_cmpl n_uniq n_lvls min max + 1 Species factr 0 1 2 3 setosa versicolor + 2 Species2 chrct 0 1 2 NA setosa versicolor --- @@ -84,15 +74,13 @@ cols: 3 ----- Numeric ----- - col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 - 1 breaks numeric 0 1 31 28.15 10 18.25 26 34 70 - iqr sd - 1 15.75 13.2 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 p100 iqr sd + 1 breaks numrc 0 1 31 28.15 10 18.25 26 34 70 15.75 13.2 ----- Categorical ----- - col class n_missing p_complete n_unique n_levels min max - 1 wool factor 0 1 2 2 A B - 2 tension factor 0 1 3 3 L H + col class n_mssn p_cmpl n_uniq n_lvls min max + 1 wool factr 0 1 2 2 A B + 2 tension factr 0 1 3 3 L H --- @@ -103,16 +91,13 @@ cols: 3 ----- Numeric ----- - col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 - 1 len numeric 0 1 43 18.81 4.2 13.07 19.25 25.27 33.9 - 2 dose numeric 0 1 3 1.17 0.5 0.5 1 2 2 - iqr sd - 1 12.2 7.65 - 2 1.5 0.63 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 p100 iqr sd + 1 len numrc 0 1 43 18.81 4.2 13.07 19.25 25.27 33.9 12.2 7.65 + 2 dose numrc 0 1 3 1.17 0.5 0.5 1 2 2 1.5 0.63 ----- Categorical ----- - col class n_missing p_complete n_unique n_levels min max - 1 supp factor 0 1 2 2 OJ VC + col class n_mssn p_cmpl n_uniq n_lvls min max + 1 supp factr 0 1 2 2 OJ VC --- @@ -123,22 +108,22 @@ cols: 3 ----- Time-Series ----- - col class n_missing p_complete n_unique mean p0 p25 p50 p75 - 1 y ts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 - 2 x ts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 - 3 z_Series 1 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 - 4 z_Series 2 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 - 5 z_Series 3 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 - 6 z_Series 4 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 - 7 z_Series 5 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 - p100 iqr sd - 1 1.59 1.26 1.15 - 2 1.59 1.26 1.15 - 3 1.59 1.26 1.15 - 4 1.59 1.26 1.15 - 5 1.59 1.26 1.15 - 6 1.59 1.26 1.15 - 7 1.59 1.26 1.15 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 p100 iqr + 1 y ts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 1.59 1.26 + 2 x ts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 1.59 1.26 + 3 z_Series 1 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 1.59 1.26 + 4 z_Series 2 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 1.59 1.26 + 5 z_Series 3 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 1.59 1.26 + 6 z_Series 4 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 1.59 1.26 + 7 z_Series 5 mts 0 1 25 0.05 -2.53 -0.44 0.24 0.82 1.59 1.26 + sd hist + 1 1.15 ▂▃▅▆▇ + 2 1.15 ▂▃▅▆▇ + 3 1.15 ▂▃▅▆▇ + 4 1.15 ▂▃▅▆▇ + 5 1.15 ▂▃▅▆▇ + 6 1.15 ▂▃▅▆▇ + 7 1.15 ▂▃▅▆▇ --- @@ -149,18 +134,18 @@ cols: 5 ----- Time-Series ----- - col class n_missing p_complete n_unique mean p0 p25 p50 p75 - 1 Series 1 mts 0 1 5 0.2 -1.24 -0.36 0.66 0.82 - 2 Series 2 mts 0 1 5 0.76 -0.39 0.77 0.81 1.3 - 3 Series 3 mts 0 1 5 -0.67 -2.53 -1.6 -0.11 0.24 - 4 Series 4 mts 0 1 5 0.21 -0.7 -0.44 0.08 0.53 - 5 Series 5 mts 0 1 5 -0.24 -2.36 -1.23 0.03 0.86 - p100 iqr sd - 1 1.14 1.18 0.98 - 2 1.33 0.53 0.7 - 3 0.64 1.84 1.34 - 4 1.59 0.97 0.91 - 5 1.51 2.09 1.57 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 p100 iqr + 1 Series 1 mts 0 1 5 0.2 -1.24 -0.36 0.66 0.82 1.14 1.18 + 2 Series 2 mts 0 1 5 0.76 -0.39 0.77 0.81 1.3 1.33 0.53 + 3 Series 3 mts 0 1 5 -0.67 -2.53 -1.6 -0.11 0.24 0.64 1.84 + 4 Series 4 mts 0 1 5 0.21 -0.7 -0.44 0.08 0.53 1.59 0.97 + 5 Series 5 mts 0 1 5 -0.24 -2.36 -1.23 0.03 0.86 1.51 2.09 + sd hist + 1 0.98 ▅▅▁▅▇ + 2 0.7 ▅▁▁▇▇ + 3 1.34 ▅▅▁▅▇ + 4 0.91 ▇▅▅▁▅ + 5 1.57 ▅▅▁▅▇ --- @@ -171,14 +156,14 @@ cols: 4 ----- Time-Series ----- - col class n_missing p_complete n_unique mean p0 p25 p50 - 1 DAX mts 0 1 1774 2530.66 1402.34 1744.1 2140.56 - 2 SMI mts 0 1 1725 3376.22 1587.4 2165.62 2796.35 - 3 CAC mts 0 1 1617 2227.83 1611 1875.15 1992.3 - 4 FTSE mts 0 1 1729 3565.64 2281 2843.15 3246.6 - p75 p100 iqr sd - 1 2722.37 6186.09 978.26 1084.79 - 2 3812.43 8412 1646.8 1663.03 - 3 2274.35 4388.5 399.2 580.31 - 4 3993.57 6179 1150.43 976.72 + col class n_mssn p_cmpl n_uniq mean p0 p25 p50 p75 + 1 DAX mts 0 1 1774 2530.66 1402.34 1744.1 2140.56 2722.37 + 2 SMI mts 0 1 1725 3376.22 1587.4 2165.62 2796.35 3812.43 + 3 CAC mts 0 1 1617 2227.83 1611 1875.15 1992.3 2274.35 + 4 FTSE mts 0 1 1729 3565.64 2281 2843.15 3246.6 3993.57 + p100 iqr sd hist + 1 6186.09 978.26 1084.79 ▇▂▂▁▁ + 2 8412 1646.8 1663.03 ▇▃▁▁▁ + 3 4388.5 399.2 580.31 ▇▂▁▁▁ + 4 6179 1150.43 976.72 ▇▇▂▂▂