From e7c647450c7e4d55e83800a0685986ccd597a3fc Mon Sep 17 00:00:00 2001 From: Nick Christofides <118103879+NicChr@users.noreply.github.com> Date: Fri, 22 Mar 2024 15:09:40 +0000 Subject: [PATCH] Updated overview. --- NEWS.md | 4 + R/overview.R | 143 +++++++++++++++++++----------- R/zzz.R | 6 +- man/overview.Rd | 50 ++++++++--- tests/testthat/_snaps/overview.md | 10 +-- 5 files changed, 142 insertions(+), 71 deletions(-) diff --git a/NEWS.md b/NEWS.md index e8aad68..bb2394d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # cheapr (Development version) +* `overview` now always returns an object of class "overview". It also returns +the number of observations instead of rows so that it makes sense +for vector summaries as well as data frame summaries. + * `sequence_` has been optimised and rewritten in C++. It now only checks for integer overflow when both `from` and `by` are integer vectors. diff --git a/R/overview.R b/R/overview.R index b803a7c..8916a22 100644 --- a/R/overview.R +++ b/R/overview.R @@ -5,71 +5,112 @@ #' #' @param x A vector or data frame. #' @param hist Should in-line histograms be returned? Default is `FALSE`. +#' @param digits How many decimal places should the summary statistics be +#' printed as? Default is 2. #' #' @returns -#' `overview(x)` returns a 1-row data frame unless -#' `x` is a data frame, in which case an object of class "overview" is returned, -#' Under the hood this is just a a list of data frames. +#' An object of class "overview". +#' Under the hood this is just a list of data frames. #' Key summary statistics are reported in each data frame. #' +#' @details +#' No rounding of statistics is done except in printing which can be controlled +#' either through the `digits` argument in `overview()`, or by setting the +#' option `options(cheapr.digits)`. \cr +#' To access the underlying data, for example the numeric summary, +#' just use `$numeric`, e.g. `overview(rnorm(30))$numeric`. +#' +#' @examples +#' library(cheapr) +#' overview(iris) +#' +#' # With histograms +#' overview(airquality, hist = TRUE) +#' +#' # Round to 0 decimal places +#' overview(airquality, digits = 0) +#' +#' # We can set an option for all overviews +#' options(cheapr.digits = 1) +#' overview(rnorm(100)) +#' options(cheapr.digits = 2) # The default #' @rdname overview #' @export -overview <- function(x, hist = FALSE){ +overview <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ UseMethod("overview") } #' @rdname overview #' @export -overview.default <- function(x, hist = FALSE){ - out <- overview(list_as_df(list(x = x)), hist = hist)$other - out +overview.default <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + overview(list_as_df(list(x = x)), hist = hist) + # out <- overview(list_as_df(list(x = x)), hist = hist)$other + # out } #' @rdname overview #' @export -overview.logical <- function(x, hist = FALSE){ - out <- overview(list_as_df(list(x = as.logical(x))), hist = hist)$logical - out +overview.logical <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + overview(list_as_df(list(x = as.logical(x))), hist = hist) + # out <- overview(list_as_df(list(x = as.logical(x))), hist = hist)$logical + # out } #' @rdname overview #' @export -overview.numeric <- function(x, hist = FALSE){ - out <- overview(list_as_df(list(x = as.numeric(x))), hist = hist)$numeric +overview.numeric <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + out <- overview(list_as_df(list(x = as.numeric(x))), hist = hist) + out$cols <- NA_integer_ out + # out <- overview(list_as_df(list(x = as.numeric(x))), hist = hist)$numeric + # out } #' @rdname overview #' @export -overview.character <- function(x, hist = FALSE){ - out <- overview(list_as_df(list(x = as.character(x))), hist = hist)$categorical +overview.character <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + out <- overview(list_as_df(list(x = as.character(x))), hist = hist) + out$cols <- NA_integer_ out + # out <- overview(list_as_df(list(x = as.character(x))), hist = hist)$categorical + # out } #' @rdname overview #' @export -overview.factor <- function(x, hist = FALSE){ - out <- overview(list_as_df(list(x = as.factor(x))), hist = hist)$categorical +overview.factor <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + out <- overview(list_as_df(list(x = as.factor(x))), hist = hist) + out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.Date <- function(x, hist = FALSE){ - out <- overview(list_as_df(list(x = as.Date(x))), hist = hist)$date +overview.Date <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + out <- overview(list_as_df(list(x = as.Date(x))), hist = hist) + out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.POSIXt <- function(x, hist = FALSE){ - out <- overview(list_as_df(list(x = as.POSIXct(x))), hist = hist)$datetime - out[[2]] <- utils::tail(class(x), n = 1) +overview.POSIXt <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + out <- overview(list_as_df(list(x = as.POSIXct(x))), hist = hist) + out$cols <- NA_integer_ out } #' @rdname overview #' @export -overview.ts <- function(x, hist = FALSE){ - out <- overview(transform_all(as.data.frame(x), as.numeric), hist = hist)$numeric - out[[2]] <- utils::tail(class(x), n = 1) +overview.ts <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) + out <- overview(transform_all(as.data.frame(x), as.numeric), hist = hist) + out$numeric$class <- class(x)[1] out } #' @rdname overview #' @export -overview.data.frame <- function(x, hist = FALSE){ +overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ + options(cheapr.digits = digits) check_is_df(x) N <- nrow(x) num_cols <- ncol(x) @@ -273,7 +314,7 @@ overview.data.frame <- function(x, hist = FALSE){ } out <- list( - nrow = N, ncol = num_cols, + obs = N, cols = num_cols, logical = lgl_out, numeric = num_out, date = date_out, @@ -286,14 +327,14 @@ overview.data.frame <- function(x, hist = FALSE){ out } #' @export -print.overview <- function(x, max = NULL, ...){ +print.overview <- function(x, max = NULL, digits = getOption("cheapr.digits", 2), ...){ # max_rows <- getOption("tibble.print_max", 20) # max_cols <- getOption("tibble.width", NULL) # max_extra_cols <- getOption("tibble.max_extra_cols", 100) # options(tibble.print_max = 10) # options(tibble.width = 100) # options(tibble.max_extra_cols = 10) - cat(paste("rows:", x$nrow, "cols:", x$ncol), "\n") + cat(paste("obs:", x$obs, "cols:", x$cols), "\n") # for (data_type in names(x)[-(1:2)]){ # if (nrow(x[[data_type]])){ # cat(paste("\n-----", data_type, "-----\n")) @@ -301,30 +342,30 @@ print.overview <- function(x, max = NULL, ...){ # } # } if (nrow(x$logical)){ - x$logical$p_complete <- pretty_num(round(x$logical$p_complete, 2)) + x$logical$p_complete <- pretty_num(round(x$logical$p_complete, digits)) cat("\n----- Logical -----\n") print(x$logical) } if (nrow(x$numeric)){ - x$numeric$p_complete <- pretty_num(round(x$numeric$p_complete, 2)) - x$numeric$mean <- pretty_num(round(x$numeric$mean, 2)) - x$numeric$p0 <- pretty_num(round(x$numeric$p0, 2)) - x$numeric$p25 <- pretty_num(round(x$numeric$p25, 2)) - x$numeric$p50 <- pretty_num(round(x$numeric$p50, 2)) - x$numeric$p75 <- pretty_num(round(x$numeric$p75, 2)) - x$numeric$p100 <- pretty_num(round(x$numeric$p100, 2)) - x$numeric$iqr <- pretty_num(round(x$numeric$iqr, 2)) - x$numeric$sd <- pretty_num(round(x$numeric$sd, 2)) + x$numeric$p_complete <- pretty_num(round(x$numeric$p_complete, digits)) + x$numeric$mean <- pretty_num(round(x$numeric$mean, digits)) + x$numeric$p0 <- pretty_num(round(x$numeric$p0, digits)) + x$numeric$p25 <- pretty_num(round(x$numeric$p25, digits)) + x$numeric$p50 <- pretty_num(round(x$numeric$p50, digits)) + x$numeric$p75 <- pretty_num(round(x$numeric$p75, digits)) + x$numeric$p100 <- pretty_num(round(x$numeric$p100, digits)) + x$numeric$iqr <- pretty_num(round(x$numeric$iqr, digits)) + x$numeric$sd <- pretty_num(round(x$numeric$sd, digits)) cat("\n----- Numeric -----\n") print(x$numeric) } if (nrow(x$date)){ - x$date$p_complete <- pretty_num(round(x$date$p_complete, 2)) + x$date$p_complete <- pretty_num(round(x$date$p_complete, digits)) cat("\n----- Dates -----\n") print(x$date) } if (nrow(x$datetime)){ - x$datetime$p_complete <- pretty_num(round(x$datetime$p_complete, 2)) + x$datetime$p_complete <- pretty_num(round(x$datetime$p_complete, digits)) # An overview list contains a 'min' & 'max' variable of date-times # This is UTC because R can't handle a date-time with multiple time-zones # And so we want to print it in local-time @@ -343,25 +384,25 @@ print.overview <- function(x, max = NULL, ...){ print(x$datetime) } if (nrow(x$time_series)){ - x$time_series$p_complete <- pretty_num(round(x$time_series$p_complete, 2)) - x$time_series$mean <- pretty_num(round(x$time_series$mean, 2)) - x$time_series$p0 <- pretty_num(round(x$time_series$p0, 2)) - x$time_series$p25 <- pretty_num(round(x$time_series$p25, 2)) - x$time_series$p50 <- pretty_num(round(x$time_series$p50, 2)) - x$time_series$p75 <- pretty_num(round(x$time_series$p75, 2)) - x$time_series$p100 <- pretty_num(round(x$time_series$p100, 2)) - x$time_series$iqr <- pretty_num(round(x$time_series$iqr, 2)) - x$time_series$sd <- pretty_num(round(x$time_series$sd, 2)) + x$time_series$p_complete <- pretty_num(round(x$time_series$p_complete, digits)) + x$time_series$mean <- pretty_num(round(x$time_series$mean, digits)) + x$time_series$p0 <- pretty_num(round(x$time_series$p0, digits)) + x$time_series$p25 <- pretty_num(round(x$time_series$p25, digits)) + x$time_series$p50 <- pretty_num(round(x$time_series$p50, digits)) + x$time_series$p75 <- pretty_num(round(x$time_series$p75, digits)) + x$time_series$p100 <- pretty_num(round(x$time_series$p100, digits)) + x$time_series$iqr <- pretty_num(round(x$time_series$iqr, digits)) + x$time_series$sd <- pretty_num(round(x$time_series$sd, digits)) cat("\n----- Time-Series -----\n") print(x$time_series) } if (nrow(x$categorical)){ - x$categorical$p_complete <- pretty_num(round(x$categorical$p_complete, 2)) + x$categorical$p_complete <- pretty_num(round(x$categorical$p_complete, digits)) cat("\n----- Categorical -----\n") print(x$categorical) } if (nrow(x$other)){ - x$other$p_complete <- pretty_num(round(x$other$p_complete, 2)) + x$other$p_complete <- pretty_num(round(x$other$p_complete, digits)) cat("\n----- Other -----\n") print(x$other) } diff --git a/R/zzz.R b/R/zzz.R index 2107094..0f388bf 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -40,8 +40,10 @@ on_package_load <- function(pkg, expr){ } } .onAttach <- function(...){ - options("cheapr.cores" = getOption("cheapr.cores", 1)) + options("cheapr.cores" = getOption("cheapr.cores", 1), + "cheapr.digits" = getOption("cheapr.digits", 2)) } .onUnload <- function(libname, pkgname){ - options(cheapr.cores = NULL) + options(cheapr.cores = NULL, + cheapr.digits = NULL) } diff --git a/man/overview.Rd b/man/overview.Rd index ccd0256..a114b29 100644 --- a/man/overview.Rd +++ b/man/overview.Rd @@ -13,37 +13,61 @@ \alias{overview.data.frame} \title{An alternative to \code{summary()} inspired by the skimr package} \usage{ -overview(x, hist = FALSE) +overview(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{default}(x, hist = FALSE) +\method{overview}{default}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{logical}(x, hist = FALSE) +\method{overview}{logical}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{numeric}(x, hist = FALSE) +\method{overview}{numeric}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{character}(x, hist = FALSE) +\method{overview}{character}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{factor}(x, hist = FALSE) +\method{overview}{factor}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{Date}(x, hist = FALSE) +\method{overview}{Date}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{POSIXt}(x, hist = FALSE) +\method{overview}{POSIXt}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{ts}(x, hist = FALSE) +\method{overview}{ts}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) -\method{overview}{data.frame}(x, hist = FALSE) +\method{overview}{data.frame}(x, hist = FALSE, digits = getOption("cheapr.digits", 2)) } \arguments{ \item{x}{A vector or data frame.} \item{hist}{Should in-line histograms be returned? Default is \code{FALSE}.} + +\item{digits}{How many decimal places should the summary statistics be +printed as? Default is 2.} } \value{ -\code{overview(x)} returns a 1-row data frame unless -\code{x} is a data frame, in which case an object of class "overview" is returned, -Under the hood this is just a a list of data frames. +An object of class "overview". +Under the hood this is just a list of data frames. Key summary statistics are reported in each data frame. } \description{ A cheaper \code{summary()} function, designed for larger data. } +\details{ +No rounding of statistics is done except in printing which can be controlled +either through the \code{digits} argument in \code{overview()}, or by setting the +option \code{options(cheapr.digits)}. \cr +To access the underlying data, for example the numeric summary, +just use \verb{$numeric}, e.g. \code{overview(rnorm(30))$numeric}. +} +\examples{ +library(cheapr) +overview(iris) + +# With histograms +overview(airquality, hist = TRUE) + +# Round to 0 decimal places +overview(airquality, digits = 0) + +# We can set an option for all overviews +options(cheapr.digits = 1) +overview(rnorm(100)) +options(cheapr.digits = 2) # The default +} diff --git a/tests/testthat/_snaps/overview.md b/tests/testthat/_snaps/overview.md index f3fa27a..b2e8a93 100644 --- a/tests/testthat/_snaps/overview.md +++ b/tests/testthat/_snaps/overview.md @@ -3,7 +3,7 @@ Code overview(airquality, hist = FALSE) Output - rows: 153 cols: 6 + obs: 153 cols: 6 ----- Numeric ----- col class n_missing p_complete n_unique mean p0 p25 p50 p75 @@ -26,7 +26,7 @@ Code overview(iris, hist = FALSE) Output - rows: 150 cols: 5 + obs: 150 cols: 5 ----- Numeric ----- col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 @@ -49,7 +49,7 @@ Code overview(iris2, hist = FALSE) Output - rows: 100 cols: 7 + obs: 100 cols: 7 ----- Logical ----- col class n_missing p_complete n_true n_false p_true @@ -77,7 +77,7 @@ Code overview(warpbreaks, hist = FALSE) Output - rows: 54 cols: 3 + obs: 54 cols: 3 ----- Numeric ----- col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 @@ -95,7 +95,7 @@ Code overview(ToothGrowth, hist = FALSE) Output - rows: 60 cols: 3 + obs: 60 cols: 3 ----- Numeric ----- col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100