Various improvements and bug fixes.

NicChr · Mar 29, 2024 · 55ad90e · 55ad90e
1 parent cb7ce27
commit 55ad90e
Show file tree

Hide file tree

Showing 20 changed files with 676 additions and 304 deletions.
diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION
@@ -0,0 +1,3 @@
+Version: 0.4.0
+Date: 2024-03-25 13:25:17 UTC
+SHA: cb7ce27331455c0fbe9ca946bc2c2a06c6f936e3
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: cheapr
 Title: Simple Functions to Save Time and Memory
-Version: 0.4.0
+Version: 0.4.0.9000
 Authors@R: 
     person("Nick", "Christofides", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-9743-7342"))

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,5 @@
+# cheapr (Development version)
+
 # cheapr 0.4.0
 
 * New function `sset` to consistently subset data frame rows and vectors in 

diff --git a/R/cpp11.R b/R/cpp11.R
@@ -1,5 +1,21 @@
 # Generated by cpp11: do not edit by hand
 
+cpp_set_rm_attributes <- function(x) {
+  .Call(`_cheapr_cpp_set_rm_attributes`, x)
+}
+
+cpp_set_add_attr <- function(x, which, value) {
+  .Call(`_cheapr_cpp_set_add_attr`, x, which, value)
+}
+
+cpp_set_rm_attr <- function(x, which) {
+  .Call(`_cheapr_cpp_set_rm_attr`, x, which)
+}
+
+cpp_set_attributes <- function(x, attributes, add) {
+  .Call(`_cheapr_cpp_set_attributes`, x, attributes, add)
+}
+
 cpp_gcd2 <- function(x, y, tol, na_rm) {
   .Call(`_cheapr_cpp_gcd2`, x, y, tol, na_rm)
 }
@@ -104,6 +120,10 @@ cpp_lead_sequence <- function(size, k, partial) {
   .Call(`_cheapr_cpp_lead_sequence`, size, k, partial)
 }
 
+cpp_sset <- function(x, indices) {
+  .Call(`_cheapr_cpp_sset`, x, indices)
+}
+
 cpp_vec_length <- function(x) {
   .Call(`_cheapr_cpp_vec_length`, x)
 }
@@ -128,14 +148,6 @@ cpp_list_as_df <- function(x) {
   .Call(`_cheapr_cpp_list_as_df`, x)
 }
 
-cpp_set_rm_attributes <- function(x) {
-  .Call(`_cheapr_cpp_set_rm_attributes`, x)
-}
-
-cpp_set_copy_attributes <- function(target, source, attrs) {
-  .Call(`_cheapr_cpp_set_copy_attributes`, target, source, attrs)
-}
-
 cpp_which_ <- function(x, invert) {
   .Call(`_cheapr_cpp_which_`, x, invert)
 }
diff --git a/R/extras.R b/R/extras.R
@@ -140,7 +140,7 @@ enframe_ <- function(x, name = "name", value = "value"){
     out <- list(x_nms, x)
     names(out) <- c(name, value)
   }
-  attr(out, "class") <- c("tbl_df", "tbl", "data.frame")
+  class(out) <- c("tbl_df", "tbl", "data.frame")
   attr(out, "row.names") <- .set_row_names(length(x))
   out
 }
@@ -160,15 +160,12 @@ deframe_ <- function(x){
 #' @export
 #' @rdname extras
 na_rm <- function(x){
-  if (is.data.frame(x)){
-    stop("x must be a vector")
-  }
   n_na <- num_na(x, recursive = TRUE)
   if (n_na == unlisted_length(x)){
-    x[0L]
+    sset(x, 0L)
   } else if (n_na == 0){
     x
   } else {
-    x[which_not_na(x)]
+    sset(x, which_not_na(x))
   }
 }
diff --git a/R/factors.R b/R/factors.R
@@ -46,13 +46,17 @@ factor_ <- function(x = integer(), levels = NULL, order = TRUE,
   }
   if (na_exclude && any_na(lvls)){
     if (order && is.null(levels)){
-      lvls <- lvls[seq_len(length(lvls) - 1L)]
+      lvls <- sset(lvls, seq_len(cpp_vec_length(lvls) - 1L))
     } else {
-      lvls <- lvls[which_not_na(lvls)]
+      lvls <- na_rm(lvls)
     }
   }
   out <- collapse::fmatch(x, lvls, overid = 2L)
-  fct_lvls <- as.character(lvls)
+  if (inherits(lvls, "data.frame")){
+    fct_lvls <- do.call(paste, c(lvls, list(sep = "_")))
+  } else {
+    fct_lvls <- as.character(lvls)
+  }
   if (inherits(x, "POSIXt") && collapse::any_duplicated(fct_lvls)){
     fct_lvls <- paste(fct_lvls, as.POSIXlt(lvls)$zone)
   }

diff --git a/R/nas.R b/R/nas.R
@@ -22,6 +22,7 @@
 #' To find rows with any empty values,
 #' use `which_(row_any_na(df))`. \cr
 #' To find empty rows use `which_(row_all_na(df))` or `which_na(df)`.
+#' To drop empty rows use `na_rm(df)` or `sset(df, which_(row_all_na(df), TRUE))`.
 #'
 #' ### `is_na`
 #' `is_na` Is an S3 generic function. It will internally fall back on
@@ -93,7 +94,9 @@ is_na.default <- function(x){
 #' @rdname is_na
 #' @export
 is_na.POSIXlt <- function(x){
-  row_any_na(list_as_df(unclass(x)[1:8]))
+  row_any_na(list_as_df(do.call(recycle, unclass(x)[
+    c("sec", "min", "hour", "mday",
+      "mon", "year", "wday", "yday")])))
 }
 #' @rdname is_na
 #' @export

diff --git a/R/overview.R b/R/overview.R
@@ -37,65 +37,57 @@
 #' @rdname overview
 #' @export
 overview <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
- UseMethod("overview")
+  UseMethod("overview")
 }
 #' @rdname overview
 #' @export
 overview.default <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  overview(list_as_df(list(x = x)), hist = hist)
+  overview(list_as_df(list(x = x)), hist = hist, digits = digits)
 }
 #' @rdname overview
 #' @export
 overview.logical <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  overview(list_as_df(list(x = as.logical(x))), hist = hist)
+  overview(list_as_df(list(x = as.logical(x))), hist = hist, digits = digits)
 }
 #' @rdname overview
 #' @export
 overview.numeric <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  out <- overview(list_as_df(list(x = as.numeric(x))), hist = hist)
+  out <- overview(list_as_df(list(x = as.numeric(x))), hist = hist, digits = digits)
   out$cols <- NA_integer_
   out
 }
 #' @rdname overview
 #' @export
 overview.character <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  out <- overview(list_as_df(list(x = as.character(x))), hist = hist)
+  out <- overview(list_as_df(list(x = as.character(x))), hist = hist, digits = digits)
   out$cols <- NA_integer_
   out
 }
 #' @rdname overview
 #' @export
 overview.factor <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  out <- overview(list_as_df(list(x = as.factor(x))), hist = hist)
+  out <- overview(list_as_df(list(x = as.factor(x))), hist = hist, digits = digits)
   out$cols <- NA_integer_
   out
 }
 #' @rdname overview
 #' @export
 overview.Date <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  out <- overview(list_as_df(list(x = as.Date(x))), hist = hist)
+  out <- overview(list_as_df(list(x = as.Date(x))), hist = hist, digits = digits)
   out$cols <- NA_integer_
   out
 }
 #' @rdname overview
 #' @export
 overview.POSIXt <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  out <- overview(list_as_df(list(x = as.POSIXct(x))), hist = hist)
+  out <- overview(list_as_df(list(x = as.POSIXct(x))), hist = hist, digits = digits)
   out$cols <- NA_integer_
   out
 }
 #' @rdname overview
 #' @export
 overview.ts <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
-  out <- overview(transform_all(as.data.frame(x), as.numeric), hist = hist)
+  out <- overview(transform_all(as.data.frame(x), as.numeric), hist = hist, digits = digits)
   out$time_series <- out$numeric
   out$numeric <- sset(out$numeric, 0)
   out$time_series$class <- class(x)[1]
@@ -107,13 +99,12 @@ overview.zoo <- overview.ts
 #' @rdname overview
 #' @export
 overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){
-  options(cheapr.digits = digits)
   check_is_df(x)
   N <- nrow(x)
   num_cols <- ncol(x)
   skim_df <- x
   data_nms <- names(skim_df)
-  col_classes <- vapply(skim_df, function(x) utils::tail(class(x), n = 1), "")
+  col_classes <- vapply(skim_df, function(x) sset(class(x), length(class(x))), "")
   out <- list_as_df(enframe_(col_classes, name = "col", value = "class"))
   chr_vars <- data_nms[vapply(skim_df, is.character, FALSE,
                               USE.NAMES = FALSE)]
@@ -251,7 +242,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi
   if (N > 0L && length(which_ts) > 0) {
     ts_overviews <- new_list(nrow(ts_out))
     for (i in seq_along(ts_overviews)){
-      ts_overviews[[i]] <- overview(ts_data[[ts_out[["col"]][i]]], hist = hist)$time_series
+      ts_overviews[[i]] <- overview(ts_data[[ts_out[["col"]][i]]], hist = hist, digits = digits)$time_series
       if (length(attr(ts_overviews[[i]], "row.names")) > 1){
         ts_overviews[[i]][["col"]] <- paste0(ts_out[["col"]][i], "_",
                                              ts_overviews[[i]][["col"]])
@@ -314,6 +305,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi
 
   out <- list(
     obs = N, cols = num_cols,
+    print_digits = digits,
     logical = lgl_out,
     numeric = num_out,
     date = date_out,
@@ -326,45 +318,37 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi
   out
 }
 #' @export
-print.overview <- function(x, max = NULL, digits = getOption("cheapr.digits", 2), ...){
-  # max_rows <- getOption("tibble.print_max", 20)
-  # max_cols <- getOption("tibble.width", NULL)
-  # max_extra_cols <- getOption("tibble.max_extra_cols", 100)
-  # options(tibble.print_max = 10)
-  # options(tibble.width = 100)
-  # options(tibble.max_extra_cols = 10)
+print.overview <- function(x, max = NULL, ...){
+  digits <- x[["print_digits"]]
+  pretty_round <- function(x, decimal_digits = digits, ...){
+    pretty_num(round(x, digits = decimal_digits), ...)
+  }
   cat(paste("obs:", x$obs, "\ncols:", x$cols), "\n")
-  # for (data_type in names(x)[-(1:2)]){
-  #   if (nrow(x[[data_type]])){
-  #     cat(paste("\n-----", data_type, "-----\n"))
-  #     print(x[[data_type]])
-  #   }
-  # }
   if (nrow(x$logical)){
-    x$logical$p_complete <- pretty_num(round(x$logical$p_complete, digits))
+    x$logical$p_complete <- pretty_round(x$logical$p_complete)
     cat("\n----- Logical -----\n")
     print(x$logical)
   }
   if (nrow(x$numeric)){
-    x$numeric$p_complete <- pretty_num(round(x$numeric$p_complete, digits))
-    x$numeric$mean <- pretty_num(round(x$numeric$mean, digits))
-    x$numeric$p0 <- pretty_num(round(x$numeric$p0, digits))
-    x$numeric$p25 <- pretty_num(round(x$numeric$p25, digits))
-    x$numeric$p50 <- pretty_num(round(x$numeric$p50, digits))
-    x$numeric$p75 <- pretty_num(round(x$numeric$p75, digits))
-    x$numeric$p100 <- pretty_num(round(x$numeric$p100, digits))
-    x$numeric$iqr <- pretty_num(round(x$numeric$iqr, digits))
-    x$numeric$sd <- pretty_num(round(x$numeric$sd, digits))
+    x$numeric$p_complete <- pretty_round(x$numeric$p_complete)
+    x$numeric$mean <- pretty_round(x$numeric$mean)
+    x$numeric$p0 <- pretty_round(x$numeric$p0)
+    x$numeric$p25 <- pretty_round(x$numeric$p25)
+    x$numeric$p50 <- pretty_round(x$numeric$p50)
+    x$numeric$p75 <- pretty_round(x$numeric$p75)
+    x$numeric$p100 <- pretty_round(x$numeric$p100)
+    x$numeric$iqr <- pretty_round(x$numeric$iqr)
+    x$numeric$sd <- pretty_round(x$numeric$sd)
     cat("\n----- Numeric -----\n")
     print(x$numeric)
   }
   if (nrow(x$date)){
-    x$date$p_complete <- pretty_num(round(x$date$p_complete, digits))
+    x$date$p_complete <- pretty_round(x$date$p_complete)
     cat("\n----- Dates -----\n")
     print(x$date)
   }
   if (nrow(x$datetime)){
-    x$datetime$p_complete <- pretty_num(round(x$datetime$p_complete, digits))
+    x$datetime$p_complete <- pretty_round(x$datetime$p_complete)
     # An overview list contains a 'min' & 'max' variable of date-times
     # This is UTC because R can't handle a date-time with multiple time-zones
     # And so we want to print it in local-time
@@ -383,33 +367,31 @@ print.overview <- function(x, max = NULL, digits = getOption("cheapr.digits", 2)
     print(x$datetime)
   }
   if (nrow(x$time_series)){
-    x$time_series$p_complete <- pretty_num(round(x$time_series$p_complete, digits))
-    x$time_series$mean <- pretty_num(round(x$time_series$mean, digits))
-    x$time_series$p0 <- pretty_num(round(x$time_series$p0, digits))
-    x$time_series$p25 <- pretty_num(round(x$time_series$p25, digits))
-    x$time_series$p50 <- pretty_num(round(x$time_series$p50, digits))
-    x$time_series$p75 <- pretty_num(round(x$time_series$p75, digits))
-    x$time_series$p100 <- pretty_num(round(x$time_series$p100, digits))
-    x$time_series$iqr <- pretty_num(round(x$time_series$iqr, digits))
-    x$time_series$sd <- pretty_num(round(x$time_series$sd, digits))
+    x$time_series$p_complete <- pretty_round(x$time_series$p_complete)
+    x$time_series$mean <- pretty_round(x$time_series$mean)
+    x$time_series$p0 <- pretty_round(x$time_series$p0)
+    x$time_series$p25 <- pretty_round(x$time_series$p25)
+    x$time_series$p50 <- pretty_round(x$time_series$p50)
+    x$time_series$p75 <- pretty_round(x$time_series$p75)
+    x$time_series$p100 <- pretty_round(x$time_series$p100)
+    x$time_series$iqr <- pretty_round(x$time_series$iqr)
+    x$time_series$sd <- pretty_round(x$time_series$sd)
     cat("\n----- Time-Series -----\n")
     print(x$time_series)
   }
   if (nrow(x$categorical)){
-    x$categorical$p_complete <- pretty_num(round(x$categorical$p_complete, digits))
+    x$categorical$p_complete <- pretty_round(x$categorical$p_complete)
     cat("\n----- Categorical -----\n")
     print(x$categorical)
   }
   if (nrow(x$other)){
-    x$other$p_complete <- pretty_num(round(x$other$p_complete, digits))
+    x$other$p_complete <- pretty_round(x$other$p_complete)
     cat("\n----- Other -----\n")
     print(x$other)
   }
-  # options(tibble.print_max = max_rows)
-  # options(tibble.width = max_cols)
-  # options(tibble.max_extra_cols = max_extra_cols)
   invisible(x)
 }
+
 ### Helpers
 
 n_unique <- function(x, na_rm = FALSE){
@@ -419,13 +401,16 @@ n_unique <- function(x, na_rm = FALSE){
   }
   out
 }
-prop_complete <- function(x, recursive = TRUE){
+prop_missing <- function(x, recursive = TRUE){
   if (recursive){
     N <- unlisted_length(x)
   } else {
-    N <- length(x)
+    N <- cpp_vec_length(x)
   }
-  1 - (num_na(x, recursive = recursive) / N)
+  num_na(x, recursive = recursive) / N
+}
+prop_complete <- function(x, recursive = TRUE){
+  1 - prop_missing(x, recursive = recursive)
 }
 transform_all <- function(data, .fn){
   for (col in names(data)){
@@ -435,14 +420,14 @@ transform_all <- function(data, .fn){
 }
 summarise_all <- function(data, .fn, size = 1){
   out <- sset(data, seq_len(size))
-  attr(out, "row.names") <- .set_row_names(size)
+  out <- cpp_set_add_attr(out, "row.names", .set_row_names(size))
   for (col in names(out)){
     out[[col]] <- .fn(data[[col]])
   }
   out
 }
 pluck_row <- function(data, i = 1){
-  unlist(data[i, ], recursive = FALSE)
+  unlist(sset(data, i), recursive = FALSE)
 }
 
 # Taken from skimr::skim with modifications