From 55ad90e60b057cfd015f0a2c6b80d9309ecac9a4 Mon Sep 17 00:00:00 2001 From: Nick Christofides <118103879+NicChr@users.noreply.github.com> Date: Fri, 29 Mar 2024 12:25:57 +0000 Subject: [PATCH] Various improvements and bug fixes. --- CRAN-SUBMISSION | 3 + DESCRIPTION | 2 +- NEWS.md | 2 + R/cpp11.R | 28 +++-- R/extras.R | 9 +- R/factors.R | 10 +- R/nas.R | 5 +- R/overview.R | 113 +++++++++----------- R/sset.R | 70 +++++++++---- R/utils.R | 22 ++++ README.Rmd | 4 +- README.md | 118 ++++++++++----------- man/is_na.Rd | 1 + man/sset.Rd | 9 +- src/attrs.cpp | 84 +++++++++++++++ src/cheapr_cpp.h | 1 + src/cpp11.cpp | 54 +++++++--- src/sset.cpp | 260 +++++++++++++++++++++++++++++++++++++++++++++++ src/utils.cpp | 130 +++--------------------- src/which.cpp | 55 ++++++++-- 20 files changed, 676 insertions(+), 304 deletions(-) create mode 100644 CRAN-SUBMISSION create mode 100644 src/attrs.cpp create mode 100644 src/sset.cpp diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 0000000..198d805 --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 0.4.0 +Date: 2024-03-25 13:25:17 UTC +SHA: cb7ce27331455c0fbe9ca946bc2c2a06c6f936e3 diff --git a/DESCRIPTION b/DESCRIPTION index 87105a6..53419fb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: cheapr Title: Simple Functions to Save Time and Memory -Version: 0.4.0 +Version: 0.4.0.9000 Authors@R: person("Nick", "Christofides", , "nick.christofides.r@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-9743-7342")) diff --git a/NEWS.md b/NEWS.md index 6629e7d..51ae405 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ +# cheapr (Development version) + # cheapr 0.4.0 * New function `sset` to consistently subset data frame rows and vectors in diff --git a/R/cpp11.R b/R/cpp11.R index c9019c3..4b5f9fb 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -1,5 +1,21 @@ # Generated by cpp11: do not edit by hand +cpp_set_rm_attributes <- function(x) { + .Call(`_cheapr_cpp_set_rm_attributes`, x) +} + +cpp_set_add_attr <- function(x, which, value) { + .Call(`_cheapr_cpp_set_add_attr`, x, which, value) +} + +cpp_set_rm_attr <- function(x, which) { + .Call(`_cheapr_cpp_set_rm_attr`, x, which) +} + +cpp_set_attributes <- function(x, attributes, add) { + .Call(`_cheapr_cpp_set_attributes`, x, attributes, add) +} + cpp_gcd2 <- function(x, y, tol, na_rm) { .Call(`_cheapr_cpp_gcd2`, x, y, tol, na_rm) } @@ -104,6 +120,10 @@ cpp_lead_sequence <- function(size, k, partial) { .Call(`_cheapr_cpp_lead_sequence`, size, k, partial) } +cpp_sset <- function(x, indices) { + .Call(`_cheapr_cpp_sset`, x, indices) +} + cpp_vec_length <- function(x) { .Call(`_cheapr_cpp_vec_length`, x) } @@ -128,14 +148,6 @@ cpp_list_as_df <- function(x) { .Call(`_cheapr_cpp_list_as_df`, x) } -cpp_set_rm_attributes <- function(x) { - .Call(`_cheapr_cpp_set_rm_attributes`, x) -} - -cpp_set_copy_attributes <- function(target, source, attrs) { - .Call(`_cheapr_cpp_set_copy_attributes`, target, source, attrs) -} - cpp_which_ <- function(x, invert) { .Call(`_cheapr_cpp_which_`, x, invert) } diff --git a/R/extras.R b/R/extras.R index 6c4a274..72ec5e7 100644 --- a/R/extras.R +++ b/R/extras.R @@ -140,7 +140,7 @@ enframe_ <- function(x, name = "name", value = "value"){ out <- list(x_nms, x) names(out) <- c(name, value) } - attr(out, "class") <- c("tbl_df", "tbl", "data.frame") + class(out) <- c("tbl_df", "tbl", "data.frame") attr(out, "row.names") <- .set_row_names(length(x)) out } @@ -160,15 +160,12 @@ deframe_ <- function(x){ #' @export #' @rdname extras na_rm <- function(x){ - if (is.data.frame(x)){ - stop("x must be a vector") - } n_na <- num_na(x, recursive = TRUE) if (n_na == unlisted_length(x)){ - x[0L] + sset(x, 0L) } else if (n_na == 0){ x } else { - x[which_not_na(x)] + sset(x, which_not_na(x)) } } diff --git a/R/factors.R b/R/factors.R index 364055c..c6916a2 100644 --- a/R/factors.R +++ b/R/factors.R @@ -46,13 +46,17 @@ factor_ <- function(x = integer(), levels = NULL, order = TRUE, } if (na_exclude && any_na(lvls)){ if (order && is.null(levels)){ - lvls <- lvls[seq_len(length(lvls) - 1L)] + lvls <- sset(lvls, seq_len(cpp_vec_length(lvls) - 1L)) } else { - lvls <- lvls[which_not_na(lvls)] + lvls <- na_rm(lvls) } } out <- collapse::fmatch(x, lvls, overid = 2L) - fct_lvls <- as.character(lvls) + if (inherits(lvls, "data.frame")){ + fct_lvls <- do.call(paste, c(lvls, list(sep = "_"))) + } else { + fct_lvls <- as.character(lvls) + } if (inherits(x, "POSIXt") && collapse::any_duplicated(fct_lvls)){ fct_lvls <- paste(fct_lvls, as.POSIXlt(lvls)$zone) } diff --git a/R/nas.R b/R/nas.R index 292de20..b041e8b 100644 --- a/R/nas.R +++ b/R/nas.R @@ -22,6 +22,7 @@ #' To find rows with any empty values, #' use `which_(row_any_na(df))`. \cr #' To find empty rows use `which_(row_all_na(df))` or `which_na(df)`. +#' To drop empty rows use `na_rm(df)` or `sset(df, which_(row_all_na(df), TRUE))`. #' #' ### `is_na` #' `is_na` Is an S3 generic function. It will internally fall back on @@ -93,7 +94,9 @@ is_na.default <- function(x){ #' @rdname is_na #' @export is_na.POSIXlt <- function(x){ - row_any_na(list_as_df(unclass(x)[1:8])) + row_any_na(list_as_df(do.call(recycle, unclass(x)[ + c("sec", "min", "hour", "mday", + "mon", "year", "wday", "yday")]))) } #' @rdname is_na #' @export diff --git a/R/overview.R b/R/overview.R index 8f6c449..a964f06 100644 --- a/R/overview.R +++ b/R/overview.R @@ -37,65 +37,57 @@ #' @rdname overview #' @export overview <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - UseMethod("overview") + UseMethod("overview") } #' @rdname overview #' @export overview.default <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - overview(list_as_df(list(x = x)), hist = hist) + overview(list_as_df(list(x = x)), hist = hist, digits = digits) } #' @rdname overview #' @export overview.logical <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - overview(list_as_df(list(x = as.logical(x))), hist = hist) + overview(list_as_df(list(x = as.logical(x))), hist = hist, digits = digits) } #' @rdname overview #' @export overview.numeric <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - out <- overview(list_as_df(list(x = as.numeric(x))), hist = hist) + out <- overview(list_as_df(list(x = as.numeric(x))), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export overview.character <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - out <- overview(list_as_df(list(x = as.character(x))), hist = hist) + out <- overview(list_as_df(list(x = as.character(x))), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export overview.factor <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - out <- overview(list_as_df(list(x = as.factor(x))), hist = hist) + out <- overview(list_as_df(list(x = as.factor(x))), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export overview.Date <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - out <- overview(list_as_df(list(x = as.Date(x))), hist = hist) + out <- overview(list_as_df(list(x = as.Date(x))), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export overview.POSIXt <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - out <- overview(list_as_df(list(x = as.POSIXct(x))), hist = hist) + out <- overview(list_as_df(list(x = as.POSIXct(x))), hist = hist, digits = digits) out$cols <- NA_integer_ out } #' @rdname overview #' @export overview.ts <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) - out <- overview(transform_all(as.data.frame(x), as.numeric), hist = hist) + out <- overview(transform_all(as.data.frame(x), as.numeric), hist = hist, digits = digits) out$time_series <- out$numeric out$numeric <- sset(out$numeric, 0) out$time_series$class <- class(x)[1] @@ -107,13 +99,12 @@ overview.zoo <- overview.ts #' @rdname overview #' @export overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digits", 2)){ - options(cheapr.digits = digits) check_is_df(x) N <- nrow(x) num_cols <- ncol(x) skim_df <- x data_nms <- names(skim_df) - col_classes <- vapply(skim_df, function(x) utils::tail(class(x), n = 1), "") + col_classes <- vapply(skim_df, function(x) sset(class(x), length(class(x))), "") out <- list_as_df(enframe_(col_classes, name = "col", value = "class")) chr_vars <- data_nms[vapply(skim_df, is.character, FALSE, USE.NAMES = FALSE)] @@ -251,7 +242,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi if (N > 0L && length(which_ts) > 0) { ts_overviews <- new_list(nrow(ts_out)) for (i in seq_along(ts_overviews)){ - ts_overviews[[i]] <- overview(ts_data[[ts_out[["col"]][i]]], hist = hist)$time_series + ts_overviews[[i]] <- overview(ts_data[[ts_out[["col"]][i]]], hist = hist, digits = digits)$time_series if (length(attr(ts_overviews[[i]], "row.names")) > 1){ ts_overviews[[i]][["col"]] <- paste0(ts_out[["col"]][i], "_", ts_overviews[[i]][["col"]]) @@ -314,6 +305,7 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi out <- list( obs = N, cols = num_cols, + print_digits = digits, logical = lgl_out, numeric = num_out, date = date_out, @@ -326,45 +318,37 @@ overview.data.frame <- function(x, hist = FALSE, digits = getOption("cheapr.digi out } #' @export -print.overview <- function(x, max = NULL, digits = getOption("cheapr.digits", 2), ...){ - # max_rows <- getOption("tibble.print_max", 20) - # max_cols <- getOption("tibble.width", NULL) - # max_extra_cols <- getOption("tibble.max_extra_cols", 100) - # options(tibble.print_max = 10) - # options(tibble.width = 100) - # options(tibble.max_extra_cols = 10) +print.overview <- function(x, max = NULL, ...){ + digits <- x[["print_digits"]] + pretty_round <- function(x, decimal_digits = digits, ...){ + pretty_num(round(x, digits = decimal_digits), ...) + } cat(paste("obs:", x$obs, "\ncols:", x$cols), "\n") - # for (data_type in names(x)[-(1:2)]){ - # if (nrow(x[[data_type]])){ - # cat(paste("\n-----", data_type, "-----\n")) - # print(x[[data_type]]) - # } - # } if (nrow(x$logical)){ - x$logical$p_complete <- pretty_num(round(x$logical$p_complete, digits)) + x$logical$p_complete <- pretty_round(x$logical$p_complete) cat("\n----- Logical -----\n") print(x$logical) } if (nrow(x$numeric)){ - x$numeric$p_complete <- pretty_num(round(x$numeric$p_complete, digits)) - x$numeric$mean <- pretty_num(round(x$numeric$mean, digits)) - x$numeric$p0 <- pretty_num(round(x$numeric$p0, digits)) - x$numeric$p25 <- pretty_num(round(x$numeric$p25, digits)) - x$numeric$p50 <- pretty_num(round(x$numeric$p50, digits)) - x$numeric$p75 <- pretty_num(round(x$numeric$p75, digits)) - x$numeric$p100 <- pretty_num(round(x$numeric$p100, digits)) - x$numeric$iqr <- pretty_num(round(x$numeric$iqr, digits)) - x$numeric$sd <- pretty_num(round(x$numeric$sd, digits)) + x$numeric$p_complete <- pretty_round(x$numeric$p_complete) + x$numeric$mean <- pretty_round(x$numeric$mean) + x$numeric$p0 <- pretty_round(x$numeric$p0) + x$numeric$p25 <- pretty_round(x$numeric$p25) + x$numeric$p50 <- pretty_round(x$numeric$p50) + x$numeric$p75 <- pretty_round(x$numeric$p75) + x$numeric$p100 <- pretty_round(x$numeric$p100) + x$numeric$iqr <- pretty_round(x$numeric$iqr) + x$numeric$sd <- pretty_round(x$numeric$sd) cat("\n----- Numeric -----\n") print(x$numeric) } if (nrow(x$date)){ - x$date$p_complete <- pretty_num(round(x$date$p_complete, digits)) + x$date$p_complete <- pretty_round(x$date$p_complete) cat("\n----- Dates -----\n") print(x$date) } if (nrow(x$datetime)){ - x$datetime$p_complete <- pretty_num(round(x$datetime$p_complete, digits)) + x$datetime$p_complete <- pretty_round(x$datetime$p_complete) # An overview list contains a 'min' & 'max' variable of date-times # This is UTC because R can't handle a date-time with multiple time-zones # And so we want to print it in local-time @@ -383,33 +367,31 @@ print.overview <- function(x, max = NULL, digits = getOption("cheapr.digits", 2) print(x$datetime) } if (nrow(x$time_series)){ - x$time_series$p_complete <- pretty_num(round(x$time_series$p_complete, digits)) - x$time_series$mean <- pretty_num(round(x$time_series$mean, digits)) - x$time_series$p0 <- pretty_num(round(x$time_series$p0, digits)) - x$time_series$p25 <- pretty_num(round(x$time_series$p25, digits)) - x$time_series$p50 <- pretty_num(round(x$time_series$p50, digits)) - x$time_series$p75 <- pretty_num(round(x$time_series$p75, digits)) - x$time_series$p100 <- pretty_num(round(x$time_series$p100, digits)) - x$time_series$iqr <- pretty_num(round(x$time_series$iqr, digits)) - x$time_series$sd <- pretty_num(round(x$time_series$sd, digits)) + x$time_series$p_complete <- pretty_round(x$time_series$p_complete) + x$time_series$mean <- pretty_round(x$time_series$mean) + x$time_series$p0 <- pretty_round(x$time_series$p0) + x$time_series$p25 <- pretty_round(x$time_series$p25) + x$time_series$p50 <- pretty_round(x$time_series$p50) + x$time_series$p75 <- pretty_round(x$time_series$p75) + x$time_series$p100 <- pretty_round(x$time_series$p100) + x$time_series$iqr <- pretty_round(x$time_series$iqr) + x$time_series$sd <- pretty_round(x$time_series$sd) cat("\n----- Time-Series -----\n") print(x$time_series) } if (nrow(x$categorical)){ - x$categorical$p_complete <- pretty_num(round(x$categorical$p_complete, digits)) + x$categorical$p_complete <- pretty_round(x$categorical$p_complete) cat("\n----- Categorical -----\n") print(x$categorical) } if (nrow(x$other)){ - x$other$p_complete <- pretty_num(round(x$other$p_complete, digits)) + x$other$p_complete <- pretty_round(x$other$p_complete) cat("\n----- Other -----\n") print(x$other) } - # options(tibble.print_max = max_rows) - # options(tibble.width = max_cols) - # options(tibble.max_extra_cols = max_extra_cols) invisible(x) } + ### Helpers n_unique <- function(x, na_rm = FALSE){ @@ -419,13 +401,16 @@ n_unique <- function(x, na_rm = FALSE){ } out } -prop_complete <- function(x, recursive = TRUE){ +prop_missing <- function(x, recursive = TRUE){ if (recursive){ N <- unlisted_length(x) } else { - N <- length(x) + N <- cpp_vec_length(x) } - 1 - (num_na(x, recursive = recursive) / N) + num_na(x, recursive = recursive) / N +} +prop_complete <- function(x, recursive = TRUE){ + 1 - prop_missing(x, recursive = recursive) } transform_all <- function(data, .fn){ for (col in names(data)){ @@ -435,14 +420,14 @@ transform_all <- function(data, .fn){ } summarise_all <- function(data, .fn, size = 1){ out <- sset(data, seq_len(size)) - attr(out, "row.names") <- .set_row_names(size) + out <- cpp_set_add_attr(out, "row.names", .set_row_names(size)) for (col in names(out)){ out[[col]] <- .fn(data[[col]]) } out } pluck_row <- function(data, i = 1){ - unlist(data[i, ], recursive = FALSE) + unlist(sset(data, i), recursive = FALSE) } # Taken from skimr::skim with modifications diff --git a/R/sset.R b/R/sset.R index decc829..8024c0d 100644 --- a/R/sset.R +++ b/R/sset.R @@ -15,6 +15,13 @@ #' You can either write methods for `sset` or `[`. \cr #' `sset` will fall back on using `[` when no suitable method is found. #' +#' To get into more detail, using `sset()` on a data frame, a new +#' list is always allocated through `cheapr:::cpp_new_list()`. +#' For data.tables, if `i` is missing, then a deep copy is made. +#' When `i` is a logical vector, it is not recycled, so it is good practice to +#' make sure the logical vector +#' matches the length of x, or if x has rows, the number of rows of x. +#' #' @examples #' library(cheapr) #' library(bench) @@ -50,7 +57,6 @@ sset <- function(x, ...){ #' @export sset.default <- function(x, i, ...){ if (!missing(i) && is.logical(i)){ - # check_length(i, length(x)) i <- which_(i) } x[i, ...] @@ -69,35 +75,52 @@ sset.tbl_df <- function(x, i, j = seq_along(x), ...){ } #' @rdname sset #' @export -sset.POSIXlt <- function(x, i, ...){ - out <- df_subset(list_as_df(x), i) - cpp_set_copy_attributes( - cpp_set_rm_attributes(out), x, names(attributes(x)) - ) +sset.POSIXlt <- function(x, i, j, ...){ + missingi <- missing(i) + missingj <- missing(j) + if (n_unique(lengths_(unclass(x))) > 1){ + out <- balancePOSIXlt(x, fill.only = FALSE, classed = FALSE) + } else { + out <- unclass(x) + } + if (missingj){ + j <- seq_along(out) + } + out <- df_subset(list_as_df(out), i, j) + cpp_set_rm_attr(out, "row.names") + if (missingj){ + cpp_set_add_attr(out, "class", class(x)) + } + cpp_set_add_attr(out, "tzone", attr(x, "tzone")) + cpp_set_add_attr(out, "balanced", TRUE) } #' @rdname sset #' @export sset.data.table <- function(x, i, j = seq_along(x), ...){ - # This is to ensure that a copy is made basically - # More efficient to use data.table::copy() - if (missing(i)){ - i <- seq_len(nrow(x)) - } out <- df_subset(x, i, j) - cpp_set_copy_attributes( - out, x, c("class", ".internal.selfref") - ) + cpp_set_attributes(out, list(class = class(x), + .internal.selfref = attributes(x)[[".internal.selfref"]]), + add = TRUE) + dt_alloc <- tryCatch(get("setalloccol", + asNamespace("data.table"), + inherits = FALSE), + error = function(e) return(".r.error")) + # Reserve sufficient space as data.table::truelength(out) at this point is 0 + if (is.character(dt_alloc) && length(dt_alloc) == 1 && dt_alloc == ".r.error"){ + out <- collapse::qDT(out) + } else { + dt_alloc(out, n = getOption("datatable.alloccol", 1024L)) + } + out } #' @rdname sset #' @export sset.sf <- function(x, i, j = seq_along(x), ...){ out <- df_subset(x, i, j) - source_nms <- names(attributes(x)) - invisible( - cpp_set_copy_attributes(out, x, setdiff_(source_nms, c("names", "row.names", "class"))) - ) - class(out) <- class(x) - out + source_attrs <- attributes(x) + source_nms <- names(source_attrs) + attrs_to_keep <- source_attrs[setdiff_(source_nms, c("names", "row.names"))] + cpp_set_attributes(out, attrs_to_keep, add = TRUE) } df_select <- function(x, j){ if (is.logical(j)){ @@ -111,8 +134,7 @@ df_select <- function(x, j){ out <- cpp_list_rm_null(unclass(x)[j]) attrs[["names"]] <- attr(out, "names") attrs[["row.names"]] <- .row_names_info(x, type = 0L) - attributes(out) <- attrs - out + cpp_set_attributes(out, attrs, add = FALSE) } # Efficient data frame subset @@ -147,3 +169,7 @@ df_subset <- function(x, i, j = seq_along(x)){ } out } +# Turn negative indices to positives +neg_indices_to_pos <- function(n, exclude){ + which_not_in(seq_len(n), abs(exclude)) +} diff --git a/R/utils.R b/R/utils.R index 4ff02fb..6bcd785 100644 --- a/R/utils.R +++ b/R/utils.R @@ -86,6 +86,28 @@ tzone <- function(x){ } } +# Recycle arguments +recycle <- function (..., length = NULL){ + out <- cpp_list_rm_null(list(...)) + lens <- lengths_(out) + uniq_lens <- collapse::fnunique(lens) + if (is.null(length)) { + if (length(lens)) { + N <- max(lens) + } + else { + N <- 0L + } + } + else { + N <- length + } + N <- N * (!collapse::anyv(lens, 0L)) + recycle <- which_(lens != N) + out[recycle] <- lapply(out[recycle], rep_len, N) + out +} + # safe_unique <- function(x, ...){ # out <- tryCatch(collapse::funique(x, ...), error = function(e) return(".r.error")) # if (length(out) == 1 && out == ".r.error"){ diff --git a/README.Rmd b/README.Rmd index 2abf40b..728e22a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -239,11 +239,11 @@ x <- sample(seq(-10^3, 10^3, 0.01)) y <- do.call(paste0, expand.grid(letters, letters, letters, letters)) mark(cheapr_factor = factor_(x), base_factor = factor(x)) -mark(base_factor = factor_(x, order = FALSE), +mark(cheapr_factor = factor_(x, order = FALSE), base_factor = factor(x, levels = unique(x))) mark(cheapr_factor = factor_(y), base_factor = factor(y)) -mark(base_factor = factor_(y, order = FALSE), +mark(cheapr_factor = factor_(y, order = FALSE), base_factor = factor(y, levels = unique(y))) ``` diff --git a/README.md b/README.md index 05c12dd..f3690bf 100644 --- a/README.md +++ b/README.md @@ -52,14 +52,14 @@ mark(na_locf(x), vec_fill_missing(x, direction = "down")) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 "na_locf(x)" 919.1µs 937µs 1041. 0B 0 -#> 2 "vec_fill_missing(x, direction… 2.63ms 2.79ms 354. 11.4MB 117. +#> 1 "na_locf(x)" 841.5µs 862.2µs 1085. 0B 0 +#> 2 "vec_fill_missing(x, direction… 2.64ms 2.82ms 352. 11.4MB 120. mark(na_locf(x), vec_fill_missing(x, direction = "down")) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 "na_locf(x)" 918.4µs 927.9µs 1069. 0B 0 -#> 2 "vec_fill_missing(x, direction… 2.62ms 2.74ms 361. 11.4MB 185. +#> 1 "na_locf(x)" 841.5µs 852.8µs 1162. 0B 0 +#> 2 "vec_fill_missing(x, direction… 2.57ms 2.81ms 349. 11.4MB 226. ``` All the `NA` handling functions in cheapr can make use of multiple cores @@ -71,16 +71,16 @@ mark(num_na(x), sum(is.na(x))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 num_na(x) 916µs 926.8µs 982. 0B 0 -#> 2 sum(is.na(x)) 893µs 1.07ms 940. 3.81MB 84.9 +#> 1 num_na(x) 839µs 854.1µs 1150. 0B 0 +#> 2 sum(is.na(x)) 930µs 1.06ms 929. 3.81MB 84.5 # 4 cores options(cheapr.cores = 4) mark(num_na(x), sum(is.na(x))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 num_na(x) 252µs 318µs 3026. 0B 0 -#> 2 sum(is.na(x)) 910µs 1.07ms 922. 3.81MB 83.5 +#> 1 num_na(x) 239µs 300.9µs 3059. 0B 0 +#> 2 sum(is.na(x)) 934µs 1.06ms 919. 3.81MB 76.6 ``` ## Efficient NA counts by row/col @@ -93,16 +93,16 @@ mark(row_na_counts(m), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 row_na_counts(m) 1.29ms 3.35ms 303. 12.9KB 0 -#> 2 rowSums(is.na(m)) 2.73ms 2.87ms 347. 3.82MB 35.0 +#> 1 row_na_counts(m) 1.33ms 3.3ms 308. 12.9KB 0 +#> 2 rowSums(is.na(m)) 2.76ms 2.87ms 344. 3.82MB 31.3 # Number of NA values by col mark(col_na_counts(m), colSums(is.na(m))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 col_na_counts(m) 680.5µs 779.8µs 1245. 12.9KB 0 -#> 2 colSums(is.na(m)) 1.92ms 2.07ms 482. 3.82MB 49.3 +#> 1 col_na_counts(m) 690.6µs 839µs 1169. 12.9KB 0 +#> 2 colSums(is.na(m)) 1.93ms 2.05ms 485. 3.82MB 43.6 ``` `is_na` is a multi-threaded alternative to `is.na` @@ -114,8 +114,8 @@ mark(is.na(x), is_na(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 is.na(x) 1.03ms 1.07ms 908. 3.81MB 146. -#> 2 is_na(x) 533.9µs 625.6µs 1561. 3.82MB 223. +#> 1 is.na(x) 1.04ms 1.1ms 886. 3.81MB 127. +#> 2 is_na(x) 537.3µs 674.5µs 1423. 3.82MB 187. ### posixlt method is much faster hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600), @@ -128,8 +128,8 @@ mark(is.na(hours), is_na(hours)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 is.na(hours) 1.22s 1.22s 0.818 61.05MB 1.64 -#> 2 is_na(hours) 5.05ms 5.62ms 173. 3.83MB 7.94 +#> 1 is.na(hours) 1.17s 1.17s 0.852 61MB 0.852 +#> 2 is_na(hours) 5.15ms 5.68ms 162. 9.8MB 9.91 ``` It differs in 2 regards: @@ -184,11 +184,11 @@ overview(df, hist = TRUE) #> #> ----- Numeric ----- #> col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 -#> 1 x integer 0 1 100 50.51 1 25 51 76 100 -#> 2 z numeric 0 1 10000000 0 -5.47 -0.68 0 0.67 5.01 +#> 1 x integer 0 1 100 50.49 1 25 50 76 100 +#> 2 z numeric 0 1 10000000 0 -5.12 -0.67 0 0.67 5.32 #> iqr sd hist #> 1 51 28.87 ▇▇▇▇▇ -#> 2 1.35 1 ▁▁▇▂▁ +#> 2 1.35 1 ▁▂▇▂▁ #> #> ----- Categorical ----- #> col class n_missing p_complete n_unique n_levels min max @@ -199,7 +199,7 @@ mark(overview(df)) #> # A tibble: 1 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 overview(df) 961ms 961ms 1.04 76.3MB 1.04 +#> 1 overview(df) 1.01s 1.01s 0.991 76.3MB 0.991 ``` ## Cheaper and consistent subsetting with `sset` @@ -234,9 +234,9 @@ mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y]) #> # A tibble: 3 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 sset(x, x %in_% y) 93.7µs 115µs 7856. 90.8KB 2.06 -#> 2 sset(x, x %in% y) 163.7µs 239µs 3774. 285.5KB 6.52 -#> 3 x[x %in% y] 132.9µs 207µs 4571. 324.6KB 4.88 +#> 1 sset(x, x %in_% y) 92.6µs 117µs 7823. 83.3KB 2.07 +#> 2 sset(x, x %in% y) 150.3µs 231µs 3802. 285.4KB 4.38 +#> 3 x[x %in% y] 128.8µs 207µs 4518. 324.5KB 6.96 ``` ## Greatest common divisor and smallest common multiple @@ -257,13 +257,13 @@ mark(gcd(x)) #> # A tibble: 1 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 gcd(x) 1.2µs 1.3µs 592287. 0B 0 +#> 1 gcd(x) 1.2µs 1.5µs 544817. 0B 0 x <- seq(0, 10^6, 0.5) mark(gcd(x)) #> # A tibble: 1 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 gcd(x) 55.2ms 55.6ms 17.9 0B 0 +#> 1 gcd(x) 48ms 49.2ms 20.3 0B 0 ``` ## Creating many sequences @@ -352,32 +352,32 @@ mark(cheapr_which = which_(x), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 2.81ms 3.12ms 307. 3.81MB 6.40 -#> 2 base_which 1.12ms 1.21ms 806. 7.63MB 34.3 +#> 1 cheapr_which 2.84ms 3.35ms 293. 3.81MB 6.42 +#> 2 base_which 1.13ms 1.32ms 716. 7.63MB 33.7 x <- rep(FALSE, 10^6) mark(cheapr_which = which_(x), base_which = which(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 206µs 266µs 3438. 0B 0 -#> 2 base_which 457µs 468µs 2021. 3.81MB 38.0 +#> 1 cheapr_which 368µs 467µs 1869. 0B 0 +#> 2 base_which 456µs 470µs 1905. 3.81MB 37.9 x <- c(rep(TRUE, 5e05), rep(FALSE, 1e06)) mark(cheapr_which = which_(x), base_which = which(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 1.67ms 1.91ms 501. 1.91MB 4.15 -#> 2 base_which 1.02ms 1.09ms 859. 7.63MB 35.6 +#> 1 cheapr_which 1.83ms 2.17ms 447. 1.91MB 4.18 +#> 2 base_which 1.02ms 1.15ms 804. 7.63MB 33.0 x <- c(rep(FALSE, 5e05), rep(TRUE, 1e06)) mark(cheapr_which = which_(x), base_which = which(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 3.69ms 3.79ms 261. 3.81MB 4.20 -#> 2 base_which 1.37ms 1.45ms 673. 9.54MB 37.3 +#> 1 cheapr_which 3.85ms 4.07ms 236. 3.81MB 4.22 +#> 2 base_which 1.35ms 1.47ms 656. 9.54MB 35.7 x <- sample(c(TRUE, FALSE), 10^6, TRUE) x[sample.int(10^6, 10^4)] <- NA mark(cheapr_which = which_(x), @@ -385,8 +385,8 @@ mark(cheapr_which = which_(x), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 2.38ms 2.47ms 398. 1.89MB 4.14 -#> 2 base_which 3.32ms 3.36ms 297. 5.71MB 8.87 +#> 1 cheapr_which 2.44ms 2.56ms 384. 1.89MB 4.17 +#> 2 base_which 3.32ms 3.36ms 294. 5.7MB 11.1 ``` ### factor @@ -400,15 +400,15 @@ mark(cheapr_factor = factor_(x), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_factor 10ms 10.4ms 94.9 4.59MB 2.11 -#> 2 base_factor 506ms 506.3ms 1.98 27.84MB 0 -mark(base_factor = factor_(x, order = FALSE), +#> 1 cheapr_factor 9.78ms 10.2ms 90.2 4.59MB 2.15 +#> 2 base_factor 553.04ms 553ms 1.81 27.84MB 0 +mark(cheapr_factor = factor_(x, order = FALSE), base_factor = factor(x, levels = unique(x))) #> # A tibble: 2 × 6 -#> expression min median `itr/sec` mem_alloc `gc/sec` -#> -#> 1 base_factor 5.62ms 6.03ms 165. 1.53MB 2.14 -#> 2 base_factor 796.2ms 796.2ms 1.26 22.79MB 0 +#> expression min median `itr/sec` mem_alloc `gc/sec` +#> +#> 1 cheapr_factor 5.52ms 6.02ms 153. 1.53MB 2.16 +#> 2 base_factor 870.24ms 870.24ms 1.15 22.79MB 0 mark(cheapr_factor = factor_(y), base_factor = factor(y)) #> Warning: Some expressions had a GC in every iteration; so filtering is @@ -416,15 +416,15 @@ mark(cheapr_factor = factor_(y), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_factor 202.08ms 205.92ms 4.87 5.23MB 0 -#> 2 base_factor 2.84s 2.84s 0.352 54.35MB 0.352 -mark(base_factor = factor_(y, order = FALSE), +#> 1 cheapr_factor 216.5ms 221.7ms 4.54 5.23MB 0 +#> 2 base_factor 2.9s 2.9s 0.345 54.35MB 0.345 +mark(cheapr_factor = factor_(y, order = FALSE), base_factor = factor(y, levels = unique(y))) #> # A tibble: 2 × 6 -#> expression min median `itr/sec` mem_alloc `gc/sec` -#> -#> 1 base_factor 7.21ms 8.05ms 124. 3.49MB 4.29 -#> 2 base_factor 47.29ms 47.83ms 20.7 39.89MB 5.18 +#> expression min median `itr/sec` mem_alloc `gc/sec` +#> +#> 1 cheapr_factor 7.63ms 8.44ms 118. 3.49MB 2.10 +#> 2 base_factor 47.25ms 51.19ms 19.7 39.89MB 5.64 ``` ### intersect & setdiff @@ -438,15 +438,15 @@ mark(cheapr_intersect = intersect_(x, y, dups = FALSE), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_intersect 3.16ms 3.35ms 294. 1.18MB 2.11 -#> 2 base_intersect 4.29ms 4.55ms 218. 5.16MB 7.34 +#> 1 cheapr_intersect 2.97ms 3.36ms 295. 1.18MB 2.10 +#> 2 base_intersect 4.38ms 4.64ms 212. 5.16MB 7.22 mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE), base_setdiff = setdiff(x, y)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_setdiff 3.38ms 3.6ms 276. 1.77MB 2.13 -#> 2 base_setdiff 4.71ms 4.85ms 205. 5.71MB 7.41 +#> 1 cheapr_setdiff 3.37ms 3.73ms 265. 1.76MB 2.11 +#> 2 base_setdiff 4.78ms 4.98ms 199. 5.71MB 7.29 ``` ### `%in_%` and `%!in_%` @@ -457,15 +457,15 @@ mark(cheapr = x %in_% y, #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr 1.92ms 2.01ms 491. 781.34KB 2.12 -#> 2 base 2.58ms 2.78ms 358. 2.53MB 4.47 +#> 1 cheapr 1.82ms 2.05ms 482. 781.34KB 2.10 +#> 2 base 2.57ms 2.85ms 329. 2.53MB 7.16 mark(cheapr = x %!in_% y, base = !x %in% y) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr 1.83ms 2ms 496. 787.85KB 2.14 -#> 2 base 2.66ms 2.92ms 339. 2.91MB 4.49 +#> 1 cheapr 1.88ms 2.03ms 484. 787.85KB 2.14 +#> 2 base 2.74ms 2.98ms 333. 2.91MB 4.47 ``` ### cut.default @@ -479,6 +479,6 @@ mark(cheapr_cut = cut_numeric(x, b), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_cut 130ms 130ms 7.67 38.1MB 2.56 -#> 2 base_cut 402ms 402ms 2.49 267.1MB 2.49 +#> 1 cheapr_cut 130ms 131ms 7.65 38.1MB 2.55 +#> 2 base_cut 503ms 503ms 1.99 267.1MB 0 ``` diff --git a/man/is_na.Rd b/man/is_na.Rd index c41788c..b0996ff 100644 --- a/man/is_na.Rd +++ b/man/is_na.Rd @@ -85,6 +85,7 @@ To replicate \code{complete.cases(x)}, use \code{!row_any_na(x)}. \cr To find rows with any empty values, use \code{which_(row_any_na(df))}. \cr To find empty rows use \code{which_(row_all_na(df))} or \code{which_na(df)}. +To drop empty rows use \code{na_rm(df)} or \code{sset(df, which_(row_all_na(df), TRUE))}. } \subsection{\code{is_na}}{ diff --git a/man/sset.Rd b/man/sset.Rd index b860361..6dd09c9 100644 --- a/man/sset.Rd +++ b/man/sset.Rd @@ -15,7 +15,7 @@ sset(x, ...) \method{sset}{tbl_df}(x, i, j = seq_along(x), ...) -\method{sset}{POSIXlt}(x, i, ...) +\method{sset}{POSIXlt}(x, i, j, ...) \method{sset}{data.table}(x, i, j = seq_along(x), ...) @@ -39,6 +39,13 @@ enhanced data frames like tibbles, data.tables and sf. \code{sset} is an S3 generic. You can either write methods for \code{sset} or \code{[}. \cr \code{sset} will fall back on using \code{[} when no suitable method is found. + +To get into more detail, using \code{sset()} on a data frame, a new +list is always allocated through \code{cheapr:::cpp_new_list()}. +For data.tables, if \code{i} is missing, then a deep copy is made. +When \code{i} is a logical vector, it is not recycled, so it is good practice to +make sure the logical vector +matches the length of x, or if x has rows, the number of rows of x. } \examples{ library(cheapr) diff --git a/src/attrs.cpp b/src/attrs.cpp new file mode 100644 index 0000000..34367cd --- /dev/null +++ b/src/attrs.cpp @@ -0,0 +1,84 @@ +#include "cheapr_cpp.h" +#include +#include + +// Adding and removing attributes in-place +// There is a check to ensure that attributes are copied when they are the same +// object as x + +[[cpp11::register]] +SEXP cpp_set_rm_attributes(SEXP x){ + SEXP attrs = Rf_protect(ATTRIB(x)); + SEXP names = Rf_protect(Rf_getAttrib(attrs, R_NamesSymbol)); + int n = Rf_length(attrs); + for (int i = 0; i < n; ++i){ + SEXP attrib_nm = Rf_protect(Rf_installChar(STRING_ELT(names, i))); + Rf_setAttrib(x, attrib_nm, R_NilValue); + } + Rf_unprotect(n + 2); + return x; +} + +// Add attribute onto existing attributes + +[[cpp11::register]] +SEXP cpp_set_add_attr(SEXP x, SEXP which, SEXP value) { + int n_protect; + Rf_protect(x = x); + Rf_protect(which = which); + Rf_protect(value = value); + SEXP attr_char = Rf_protect(Rf_install(CHAR(STRING_ELT(which, 0)))); + if (cpp_obj_address(x) == cpp_obj_address(value)){ + Rf_protect(value = Rf_duplicate(value)); + n_protect = 5; + } else { + n_protect = 4; + } + Rf_setAttrib(x, attr_char, value); + Rf_unprotect(n_protect); + return x; +} + +[[cpp11::register]] +SEXP cpp_set_rm_attr(SEXP x, SEXP which) { + Rf_protect(x = x); + Rf_protect(which = which); + SEXP attr_char = Rf_protect(Rf_installChar(STRING_ELT(which, 0))); + Rf_setAttrib(x, attr_char, R_NilValue); + Rf_unprotect(3); + return x; +} + +// Set attributes of x in-place, when add = F, attrs of x are first removed + +[[cpp11::register]] +SEXP cpp_set_attributes(SEXP x, SEXP attributes, bool add) { + int n_protect; + if (add){ + Rf_protect(x = x); + } else { + Rf_protect(x = cpp_set_rm_attributes(x)); + } + SEXP names = Rf_protect(Rf_getAttrib(attributes, R_NamesSymbol)); + n_protect = 2; + if (!Rf_isVectorList(attributes) || Rf_isNull(names)){ + Rf_unprotect(n_protect); + Rf_error("attributes must be a named list"); + } + const SEXP *p_attributes = VECTOR_PTR_RO(attributes); + SEXP *p_names = STRING_PTR(names); + int n = Rf_length(attributes); + for (int i = 0; i < n; ++i){ + SEXP attr_nm = Rf_protect(Rf_installChar(p_names[i])); + ++n_protect; + if (cpp_obj_address(x) == cpp_obj_address(p_attributes[i])){ + SEXP dup_attr = Rf_protect(Rf_duplicate(p_attributes[i])); + ++n_protect; + Rf_setAttrib(x, attr_nm, dup_attr); + } else { + Rf_setAttrib(x, attr_nm, p_attributes[i]); + } + } + Rf_unprotect(n_protect); + return x; +} diff --git a/src/cheapr_cpp.h b/src/cheapr_cpp.h index 5c7ec80..ef4f548 100644 --- a/src/cheapr_cpp.h +++ b/src/cheapr_cpp.h @@ -42,5 +42,6 @@ R_xlen_t cpp_df_nrow(SEXP x); R_xlen_t cpp_unnested_length(SEXP x); SEXP xlen_to_r(R_xlen_t x); R_xlen_t cpp_vec_length(SEXP x); +SEXP cpp_obj_address(SEXP x); #endif diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 53a4555..b74aee1 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -5,6 +5,34 @@ #include "cpp11/declarations.hpp" #include +// attrs.cpp +SEXP cpp_set_rm_attributes(SEXP x); +extern "C" SEXP _cheapr_cpp_set_rm_attributes(SEXP x) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_set_rm_attributes(cpp11::as_cpp>(x))); + END_CPP11 +} +// attrs.cpp +SEXP cpp_set_add_attr(SEXP x, SEXP which, SEXP value); +extern "C" SEXP _cheapr_cpp_set_add_attr(SEXP x, SEXP which, SEXP value) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_set_add_attr(cpp11::as_cpp>(x), cpp11::as_cpp>(which), cpp11::as_cpp>(value))); + END_CPP11 +} +// attrs.cpp +SEXP cpp_set_rm_attr(SEXP x, SEXP which); +extern "C" SEXP _cheapr_cpp_set_rm_attr(SEXP x, SEXP which) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_set_rm_attr(cpp11::as_cpp>(x), cpp11::as_cpp>(which))); + END_CPP11 +} +// attrs.cpp +SEXP cpp_set_attributes(SEXP x, SEXP attributes, bool add); +extern "C" SEXP _cheapr_cpp_set_attributes(SEXP x, SEXP attributes, SEXP add) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_set_attributes(cpp11::as_cpp>(x), cpp11::as_cpp>(attributes), cpp11::as_cpp>(add))); + END_CPP11 +} // gcd.cpp double cpp_gcd2(double x, double y, double tol, bool na_rm); extern "C" SEXP _cheapr_cpp_gcd2(SEXP x, SEXP y, SEXP tol, SEXP na_rm) { @@ -187,6 +215,13 @@ extern "C" SEXP _cheapr_cpp_lead_sequence(SEXP size, SEXP k, SEXP partial) { return cpp11::as_sexp(cpp_lead_sequence(cpp11::as_cpp>(size), cpp11::as_cpp>(k), cpp11::as_cpp>(partial))); END_CPP11 } +// sset.cpp +SEXP cpp_sset(SEXP x, SEXP indices); +extern "C" SEXP _cheapr_cpp_sset(SEXP x, SEXP indices) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_sset(cpp11::as_cpp>(x), cpp11::as_cpp>(indices))); + END_CPP11 +} // utils.cpp R_xlen_t cpp_vec_length(SEXP x); extern "C" SEXP _cheapr_cpp_vec_length(SEXP x) { @@ -229,20 +264,6 @@ extern "C" SEXP _cheapr_cpp_list_as_df(SEXP x) { return cpp11::as_sexp(cpp_list_as_df(cpp11::as_cpp>(x))); END_CPP11 } -// utils.cpp -SEXP cpp_set_rm_attributes(SEXP x); -extern "C" SEXP _cheapr_cpp_set_rm_attributes(SEXP x) { - BEGIN_CPP11 - return cpp11::as_sexp(cpp_set_rm_attributes(cpp11::as_cpp>(x))); - END_CPP11 -} -// utils.cpp -SEXP cpp_set_copy_attributes(SEXP target, SEXP source, SEXP attrs); -extern "C" SEXP _cheapr_cpp_set_copy_attributes(SEXP target, SEXP source, SEXP attrs) { - BEGIN_CPP11 - return cpp11::as_sexp(cpp_set_copy_attributes(cpp11::as_cpp>(target), cpp11::as_cpp>(source), cpp11::as_cpp>(attrs))); - END_CPP11 -} // which.cpp SEXP cpp_which_(SEXP x, bool invert); extern "C" SEXP _cheapr_cpp_which_(SEXP x, SEXP invert) { @@ -281,8 +302,11 @@ static const R_CallMethodDef CallEntries[] = { {"_cheapr_cpp_r_unnested_length", (DL_FUNC) &_cheapr_cpp_r_unnested_length, 1}, {"_cheapr_cpp_row_na_counts", (DL_FUNC) &_cheapr_cpp_row_na_counts, 1}, {"_cheapr_cpp_sequence", (DL_FUNC) &_cheapr_cpp_sequence, 3}, - {"_cheapr_cpp_set_copy_attributes", (DL_FUNC) &_cheapr_cpp_set_copy_attributes, 3}, + {"_cheapr_cpp_set_add_attr", (DL_FUNC) &_cheapr_cpp_set_add_attr, 3}, + {"_cheapr_cpp_set_attributes", (DL_FUNC) &_cheapr_cpp_set_attributes, 3}, + {"_cheapr_cpp_set_rm_attr", (DL_FUNC) &_cheapr_cpp_set_rm_attr, 2}, {"_cheapr_cpp_set_rm_attributes", (DL_FUNC) &_cheapr_cpp_set_rm_attributes, 1}, + {"_cheapr_cpp_sset", (DL_FUNC) &_cheapr_cpp_sset, 2}, {"_cheapr_cpp_vec_length", (DL_FUNC) &_cheapr_cpp_vec_length, 1}, {"_cheapr_cpp_which_", (DL_FUNC) &_cheapr_cpp_which_, 2}, {"_cheapr_cpp_which_na", (DL_FUNC) &_cheapr_cpp_which_na, 1}, diff --git a/src/sset.cpp b/src/sset.cpp new file mode 100644 index 0000000..859cf0c --- /dev/null +++ b/src/sset.cpp @@ -0,0 +1,260 @@ +#include "cheapr_cpp.h" +#include +#include +// #include +// using namespace cpp11; + +[[cpp11::register]] +SEXP cpp_sset(SEXP x, SEXP indices){ + int *pi = INTEGER(indices); + int xn = Rf_xlength(x); + int n = Rf_xlength(indices); + int n_protections = 0; + int zero_count = 0; + int pos_count = 0; + int oob_count = 0; + int out_size; + bool do_parallel = n >= 10000; + int n_cores = do_parallel ? num_cores() : 1; + do_parallel = do_parallel && n_cores > 1; + + // Counting the number of: + // Zeroes + // Out-of-bounds indices + // Positive indices + // From this we can also work out the number of negatives + + if (do_parallel){ +#pragma omp parallel for simd num_threads(n_cores) reduction(+:zero_count,pos_count,oob_count) + for (int j = 0; j < n; ++j){ + zero_count += pi[j] == 0; + pos_count += pi[j] > 0; + oob_count += std::abs(pi[j]) > xn; + } + } else { +#pragma omp for simd + for (int j = 0; j < n; ++j){ + zero_count += (pi[j] == 0); + pos_count += (pi[j] > 0); + oob_count += (std::abs(pi[j]) > xn); + } + } + bool neg_count = n - pos_count - zero_count; + if ( (pos_count + zero_count) > 0 && neg_count > 0){ + Rf_error("Cannot mix positive and negative indices"); + } + bool simple_sset = zero_count == 0 && oob_count == 0 && pos_count == n; + + // Convert negative index vector to positive + + if (neg_count > 0){ + SEXP indices2 = Rf_protect(cpp11::package("cheapr")["neg_indices_to_pos"](xn, indices)); + ++n_protections; + int *pi2 = INTEGER(indices2); + pi = pi2; + out_size = Rf_xlength(indices2); + n = out_size; + simple_sset = true; + } else { + out_size = n - zero_count; + } + switch ( TYPEOF(x) ){ + int i; + case NILSXP: { + return R_NilValue; + } + case LGLSXP: { + int *p_x = LOGICAL(x); + SEXP out = Rf_protect(Rf_allocVector(LGLSXP, out_size)); + ++n_protections; + zero_count = 0; + int *p_out = LOGICAL(out); + if (simple_sset){ + if (do_parallel){ +#pragma omp parallel for simd num_threads(n_cores) private(i) + for (i = 0; i < n; ++i){ + p_out[i] = p_x[pi[i] - 1]; + } + } else { +#pragma omp for simd + for (i = 0; i < n; ++i){ + p_out[i] = p_x[pi[i] - 1]; + } + } + } else { + for (i = 0; i < n; ++i){ + if (pi[i] == 0){ + ++zero_count; + } else { + p_out[i - zero_count] = (pi[i] <= xn) ? p_x[pi[i] - 1] : NA_LOGICAL; + } + // p_out[i - zero_count] = (pi[i] <= xn) ? p_x[pi[i] - 1] : NA_LOGICAL; + // p_out[i - ( pi[i] == 0 ? zero_count++ : zero_count)] = (pi[i] > 0 && pi[i] <= xn) ? p_x[pi[i] - 1] : NA_INTEGER; + } + } + Rf_unprotect(n_protections); + return out; + } + case INTSXP: { + int *p_x = INTEGER(x); + SEXP out = Rf_protect(Rf_allocVector(INTSXP, out_size)); + ++n_protections; + zero_count = 0; + int *p_out = INTEGER(out); + if (simple_sset){ + if (do_parallel){ +#pragma omp parallel for simd num_threads(n_cores) private(i) + for (i = 0; i < n; ++i){ + p_out[i] = p_x[pi[i] - 1]; + } + } else { +#pragma omp for simd + for (i = 0; i < n; ++i){ + p_out[i] = p_x[pi[i] - 1]; + } + } + } else { + for (i = 0; i < n; ++i){ + if (pi[i] == 0){ + ++zero_count; + } else { + p_out[i - zero_count] = (pi[i] <= xn) ? p_x[pi[i] - 1] : NA_INTEGER; + } + // p_out[i - zero_count] = (pi[i] <= xn) ? p_x[pi[i] - 1] : NA_INTEGER; + // p_out[i - ( pi[i] == 0 ? zero_count++ : zero_count)] = (pi[i] > 0 && pi[i] <= xn) ? p_x[pi[i] - 1] : NA_INTEGER; + } + } + Rf_unprotect(n_protections); + return out; + } + case REALSXP: { + double *p_x = REAL(x); + SEXP out = Rf_protect(Rf_allocVector(REALSXP, out_size)); + ++n_protections; + zero_count = 0; + double *p_out = REAL(out); + if (simple_sset){ + if (do_parallel){ +#pragma omp parallel for simd num_threads(n_cores) private(i) + for (i = 0; i < n; ++i){ + p_out[i] = p_x[pi[i] - 1]; + } + } else { +#pragma omp for simd + for (i = 0; i < n; ++i){ + p_out[i] = p_x[pi[i] - 1]; + } + } + } else { + for (i = 0; i < n; ++i){ + if (pi[i] == 0){ + ++zero_count; + } else { + p_out[i - zero_count] = (pi[i] <= xn) ? p_x[pi[i] - 1] : NA_REAL; + } + // p_out[i - ( pi[i] == 0 ? zero_count++ : zero_count)] = (pi[i] > 0 && pi[i] <= xn) ? p_x[pi[i] - 1] : NA_REAL; + } + } + Rf_unprotect(n_protections); + return out; + } + case STRSXP: { + SEXP *p_x = STRING_PTR(x); + SEXP out = Rf_protect(Rf_allocVector(STRSXP, out_size)); + ++n_protections; + zero_count = 0; + if (simple_sset){ + for (i = 0; i < n; ++i){ + SET_STRING_ELT(out, i, p_x[pi[i] - 1]); + } + } else { + for (i = 0; i < n; ++i){ + if (pi[i] == 0){ + ++zero_count; + } else { + SET_STRING_ELT(out, i - zero_count, + (pi[i] <= xn) ? p_x[pi[i] - 1] : NA_STRING); + } + } + } + Rf_unprotect(n_protections); + return out; + } + case RAWSXP: { + Rbyte *p_x = RAW(x); + SEXP out = Rf_protect(Rf_allocVector(RAWSXP, out_size)); + ++n_protections; + zero_count = 0; + if (simple_sset){ + for (i = 0; i < n; ++i){ + SET_RAW_ELT(out, i, p_x[pi[i] - 1]); + } + } else { + for (i = 0; i < n; ++i){ + if (pi[i] == 0){ + ++zero_count; + } else { + SET_RAW_ELT(out, i - zero_count, + (pi[i] <= xn) ? p_x[pi[i] - 1] : 0); + } + } + } + Rf_unprotect(n_protections); + return out; + } + case VECSXP: { + const SEXP *p_x = VECTOR_PTR_RO(x); + SEXP out = Rf_protect(Rf_allocVector(VECSXP, out_size)); + ++n_protections; + zero_count = 0; + if (simple_sset){ + for (i = 0; i < n; ++i){ + SET_VECTOR_ELT(out, i, p_x[pi[i] - 1]); + } + } else { + for (i = 0; i < n; ++i){ + if (pi[i] == 0){ + ++zero_count; + } else { + SET_VECTOR_ELT(out, i - zero_count, + (pi[i] <= xn) ? p_x[pi[i] - 1] : R_NilValue); + } + } + } + Rf_unprotect(n_protections); + return out; + } + default: { + Rf_error("%s cannot handle an object of type %s", __func__, Rf_type2char(TYPEOF(x))); + } + } +} + + +// A subset method using c++ vectors + +// list cpp_sset(SEXP x, integers i){ +// int xn = Rf_xlength(x); +// int n = i.size(); +// switch ( TYPEOF(x) ){ +// case INTSXP: { +// std::vector out; +// int *p_x = INTEGER(x); +// out.reserve(n); +// for (int j = 0; j < n; ++j){ +// if (i[j] > 0 && i[j] <= xn){ +// int val = p_x[i[j] - 1]; +// out.push_back(val); +// } else { +// out.push_back(NA_INTEGER); +// } +// } +// return writable::list({ +// "out"_nm = out +// }); +// } +// default: { +// Rf_error("%s cannot handle an object of type %s", __func__, Rf_type2char(TYPEOF(x))); +// } +// } +// } diff --git a/src/utils.cpp b/src/utils.cpp index eba1da1..2876bb2 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -108,29 +108,34 @@ SEXP cpp_list_rm_null(SEXP l) { int n_keep = n - n_null; int whichj = 0; int j = 0; + + // Which list elements should we keep? + SEXP keep = Rf_protect(Rf_allocVector(INTSXP, n_keep)); int *p_keep = INTEGER(keep); while (whichj < n_keep){ - p_keep[whichj] = j + 1; - whichj += (p_l[j] != R_NilValue); - ++j; + p_keep[whichj] = j; + whichj += (p_l[j++] != R_NilValue); } + + // Subset on both the list and names of the list + SEXP out = Rf_protect(Rf_allocVector(VECSXP, n_keep)); - SEXP names = Rf_protect(Rf_duplicate(Rf_getAttrib(l, R_NamesSymbol))); + SEXP names = Rf_protect(Rf_getAttrib(l, R_NamesSymbol)); bool has_names = !Rf_isNull(names); if (has_names){ SEXP *p_names = STRING_PTR(names); SEXP out_names = Rf_protect(Rf_allocVector(STRSXP, n_keep)); for (int k = 0; k < n_keep; ++k) { - SET_STRING_ELT(out_names, k, p_names[p_keep[k] - 1]); - SET_VECTOR_ELT(out, k, p_l[p_keep[k] - 1]); + SET_STRING_ELT(out_names, k, p_names[p_keep[k]]); + SET_VECTOR_ELT(out, k, p_l[p_keep[k]]); } Rf_setAttrib(out, R_NamesSymbol, out_names); Rf_unprotect(5); return out; } else { for (int k = 0; k < n_keep; ++k) { - SET_VECTOR_ELT(out, k, p_l[p_keep[k] - 1]); + SET_VECTOR_ELT(out, k, p_l[p_keep[k]]); } Rf_unprotect(4); return out; @@ -165,115 +170,12 @@ SEXP cpp_list_as_df(SEXP x) { } } -// Remove attributes in-place - -[[cpp11::register]] -SEXP cpp_set_rm_attributes(SEXP x){ - SEXP attrs = Rf_protect(cpp11::package("base")["attributes"](x)); - SEXP names = Rf_protect(Rf_getAttrib(attrs, R_NamesSymbol)); - int n = Rf_length(attrs); - for (int i = 0; i < n; ++i){ - SEXP attrib_nm = Rf_protect(Rf_install(CHAR(STRING_ELT(names, i)))); - Rf_setAttrib(x, attrib_nm, R_NilValue); - } - Rf_unprotect(n + 2); - return x; +SEXP cpp_obj_address(SEXP x) { + static char buf[1000]; + snprintf(buf, 1000, "%p", (void*) x); + return Rf_mkChar(buf); } -// Copy specified attributes (character vector of names) -// from source to target (by reference) -// Use with extreme care as it modifies target in-place -// If you use it, make absolutely sure that target is not pointed to by other -// objects as it will modify the attributes of those objects too - -[[cpp11::register]] -SEXP cpp_set_copy_attributes(SEXP target, SEXP source, SEXP attrs){ - SEXP *p_attrs = STRING_PTR(attrs); - int n_attrs = Rf_length(attrs); - for (int i = 0; i < n_attrs; ++i){ - SEXP attrib_nm = Rf_protect(Rf_install(CHAR(p_attrs[i]))); - Rf_setAttrib(target, attrib_nm, Rf_getAttrib(source, attrib_nm)); - } - Rf_unprotect(n_attrs); - return target; -} - -// SEXP cpp_unlist(SEXP x, SEXP ptype) { -// if (!Rf_isVectorList(x)){ -// Rf_error("x must be a list"); -// } -// int n_protections = 0; -// R_xlen_t n = Rf_xlength(x); -// R_xlen_t N = cpp_unnested_length(x); -// R_xlen_t m; -// R_xlen_t k = 0; -// const SEXP *p_x = VECTOR_PTR_RO(x); -// switch ( TYPEOF(ptype) ){ -// case LGLSXP: { -// ++n_protections; -// SEXP out = Rf_protect(Rf_allocVector(LGLSXP, N)); -// int *p_out = LOGICAL(out); -// for (R_xlen_t i = 0; i < n; ++i) { -// m = Rf_xlength(p_x[i]); -// int *p_xj = LOGICAL(p_x[i]); -// for (R_xlen_t j = 0; j < m; ++j) { -// p_out[k] = p_xj[j]; -// ++k; -// } -// } -// Rf_unprotect(n_protections); -// return out; -// } -// case INTSXP: { -// ++n_protections; -// SEXP out = Rf_protect(Rf_allocVector(INTSXP, N)); -// int *p_out = INTEGER(out); -// for (R_xlen_t i = 0; i < n; ++i) { -// m = Rf_xlength(p_x[i]); -// int *p_xj = INTEGER(p_x[i]); -// for (R_xlen_t j = 0; j < m; ++j) { -// p_out[k] = p_xj[j]; -// ++k; -// } -// } -// Rf_unprotect(n_protections); -// return out; -// } -// case REALSXP: { -// ++n_protections; -// SEXP out = Rf_protect(Rf_allocVector(REALSXP, N)); -// double *p_out = REAL(out); -// for (R_xlen_t i = 0; i < n; ++i) { -// m = Rf_xlength(p_x[i]); -// double *p_xj = REAL(p_x[i]); -// for (R_xlen_t j = 0; j < m; ++j) { -// p_out[k] = p_xj[j]; -// ++k; -// } -// } -// Rf_unprotect(n_protections); -// return out; -// } -// case STRSXP: { -// ++n_protections; -// SEXP out = Rf_protect(Rf_allocVector(STRSXP, N)); -// for (R_xlen_t i = 0; i < n; ++i) { -// m = Rf_xlength(p_x[i]); -// SEXP *p_xj = STRING_PTR(p_x[i]); -// for (R_xlen_t j = 0; j < m; ++j) { -// SET_STRING_ELT(out, k, p_xj[j]); -// ++k; -// } -// } -// Rf_unprotect(n_protections); -// return out; -// } -// default: { -// Rf_error("%s cannot handle an object of type %s", __func__, Rf_type2char(TYPEOF(ptype))); -// } -// } -// } - // Potentially useful for rolling calculations // Computes the rolling number of true values in a given // series of consecutive true values diff --git a/src/which.cpp b/src/which.cpp index 1743137..7ca642a 100644 --- a/src/which.cpp +++ b/src/which.cpp @@ -43,8 +43,7 @@ SEXP cpp_which_(SEXP x, bool invert){ R_xlen_t i = 0; while (whichi < out_size){ p_out[whichi] = i + 1; - whichi += !(p_x[i] == TRUE); - ++i; + whichi += (p_x[i++] != TRUE); } Rf_unprotect(1); return out; @@ -57,8 +56,7 @@ SEXP cpp_which_(SEXP x, bool invert){ int i = 0; while (whichi < out_size){ p_out[whichi] = i + 1; - whichi += !(p_x[i] == TRUE); - ++i; + whichi += (p_x[i++] != TRUE); } Rf_unprotect(1); return out; @@ -72,8 +70,7 @@ SEXP cpp_which_(SEXP x, bool invert){ R_xlen_t i = 0; while (whichi < size){ p_out[whichi] = i + 1; - whichi += (p_x[i] == TRUE); - ++i; + whichi += (p_x[i++] == TRUE); } Rf_unprotect(1); return out; @@ -85,11 +82,53 @@ SEXP cpp_which_(SEXP x, bool invert){ int i = 0; while (whichi < size){ p_out[whichi] = i + 1; - whichi += (p_x[i] == TRUE); - ++i; + whichi += (p_x[i++] == TRUE); } Rf_unprotect(1); return out; } } } + +// 2 more which() alternatives +// list cpp_which2(SEXP x){ +// int n = Rf_xlength(x); +// int *p_x = LOGICAL(x); +// // std::vector out; +// // out.reserve(n); +// // for (int i = 0; i < n; ++i){ +// // if (p_x[i] == TRUE){ +// // out.push_back(i + 1); +// // } +// // } +// int k = 0; +// std::vector out(n); +// for (int i = 0; i < n; ++i){ +// if (p_x[i] == TRUE){ +// out[k++] = i + 1; +// } else { +// out.pop_back(); +// } +// } +// return writable::list({ +// "out"_nm = out +// }); +// } +// +// SEXP cpp_which3(SEXP x){ +// int n = Rf_xlength(x); +// int *p_x = LOGICAL(x); +// int size = 0; +// int j; +// for (j = 0; j < n; ++j) size += (p_x[j] == TRUE); +// SEXP out = Rf_protect(Rf_allocVector(INTSXP, size)); +// int *p_out = INTEGER(out); +// int k = 0; +// for (int i = 0; i < j; ++i){ +// if (p_x[i] == TRUE){ +// p_out[k++] = i + 1; +// } +// } +// Rf_unprotect(1); +// return out; +// }