From 1626ec4e3dd5bf61785801a7501b0bcda6fd955f Mon Sep 17 00:00:00 2001 From: Nick Christofides <118103879+NicChr@users.noreply.github.com> Date: Thu, 31 Oct 2024 10:10:32 +0000 Subject: [PATCH] Updates. --- NAMESPACE | 3 +- R/cpp11.R | 12 ++- R/extras.R | 202 ++++++++++++++++++------------------------ R/if_else.R | 122 +++++++++++++++++++++++++ man/cheapr_if_else.Rd | 26 ++++++ man/extras.Rd | 70 ++++++++++++++- src/cpp11.cpp | 24 ++++- src/utils.cpp | 123 ++++++++++++++++--------- src/which.cpp | 82 +++++++++++++++++ 9 files changed, 496 insertions(+), 168 deletions(-) create mode 100644 R/if_else.R create mode 100644 man/cheapr_if_else.Rd diff --git a/NAMESPACE b/NAMESPACE index 3d307f5..4fb431b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -19,7 +19,6 @@ S3method(collapse::fsum,integer64) S3method(collapse::funique,POSIXlt) S3method(collapse::funique,vctrs_rcrd) S3method(collapse::fvar,integer64) -S3method(cut,integer64) S3method(get_breaks,integer) S3method(get_breaks,integer64) S3method(get_breaks,numeric) @@ -57,6 +56,7 @@ export(any_na) export(as_discrete) export(as_factor) export(bin) +export(cheapr_if_else) export(cheapr_rev) export(cheapr_var) export(col_all_na) @@ -140,4 +140,5 @@ export(which_na) export(which_not_na) export(which_val) export(window_sequence) +export(with_local_seed) useDynLib(cheapr, .registration = TRUE) diff --git a/R/cpp11.R b/R/cpp11.R index c5a966e..ed0ddc2 100644 --- a/R/cpp11.R +++ b/R/cpp11.R @@ -264,8 +264,12 @@ cpp_rev <- function(x, set) { .Call(`_cheapr_cpp_rev`, x, set) } -cpp_if_else <- function(condition, yes, no) { - .Call(`_cheapr_cpp_if_else`, condition, yes, no) +cpp_if_else <- function(condition, yes, no, na) { + .Call(`_cheapr_cpp_if_else`, condition, yes, no, na) +} + +cpp_lgl_count <- function(x) { + .Call(`_cheapr_cpp_lgl_count`, x) } cpp_which_ <- function(x, invert) { @@ -283,3 +287,7 @@ cpp_which_na <- function(x) { cpp_which_not_na <- function(x) { .Call(`_cheapr_cpp_which_not_na`, x) } + +cpp_lgl_locs <- function(x, n_true, n_false, include_true, include_false, include_na) { + .Call(`_cheapr_cpp_lgl_locs`, x, n_true, n_false, include_true, include_false, include_na) +} diff --git a/R/extras.R b/R/extras.R index b347e54..bc58230 100644 --- a/R/extras.R +++ b/R/extras.R @@ -22,6 +22,11 @@ #' randomly into your vector. #' @param na.rm Should `NA` values be ignored in `cheapr_var()` Default is #' `TRUE`. +#' @param expr Expression that will be evaluated with a local seed that +#' is independent and has absolutely no effect on the global RNG state. +#' @param .seed A local seed to set which is only used inside +#' `with_local_seed()`. After the execution of the expression the original +#' seed is reset. #' #' @returns #' `enframe()_` converts a vector to a data frame. \cr @@ -44,7 +49,61 @@ #' for matrices in which it matches `length()`. #' `cheapr_var` returns the variance of a numeric vector. #' No coercion happens for integer vectors and so is very cheap. \cr -#' `cheapr_rev` is a much cheaper version of `rev()`. +#' `cheapr_rev` is a much cheaper version of `rev()`. \cr +#' `with_local_seed` offers no speed improvements but is extremely handy +#' in executing random number based expressions like `rnorm()` without +#' affecting the global RNG state. It allows you to run these expressions in a +#' sort of independent 'container' and with an optional seed for that +#' 'container' for reproducibility. +#' The rationale for including this in 'cheapr' is that it can reduce the need +#' to set many seed values, +#' especially for multiple output comparisons of RNG expressions. +#' Another way of thinking about it is that `with_local_seed()` is a helper +#' that allows you to write reproducible code without side-effects, which +#' traditionally cannot be avoided when calling `set.seed()` directly. +#' +#' @examples +#' library(cheapr) +#' +#' # Using `with_local_seed()` +#' +#' # The below 2 statements are equivalent +#' +#' # Statement 1 +#' set.seed(123456789) +#' res <- rnorm(10) +#' +#' # Statement 2 +#' res2 <- with_local_seed(rnorm(10), .seed = 123456789) +#' +#' # They are the same +#' identical(res, res2) +#' +#' # As an example we can see that the RNG is unaffected by generating +#' # random uniform deviates in batches between calls to `with_local_seed()` +#' # and comparing to the first result +#' +#' set.seed(123456789) +#' batch1 <- rnorm(2) +#' +#' with_local_seed(runif(10)) +#' batch2 <- rnorm(2) +#' with_local_seed(runif(10)) +#' batch3 <- rnorm(1) +#' with_local_seed(runif(10)) +#' batch4 <- rnorm(5) +#' +#' # Combining the batches produces the same result +#' # therefore `with_local_seed` did not interrupt the rng sequence +#' identical(c(batch1, batch2, batch3, batch4), res) +#' +#' # It can be useful in multiple comparisons +#' out1 <- with_local_seed(rnorm(5)) +#' out2 <- with_local_seed(rnorm(5)) +#' out3 <- with_local_seed(rnorm(5)) +#' +#' identical(out1, out2) +#' identical(out1, out3) #' #' @rdname extras #' @export @@ -66,6 +125,7 @@ intersect_ <- function(x, y, dups = TRUE){ #' @export cut_numeric <- function(x, breaks, labels = NULL, include.lowest = FALSE, right = TRUE, dig.lab = 3L, ordered_result = FALSE, ...){ + .Deprecated(old = "cut_numeric", new = "as_discrete") if (!is.numeric(x)) stop("'x' must be numeric") if (length(breaks) == 1L) { @@ -125,14 +185,6 @@ cut_numeric <- function(x, breaks, labels = NULL, include.lowest = FALSE, } #' @rdname extras #' @export -cut.integer64 <- function(x, ...){ - - ## Would be nice if cut() accepted a formatting function - ## As large int64 are printed with sci notation - cut_numeric(cpp_int64_to_numeric(x), ...) -} -#' @rdname extras -#' @export `%in_%` <- function(x, table){ collapse::fmatch(x, table, overid = 2L, nomatch = 0L) > 0L } @@ -231,8 +283,7 @@ cheapr_var <- function(x, na.rm = TRUE){ #' @rdname extras #' @export cheapr_rev <- function(x){ - # If x is a simple vector, use cpp_rev - if (!is.object(x) && is.atomic(x)){ + if (is_base_atomic(x)){ .Call(`_cheapr_cpp_rev`, x, FALSE) } else { n <- vector_length(x) @@ -242,49 +293,22 @@ cheapr_rev <- function(x){ cheapr_sd <- function(x, na.rm = TRUE){ sqrt(cheapr_var(x, na.rm = na.rm)) } - -# head_ <- function(x, n = 1L){ -# check_length(n, 1L) -# N <- cpp_vec_length(x) -# if (n >= 0) { -# size <- min(n, N) -# } -# else { -# size <- max(0L, N + n) -# } -# sset(x, seq_len(size)) -# } -# tail_ <- function (x, n = 1L){ -# check_length(n, 1L) -# N <- cpp_vec_length(x) -# if (n >= 0) { -# size <- min(n, N) -# } -# else { -# size <- max(0L, N + n) -# } -# sset(x, seq.int(from = N - size + 1L, by = 1L, length.out = size)) -# } -# with_seed <- function (expr, .seed = NULL, ...){ -# old <- globalenv()[[".Random.seed"]] -# if (is.null(old)) { -# set.seed(NULL) -# old <- globalenv()[[".Random.seed"]] -# } -# if (!is.null(.seed)) { -# set.seed(.seed, ...) -# } -# on.exit({ -# assign(".Random.seed", old, envir = globalenv()) -# }) -# eval(expr, envir = parent.frame()) -# } -duplicated_ <- function(x, .all = FALSE){ - groups <- collapse::group(x, starts = !.all, group.sizes = TRUE) - sizes <- attr(groups, "group.sizes") - out <- (sizes > 1L)[groups] - out[attr(groups, "starts")] <- FALSE - out +#' @rdname extras +#' @export +with_local_seed <- function (expr, .seed = NULL, ...){ + global_env <- base::globalenv + old <- global_env()[[".Random.seed"]] + if (is.null(old)) { + set.seed(NULL) + old <- global_env()[[".Random.seed"]] + } + if (!is.null(.seed)) { + set.seed(.seed, ...) + } + on.exit({ + assign(".Random.seed", old, envir = global_env()) + }, add = TRUE) + eval(expr, envir = parent.frame()) } cast <- function(x, template){ @@ -297,69 +321,13 @@ cast <- function(x, template){ } } -cheapr_if_else <- function(condition, true, false){ - if (!is.logical(condition)){ - stop("condition must be a logical vector") - } - if (length(true) != 1 && length(true) != length(condition)){ - stop("`length(true)` must be 1 or `length(condition)`") - } - if (length(false) != 1 && length(false) != length(condition)){ - stop("`length(false)` must be 1 or `length(condition)`") - } - - if (is.factor(true) || is.factor(false)){ - template <- combine_factors(true[1L], false[1L])[0L] - } else { - template <- c(true[1L], false[1L])[0L] - } - - true <- cast(true, template) - false <- cast(false, template) - - if (is_base_atomic(true) && is_base_atomic(true)){ - return(`mostattributes<-`( - cpp_if_else(condition, true, false), - attributes(template) - )) - } - - # Catch-all method - - if (val_count(condition, TRUE) == length(condition)){ - if (length(true) == 1){ - return(rep(true, length(condition))) - } else { - return(true) - } - } - - if (val_count(condition, FALSE) == length(condition)){ - if (length(false) == 1){ - return(rep(false, length(condition))) - } else { - return(false) - } - } - - out <- rep(template, length.out = length(condition)) - - true_locs <- which_val(condition, TRUE) - false_locs <- which_val(condition, FALSE) - - if (length(true) == 1){ - out[true_locs] <- true - } else { - out[true_locs] <- true[true_locs] - } - if (length(false) == 1){ - out[false_locs] <- false - } else { - out[false_locs] <- false[false_locs] - } - out -} - +# duplicated_ <- function(x, .all = FALSE){ +# groups <- collapse::group(x, starts = !.all, group.sizes = TRUE) +# sizes <- attr(groups, "group.sizes") +# out <- (sizes > 1L)[groups] +# out[attr(groups, "starts")] <- FALSE +# out +# } # duplicates <- function(x, .all = FALSE, .count = FALSE){ # groups <- collapse::group(x, starts = !.all, group.sizes = TRUE) # sizes <- attr(groups, "group.sizes") diff --git a/R/if_else.R b/R/if_else.R new file mode 100644 index 0000000..fce550a --- /dev/null +++ b/R/if_else.R @@ -0,0 +1,122 @@ +#' Cheaper version of `ifelse()` +#' +#' @param condition [logical] A condition which will be used to +#' evaluate the if else operation. +#' @param true Value(s) to replace `TRUE` instances. +#' @param false Value(s) to replace `FALSE` instances. +#' @param default Catch-all value(s) to replace all other instances, +#' where `is.na(condition)`. +#' +#' @returns +#' A vector the same length as condition, +#' using a common type between `true`, `false` and `default`. +#' +#' @export +cheapr_if_else <- function(condition, true, false, default = false[NA_integer_]){ + + if (!is.logical(condition)){ + stop("condition must be a logical vector") + } + if (length(true) != 1 && length(true) != length(condition)){ + stop("`length(true)` must be 1 or `length(condition)`") + } + if (length(false) != 1 && length(false) != length(condition)){ + stop("`length(false)` must be 1 or `length(condition)`") + } + if (length(default) != 1 && length(default) != length(condition)){ + stop("`length(default)` must be 1 or `length(condition)`") + } + + if (is.factor(true) || is.factor(false) || is.factor(default)){ + template <- combine_factors(true[1L], false[1L], default[1L])[0L] + } else { + template <- c(true[1L], false[1L], default[1L])[0L] + } + + true <- cast(true, template) + false <- cast(false, template) + default <- cast(default, template) + + if (is_base_atomic(true) && is_base_atomic(false) && is_base_atomic(default)){ + return(`mostattributes<-`( + cpp_if_else(condition, true, false, default), + attributes(template) + )) + } + + # Catch-all method + + lgl_val_counts <- cpp_lgl_count(condition) + n_true <- lgl_val_counts["true"] + n_false <- lgl_val_counts["false"] + n_default <- lgl_val_counts["na"] + + if (n_true == length(condition)){ + if (length(true) == 1){ + return(rep(true, length(condition))) + } else { + return(true) + } + } + + if (n_false == length(condition)){ + if (length(false) == 1){ + return(rep(false, length(condition))) + } else { + return(false) + } + } + + if (n_default == length(condition)){ + if (length(default) == 1){ + return(rep(default, length(condition))) + } else { + return(default) + } + } + + # if (length(default) == 1 && is.na(default)){ + # out <- rep(template, length.out = length(condition)) + # } else if (length(default) == length(condition)){ + # out <- default + # } else { + # out <- rep(default, length.out = length(condition)) + # } + + # if (length(default) == length(condition)){ + # out <- default + # } else { + # out <- rep(default, length.out = length(condition)) + # } + + # The else part is most likely to be most prominent + if (length(false) == length(condition)){ + out <- false + } else { + out <- rep(false, length.out = length(condition)) + } + + lgl_locs <- cpp_lgl_locs(condition, n_true = n_true, n_false = n_false, + include_true = TRUE, include_false = FALSE, + include_na = TRUE) + true_locs <- lgl_locs[["true"]] + # false_locs <- lgl_locs[["false"]] + default_locs <- lgl_locs[["na"]] + + if (length(true) == 1){ + out[true_locs] <- true + } else { + out[true_locs] <- true[true_locs] + } + # if (length(false) == 1){ + # out[false_locs] <- false + # } else { + # out[false_locs] <- false[false_locs] + # } + if (length(default) == 1){ + out[default_locs] <- default + } else { + out[default_locs] <- default[default_locs] + } + out +} diff --git a/man/cheapr_if_else.Rd b/man/cheapr_if_else.Rd new file mode 100644 index 0000000..8adf812 --- /dev/null +++ b/man/cheapr_if_else.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/if_else.R +\name{cheapr_if_else} +\alias{cheapr_if_else} +\title{Cheaper version of \code{ifelse()}} +\usage{ +cheapr_if_else(condition, true, false, default = false[NA_integer_]) +} +\arguments{ +\item{condition}{\link{logical} A condition which will be used to +evaluate the if else operation.} + +\item{true}{Value(s) to replace \code{TRUE} instances.} + +\item{false}{Value(s) to replace \code{FALSE} instances.} + +\item{default}{Catch-all value(s) to replace all other instances, +where \code{is.na(condition)}.} +} +\value{ +A vector the same length as condition, +using a common type between \code{true}, \code{false} and \code{default}. +} +\description{ +Cheaper version of \code{ifelse()} +} diff --git a/man/extras.Rd b/man/extras.Rd index eb332cc..9eeaaff 100644 --- a/man/extras.Rd +++ b/man/extras.Rd @@ -4,7 +4,6 @@ \alias{setdiff_} \alias{intersect_} \alias{cut_numeric} -\alias{cut.integer64} \alias{\%in_\%} \alias{\%!in_\%} \alias{enframe_} @@ -15,6 +14,7 @@ \alias{vector_length} \alias{cheapr_var} \alias{cheapr_rev} +\alias{with_local_seed} \title{Extra utilities} \usage{ setdiff_(x, y, dups = TRUE) @@ -32,8 +32,6 @@ cut_numeric( ... ) -\method{cut}{integer64}(x, ...) - x \%in_\% table x \%!in_\% table @@ -53,6 +51,8 @@ vector_length(x) cheapr_var(x, na.rm = TRUE) cheapr_rev(x) + +with_local_seed(expr, .seed = NULL, ...) } \arguments{ \item{x}{A vector or data frame.} @@ -95,6 +95,13 @@ randomly into your vector.} \item{na.rm}{Should \code{NA} values be ignored in \code{cheapr_var()} Default is \code{TRUE}.} + +\item{expr}{Expression that will be evaluated with a local seed that +is independent and has absolutely no effect on the global RNG state.} + +\item{.seed}{A local seed to set which is only used inside +\code{with_local_seed()}. After the execution of the expression the original +seed is reset.} } \value{ \verb{enframe()_} converts a vector to a data frame. \cr @@ -117,8 +124,63 @@ Useful for generating missing data. \cr for matrices in which it matches \code{length()}. \code{cheapr_var} returns the variance of a numeric vector. No coercion happens for integer vectors and so is very cheap. \cr -\code{cheapr_rev} is a much cheaper version of \code{rev()}. +\code{cheapr_rev} is a much cheaper version of \code{rev()}. \cr +\code{with_local_seed} offers no speed improvements but is extremely handy +in executing random number based expressions like \code{rnorm()} without +affecting the global RNG state. It allows you to run these expressions in a +sort of independent 'container' and with an optional seed for that +'container' for reproducibility. +The rationale for including this in 'cheapr' is that it can reduce the need +to set many seed values, +especially for multiple output comparisons of RNG expressions. +Another way of thinking about it is that \code{with_local_seed()} is a helper +that allows you to write reproducible code without side-effects, which +traditionally cannot be avoided when calling \code{set.seed()} directly. } \description{ Extra utilities } +\examples{ +library(cheapr) + +# Using `with_local_seed()` + +# The below 2 statements are equivalent + +# Statement 1 +set.seed(123456789) +res <- rnorm(10) + +# Statement 2 +res2 <- with_local_seed(rnorm(10), .seed = 123456789) + +# They are the same +identical(res, res2) + +# As an example we can see that the RNG is unaffected by generating +# random uniform deviates in batches between calls to `with_local_seed()` +# and comparing to the first result + +set.seed(123456789) +batch1 <- rnorm(2) + +with_local_seed(runif(10)) +batch2 <- rnorm(2) +with_local_seed(runif(10)) +batch3 <- rnorm(1) +with_local_seed(runif(10)) +batch4 <- rnorm(5) + +# Combining the batches produces the same result +# therefore `with_local_seed` did not interrupt the rng sequence +identical(c(batch1, batch2, batch3, batch4), res) + +# It can be useful in multiple comparisons +out1 <- with_local_seed(rnorm(5)) +out2 <- with_local_seed(rnorm(5)) +out3 <- with_local_seed(rnorm(5)) + +identical(out1, out2) +identical(out1, out3) + +} diff --git a/src/cpp11.cpp b/src/cpp11.cpp index 51765fc..2120bc3 100644 --- a/src/cpp11.cpp +++ b/src/cpp11.cpp @@ -468,10 +468,17 @@ extern "C" SEXP _cheapr_cpp_rev(SEXP x, SEXP set) { END_CPP11 } // utils.cpp -SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no); -extern "C" SEXP _cheapr_cpp_if_else(SEXP condition, SEXP yes, SEXP no) { +SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no, SEXP na); +extern "C" SEXP _cheapr_cpp_if_else(SEXP condition, SEXP yes, SEXP no, SEXP na) { BEGIN_CPP11 - return cpp11::as_sexp(cpp_if_else(cpp11::as_cpp>(condition), cpp11::as_cpp>(yes), cpp11::as_cpp>(no))); + return cpp11::as_sexp(cpp_if_else(cpp11::as_cpp>(condition), cpp11::as_cpp>(yes), cpp11::as_cpp>(no), cpp11::as_cpp>(na))); + END_CPP11 +} +// utils.cpp +SEXP cpp_lgl_count(SEXP x); +extern "C" SEXP _cheapr_cpp_lgl_count(SEXP x) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_lgl_count(cpp11::as_cpp>(x))); END_CPP11 } // which.cpp @@ -502,6 +509,13 @@ extern "C" SEXP _cheapr_cpp_which_not_na(SEXP x) { return cpp11::as_sexp(cpp_which_not_na(cpp11::as_cpp>(x))); END_CPP11 } +// which.cpp +SEXP cpp_lgl_locs(SEXP x, R_xlen_t n_true, R_xlen_t n_false, bool include_true, bool include_false, bool include_na); +extern "C" SEXP _cheapr_cpp_lgl_locs(SEXP x, SEXP n_true, SEXP n_false, SEXP include_true, SEXP include_false, SEXP include_na) { + BEGIN_CPP11 + return cpp11::as_sexp(cpp_lgl_locs(cpp11::as_cpp>(x), cpp11::as_cpp>(n_true), cpp11::as_cpp>(n_false), cpp11::as_cpp>(include_true), cpp11::as_cpp>(include_false), cpp11::as_cpp>(include_na))); + END_CPP11 +} extern "C" { static const R_CallMethodDef CallEntries[] = { @@ -521,7 +535,7 @@ static const R_CallMethodDef CallEntries[] = { {"_cheapr_cpp_gcd", (DL_FUNC) &_cheapr_cpp_gcd, 5}, {"_cheapr_cpp_gcd2", (DL_FUNC) &_cheapr_cpp_gcd2, 4}, {"_cheapr_cpp_gcd2_vectorised", (DL_FUNC) &_cheapr_cpp_gcd2_vectorised, 4}, - {"_cheapr_cpp_if_else", (DL_FUNC) &_cheapr_cpp_if_else, 3}, + {"_cheapr_cpp_if_else", (DL_FUNC) &_cheapr_cpp_if_else, 4}, {"_cheapr_cpp_int64_to_double", (DL_FUNC) &_cheapr_cpp_int64_to_double, 1}, {"_cheapr_cpp_int64_to_int", (DL_FUNC) &_cheapr_cpp_int64_to_int, 1}, {"_cheapr_cpp_int64_to_numeric", (DL_FUNC) &_cheapr_cpp_int64_to_numeric, 1}, @@ -535,6 +549,8 @@ static const R_CallMethodDef CallEntries[] = { {"_cheapr_cpp_lcm2_vectorised", (DL_FUNC) &_cheapr_cpp_lcm2_vectorised, 4}, {"_cheapr_cpp_lead_sequence", (DL_FUNC) &_cheapr_cpp_lead_sequence, 3}, {"_cheapr_cpp_lengths", (DL_FUNC) &_cheapr_cpp_lengths, 2}, + {"_cheapr_cpp_lgl_count", (DL_FUNC) &_cheapr_cpp_lgl_count, 1}, + {"_cheapr_cpp_lgl_locs", (DL_FUNC) &_cheapr_cpp_lgl_locs, 6}, {"_cheapr_cpp_list_as_df", (DL_FUNC) &_cheapr_cpp_list_as_df, 1}, {"_cheapr_cpp_matrix_col_na_counts", (DL_FUNC) &_cheapr_cpp_matrix_col_na_counts, 1}, {"_cheapr_cpp_matrix_row_na_counts", (DL_FUNC) &_cheapr_cpp_matrix_row_na_counts, 1}, diff --git a/src/utils.cpp b/src/utils.cpp index 959222b..13136c5 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -381,7 +381,7 @@ SEXP cpp_rev(SEXP x, bool set){ } [[cpp11::register]] -SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ +SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no, SEXP na){ int NP = 0; // count num protections if (TYPEOF(condition) != LGLSXP){ Rf_error("condition must be a logical vector"); @@ -389,9 +389,13 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ if (TYPEOF(yes) != TYPEOF(no)){ Rf_error("`typeof(yes)` must match `typeof(no)`"); } + if (TYPEOF(yes) != TYPEOF(na)){ + Rf_error("`typeof(yes)` must match `typeof(na)`"); + } R_xlen_t n = Rf_xlength(condition); R_xlen_t yes_size = Rf_xlength(yes); R_xlen_t no_size = Rf_xlength(no); + R_xlen_t na_size = Rf_xlength(na); if (yes_size != 1 && yes_size != n){ Rf_error("`length(yes)` must be 1 or `length(condition)`"); @@ -399,9 +403,13 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ if (no_size != 1 && no_size != n){ Rf_error("`length(no)` must be 1 or `length(condition)`"); } + if (na_size != 1 && na_size != n){ + Rf_error("`length(na)` must be 1 or `length(condition)`"); + } bool yes_scalar = yes_size == 1; bool no_scalar = no_size == 1; + bool na_scalar = na_size == 1; int *p_x = LOGICAL(condition); SEXP out = Rf_protect(Rf_allocVector(TYPEOF(yes), n)); ++NP; @@ -415,8 +423,8 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ int *p_out = INTEGER(out); int *p_yes = INTEGER(yes); int *p_no = INTEGER(no); + int *p_na = INTEGER(na); - int na_val = NA_INTEGER; for (R_xlen_t i = 0; i < n; ++i){ switch(p_x[i]){ case true: { @@ -428,7 +436,7 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ break; } default: { - p_out[i] = na_val; + p_out[i] = p_na[na_scalar ? 0 : i]; break; } } @@ -441,8 +449,8 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ double *p_out = REAL(out); double *p_yes = REAL(yes); double *p_no = REAL(no); + double *p_na = REAL(na); - double na_val = NA_REAL; for (R_xlen_t i = 0; i < n; ++i){ switch(p_x[i]){ case true: { @@ -454,20 +462,18 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ break; } default: { - p_out[i] = na_val; + p_out[i] = p_na[na_scalar ? 0 : i]; break; } } - // p_out[i] = p_x[i] == NA_LOGICAL ? na_val : p_x[i] ? - // p_yes[yes_scalar ? 0 : i] : p_no[no_scalar ? 0 : i]; } break; } case STRSXP: { const SEXP *p_yes = STRING_PTR_RO(yes); const SEXP *p_no = STRING_PTR_RO(no); + const SEXP *p_na = STRING_PTR_RO(na); - SEXP na_val = NA_STRING; for (R_xlen_t i = 0; i < n; ++i){ switch(p_x[i]){ case true: { @@ -479,65 +485,53 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ break; } default: { - SET_STRING_ELT(out, i, na_val); + SET_STRING_ELT(out, i, p_na[na_scalar ? 0 : i]); break; } } - // SET_STRING_ELT(out, i, ( - // p_x[i] == NA_LOGICAL ? na_val : p_x[i] ? - // p_yes[yes_scalar ? 0 : i] : p_no[no_scalar ? 0 : i] - // )); } break; } case CPLXSXP: { Rcomplex *p_yes = COMPLEX(yes); Rcomplex *p_no = COMPLEX(no); - - SEXP na_cplx = Rf_protect(Rf_allocVector(CPLXSXP, 1)); ++NP; - Rcomplex *p_na_cplx = COMPLEX(na_cplx); - p_na_cplx[0].r = NA_REAL; - p_na_cplx[0].i = NA_REAL; - Rcomplex na_val = Rf_asComplex(na_cplx); + Rcomplex *p_na = COMPLEX(na); for (R_xlen_t i = 0; i < n; ++i){ - SET_COMPLEX_ELT(out, i, ( - p_x[i] == NA_LOGICAL ? na_val : p_x[i] ? - p_yes[yes_scalar ? 0 : i] : p_no[no_scalar ? 0 : i] - )); + switch(p_x[i]){ + case true: { + SET_COMPLEX_ELT(out, i, p_yes[yes_scalar ? 0 : i]); + break; + } + case false: { + SET_COMPLEX_ELT(out, i, p_no[no_scalar ? 0 : i]); + break; + } + default: { + SET_COMPLEX_ELT(out, i, p_na[na_scalar ? 0 : i]); + break; + } + } } break; } case RAWSXP: { Rbyte *p_yes = RAW(yes); Rbyte *p_no = RAW(no); - - SEXP na_raw = Rf_protect(Rf_coerceVector(Rf_ScalarReal(0), RAWSXP)); ++NP; - Rbyte na_val = RAW(na_raw)[0]; - - for (R_xlen_t i = 0; i < n; ++i){ - SET_RAW_ELT(out, i, ( - p_x[i] == NA_LOGICAL ? na_val : p_x[i] ? - p_yes[yes_scalar ? 0 : i] : p_no[no_scalar ? 0 : i] - )); - } - break; - } - case VECSXP: { - const SEXP *p_yes = VECTOR_PTR_RO(yes); - const SEXP *p_no = VECTOR_PTR_RO(no); + Rbyte *p_na = RAW(na); for (R_xlen_t i = 0; i < n; ++i){ switch(p_x[i]){ case true: { - SET_VECTOR_ELT(out, i, p_yes[yes_scalar ? 0 : i]); + SET_RAW_ELT(out, i, p_yes[yes_scalar ? 0 : i]); break; } case false: { - SET_VECTOR_ELT(out, i, p_no[no_scalar ? 0 : i]); + SET_RAW_ELT(out, i, p_no[no_scalar ? 0 : i]); break; } default: { + SET_RAW_ELT(out, i, p_na[na_scalar ? 0 : i]); break; } } @@ -553,6 +547,55 @@ SEXP cpp_if_else(SEXP condition, SEXP yes, SEXP no){ return out; } +// Counts number of true, false and NAs in a logical vector in one pass + +[[cpp11::register]] +SEXP cpp_lgl_count(SEXP x){ + R_xlen_t n = Rf_xlength(x); + int n_cores = n >= CHEAPR_OMP_THRESHOLD ? num_cores() : 1; + + int *p_x = LOGICAL(x); + + R_xlen_t i; + R_xlen_t ntrue = 0, nfalse = 0; + + if (n_cores > 1){ +#pragma omp parallel for simd num_threads(n_cores) reduction(+:ntrue, nfalse) + for (i = 0; i < n; ++i){ + ntrue += p_x[i] == TRUE; + nfalse += p_x[i] == FALSE; + } + } else { + OMP_FOR_SIMD + for (i = 0; i < n; ++i){ + ntrue += p_x[i] == TRUE; + nfalse += p_x[i] == FALSE; + } + } + R_xlen_t nna = n - ntrue - nfalse; + + SEXP out = Rf_protect(Rf_allocVector(n > integer_max_ ? REALSXP : INTSXP, 3)); + SEXP names = Rf_protect(Rf_allocVector(STRSXP, 3)); + SET_STRING_ELT(names, 0, Rf_mkChar("true")); + SET_STRING_ELT(names, 1, Rf_mkChar("false")); + SET_STRING_ELT(names, 2, Rf_mkChar("na")); + + if (n > integer_max_){ + SET_REAL_ELT(out, 0, (double) ntrue); + SET_REAL_ELT(out, 1, (double) nfalse); + SET_REAL_ELT(out, 2, (double) nna); + } else { + SET_INTEGER_ELT(out, 0, (int) ntrue); + SET_INTEGER_ELT(out, 1, (int) nfalse); + SET_INTEGER_ELT(out, 2, (int) nna); + } + + Rf_setAttrib(out, R_NamesSymbol, names); + + Rf_unprotect(2); + return out; +} + // SEXP cpp_c(SEXP x){ // if (!Rf_isVectorList(x)){ // Rf_error("x must be a list of vectors"); diff --git a/src/which.cpp b/src/which.cpp index 5745f86..04f01ef 100644 --- a/src/which.cpp +++ b/src/which.cpp @@ -530,6 +530,88 @@ SEXP cpp_which_not_na(SEXP x){ } } +// Return the locations of T, F, and NA in one pass +// Must provide the correct num of T and F as args + +[[cpp11::register]] +SEXP cpp_lgl_locs(SEXP x, R_xlen_t n_true, R_xlen_t n_false, + bool include_true, bool include_false, bool include_na){ + R_xlen_t n = Rf_xlength(x); + int *p_x = LOGICAL(x); + + if (n > integer_max_){ + SEXP true_locs = Rf_protect(Rf_allocVector(REALSXP, include_true ? n_true : 0)); + SEXP false_locs = Rf_protect(Rf_allocVector(REALSXP, include_false ? n_false : 0)); + SEXP na_locs = Rf_protect(Rf_allocVector(REALSXP, include_na ? (n - n_true - n_false) : 0)); + + double *p_true = REAL(true_locs); + double *p_false = REAL(false_locs); + double *p_na = REAL(na_locs); + + R_xlen_t k1 = 0; + R_xlen_t k2 = 0; + R_xlen_t k3 = 0; + + for (R_xlen_t i = 0; i < n; ++i){ + if (include_true && p_x[i] == TRUE){ + p_true[k1++] = i + 1; + } else if (include_false && p_x[i] == FALSE){ + p_false[k2++] = i + 1; + } else if (include_na && p_x[i] == NA_LOGICAL){ + p_na[k3++] = i + 1; + } + } + SEXP out = Rf_protect(Rf_allocVector(VECSXP, 3)); + SET_VECTOR_ELT(out, 0, true_locs); + SET_VECTOR_ELT(out, 1, false_locs); + SET_VECTOR_ELT(out, 2, na_locs); + + SEXP names = Rf_protect(Rf_allocVector(STRSXP, 3)); + SET_STRING_ELT(names, 0, Rf_mkChar("true")); + SET_STRING_ELT(names, 1, Rf_mkChar("false")); + SET_STRING_ELT(names, 2, Rf_mkChar("na")); + Rf_setAttrib(out, R_NamesSymbol, names); + + Rf_unprotect(5); + return out; + } else { + SEXP true_locs = Rf_protect(Rf_allocVector(INTSXP, include_true ? n_true : 0)); + SEXP false_locs = Rf_protect(Rf_allocVector(INTSXP, include_false ? n_false : 0)); + SEXP na_locs = Rf_protect(Rf_allocVector(INTSXP, include_na ? (n - n_true - n_false) : 0)); + + int *p_true = INTEGER(true_locs); + int *p_false = INTEGER(false_locs); + int *p_na = INTEGER(na_locs); + + int k1 = 0; + int k2 = 0; + int k3 = 0; + + for (int i = 0; i < n; ++i){ + if (include_true && p_x[i] == TRUE){ + p_true[k1++] = i + 1; + } else if (include_false && p_x[i] == FALSE){ + p_false[k2++] = i + 1; + } else if (include_na && p_x[i] == NA_LOGICAL){ + p_na[k3++] = i + 1; + } + } + SEXP out = Rf_protect(Rf_allocVector(VECSXP, 3)); + SET_VECTOR_ELT(out, 0, true_locs); + SET_VECTOR_ELT(out, 1, false_locs); + SET_VECTOR_ELT(out, 2, na_locs); + + SEXP names = Rf_protect(Rf_allocVector(STRSXP, 3)); + SET_STRING_ELT(names, 0, Rf_mkChar("true")); + SET_STRING_ELT(names, 1, Rf_mkChar("false")); + SET_STRING_ELT(names, 2, Rf_mkChar("na")); + Rf_setAttrib(out, R_NamesSymbol, names); + + Rf_unprotect(5); + return out; + } +} + // 2 more which() alternatives // list cpp_which2(SEXP x){