From 27a9b41df581f912922f3c1b8f29bb889c105da8 Mon Sep 17 00:00:00 2001 From: Nick Christofides <118103879+NicChr@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:52:56 +0100 Subject: [PATCH] Further updates. --- R/scalars.R | 3 + R/sset.R | 4 + README.Rmd | 22 ++++ README.md | 152 +++++++++++++----------- src/sset.cpp | 230 +++---------------------------------- src/which.cpp | 66 ++++++----- tests/testthat/test-sset.R | 39 +------ 7 files changed, 173 insertions(+), 343 deletions(-) diff --git a/R/scalars.R b/R/scalars.R index 3628493..20eac2d 100644 --- a/R/scalars.R +++ b/R/scalars.R @@ -29,3 +29,6 @@ val_rm <- function(x, value){ sset(x, cpp_which_val(x, value, invert = TRUE)) } } +which_val <- function(x, value, invert = FALSE){ + .Call(`_cheapr_cpp_which_val`, x, value, invert) +} diff --git a/R/sset.R b/R/sset.R index 9652ea4..45d4604 100644 --- a/R/sset.R +++ b/R/sset.R @@ -67,6 +67,7 @@ sset <- function(x, ...){ #' @export sset.default <- function(x, i, ...){ if (!missing(i) && is.logical(i)){ + check_length(i, length(x)) i <- which_(i) } # The below line will handle a special but common @@ -105,6 +106,7 @@ sset.Date <- function(x, i, ...){ # out <- sset.default(unclass(x), i, ...) # set_attr(out, "class", oldClass(x)) if (!missing(i) && is.logical(i)){ + check_length(i, length(x)) i <- which_(i) } if (!missing(i) && @@ -127,6 +129,7 @@ sset.Date <- function(x, i, ...){ #' @export sset.POSIXct <- function(x, i, ...){ if (!missing(i) && is.logical(i)){ + check_length(i, length(x)) i <- which_(i) } if (!missing(i) && @@ -149,6 +152,7 @@ sset.POSIXct <- function(x, i, ...){ #' @export sset.factor <- function(x, i, ...){ if (!missing(i) && is.logical(i)){ + check_length(i, length(x)) i <- which_(i) } if (!missing(i) && diff --git a/README.Rmd b/README.Rmd index a733e35..456b869 100644 --- a/README.Rmd +++ b/README.Rmd @@ -166,6 +166,28 @@ mark(sset(df, -10^4:0), check = FALSE) # The only difference is the row names ``` +The biggest difference between `sset` and `[` is the way logical vectors are handled. +The two main differences when `i` is a logical vector are: + +* `NA` values are ignored, only the locations of `TRUE` values are used. +* `i` must be the same length as `x` and is not recycled. + + +```{r,error=TRUE} +# Examples with NAs +x <- c(1, 5, NA, NA, -5) +x[x > 0] +sset(x, x > 0) + +# Example with length(i) < length(x) +sset(x, TRUE) + +# This is equivalent +x[TRUE] +# to.. +sset(x) +``` + ## Greatest common divisor and smallest common multiple diff --git a/README.md b/README.md index fdf927a..7b02a8e 100644 --- a/README.md +++ b/README.md @@ -52,14 +52,14 @@ mark(na_locf(x), vec_fill_missing(x, direction = "down")) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 "na_locf(x)" 841.5µs 851.15µs 1152. 0B 0 -#> 2 "vec_fill_missing(x, direction… 2.59ms 2.79ms 353. 11.4MB 117. +#> 1 "na_locf(x)" 841.1µs 854.1µs 1144. 0B 0 +#> 2 "vec_fill_missing(x, direction… 2.67ms 2.79ms 352. 11.4MB 117. mark(na_locf(x), vec_fill_missing(x, direction = "down")) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 "na_locf(x)" 841.7µs 851.5µs 1164. 0B 0 -#> 2 "vec_fill_missing(x, direction… 2.62ms 2.77ms 353. 11.4MB 174. +#> 1 "na_locf(x)" 841µs 855.9µs 1130. 0B 0 +#> 2 "vec_fill_missing(x, direction… 2.57ms 2.79ms 353. 11.4MB 203. ``` All the `NA` handling functions in cheapr can make use of multiple cores @@ -71,16 +71,16 @@ mark(num_na(x), sum(is.na(x))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 num_na(x) 839µs 844.6µs 1178. 0B 0 -#> 2 sum(is.na(x)) 975µs 1.08ms 909. 3.81MB 83.1 +#> 1 num_na(x) 838µs 848.7µs 1149. 0B 0 +#> 2 sum(is.na(x)) 974µs 1.07ms 917. 3.81MB 82.1 # 4 cores options(cheapr.cores = 4) mark(num_na(x), sum(is.na(x))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 num_na(x) 234µs 297.5µs 3099. 0B 0 -#> 2 sum(is.na(x)) 992µs 1.08ms 885. 3.81MB 80.2 +#> 1 num_na(x) 239µs 300.1µs 3054. 0B 0 +#> 2 sum(is.na(x)) 967µs 1.07ms 913. 3.81MB 83.5 ``` ## Efficient NA counts by row/col @@ -93,16 +93,16 @@ mark(row_na_counts(m), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 row_na_counts(m) 1.52ms 3.29ms 319. 12.9KB 0 -#> 2 rowSums(is.na(m)) 2.8ms 2.89ms 338. 3.82MB 34.6 +#> 1 row_na_counts(m) 1.3ms 3.14ms 331. 12.9KB 0 +#> 2 rowSums(is.na(m)) 2.79ms 2.89ms 344. 3.82MB 34.4 # Number of NA values by col mark(col_na_counts(m), colSums(is.na(m))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 col_na_counts(m) 684.1µs 799.8µs 1222. 12.9KB 0 -#> 2 colSums(is.na(m)) 1.99ms 2.06ms 479. 3.82MB 47.4 +#> 1 col_na_counts(m) 683.3µs 801.7µs 1253. 12.9KB 0 +#> 2 colSums(is.na(m)) 1.97ms 2.07ms 480. 3.82MB 45.6 ``` `is_na` is a multi-threaded alternative to `is.na` @@ -114,8 +114,8 @@ mark(is.na(x), is_na(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 is.na(x) 1.05ms 1.09ms 902. 3.81MB 83.6 -#> 2 is_na(x) 573.7µs 702.2µs 1323. 3.82MB 121. +#> 1 is.na(x) 1.04ms 1.09ms 870. 3.81MB 131. +#> 2 is_na(x) 527.8µs 619.6µs 1565. 3.82MB 227. ### posixlt method is much faster hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600), @@ -123,13 +123,11 @@ hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600), hours[sample.int(10^6, 10^5)] <- NA mark(is.na(hours), is_na(hours)) -#> Warning: Some expressions had a GC in every iteration; so filtering is -#> disabled. #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 is.na(hours) 1.17s 1.17s 0.854 61MB 0.854 -#> 2 is_na(hours) 5.24ms 5.7ms 163. 9.8MB 9.93 +#> 1 is.na(hours) 1.18s 1.18s 0.846 61MB 0 +#> 2 is_na(hours) 5.2ms 5.84ms 171. 9.8MB 8.76 ``` It differs in 2 regards: @@ -183,12 +181,12 @@ overview(df, hist = TRUE) #> cols: 3 #> #> ----- Numeric ----- -#> col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 -#> 1 x integer 0 1 100 50.49 1 25 50 75 100 -#> 2 z numeric 0 1 10000000 0 -5.03 -0.67 0 0.67 5.35 -#> iqr sd hist -#> 1 50 28.87 ▇▇▇▇▇ -#> 2 1.35 1 ▁▂▇▁▁ +#> col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 iqr +#> 1 x integer 0 1 100 50.5 1 25 50 75 100 50 +#> 2 z numeric 0 1 10000000 0 -5.39 -0.67 0 0.67 5.23 1.35 +#> sd hist +#> 1 28.86 ▇▇▇▇▇ +#> 2 1 ▁▂▇▂▁ #> #> ----- Categorical ----- #> col class n_missing p_complete n_unique n_levels min max @@ -199,7 +197,7 @@ mark(overview(df)) #> # A tibble: 1 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 overview(df) 922ms 922ms 1.08 76.3MB 1.08 +#> 1 overview(df) 903ms 903ms 1.11 76.3MB 1.11 ``` ## Cheaper and consistent subsetting with `sset` @@ -234,9 +232,9 @@ mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y]) #> # A tibble: 3 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 sset(x, x %in_% y) 90.6µs 114µs 8443. 83.1KB 2.06 -#> 2 sset(x, x %in% y) 151.2µs 220µs 4454. 285.3KB 6.74 -#> 3 x[x %in% y] 158.4µs 205µs 4826. 324.4KB 6.78 +#> 1 sset(x, x %in_% y) 95.2µs 115µs 8203. 88.3KB 2.06 +#> 2 sset(x, x %in% y) 155.5µs 229µs 4281. 285.4KB 6.66 +#> 3 x[x %in% y] 139.3µs 212µs 4428. 324.5KB 6.75 ``` `sset` uses an internal range-based subset when `i` is an ALTREP integer @@ -247,8 +245,8 @@ mark(sset(df, 0:10^5), df[0:10^5, , drop = FALSE]) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 sset(df, 0:10^5) 365.3µs 451.8µs 2148. 1.53MB 15.2 -#> 2 df[0:10^5, , drop = FALSE] 6.83ms 7.1ms 141. 4.82MB 4.33 +#> 1 sset(df, 0:10^5) 370.6µs 439.4µs 2190. 1.53MB 17.2 +#> 2 df[0:10^5, , drop = FALSE] 6.77ms 7.08ms 138. 4.83MB 2.06 ``` It also accepts negative indexes @@ -262,8 +260,34 @@ mark(sset(df, -10^4:0), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 sset(df, -10^4:0) 21.9ms 28.5ms 29.4 152MB 20.2 -#> 2 df[-10^4:0, , drop = FALSE] 653.8ms 653.8ms 1.53 776MB 7.65 +#> 1 sset(df, -10^4:0) 21.7ms 27.2ms 33.0 152MB 19.4 +#> 2 df[-10^4:0, , drop = FALSE] 562.7ms 562.7ms 1.78 776MB 5.33 +``` + +The biggest difference between `sset` and `[` is the way logical vectors +are handled. The two main differences when `i` is a logical vector are: + +- `NA` values are ignored, only the locations of `TRUE` values are used. +- `i` must be the same length as `x` and is not recycled. + +``` r +# Examples with NAs +x <- c(1, 5, NA, NA, -5) +x[x > 0] +#> [1] 1 5 NA NA +sset(x, x > 0) +#> [1] 1 5 + +# Example with length(i) < length(x) +sset(x, TRUE) +#> Error in check_length(i, length(x)): i must have length 5 + +# This is equivalent +x[TRUE] +#> [1] 1 5 NA NA -5 +# to.. +sset(x) +#> [1] 1 5 NA NA -5 ``` ## Greatest common divisor and smallest common multiple @@ -284,13 +308,13 @@ mark(gcd(x)) #> # A tibble: 1 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 gcd(x) 1.2µs 1.3µs 667249. 0B 0 +#> 1 gcd(x) 1.2µs 1.3µs 687153. 0B 0 x <- seq(0, 10^6, 0.5) mark(gcd(x)) #> # A tibble: 1 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 gcd(x) 46.1ms 46.2ms 21.6 0B 0 +#> 1 gcd(x) 52.2ms 52.3ms 19.1 0B 0 ``` ## Creating many sequences @@ -379,32 +403,32 @@ mark(cheapr_which = which_(x), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 2.5ms 2.72ms 353. 3.81MB 2.06 -#> 2 base_which 1.15ms 1.24ms 747. 7.63MB 11.9 +#> 1 cheapr_which 2.56ms 2.76ms 347. 3.81MB 4.16 +#> 2 base_which 1.15ms 1.23ms 770. 7.63MB 24.7 x <- rep(FALSE, 10^6) mark(cheapr_which = which_(x), base_which = which(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 214µs 263µs 3667. 0B 0 -#> 2 base_which 457µs 461µs 2141. 3.81MB 17.7 +#> 1 cheapr_which 208µs 271µs 3316. 0B 0 +#> 2 base_which 456µs 472µs 2092. 3.81MB 33.6 x <- c(rep(TRUE, 5e05), rep(FALSE, 1e06)) mark(cheapr_which = which_(x), base_which = which(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 1.45ms 1.63ms 595. 1.91MB 4.19 -#> 2 base_which 1.02ms 1.08ms 911. 7.63MB 13.5 +#> 1 cheapr_which 1.5ms 1.69ms 565. 1.91MB 4.16 +#> 2 base_which 1.02ms 1.08ms 869. 7.63MB 28.6 x <- c(rep(FALSE, 5e05), rep(TRUE, 1e06)) mark(cheapr_which = which_(x), base_which = which(x)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 3.39ms 3.53ms 273. 3.81MB 4.30 -#> 2 base_which 1.38ms 1.55ms 622. 9.54MB 11.5 +#> 1 cheapr_which 3.43ms 3.52ms 279. 3.81MB 4.16 +#> 2 base_which 1.37ms 1.48ms 616. 9.54MB 30.2 x <- sample(c(TRUE, FALSE), 10^6, TRUE) x[sample.int(10^6, 10^4)] <- NA mark(cheapr_which = which_(x), @@ -412,8 +436,8 @@ mark(cheapr_which = which_(x), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_which 2.09ms 2.21ms 438. 1.89MB 0 -#> 2 base_which 3.33ms 3.38ms 295. 5.7MB 4.24 +#> 1 cheapr_which 2.16ms 2.27ms 432. 1.89MB 2.04 +#> 2 base_which 3.33ms 3.38ms 293. 5.7MB 8.68 ``` ### factor @@ -427,29 +451,31 @@ mark(cheapr_factor = factor_(x), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_factor 9.88ms 10.4ms 95.5 4.59MB 0 -#> 2 base_factor 523.52ms 523.5ms 1.91 27.84MB 0 +#> 1 cheapr_factor 9.88ms 10.4ms 94.4 4.59MB 0 +#> 2 base_factor 507.47ms 507.5ms 1.97 27.84MB 0 mark(cheapr_factor = factor_(x, order = FALSE), base_factor = factor(x, levels = unique(x))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_factor 5.6ms 5.92ms 167. 1.53MB 0 -#> 2 base_factor 800.8ms 800.85ms 1.25 22.79MB 0 +#> 1 cheapr_factor 5.55ms 5.93ms 167. 1.53MB 0 +#> 2 base_factor 806.6ms 806.6ms 1.24 22.79MB 0 mark(cheapr_factor = factor_(y), base_factor = factor(y)) +#> Warning: Some expressions had a GC in every iteration; so filtering is +#> disabled. #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_factor 202.93ms 203.91ms 4.89 5.23MB 0 -#> 2 base_factor 2.84s 2.84s 0.352 54.35MB 0 +#> 1 cheapr_factor 244.52ms 255.22ms 3.92 5.23MB 0 +#> 2 base_factor 3.06s 3.06s 0.327 54.35MB 0.327 mark(cheapr_factor = factor_(y, order = FALSE), base_factor = factor(y, levels = unique(y))) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_factor 7.06ms 8.07ms 121. 3.49MB 0 -#> 2 base_factor 44.53ms 48.78ms 20.5 39.89MB 2.28 +#> 1 cheapr_factor 7.77ms 8.17ms 123. 3.49MB 0 +#> 2 base_factor 48.35ms 49.82ms 19.2 39.89MB 2.13 ``` ### intersect & setdiff @@ -463,15 +489,15 @@ mark(cheapr_intersect = intersect_(x, y, dups = FALSE), #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_intersect 3.16ms 3.43ms 290. 1.18MB 0 -#> 2 base_intersect 4.42ms 4.61ms 212. 5.16MB 2.21 +#> 1 cheapr_intersect 3.11ms 3.42ms 287. 1.18MB 0 +#> 2 base_intersect 4.39ms 4.64ms 212. 5.16MB 2.18 mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE), base_setdiff = setdiff(x, y)) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_setdiff 3.4ms 3.73ms 266. 1.76MB 0 -#> 2 base_setdiff 4.66ms 5ms 195. 5.71MB 2.22 +#> 1 cheapr_setdiff 3.49ms 3.65ms 271. 1.76MB 0 +#> 2 base_setdiff 4.78ms 4.97ms 197. 5.71MB 2.22 ``` ### `%in_%` and `%!in_%` @@ -482,15 +508,15 @@ mark(cheapr = x %in_% y, #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr 1.96ms 2.05ms 481. 781.34KB 0 -#> 2 base 2.69ms 2.82ms 345. 2.53MB 2.25 +#> 1 cheapr 1.99ms 2.06ms 477. 781.34KB 2.11 +#> 2 base 2.68ms 2.81ms 352. 2.53MB 0 mark(cheapr = x %!in_% y, base = !x %in% y) #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr 1.95ms 2.09ms 476. 787.85KB 0 -#> 2 base 2.86ms 2.98ms 324. 2.91MB 2.23 +#> 1 cheapr 1.92ms 2.05ms 484. 787.85KB 2.19 +#> 2 base 2.87ms 2.97ms 331. 2.91MB 0 ``` ### cut.default @@ -501,11 +527,9 @@ x <- rnorm(10^7) b <- seq(0, max(x), 0.2) mark(cheapr_cut = cut_numeric(x, b), base_cut = cut(x, b)) -#> Warning: Some expressions had a GC in every iteration; so filtering is -#> disabled. #> # A tibble: 2 × 6 #> expression min median `itr/sec` mem_alloc `gc/sec` #> -#> 1 cheapr_cut 132ms 133ms 7.50 38.1MB 0 -#> 2 base_cut 438ms 486ms 2.06 267.1MB 2.06 +#> 1 cheapr_cut 131ms 131ms 7.66 38.1MB 3.83 +#> 2 base_cut 403ms 403ms 2.48 267.1MB 2.48 ``` diff --git a/src/sset.cpp b/src/sset.cpp index fbece8e..5190ab6 100644 --- a/src/sset.cpp +++ b/src/sset.cpp @@ -649,14 +649,10 @@ SEXP cpp_sset_df(SEXP x, SEXP indices){ SEXP out = Rf_protect(Rf_allocVector(VECSXP, ncols)); ++n_protections; // SEXP *p_out = VECTOR_PTR(out); - // Counting the number of: - // Zeroes - // Out-of-bounds indices - // Positive indices - // From this we can also work out the number of negatives // If indices is a special type of ALTREP compact int sequence, we can // Use a range-based subset instead + if (is_alt_compact_seq(indices)){ // ALTREP integer sequence method @@ -702,7 +698,13 @@ SEXP cpp_sset_df(SEXP x, SEXP indices){ out_size = get_alt_final_sset_size(xn, from, to, by); } else { int *pi = INTEGER(indices); - // Usual method + + // Counting the number of: + // Zeroes + // Out-of-bounds indices + // Positive indices + // NA indices + // From this we can also work out the number of negatives if (do_parallel){ #pragma omp parallel for simd num_threads(n_cores) reduction(+:zero_count,pos_count,oob_count,na_count) @@ -726,9 +728,16 @@ SEXP cpp_sset_df(SEXP x, SEXP indices){ (neg_count > 0 && na_count > 0)){ Rf_error("Cannot mix positive and negative indices"); } + // Should a simplified sset method be used? + bool simple_sset = zero_count == 0 && oob_count == 0 && na_count == 0 && pos_count == n; + + // Final length of output + out_size = na_count + pos_count; - // Index vector is clean, we can use fast subset + + // If Index vector is clean we can use fast subset + if (simple_sset){ for (int j = 0; j < ncols; ++j){ SEXP df_var = Rf_protect(p_x[j]); @@ -753,7 +762,9 @@ SEXP cpp_sset_df(SEXP x, SEXP indices){ } Rf_unprotect(1); } + // Negative indexing + } else if (neg_count > 0){ SEXP indices2 = Rf_protect(cpp11::package("cheapr")["neg_indices_to_pos"](indices, xn)); ++n_protections; @@ -843,211 +854,6 @@ SEXP cpp_sset_df(SEXP x, SEXP indices){ return out; } -// SEXP cpp_sset_df(SEXP x, SEXP indices){ -// int xn = cpp_df_nrow(x); -// int ncols = Rf_length(x); -// int n = Rf_length(indices); -// int n_protections = 0; -// int zero_count = 0; -// int pos_count = 0; -// int oob_count = 0; -// int na_count = 0; -// int out_size; -// bool do_parallel = n >= 10000; -// int n_cores = do_parallel ? num_cores() : 1; -// cpp11::function cheapr_sset = cpp11::package("cheapr")["sset"]; -// const SEXP *p_x = VECTOR_PTR_RO(x); -// SEXP out = Rf_protect(Rf_allocVector(VECSXP, ncols)); -// ++n_protections; -// // SEXP *p_out = VECTOR_PTR(out); -// // Counting the number of: -// // Zeroes -// // Out-of-bounds indices -// // Positive indices -// // From this we can also work out the number of negatives -// -// // If indices is a special type of ALTREP compact int sequence, we can -// // Use a range-based subset instead -// if (is_alt_compact_seq(indices)){ -// -// // ALTREP integer sequence method -// -// SEXP seq_data = Rf_protect(alt_compact_seq_data(indices)); -// ++n_protections; -// R_xlen_t from = REAL(seq_data)[0]; -// R_xlen_t to = REAL(seq_data)[1]; -// R_xlen_t by = REAL(seq_data)[2]; -// for (int j = 0; j < ncols; ++j){ -// SEXP df_var = Rf_protect(p_x[j]); -// if (!Rf_isObject(df_var) || -// Rf_inherits(df_var, "Date") || -// Rf_inherits(df_var, "POSIXct") || -// Rf_inherits(df_var, "factor")){ -// SEXP list_var = Rf_protect(cpp_sset_range(df_var, from, to, by)); -// Rf_copyMostAttrib(df_var, list_var); -// int has_names = Rf_getAttrib(df_var, R_NamesSymbol) != R_NilValue; -// if (has_names){ -// SEXP new_names = Rf_protect(cpp_sset_range( -// Rf_getAttrib(df_var, R_NamesSymbol), from, to, by) -// ); -// Rf_setAttrib(list_var, R_NamesSymbol, new_names); -// } -// SET_VECTOR_ELT(out, j, list_var); -// -// // We un-protect below the original and new df variables, as well as names -// // Once they are added to the data frame, they are protected -// // If we didn't do this we would easily reach the protection stack limit -// // of 10,000 -// Rf_unprotect(1 + has_names); -// } else { -// SET_VECTOR_ELT(out, j, cheapr_sset(df_var, indices)); -// } -// Rf_unprotect(1); -// } -// out_size = get_alt_final_sset_size(xn, from, to, by); -// } else { -// int *pi = INTEGER(indices); -// // Usual method -// -// if (do_parallel){ -// #pragma omp parallel for simd num_threads(n_cores) reduction(+:zero_count,pos_count,oob_count,na_count) -// for (int j = 0; j < n; ++j){ -// zero_count += (pi[j] == 0); -// pos_count += (pi[j] > 0); -// oob_count += (std::fabs(pi[j]) > xn); -// na_count += (pi[j] == NA_INTEGER); -// } -// } else { -// OMP_FOR_SIMD -// for (int j = 0; j < n; ++j){ -// zero_count += (pi[j] == 0); -// pos_count += (pi[j] > 0); -// oob_count += (std::fabs(pi[j]) > xn); -// na_count += (pi[j] == NA_INTEGER); -// } -// } -// int neg_count = n - pos_count - zero_count - na_count; -// if ( (pos_count > 0 && neg_count > 0) || -// (neg_count > 0 && na_count > 0)){ -// Rf_error("Cannot mix positive and negative indices"); -// } -// bool simple_sset = zero_count == 0 && oob_count == 0 && na_count == 0 && pos_count == n; -// out_size = na_count + pos_count; -// // Index vector is clean, we can use fast subset -// if (simple_sset){ -// for (int j = 0; j < ncols; ++j){ -// SEXP df_var = Rf_protect(p_x[j]); -// if (!Rf_isObject(df_var) || -// Rf_inherits(df_var, "Date") || -// Rf_inherits(df_var, "POSIXct") || -// Rf_inherits(df_var, "factor")){ -// SEXP list_var = Rf_protect(cpp_sset_unsafe(df_var, pi, out_size, n_cores)); -// Rf_copyMostAttrib(df_var, list_var); -// int has_names = Rf_getAttrib(df_var, R_NamesSymbol) != R_NilValue; -// if (has_names){ -// SEXP new_names = Rf_protect(cpp_sset_unsafe( -// Rf_getAttrib(df_var, R_NamesSymbol), pi, out_size, n_cores -// )); -// Rf_setAttrib(list_var, R_NamesSymbol, new_names); -// } -// SET_VECTOR_ELT(out, j, list_var); -// Rf_unprotect(1 + has_names); -// } else { -// SET_VECTOR_ELT(out, j, cheapr_sset(df_var, indices)); -// } -// Rf_unprotect(1); -// } -// // Negative indexing -// } else if (neg_count > 0){ -// SEXP indices2 = Rf_protect(cpp11::package("cheapr")["neg_indices_to_pos"](indices, xn)); -// ++n_protections; -// out_size = Rf_length(indices2); -// int *pi2 = INTEGER(indices2); -// for (int j = 0; j < ncols; ++j){ -// SEXP df_var = Rf_protect(p_x[j]); -// if (!Rf_isObject(df_var) || -// Rf_inherits(df_var, "Date") || -// Rf_inherits(df_var, "POSIXct") || -// Rf_inherits(df_var, "factor")){ -// SEXP list_var = Rf_protect(cpp_sset_unsafe(df_var, pi2, out_size, n_cores)); -// Rf_copyMostAttrib(df_var, list_var); -// int has_names = Rf_getAttrib(df_var, R_NamesSymbol) != R_NilValue; -// if (has_names){ -// SEXP new_names = Rf_protect(cpp_sset_unsafe( -// Rf_getAttrib(df_var, R_NamesSymbol), pi2, out_size, n_cores -// )); -// Rf_setAttrib(list_var, R_NamesSymbol, new_names); -// } -// SET_VECTOR_ELT(out, j, list_var); -// Rf_unprotect(1 + has_names); -// } else { -// SET_VECTOR_ELT(out, j, cheapr_sset(df_var, indices2)); -// } -// Rf_unprotect(1); -// } -// // If index vector is clean except for existence of zeroes -// } else if (zero_count > 0 && oob_count == 0 && na_count == 0){ -// SEXP r_zero = Rf_protect(Rf_ScalarInteger(0)); -// ++n_protections; -// SEXP indices2 = Rf_protect(cpp11::package("cheapr")["val_rm"](indices, r_zero)); -// ++n_protections; -// int *pi2 = INTEGER(indices2); -// for (int j = 0; j < ncols; ++j){ -// SEXP df_var = Rf_protect(p_x[j]); -// if (!Rf_isObject(df_var) || -// Rf_inherits(df_var, "Date") || -// Rf_inherits(df_var, "POSIXct") || -// Rf_inherits(df_var, "factor")){ -// SEXP list_var = Rf_protect(cpp_sset_unsafe(df_var, pi2, out_size, n_cores)); -// Rf_copyMostAttrib(df_var, list_var); -// int has_names = Rf_getAttrib(df_var, R_NamesSymbol) != R_NilValue; -// if (has_names){ -// SEXP new_names = Rf_protect(cpp_sset_unsafe( -// Rf_getAttrib(df_var, R_NamesSymbol), pi2, out_size, n_cores -// )); -// Rf_setAttrib(list_var, R_NamesSymbol, new_names); -// } -// SET_VECTOR_ELT(out, j, list_var); -// Rf_unprotect(1 + has_names); -// } else { -// SET_VECTOR_ELT(out, j, cheapr_sset(df_var, indices2)); -// } -// Rf_unprotect(1); -// } -// } else { -// for (int j = 0; j < ncols; ++j){ -// SEXP df_var = Rf_protect(p_x[j]); -// SET_VECTOR_ELT(out, j, cheapr_sset(df_var, indices)); -// Rf_unprotect(1); -// } -// } -// } -// SEXP names = Rf_protect(Rf_duplicate(Rf_getAttrib(x, R_NamesSymbol))); -// ++n_protections; -// Rf_setAttrib(out, R_NamesSymbol, names); -// -// // list to data frame object -// SEXP df_str = Rf_protect(Rf_ScalarString(Rf_mkChar("data.frame"))); -// ++n_protections; -// if (out_size > 0){ -// SEXP row_names = Rf_protect(Rf_allocVector(INTSXP, 2)); -// ++n_protections; -// INTEGER(row_names)[0] = NA_INTEGER; -// INTEGER(row_names)[1] = -out_size; -// Rf_setAttrib(out, R_RowNamesSymbol, row_names); -// } else { -// SEXP row_names = Rf_protect(Rf_allocVector(INTSXP, 0)); -// ++n_protections; -// Rf_setAttrib(out, R_RowNamesSymbol, row_names); -// } -// Rf_classgets(out, df_str); -// // Basically cpp_list_as_df() creates a shallow copy and we don't want that -// // Rf_protect(out = cpp_list_as_df(out)); -// // ++n_protections; -// Rf_unprotect(n_protections); -// return out; -// } - // SEXP cpp_sset(SEXP x, SEXP indices){ // if (!Rf_isObject(x) && Rf_isNull(Rf_getAttrib(x, R_NamesSymbol)) && is_alt_compact_seq(indices)){ // SEXP int_seq_data = Rf_protect(Rf_coerceVector(alt_data1(indices), INTSXP)); diff --git a/src/which.cpp b/src/which.cpp index f0e23bf..f712c8d 100644 --- a/src/which.cpp +++ b/src/which.cpp @@ -86,7 +86,8 @@ SEXP cpp_which_(SEXP x, bool invert){ SEXP cpp_which_val(SEXP x, SEXP value, bool invert){ int n_protections = 0; R_xlen_t n = Rf_xlength(x); - if (cpp_vec_length(value) != 1){ + bool is_long = (n > integer_max_); + if (Rf_length(value) != 1){ Rf_error("value must be a vector of length 1"); } SEXP val_is_na = Rf_protect(cpp_is_na(value)); @@ -99,6 +100,18 @@ SEXP cpp_which_val(SEXP x, SEXP value, bool invert){ return cpp_which_na(x); } } +#define WHICH_VAL(_val_) \ + if (invert){ \ + while (whichi < out_size){ \ + p_out[whichi] = i + 1; \ + whichi += (p_x[i++] != _val_); \ + } \ + } else { \ + while (whichi < out_size){ \ + p_out[whichi] = i + 1; \ + whichi += (p_x[i++] == _val_); \ + } \ + } R_xlen_t n_vals = scalar_count(x, value, false); R_xlen_t out_size = invert ? n - n_vals : n_vals; R_xlen_t whichi = 0; @@ -106,68 +119,53 @@ SEXP cpp_which_val(SEXP x, SEXP value, bool invert){ switch ( TYPEOF(x) ){ case LGLSXP: case INTSXP: { - SEXP out = Rf_protect(Rf_allocVector(INTSXP, out_size)); + SEXP out = Rf_protect(Rf_allocVector(is_long ? REALSXP : INTSXP, out_size)); ++n_protections; - int *p_out = INTEGER(out); Rf_protect(value = Rf_coerceVector(value, INTSXP)); ++n_protections; int val = Rf_asInteger(value); int *p_x = INTEGER(x); - if (invert){ - while (whichi < out_size){ - p_out[whichi] = i + 1; - whichi += !(p_x[i++] == val); - } + if (is_long){ + double *p_out = REAL(out); + WHICH_VAL(val); } else { - while (whichi < out_size){ - p_out[whichi] = i + 1; - whichi += (p_x[i++] == val); - } + int *p_out = INTEGER(out); + WHICH_VAL(val); } Rf_unprotect(n_protections); return out; } case REALSXP: { - SEXP out = Rf_protect(Rf_allocVector(INTSXP, out_size)); + SEXP out = Rf_protect(Rf_allocVector(is_long ? REALSXP : INTSXP, out_size)); ++n_protections; - int *p_out = INTEGER(out); Rf_protect(value = Rf_coerceVector(value, REALSXP)); ++n_protections; double val = Rf_asReal(value); double *p_x = REAL(x); - if (invert){ - while (whichi < out_size){ - p_out[whichi] = i + 1; - whichi += !(p_x[i++] == val); - } + if (is_long){ + double *p_out = REAL(out); + WHICH_VAL(val); } else { - while (whichi < out_size){ - p_out[whichi] = i + 1; - whichi += (p_x[i++] == val); - } + int *p_out = INTEGER(out); + WHICH_VAL(val); } Rf_unprotect(n_protections); return out; } case STRSXP: { - SEXP out = Rf_protect(Rf_allocVector(INTSXP, out_size)); + SEXP out = Rf_protect(Rf_allocVector(is_long ? REALSXP : INTSXP, out_size)); ++n_protections; - int *p_out = INTEGER(out); Rf_protect(value = Rf_coerceVector(value, STRSXP)); ++n_protections; SEXP val = Rf_protect(Rf_asChar(value)); ++n_protections; SEXP *p_x = STRING_PTR(x); - if (invert){ - while (whichi < out_size){ - p_out[whichi] = i + 1; - whichi += !(p_x[i++] == val); - } + if (is_long){ + double *p_out = REAL(out); + WHICH_VAL(val); } else { - while (whichi < out_size){ - p_out[whichi] = i + 1; - whichi += (p_x[i++] == val); - } + int *p_out = INTEGER(out); + WHICH_VAL(val); } Rf_unprotect(n_protections); return out; diff --git a/tests/testthat/test-sset.R b/tests/testthat/test-sset.R index 972c76a..5c11e60 100644 --- a/tests/testthat/test-sset.R +++ b/tests/testthat/test-sset.R @@ -4,7 +4,7 @@ test_that("subsetting", { a <- rnorm(10^3) a[sample.int(10^3, 10)] <- Inf a[sample.int(10^3, 10)] <- -Inf - a[sample.int(10^3, 10)] <- NaN + # a[sample.int(10^3, 10)] <- NaN b <- sample(-100:100, 10^3, TRUE) c <- sample(letters, 10^3, TRUE) d <- complex(real = rnorm(10^3), @@ -51,19 +51,17 @@ test_that("subsetting", { i23 <- 2000:3000 i24 <- -1:-1000 i25 <- -111:-1000 + i26 <- b >= 0 - # i7 <- NA # This doesn't match - # i8 <- NA_integer_ - - objs_to_test <- letters[1:10] - ind_to_test <- paste0("i", 1:25) + objs_to_test <- letters[1:11] + ind_to_test <- paste0("i", 1:26) for (obj in objs_to_test){ if (!is.raw(get(obj))){ - assign(obj, fill_with_na(get(obj), n = 111)) + assign(obj, `names<-`(fill_with_na(get(obj), n = 111), sample.int(1000))) } } - + # Vectors lose their names here df <- data.frame(a, b, c, d, e, f, g, h) df$i <- i df$j <- j @@ -71,7 +69,6 @@ test_that("subsetting", { test_df <- expand.grid(objs_to_test, ind_to_test, stringsAsFactors = FALSE) names(test_df) <- c("obj", "ind") - # test_df <- test_df |> dplyr::filter(obj != "g") for (i in seq_len(nrow(test_df))){ r_obj <- get(test_df$obj[i]) r_ind <- get(test_df$ind[i]) @@ -133,30 +130,6 @@ test_that("subsetting", { ) }) -test_that("errors", { - expect_null(sset(NULL)) - expect_null(sset(NULL, 1:10)) - expect_error(sset(iris$Sepal.Length, c(-5, 5))) - expect_error(sset(iris$Sepal.Length, 10:-10)) - expect_error(sset(globalenv())) -}) - -test_that("misc", { - expect_identical( - cpp_sset_range(10:1, 3, 0, -1), - rev(cpp_sset_range(10:1, 0, 3, 1)) - ) - expect_identical(cpp_sset_range(1:10, 0, 0, 1), integer()) - expect_identical(cpp_sset_range(integer(), 0, 0, 1), integer()) - expect_identical(cpp_sset_range(letters, 0, 0, 1), character()) - expect_identical(cpp_sset_range(as.list(letters), 0, 0, 1), list()) - expect_identical(cpp_sset_range(as.list(letters)[0], 0, 0, -1), list()) - expect_identical(cpp_sset_range(letters[0], 0, 0, -1), character()) - expect_error(cpp_sset_range(letters, 1, 10, 2)) - expect_error(cpp_sset_range(letters, 1, 10, -1)) - expect_error(cpp_sset_range(global(), 1, 10, 1)) -}) - # test_that("fatal error", { # set.seed(43) # i <- as.integer(sample(seq(0, 1000, 100), 10^6, TRUE))