Skip to content

Commit

Permalink
Updated readme.
Browse files Browse the repository at this point in the history
  • Loading branch information
NicChr committed Oct 11, 2024
1 parent b2207cf commit 23bdd7d
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 59 deletions.
3 changes: 1 addition & 2 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ mark(num_na(x), sum(is.na(x)))
# 4 cores
options(cheapr.cores = 4)
mark(num_na(x), sum(is.na(x)))
options(cheapr.cores = 4)
options(cheapr.cores = 1)
```

## Efficient NA counts by row/col
Expand Down Expand Up @@ -276,7 +276,6 @@ increments <- c(1, 0.5, 0.1)
start <- 1
end <- 5
unlist(lapply(increments, \(x) seq(start, end, x)))
```

In cheapr you can use `seq_()` which accepts vector arguments.
Expand Down
114 changes: 57 additions & 57 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,17 +75,17 @@ mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 986µs 1.04ms 954. 2.41KB 0
#> 2 sum(is.na(x)) 780µs 1.75ms 575. 3.81MB 45.6
#> 1 num_na(x) 120µs 123.1µs 7900. 2.41KB 0
#> 2 sum(is.na(x)) 829µs 1.95ms 521. 3.81MB 41.6
# 4 cores
options(cheapr.cores = 4)
mark(num_na(x), sum(is.na(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 num_na(x) 267µs 338.4µs 2881. 0B 0
#> 2 sum(is.na(x)) 782µs 1.75ms 577. 3.81MB 48.3
options(cheapr.cores = 4)
#> 1 num_na(x) 56.9µs 72.6µs 12840. 0B 0
#> 2 sum(is.na(x)) 893.7µs 2ms 494. 3.81MB 39.9
options(cheapr.cores = 1)
```

## Efficient NA counts by row/col
Expand All @@ -98,16 +98,16 @@ mark(row_na_counts(m),
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 row_na_counts(m) 1.34ms 2.66ms 374. 9.11KB 0
#> 2 rowSums(is.na(m)) 3.48ms 3.69ms 267. 3.82MB 23.9
#> 1 row_na_counts(m) 1.99ms 2.04ms 489. 9.14KB 0
#> 2 rowSums(is.na(m)) 2.83ms 3.86ms 262. 3.82MB 23.1
# Number of NA values by col
mark(col_na_counts(m),
colSums(is.na(m)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 col_na_counts(m) 721.5µs 894.9µs 1123. 9.1KB 0
#> 2 colSums(is.na(m)) 2.63ms 2.83ms 351. 3.82MB 31.9
#> 1 col_na_counts(m) 1.96ms 2ms 499. 9.14KB 0
#> 2 colSums(is.na(m)) 1.88ms 2.92ms 342. 3.82MB 33.3
```

`is_na` is a multi-threaded alternative to `is.na`
Expand All @@ -119,8 +119,8 @@ mark(is.na(x), is_na(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(x) 739µs 1.79ms 564. 3.81MB 66.5
#> 2 is_na(x) 425µs 848.45µs 1184. 3.82MB 109.
#> 1 is.na(x) 812µs 2.02ms 503. 3.81MB 97.8
#> 2 is_na(x) 625µs 1.84ms 559. 3.82MB 52.6

### posixlt method is much faster
hours <- as.POSIXlt(seq.int(0, length.out = 10^6, by = 3600),
Expand All @@ -133,8 +133,8 @@ mark(is.na(hours), is_na(hours))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 is.na(hours) 1.22s 1.22s 0.821 61MB 0.821
#> 2 is_na(hours) 5.26ms 7.5ms 111. 13.9MB 13.9
#> 1 is.na(hours) 1.27s 1.27s 0.790 61MB 0.790
#> 2 is_na(hours) 4.05ms 6.37ms 149. 13.9MB 17.9
```

It differs in 2 regards:
Expand Down Expand Up @@ -188,12 +188,12 @@ overview(df, hist = TRUE)
#> cols: 3
#>
#> ----- Numeric -----
#> col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100
#> 1 x integer 0 1 100 50.52 1 26 51 76 100
#> 2 z numeric 0 1 10000000 0 -4.95 -0.67 0 0.67 4.97
#> iqr sd hist
#> 1 50 28.87 ▇▇▇▇▇
#> 2 1.35 1 ▁▇▂▁
#> col class n_missing p_complete n_unique mean p0 p25 p50 p75 p100 iqr
#> 1 x integer 0 1 100 50.5 1 26 51 75 100 49
#> 2 z numeric 0 1 10000000 0 -5.5 -0.68 0 0.67 5.16 1.35
#> sd hist
#> 1 28.86 ▇▇▇▇▇
#> 2 1 ▁▇▂▁
#>
#> ----- Categorical -----
#> col class n_missing p_complete n_unique n_levels min max
Expand All @@ -202,7 +202,7 @@ mark(overview(df))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 overview(df) 1.26s 1.26s 0.793 2.09KB 0
#> 1 overview(df) 1.04s 1.04s 0.966 2.09KB 0
```

## Cheaper and consistent subsetting with `sset`
Expand Down Expand Up @@ -237,9 +237,9 @@ mark(sset(x, x %in_% y), sset(x, x %in% y), x[x %in% y])
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(x, x %in_% y) 82.5µs 122µs 8488. 88.1KB 6.66
#> 2 sset(x, x %in% y) 152.8µs 230µs 4388. 285.3KB 8.80
#> 3 x[x %in% y] 130.5µs 210µs 4824. 324.4KB 14.2
#> 1 sset(x, x %in_% y) 87µs 129µs 8010. 88.2KB 6.48
#> 2 sset(x, x %in% y) 164µs 256µs 3882. 285.4KB 8.79
#> 3 x[x %in% y] 136µs 226µs 4516. 324.5KB 11.2
```

`sset` uses an internal range-based subset when `i` is an ALTREP integer
Expand All @@ -250,8 +250,8 @@ mark(sset(df, 0:10^5), df[0:10^5, , drop = FALSE])
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, 0:10^5) 132.5µs 518.8µs 1944. 1.53MB 26.8
#> 2 df[0:10^5, , drop = FALSE] 6.1ms 7.28ms 136. 4.83MB 6.57
#> 1 sset(df, 0:10^5) 190µs 615µs 1654. 1.53MB 23.9
#> 2 df[0:10^5, , drop = FALSE] 6.69ms 7.88ms 129. 4.83MB 6.55
```

It also accepts negative indexes
Expand All @@ -265,8 +265,8 @@ mark(sset(df, -10^4:0),
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 sset(df, -10^4:0) 47.8ms 49.5ms 15.8 152MB 5.94
#> 2 df[-10^4:0, , drop = FALSE] 748.5ms 748.5ms 1.34 776MB 4.01
#> 1 sset(df, -10^4:0) 54.1ms 72ms 12.7 152MB 9.06
#> 2 df[-10^4:0, , drop = FALSE] 840.4ms 840ms 1.19 776MB 3.57
```

The biggest difference between `sset` and `[` is the way logical vectors
Expand Down Expand Up @@ -409,13 +409,13 @@ mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 1.2µs 1.5µs 626912. 0B 0
#> 1 gcd(x) 1.4µs 1.5µs 618682. 0B 0
x <- seq(0, 10^6, 0.5)
mark(gcd(x))
#> # A tibble: 1 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 gcd(x) 51.5ms 51.8ms 19.2 0B 0
#> 1 gcd(x) 54.5ms 55.9ms 17.8 0B 0
```

## Creating many sequences
Expand Down Expand Up @@ -503,41 +503,41 @@ mark(cheapr_which = which_(x),
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 2.03ms 3.41ms 282. 3.81MB 2.09
#> 2 base_which 629.4µs 2.62ms 397. 7.63MB 9.62
#> 1 cheapr_which 2.77ms 4.24ms 230. 3.81MB 2.07
#> 2 base_which 865.8µs 2.98ms 346. 7.63MB 9.17
x <- rep(FALSE, 10^6)
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 230µs 290µs 3230. 0B 0
#> 2 base_which 456µs 480µs 2055. 3.81MB 20.1
#> 1 cheapr_which 823µs 833µs 1192. 0B 0
#> 2 base_which 487µs 499µs 1896. 3.81MB 24.6
x <- c(rep(TRUE, 5e05), rep(FALSE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 1.24ms 2ms 488. 1.91MB 2.08
#> 2 base_which 781µs 1.74ms 588. 7.63MB 12.0
#> 1 cheapr_which 2.24ms 2.92ms 337. 1.91MB 2.07
#> 2 base_which 923.1µs 2.08ms 485. 7.63MB 11.1
x <- c(rep(FALSE, 5e05), rep(TRUE, 1e06))
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 2.98ms 4.25ms 236. 3.81MB 2.09
#> 2 base_which 938.1µs 3.1ms 330. 9.54MB 9.22
#> 1 cheapr_which 3.83ms 5.2ms 192. 3.81MB 2.06
#> 2 base_which 1.08ms 3.69ms 275. 9.54MB 6.66
x <- sample(c(TRUE, FALSE), 10^6, TRUE)
x[sample.int(10^6, 10^4)] <- NA
mark(cheapr_which = which_(x),
base_which = which(x))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_which 1.93ms 2.56ms 374. 1.89MB 2.09
#> 2 base_which 3.19ms 4.2ms 240. 5.7MB 2.07
#> 1 cheapr_which 2.71ms 3.18ms 313. 1.89MB 2.07
#> 2 base_which 3.42ms 4.41ms 227. 5.7MB 4.24
```

### factor
Expand All @@ -550,29 +550,29 @@ mark(cheapr_factor = factor_(x),
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 9.19ms 9.74ms 102. 4.59MB 0
#> 2 base_factor 490.32ms 494.23ms 2.02 27.84MB 0
#> 1 cheapr_factor 9.83ms 10.2ms 97.5 4.59MB 0
#> 2 base_factor 598.01ms 598ms 1.67 27.84MB 0
mark(cheapr_factor = factor_(x, order = FALSE),
base_factor = factor(x, levels = unique(x)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 4.37ms 5.08ms 192. 1.53MB 0
#> 2 base_factor 752.8ms 752.8ms 1.33 22.79MB 0
#> 1 cheapr_factor 4.51ms 5.17ms 187. 1.53MB 0
#> 2 base_factor 975.8ms 975.8ms 1.02 22.79MB 0
mark(cheapr_factor = factor_(y),
base_factor = factor(y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 208.06ms 211.32ms 4.60 5.23MB 0
#> 2 base_factor 3.23s 3.23s 0.309 54.35MB 0
#> 1 cheapr_factor 231.34ms 247.59ms 4.09 5.23MB 0
#> 2 base_factor 3.87s 3.87s 0.258 54.35MB 0
mark(cheapr_factor = factor_(y, order = FALSE),
base_factor = factor(y, levels = unique(y)))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_factor 4.97ms 8.01ms 119. 3.49MB 0
#> 2 base_factor 53.61ms 59.46ms 16.9 39.89MB 0
#> 1 cheapr_factor 5.46ms 6.72ms 144. 3.49MB 0
#> 2 base_factor 58.82ms 65.16ms 15.4 39.89MB 2.56
```

### intersect & setdiff
Expand All @@ -585,15 +585,15 @@ mark(cheapr_intersect = intersect_(x, y, dups = FALSE),
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_intersect 2.71ms 2.89ms 337. 1.18MB 2.30
#> 2 base_intersect 4.91ms 5.28ms 182. 5.16MB 0
#> 1 cheapr_intersect 2.76ms 3.02ms 326. 1.18MB 0
#> 2 base_intersect 5.12ms 5.42ms 171. 5.16MB 2.16
mark(cheapr_setdiff = setdiff_(x, y, dups = FALSE),
base_setdiff = setdiff(x, y))
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_setdiff 2.97ms 3.15ms 315. 1.76MB 0
#> 2 base_setdiff 4.97ms 5.51ms 179. 5.71MB 2.29
#> 1 cheapr_setdiff 3.03ms 3.29ms 291. 1.76MB 2.14
#> 2 base_setdiff 4.56ms 5.49ms 181. 5.71MB 2.18
```

### `%in_%` and `%!in_%`
Expand All @@ -604,15 +604,15 @@ mark(cheapr = x %in_% y,
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.75ms 1.83ms 539. 781.34KB 0
#> 2 base 2.52ms 3.08ms 325. 2.53MB 2.24
#> 1 cheapr 1.77ms 1.89ms 522. 781.34KB 0
#> 2 base 2.96ms 3.12ms 305. 2.53MB 2.18
mark(cheapr = x %!in_% y,
base = !x %in% y)
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr 1.68ms 1.82ms 543. 787.84KB 0
#> 2 base 2.82ms 3.25ms 306. 2.91MB 2.25
#> 1 cheapr 1.72ms 1.82ms 540. 787.84KB 0
#> 2 base 2.81ms 3.34ms 296. 2.91MB 2.16
```

### `as_discrete`
Expand All @@ -629,6 +629,6 @@ mark(cheapr_cut = as_discrete(x, b, left = FALSE),
#> # A tibble: 2 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 cheapr_cut 216ms 219ms 4.54 38.2MB 0
#> 2 base_cut 642ms 642ms 1.56 267.1MB 1.56
#> 1 cheapr_cut 148ms 149ms 6.42 38.2MB 1.60
#> 2 base_cut 601ms 601ms 1.66 267.1MB 1.66
```

0 comments on commit 23bdd7d

Please sign in to comment.