From a1cecfde7e2d25fdcadab874d1abc4853c1c9f89 Mon Sep 17 00:00:00 2001 From: Lilith Orion Hafner Date: Thu, 30 Sep 2021 22:05:39 -0500 Subject: [PATCH 1/5] document and update default choice between :dict and :radixsort --- src/counts.jl | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 580870598..aef902f3f 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -242,25 +242,28 @@ If a weighting vector `wv` is specified, the sum of the weights is used rather t raw counts. `alg` can be one of: -- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use - `:radixsort`, otherwise use `:dict`. +- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and + `length(x) > 100` then use `:radixsort`, otherwise use `:dict`. - `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the [radix sort](https://en.wikipedia.org/wiki/Radix_sort) algorithm to sort the input vector which will generally lead to - shorter running time. However the radix sort algorithm creates a - copy of the input vector and hence uses more RAM. Choose `:dict` - if the amount of available RAM is a limitation. + shorter running time for large `x` with many duplicates. However + the radix sort algorithm creates a copy of the input vector and + hence uses more RAM. Choose `:dict` if the amount of available + RAM is a limitation. - `:dict`: use `Dict`-based method which is generally slower but uses less - RAM and is safe for any data type. + RAM, is safe for any data type, is faster for small arrays, and + is faster when `length(x)` is less than about 5 times + `length(unique(x))`. """ addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM - if radixsort_safe(T) && (alg == :auto || alg == :radixsort) + if radixsort_safe(T) && ((alg == :auto && length(x) > 100) || alg == :radixsort) addcounts_radixsort!(cm, x) elseif alg == :radixsort throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead")) @@ -395,18 +398,21 @@ end Return a dictionary mapping each unique value in `x` to its number of occurrences. A vector of weights `w` can be provided when `x` is a vector. -- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use - `:radixsort`, otherwise use `:dict`. +- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and + `length(x) > 100` then use `:radixsort`, otherwise use `:dict`. - `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the [radix sort](https://en.wikipedia.org/wiki/Radix_sort) algorithm to sort the input vector which will generally lead to - shorter running time. However the radix sort algorithm creates a - copy of the input vector and hence uses more RAM. Choose `:dict` - if the amount of available RAM is a limitation. + shorter running time for large `x` with many duplicates. However + the radix sort algorithm creates a copy of the input vector and + hence uses more RAM. Choose `:dict` if the amount of available + RAM is a limitation. - `:dict`: use `Dict`-based method which is generally slower but uses less - RAM and is safe for any data type. + RAM, is safe for any data type, is faster for small arrays, and + is faster when `length(x)` is less than about 5 times + `length(unique(x))`. """ countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv) From efe70b5989080867249e328c92c91c8a2cc0526b Mon Sep 17 00:00:00 2001 From: Lilith Hafner Date: Sat, 2 Sep 2023 13:39:11 -0500 Subject: [PATCH 2/5] update for new Julia --- src/counts.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 83e910f9b..bde3db669 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -252,8 +252,8 @@ If a weighting vector `wv` is specified, the sum of the weights is used rather t raw counts. `alg` is only allowed for unweighted counting and can be one of:c -- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and - `length(x) > 100` then use `:radixsort`, otherwise use `:dict`. +- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use + `:radixsort`, otherwise use `:dict`. - `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the [radix sort](https://en.wikipedia.org/wiki/Radix_sort) @@ -273,7 +273,7 @@ addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T # if it's safe to be sorted using radixsort then it should be faster # albeit using more RAM - if radixsort_safe(T) && ((alg == :auto && length(x) > 100) || alg == :radixsort) + if radixsort_safe(T) && (alg == :auto || alg == :radixsort) addcounts_radixsort!(cm, x) elseif alg == :radixsort throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead")) @@ -427,8 +427,8 @@ If a weighting vector `wv` is specified, the sum of weights is used rather than raw counts. `alg` is only allowed for unweighted counting and can be one of: -- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and - `length(x) > 100` then use `:radixsort`, otherwise use `:dict`. +- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and then use + `:radixsort`, otherwise use `:dict`. - `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the [radix sort](https://en.wikipedia.org/wiki/Radix_sort) From 2ed0853fe1ab46ccb575b03b69e982ed954f672d Mon Sep 17 00:00:00 2001 From: Lilith Hafner Date: Sat, 2 Sep 2023 13:40:35 -0500 Subject: [PATCH 3/5] fix typo --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index bde3db669..6a114147c 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -251,7 +251,7 @@ Add counts based on `x` to a count map. New entries will be added if new values If a weighting vector `wv` is specified, the sum of the weights is used rather than the raw counts. -`alg` is only allowed for unweighted counting and can be one of:c +`alg` is only allowed for unweighted counting and can be one of: - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. From 7bed1ae4797c341b295242f63f8db79afa935de5 Mon Sep 17 00:00:00 2001 From: Lilith Hafner Date: Sat, 2 Sep 2023 13:41:11 -0500 Subject: [PATCH 4/5] fix typo --- src/counts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/counts.jl b/src/counts.jl index 6a114147c..1867ef61d 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -427,7 +427,7 @@ If a weighting vector `wv` is specified, the sum of weights is used rather than raw counts. `alg` is only allowed for unweighted counting and can be one of: -- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and then use +- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use `:radixsort`, otherwise use `:dict`. - `:radixsort`: if `radixsort_safe(eltype(x)) == true` then use the From 76255b4e1987bad84fc58de45575042a36b0ef24 Mon Sep 17 00:00:00 2001 From: Lilith Hafner Date: Sat, 2 Sep 2023 14:53:34 -0500 Subject: [PATCH 5/5] revise runtime estimate --- src/counts.jl | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/counts.jl b/src/counts.jl index 1867ef61d..790c44fd6 100644 --- a/src/counts.jl +++ b/src/counts.jl @@ -265,8 +265,7 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM, is safe for any data type, is faster for small arrays, and - is faster when `length(x)` is less than about 5 times - `length(unique(x))`. + is faster when there are not many duplicates. """ addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg) @@ -440,8 +439,7 @@ raw counts. - `:dict`: use `Dict`-based method which is generally slower but uses less RAM, is safe for any data type, is faster for small arrays, and - is faster when `length(x)` is less than about 5 times - `length(unique(x))`. + is faster when there are not many duplicates. """ countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg) countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)