From a1cecfde7e2d25fdcadab874d1abc4853c1c9f89 Mon Sep 17 00:00:00 2001
From: Lilith Orion Hafner <lilithhafner@gmail.com>
Date: Thu, 30 Sep 2021 22:05:39 -0500
Subject: [PATCH 1/5] document and update default choice between :dict and
 :radixsort

---
 src/counts.jl | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/counts.jl b/src/counts.jl
index 580870598..aef902f3f 100644
--- a/src/counts.jl
+++ b/src/counts.jl
@@ -242,25 +242,28 @@ If a weighting vector `wv` is specified, the sum of the weights is used rather t
 raw counts.
 
 `alg` can be one of:
-- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
-                     `:radixsort`, otherwise use `:dict`.
+- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and
+                     `length(x) > 100` then use `:radixsort`, otherwise use `:dict`.
 
 - `:radixsort`:      if `radixsort_safe(eltype(x)) == true` then use the
                      [radix sort](https://en.wikipedia.org/wiki/Radix_sort)
                      algorithm to sort the input vector which will generally lead to
-                     shorter running time. However the radix sort algorithm creates a
-                     copy of the input vector and hence uses more RAM. Choose `:dict`
-                     if the amount of available RAM is a limitation.
+                     shorter running time for large `x` with many duplicates. However
+                     the radix sort algorithm creates a copy of the input vector and
+                     hence uses more RAM. Choose `:dict` if the amount of available
+                     RAM is a limitation.
 
 - `:dict`:           use `Dict`-based method which is generally slower but uses less
-                     RAM and is safe for any data type.
+                     RAM, is safe for any data type, is faster for small arrays, and
+                     is faster when `length(x)` is less than about 5 times
+                     `length(unique(x))`.
 """
 addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg)
 
 function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T
     # if it's safe to be sorted using radixsort then it should be faster
     # albeit using more RAM
-    if radixsort_safe(T) && (alg == :auto || alg == :radixsort)
+    if radixsort_safe(T) && ((alg == :auto && length(x) > 100) || alg == :radixsort)
         addcounts_radixsort!(cm, x)
     elseif alg == :radixsort
         throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead"))
@@ -395,18 +398,21 @@ end
 Return a dictionary mapping each unique value in `x` to its number
 of occurrences. A vector of weights `w` can be provided when `x` is a vector.
 
-- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
-                     `:radixsort`, otherwise use `:dict`.
+- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and
+                     `length(x) > 100` then use `:radixsort`, otherwise use `:dict`.
 
 - `:radixsort`:      if `radixsort_safe(eltype(x)) == true` then use the
                      [radix sort](https://en.wikipedia.org/wiki/Radix_sort)
                      algorithm to sort the input vector which will generally lead to
-                     shorter running time. However the radix sort algorithm creates a
-                     copy of the input vector and hence uses more RAM. Choose `:dict`
-                     if the amount of available RAM is a limitation.
+                     shorter running time for large `x` with many duplicates. However
+                     the radix sort algorithm creates a copy of the input vector and
+                     hence uses more RAM. Choose `:dict` if the amount of available
+                     RAM is a limitation.
 
 - `:dict`:           use `Dict`-based method which is generally slower but uses less
-                     RAM and is safe for any data type.
+                     RAM, is safe for any data type, is faster for small arrays, and
+                     is faster when `length(x)` is less than about 5 times
+                     `length(unique(x))`.
 """
 countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg)
 countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)

From efe70b5989080867249e328c92c91c8a2cc0526b Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sat, 2 Sep 2023 13:39:11 -0500
Subject: [PATCH 2/5] update for new Julia

---
 src/counts.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/counts.jl b/src/counts.jl
index 83e910f9b..bde3db669 100644
--- a/src/counts.jl
+++ b/src/counts.jl
@@ -252,8 +252,8 @@ If a weighting vector `wv` is specified, the sum of the weights is used rather t
 raw counts.
 
 `alg` is only allowed for unweighted counting and can be one of:c
-- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and
-                     `length(x) > 100` then use `:radixsort`, otherwise use `:dict`.
+- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
+                     `:radixsort`, otherwise use `:dict`.
 
 - `:radixsort`:      if `radixsort_safe(eltype(x)) == true` then use the
                      [radix sort](https://en.wikipedia.org/wiki/Radix_sort)
@@ -273,7 +273,7 @@ addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg)
 function _addcounts!(::Type{T}, cm::Dict, x; alg = :auto) where T
     # if it's safe to be sorted using radixsort then it should be faster
     # albeit using more RAM
-    if radixsort_safe(T) && ((alg == :auto && length(x) > 100) || alg == :radixsort)
+    if radixsort_safe(T) && (alg == :auto || alg == :radixsort)
         addcounts_radixsort!(cm, x)
     elseif alg == :radixsort
         throw(ArgumentError("`alg = :radixsort` is chosen but type `radixsort_safe($T)` did not return `true`; use `alg = :auto` or `alg = :dict` instead"))
@@ -427,8 +427,8 @@ If a weighting vector `wv` is specified, the sum of weights is used rather than
 raw counts.
 
 `alg` is only allowed for unweighted counting and can be one of:
-- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and
-                     `length(x) > 100` then use `:radixsort`, otherwise use `:dict`.
+- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and then use
+                     `:radixsort`, otherwise use `:dict`.
 
 - `:radixsort`:      if `radixsort_safe(eltype(x)) == true` then use the
                      [radix sort](https://en.wikipedia.org/wiki/Radix_sort)

From 2ed0853fe1ab46ccb575b03b69e982ed954f672d Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sat, 2 Sep 2023 13:40:35 -0500
Subject: [PATCH 3/5] fix typo

---
 src/counts.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/counts.jl b/src/counts.jl
index bde3db669..6a114147c 100644
--- a/src/counts.jl
+++ b/src/counts.jl
@@ -251,7 +251,7 @@ Add counts based on `x` to a count map. New entries will be added if new values
 If a weighting vector `wv` is specified, the sum of the weights is used rather than the
 raw counts.
 
-`alg` is only allowed for unweighted counting and can be one of:c
+`alg` is only allowed for unweighted counting and can be one of:
 - `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
                      `:radixsort`, otherwise use `:dict`.
 

From 7bed1ae4797c341b295242f63f8db79afa935de5 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sat, 2 Sep 2023 13:41:11 -0500
Subject: [PATCH 4/5] fix typo

---
 src/counts.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/counts.jl b/src/counts.jl
index 6a114147c..1867ef61d 100644
--- a/src/counts.jl
+++ b/src/counts.jl
@@ -427,7 +427,7 @@ If a weighting vector `wv` is specified, the sum of weights is used rather than
 raw counts.
 
 `alg` is only allowed for unweighted counting and can be one of:
-- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` and then use
+- `:auto` (default): if `StatsBase.radixsort_safe(eltype(x)) == true` then use
                      `:radixsort`, otherwise use `:dict`.
 
 - `:radixsort`:      if `radixsort_safe(eltype(x)) == true` then use the

From 76255b4e1987bad84fc58de45575042a36b0ef24 Mon Sep 17 00:00:00 2001
From: Lilith Hafner <Lilith.Hafner@gmail.com>
Date: Sat, 2 Sep 2023 14:53:34 -0500
Subject: [PATCH 5/5] revise runtime estimate

---
 src/counts.jl | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/counts.jl b/src/counts.jl
index 1867ef61d..790c44fd6 100644
--- a/src/counts.jl
+++ b/src/counts.jl
@@ -265,8 +265,7 @@ raw counts.
 
 - `:dict`:           use `Dict`-based method which is generally slower but uses less
                      RAM, is safe for any data type, is faster for small arrays, and
-                     is faster when `length(x)` is less than about 5 times
-                     `length(unique(x))`.
+                     is faster when there are not many duplicates.
 """
 addcounts!(cm::Dict, x; alg = :auto) = _addcounts!(eltype(x), cm, x, alg = alg)
 
@@ -440,8 +439,7 @@ raw counts.
 
 - `:dict`:           use `Dict`-based method which is generally slower but uses less
                      RAM, is safe for any data type, is faster for small arrays, and
-                     is faster when `length(x)` is less than about 5 times
-                     `length(unique(x))`.
+                     is faster when there are not many duplicates.
 """
 countmap(x; alg = :auto) = addcounts!(Dict{eltype(x),Int}(), x; alg = alg)
 countmap(x::AbstractArray{T}, wv::AbstractVector{W}) where {T,W<:Real} = addcounts!(Dict{T,W}(), x, wv)