Restore old grouping algorithm and improve it

nalimilan · nalimilan · commit d6104d7de8f1 · 2017-06-18T16:23:12.000+02:00
Follow the strategy used by Pandas. The new implementation is more efficient
since it avoids creating a NullableCategoricalArray: the integer codes are
combined on the fly with those computed from previous columns. Hashing only
happens once by giving arbitrary codes to levels in the first pass; after that,
only integer codes are used.

Move the per-column operations to separate functions which can be specialized
by the compiler for each column type. This also allows using a more efficient
method for CategoricalArray.

Fix ordering of CategoricalArray levels when levels have been reordered,
and sort null values last for consistency with other nullable arrays. Enable
sorting by default since its cost is relatively small compared with the rest.

Avoid some allocations by using in place operations, use Base.unique!().
diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl
@@ -26,18 +26,111 @@ end
 # Split
 #
 
+function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false)
+    # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
+
+    # count group sizes, location 0 for NULL
+    n = length(x)
+    # counts = x.pool
+    counts = fill(0, ngroups + 1)
+    for i = 1:n
+        counts[x[i] + 1] += 1
+    end
+
+    # mark the start of each contiguous group of like-indexed data
+    where = fill(1, ngroups + 1)
+    if null_last
+        for i = 3:ngroups+1
+            where[i] = where[i - 1] + counts[i - 1]
+        end
+        where[1] = where[end] + counts[end]
+    else
+        for i = 2:ngroups+1
+            where[i] = where[i - 1] + counts[i - 1]
+        end
+    end
+
+    # this is our indexer
+    result = fill(0, n)
+    for i = 1:n
+        label = x[i] + 1
+        result[where[label]] = i
+        where[label] += 1
+    end
+    result, where, counts
+end
+
+# Assign an integer code to each level of x, and combine these codes with existing vector
+function combine_col!{T}(x::AbstractVector, col::AbstractVector{T},
+                         ngroups::Integer, sort::Bool)
+    d = Dict{T, UInt32}()
+    y = Vector{UInt32}(length(x))
+    n = 0
+    # Note: using get! instead of triggers lots of allocations
+    @inbounds for i in eachindex(x)
+        v = col[i]
+        index = Base.ht_keyindex(d, v)
+        if index < 0 # new level
+            @inbounds y[i] = d[v] = n
+            n += 1
+        else
+            y[i] = d.vals[index]
+        end
+    end
+
+    if sort
+        # compute mapping from unsorted to sorted codes
+        tmp = sortperm(collect(keys(d)))
+        perm = ipermute!(collect(0:(n-1)), tmp)
+        refperm = sortperm!(tmp, collect(values(d)))
+        permute!(perm, tmp)
+
+        @inbounds for i in eachindex(x)
+            x[i] += perm[y[i] + 1] * ngroups
+        end
+    else
+        @inbounds for i in eachindex(x)
+            x[i] += y[i] * ngroups
+        end
+    end
+
+    n
+end
+
+# More efficient method which can use the references directly
+# Levels are always sorted
+function combine_col!(x::AbstractVector,
+                      col::Union{AbstractCategoricalVector, AbstractNullableCategoricalVector},
+                      ngroups::Integer, sort::Bool)
+    nlevels = length(levels(col))
+    order = CategoricalArrays.order(col.pool)
+    codes = similar(order, length(order)+1)
+    codes[1] = nlevels # Sort nulls last, only used if present
+    codes[2:end] .= order .- 1
+    anynulls = false
+    @inbounds for i in eachindex(x)
+        ref = col.refs[i]
+        x[i] += codes[ref + 1] * ngroups
+        if eltype(col) <: Nullable
+            anynulls |= (ref == 0)
+        end
+    end
+    nlevels + anynulls
+end
+
 """
 A view of an AbstractDataTable split into row groups
 
 ```julia
-groupby(d::AbstractDataTable, cols)
-groupby(cols)
+groupby(d::AbstractDataTable, cols; sort = true)
+groupby(cols; sort = true)
 ```
 
 ### Arguments
 
 * `d` : an AbstractDataTable to split (optional, see [Returns](#returns))
 * `cols` : data table columns to group by
+* `sort`: whether to sort row groups; disable sorting for maximum performance
 
 ### Returns
 
@@ -79,17 +172,24 @@ dt |> groupby([:a, :b]) |> [sum, length]
 ```
 
 """
-function groupby{T}(dt::AbstractDataTable, cols::Vector{T}; sort::Bool = false)
-    sdt = dt[cols]
-    dt_groups = group_rows(sdt)
-    # sort the groups
-    if sort
-        group_perm = sortperm(view(sdt, dt_groups.rperm[dt_groups.starts]))
-        permute!(dt_groups.starts, group_perm)
-        Base.permute!!(dt_groups.stops, group_perm)
+function groupby{T}(d::AbstractDataTable, cols::Vector{T}; sort::Bool = true)
+    ## a subset of Wes McKinney's algorithm here:
+    ##     http://wesmckinney.com/blog/?p=489
+
+    x = ones(UInt32, nrow(d))
+    ngroups = 1
+    for j in length(cols):-1:1
+        # also compute the number of groups, which is the product of the set lengths
+        ngroups *= combine_col!(x, d[cols[j]], ngroups, sort)
+        # TODO if ngroups is really big, shrink it
     end
-    GroupedDataTable(dt, cols, dt_groups.rperm,
-                     dt_groups.starts, dt_groups.stops)
+    (idx, starts) = groupsort_indexer(x, ngroups)
+    # Remove zero-length groupings
+    starts = _groupedunique!(starts)
+    ends = starts[2:end]
+    ends .-= 1
+    pop!(starts)
+    GroupedDataTable(d, cols, idx, starts, ends)
 end
 groupby(d::AbstractDataTable, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
 
@@ -263,8 +363,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
 based on columns `col`
 
 ```julia
-by(d::AbstractDataTable, cols, f::Function; sort::Bool = false)
-by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
+by(d::AbstractDataTable, cols, f::Function; sort::Bool = true)
+by(f::Function, d::AbstractDataTable, cols; sort::Bool = true)
 ```
 
 ### Arguments
@@ -273,7 +373,7 @@ by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
 * `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
 * `f` : a function to be applied to groups; expects each argument to
   be an AbstractDataTable
-* `sort`: sort row groups (no sorting by default)
+* `sort`: whether to sort row groups; disable sorting for maximum performance
 
 `f` can return a value, a vector, or a DataTable. For a value or
 vector, these are merged into a column along with the `cols` keys. For
@@ -321,8 +421,8 @@ Split-apply-combine that applies a set of functions over columns of an
 AbstractDataTable or GroupedDataTable
 
 ```julia
-aggregate(d::AbstractDataTable, cols, fs)
-aggregate(gd::GroupedDataTable, fs)
+aggregate(d::AbstractDataTable, cols, fs; sort::Bool=true)
+aggregate(gd::GroupedDataTable, fs; sort::Bool=true)
 ```
 
 ### Arguments
@@ -332,6 +432,7 @@ aggregate(gd::GroupedDataTable, fs)
 * `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
 * `fs` : a function or vector of functions to be applied to vectors
   within groups; expects each argument to be a column vector
+* `sort`: whether to sort row groups; disable sorting for maximum performance
 
 Each `fs` should return a value or vector. All returns must be the
 same length.
@@ -353,15 +454,17 @@ dt |> groupby(:a) |> [sum, x->mean(dropnull(x))]   # equivalent
 ```
 
 """
-aggregate(d::AbstractDataTable, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort)
-function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=false)
+aggregate(d::AbstractDataTable, fs::Function; sort::Bool=true) =
+    aggregate(d, [fs], sort=sort)
+function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=true)
     headers = _makeheaders(fs, _names(d))
     _aggregate(d, fs, headers, sort)
 end
 
 # Applies aggregate to non-key cols of each SubDataTable of a GroupedDataTable
-aggregate(gd::GroupedDataTable, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort)
-function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=false)
+aggregate(gd::GroupedDataTable, f::Function; sort::Bool=true) =
+    aggregate(gd, [f], sort=sort)
+function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=true)
     headers = _makeheaders(fs, setdiff(_names(gd), gd.cols))
     res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd))
     sort && sort!(res, cols=headers)
@@ -375,7 +478,7 @@ end
 function aggregate{S<:ColumnIndex, T <:Function}(d::AbstractDataTable,
                                                  cols::Union{S, AbstractVector{S}},
                                                  fs::Union{T, Vector{T}};
-                                                 sort::Bool=false)
+                                                 sort::Bool=true)
     aggregate(groupby(d, cols, sort=sort), fs)
 end
 
@@ -384,7 +487,8 @@ function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol})
     [Symbol(colname,'_',fname) for fname in fnames for colname in cn]
 end
 
-function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false)
+function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T},
+                                 headers::Vector{Symbol}, sort::Bool=true)
     res = DataTable(Any[vcat(f(d[i])) for f in fs for i in 1:size(d, 2)], headers)
     sort && sort!(res, cols=headers)
     res
diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -155,6 +155,26 @@ function countnull(a::CategoricalArray)
     return res
 end
 
+if !isdefined(Base, :unique!) # Julia < 0.7
+    function _groupedunique!(A::AbstractVector)
+        isempty(A) && return A
+        idxs = eachindex(A)
+        y = first(A)
+        state = start(idxs)
+        i, state = next(idxs, state)
+        for x in A
+            if !isequal(x, y)
+                i, state = next(idxs, state)
+                y = A[i] = x
+            end
+        end
+        resize!(A, i - first(idxs) + 1)
+    end
+else
+    # unique!() includes a fast path for sorted vectors
+    _groupedunique!(A::AbstractVector) = unique!(A)
+end
+
 # Gets the name of a function. Used in groupedatatable/grouping.jl
 function _fnames{T<:Function}(fs::Vector{T})
     λcounter = 0
diff --git a/test/grouping.jl b/test/grouping.jl
@@ -165,11 +165,34 @@ module TestGrouping
     levels!(dt[:Key1], ["Z", "B", "A"])
     levels!(dt[:Key2], ["Z", "B", "A"])
     gd = groupby(dt, :Key1)
-    @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
-    @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
+    @test isequal(gd[1], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4))
+    @test isequal(gd[2], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2))
     gd = groupby(dt, [:Key1, :Key2])
-    @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1))
-    @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2))
-    @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3))
-    @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4))
+    @test isequal(gd[1], DataTable(Key1="B", Key2="B", Value=4))
+    @test isequal(gd[2], DataTable(Key1="B", Key2="A", Value=3))
+    @test isequal(gd[3], DataTable(Key1="A", Key2="B", Value=2))
+    @test isequal(gd[4], DataTable(Key1="A", Key2="A", Value=1))
+
+    # test NullableArray and NullableCategoricalArray with nulls
+    for (S, T) in ((NullableArray, NullableArray),
+                   (NullableCategoricalArray, NullableCategoricalArray),
+                   (NullableArray, NullableCategoricalArray),
+                   (NullableCategoricalArray, NullableArray))
+        dt = DataTable(Key1 = S(["A", "A", "B", Nullable(), Nullable()]),
+                       Key2 = T(["A", "B", "A", Nullable(), "A"]),
+                       Value = 1:5)
+        gd = groupby(dt, :Key1)
+        @test isequal(gd[1], DataTable(Key1=Nullable{String}["A", "A"],
+                                       Key2=Nullable{String}["A", "B"], Value=1:2))
+        @test isequal(gd[2], DataTable(Key1=Nullable{String}["B"],
+                                       Key2=Nullable{String}["A"], Value=3))
+        @test isequal(gd[3], DataTable(Key1=[Nullable(), Nullable()],
+                                       Key2=Nullable{String}[Nullable(), "A"], Value=4:5))
+        gd = groupby(dt, [:Key1, :Key2])
+        @test isequal(gd[1], DataTable(Key1=Nullable("A"), Key2=Nullable("A"), Value=1))
+        @test isequal(gd[2], DataTable(Key1=Nullable("A"), Key2=Nullable("B"), Value=2))
+        @test isequal(gd[3], DataTable(Key1=Nullable("B"), Key2=Nullable("A"), Value=3))
+        @test isequal(gd[4], DataTable(Key1=Nullable(), Key2=Nullable("A"), Value=5))
+        @test isequal(gd[5], DataTable(Key1=Nullable(), Key2=Nullable(), Value=4))
+    end
 end