grouping: replace groupsort by _group_rows()

alyst · alyst · commit df8411e83d35 · 2015-08-07T23:59:53.000+03:00
diff --git a/src/groupeddataframe/grouping.jl b/src/groupeddataframe/grouping.jl
@@ -36,7 +36,7 @@ groupby(cols)
 ### Arguments
 
 * `d` : an AbstractDataFrame
-* `cols` : an 
+* `cols` : data frame columns to group by
 
 If `d` is not provided, a curried version of groupby is given.
 
@@ -82,32 +82,19 @@ df |> groupby([:a, :b]) |> [sum, length]
 
 """
 function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
-    ## a subset of Wes McKinney's algorithm here:
-    ##     http://wesmckinney.com/blog/?p=489
-
     ncols = length(cols)
-    # use the pool trick to get a set of integer references for each unique item
-    dv = PooledDataArray(d[cols[ncols]])
-    # if there are NAs, add 1 to the refs to avoid underflows in x later
-    dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
-    x = copy(dv.refs) .+ dv_has_nas
-    # also compute the number of groups, which is the product of the set lengths
-    ngroups = length(dv.pool) + dv_has_nas
-    # if there's more than 1 column, do roughly the same thing repeatedly
-    for j = (ncols - 1):-1:1
-        dv = PooledDataArray(d[cols[j]])
-        dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
-        for i = 1:nrow(d)
-            x[i] += (dv.refs[i] + dv_has_nas- 1) * ngroups
-        end
-        ngroups = ngroups * (length(dv.pool) + dv_has_nas)
-        # TODO if ngroups is really big, shrink it
+    d_groups = _group_rows(d[cols])
+    # sort the groups
+    d_group_keys = sort!(collect(keys(d_groups)))
+    # generate permutation that arranges rows by groups
+    idx = sizehint!(@compat(Vector{Int}()), nrow(d))
+    starts = sizehint!(@compat(Vector{Int}()), length(d_groups))
+    for gr_row in d_group_keys
+      push!(starts, length(idx)+1)
+      append!(idx, d_groups[gr_row])
     end
-    (idx, starts) = DataArrays.groupsort_indexer(x, ngroups)
-    # Remove zero-length groupings
-    starts = _uniqueofsorted(starts)
-    ends = starts[2:end] - 1
-    GroupedDataFrame(d, cols, idx, starts[1:end-1], ends)
+    ends = push!(starts[2:end] - 1, length(idx))
+    GroupedDataFrame(d, cols, idx, starts, ends)
 end
 groupby(d::AbstractDataFrame, cols) = groupby(d, [cols])
 
@@ -284,7 +271,7 @@ notation can be used.
 
 ### Returns
 
-* `::DataFrame` 
+* `::DataFrame`
 
 ### Examples
 
@@ -330,7 +317,7 @@ same length.
 
 ### Returns
 
-* `::DataFrame` 
+* `::DataFrame`
 
 ### Examples
 
diff --git a/src/other/utils.jl b/src/other/utils.jl
@@ -162,19 +162,6 @@ function _setdiff{T}(a::AbstractVector{T}, b::T)
     diff
 end
 
-function _uniqueofsorted(x::Vector)
-    idx = fill(true, length(x))
-    lastx = x[1]
-    for i = 2:length(x)
-        if lastx == x[i]
-            idx[i] = false
-        else
-            lastx = x[i]
-        end
-    end
-    x[idx]
-end
-
 # Gets the name of a function. Used in groupedataframe/grouping.jl
 function _fnames(fs::Vector{Function})
     λcounter = 0
@@ -188,4 +175,4 @@ function _fnames(fs::Vector{Function})
         name
     end
     names
-end
+end