Skip to content

Commit df8411e

Browse files
committed
grouping: replace groupsort by _group_rows()
1 parent 8951261 commit df8411e

File tree

2 files changed

+15
-41
lines changed

2 files changed

+15
-41
lines changed

src/groupeddataframe/grouping.jl

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ groupby(cols)
3636
### Arguments
3737
3838
* `d` : an AbstractDataFrame
39-
* `cols` : an
39+
* `cols` : data frame columns to group by
4040
4141
If `d` is not provided, a curried version of groupby is given.
4242
@@ -82,32 +82,19 @@ df |> groupby([:a, :b]) |> [sum, length]
8282
8383
"""
8484
function groupby{T}(d::AbstractDataFrame, cols::Vector{T})
85-
## a subset of Wes McKinney's algorithm here:
86-
## http://wesmckinney.com/blog/?p=489
87-
8885
ncols = length(cols)
89-
# use the pool trick to get a set of integer references for each unique item
90-
dv = PooledDataArray(d[cols[ncols]])
91-
# if there are NAs, add 1 to the refs to avoid underflows in x later
92-
dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
93-
x = copy(dv.refs) .+ dv_has_nas
94-
# also compute the number of groups, which is the product of the set lengths
95-
ngroups = length(dv.pool) + dv_has_nas
96-
# if there's more than 1 column, do roughly the same thing repeatedly
97-
for j = (ncols - 1):-1:1
98-
dv = PooledDataArray(d[cols[j]])
99-
dv_has_nas = (findfirst(dv.refs, 0) > 0 ? 1 : 0)
100-
for i = 1:nrow(d)
101-
x[i] += (dv.refs[i] + dv_has_nas- 1) * ngroups
102-
end
103-
ngroups = ngroups * (length(dv.pool) + dv_has_nas)
104-
# TODO if ngroups is really big, shrink it
86+
d_groups = _group_rows(d[cols])
87+
# sort the groups
88+
d_group_keys = sort!(collect(keys(d_groups)))
89+
# generate permutation that arranges rows by groups
90+
idx = sizehint!(@compat(Vector{Int}()), nrow(d))
91+
starts = sizehint!(@compat(Vector{Int}()), length(d_groups))
92+
for gr_row in d_group_keys
93+
push!(starts, length(idx)+1)
94+
append!(idx, d_groups[gr_row])
10595
end
106-
(idx, starts) = DataArrays.groupsort_indexer(x, ngroups)
107-
# Remove zero-length groupings
108-
starts = _uniqueofsorted(starts)
109-
ends = starts[2:end] - 1
110-
GroupedDataFrame(d, cols, idx, starts[1:end-1], ends)
96+
ends = push!(starts[2:end] - 1, length(idx))
97+
GroupedDataFrame(d, cols, idx, starts, ends)
11198
end
11299
groupby(d::AbstractDataFrame, cols) = groupby(d, [cols])
113100

@@ -284,7 +271,7 @@ notation can be used.
284271
285272
### Returns
286273
287-
* `::DataFrame`
274+
* `::DataFrame`
288275
289276
### Examples
290277
@@ -330,7 +317,7 @@ same length.
330317
331318
### Returns
332319
333-
* `::DataFrame`
320+
* `::DataFrame`
334321
335322
### Examples
336323

src/other/utils.jl

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -162,19 +162,6 @@ function _setdiff{T}(a::AbstractVector{T}, b::T)
162162
diff
163163
end
164164

165-
function _uniqueofsorted(x::Vector)
166-
idx = fill(true, length(x))
167-
lastx = x[1]
168-
for i = 2:length(x)
169-
if lastx == x[i]
170-
idx[i] = false
171-
else
172-
lastx = x[i]
173-
end
174-
end
175-
x[idx]
176-
end
177-
178165
# Gets the name of a function. Used in groupedataframe/grouping.jl
179166
function _fnames(fs::Vector{Function})
180167
λcounter = 0
@@ -188,4 +175,4 @@ function _fnames(fs::Vector{Function})
188175
name
189176
end
190177
names
191-
end
178+
end

0 commit comments

Comments
 (0)