@@ -36,7 +36,7 @@ groupby(cols)
36
36
### Arguments
37
37
38
38
* `d` : an AbstractDataFrame
39
- * `cols` : an
39
+ * `cols` : data frame columns to group by
40
40
41
41
If `d` is not provided, a curried version of groupby is given.
42
42
@@ -82,32 +82,19 @@ df |> groupby([:a, :b]) |> [sum, length]
82
82
83
83
"""
84
84
function groupby {T} (d:: AbstractDataFrame , cols:: Vector{T} )
85
- # # a subset of Wes McKinney's algorithm here:
86
- # # http://wesmckinney.com/blog/?p=489
87
-
88
85
ncols = length (cols)
89
- # use the pool trick to get a set of integer references for each unique item
90
- dv = PooledDataArray (d[cols[ncols]])
91
- # if there are NAs, add 1 to the refs to avoid underflows in x later
92
- dv_has_nas = (findfirst (dv. refs, 0 ) > 0 ? 1 : 0 )
93
- x = copy (dv. refs) .+ dv_has_nas
94
- # also compute the number of groups, which is the product of the set lengths
95
- ngroups = length (dv. pool) + dv_has_nas
96
- # if there's more than 1 column, do roughly the same thing repeatedly
97
- for j = (ncols - 1 ): - 1 : 1
98
- dv = PooledDataArray (d[cols[j]])
99
- dv_has_nas = (findfirst (dv. refs, 0 ) > 0 ? 1 : 0 )
100
- for i = 1 : nrow (d)
101
- x[i] += (dv. refs[i] + dv_has_nas- 1 ) * ngroups
102
- end
103
- ngroups = ngroups * (length (dv. pool) + dv_has_nas)
104
- # TODO if ngroups is really big, shrink it
86
+ d_groups = _group_rows (d[cols])
87
+ # sort the groups
88
+ d_group_keys = sort! (collect (keys (d_groups)))
89
+ # generate permutation that arranges rows by groups
90
+ idx = sizehint! (@compat (Vector {Int} ()), nrow (d))
91
+ starts = sizehint! (@compat (Vector {Int} ()), length (d_groups))
92
+ for gr_row in d_group_keys
93
+ push! (starts, length (idx)+ 1 )
94
+ append! (idx, d_groups[gr_row])
105
95
end
106
- (idx, starts) = DataArrays. groupsort_indexer (x, ngroups)
107
- # Remove zero-length groupings
108
- starts = _uniqueofsorted (starts)
109
- ends = starts[2 : end ] - 1
110
- GroupedDataFrame (d, cols, idx, starts[1 : end - 1 ], ends)
96
+ ends = push! (starts[2 : end ] - 1 , length (idx))
97
+ GroupedDataFrame (d, cols, idx, starts, ends)
111
98
end
112
99
groupby (d:: AbstractDataFrame , cols) = groupby (d, [cols])
113
100
@@ -284,7 +271,7 @@ notation can be used.
284
271
285
272
### Returns
286
273
287
- * `::DataFrame`
274
+ * `::DataFrame`
288
275
289
276
### Examples
290
277
@@ -330,7 +317,7 @@ same length.
330
317
331
318
### Returns
332
319
333
- * `::DataFrame`
320
+ * `::DataFrame`
334
321
335
322
### Examples
336
323
0 commit comments