26
26
# Split
27
27
#
28
28
29
+ function groupsort_indexer (x:: AbstractVector , ngroups:: Integer , null_last:: Bool = false )
30
+ # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).
31
+
32
+ # count group sizes, location 0 for NULL
33
+ n = length (x)
34
+ # counts = x.pool
35
+ counts = fill (0 , ngroups + 1 )
36
+ for i = 1 : n
37
+ counts[x[i] + 1 ] += 1
38
+ end
39
+
40
+ # mark the start of each contiguous group of like-indexed data
41
+ where = fill (1 , ngroups + 1 )
42
+ if null_last
43
+ for i = 3 : ngroups+ 1
44
+ where [i] = where [i - 1 ] + counts[i - 1 ]
45
+ end
46
+ where [1 ] = where [end ] + counts[end ]
47
+ else
48
+ for i = 2 : ngroups+ 1
49
+ where [i] = where [i - 1 ] + counts[i - 1 ]
50
+ end
51
+ end
52
+
53
+ # this is our indexer
54
+ result = fill (0 , n)
55
+ for i = 1 : n
56
+ label = x[i] + 1
57
+ result[where [label]] = i
58
+ where [label] += 1
59
+ end
60
+ result, where , counts
61
+ end
62
+
63
+ # Assign an integer code to each level of x, and combine these codes with existing vector
64
+ function combine_col! {T} (x:: AbstractVector , col:: AbstractVector{T} ,
65
+ ngroups:: Integer , sort:: Bool )
66
+ d = Dict {T, UInt32} ()
67
+ y = Vector {UInt32} (length (x))
68
+ n = 0
69
+ # Note: using get! instead of triggers lots of allocations
70
+ @inbounds for i in eachindex (x)
71
+ v = col[i]
72
+ index = Base. ht_keyindex (d, v)
73
+ if index < 0 # new level
74
+ @inbounds y[i] = d[v] = n
75
+ n += 1
76
+ else
77
+ y[i] = d. vals[index]
78
+ end
79
+ end
80
+
81
+ if sort
82
+ # compute mapping from unsorted to sorted codes
83
+ tmp = sortperm (collect (keys (d)))
84
+ perm = ipermute! (collect (0 : (n- 1 )), tmp)
85
+ refperm = sortperm! (tmp, collect (values (d)))
86
+ permute! (perm, tmp)
87
+
88
+ @inbounds for i in eachindex (x)
89
+ x[i] += perm[y[i] + 1 ] * ngroups
90
+ end
91
+ else
92
+ @inbounds for i in eachindex (x)
93
+ x[i] += y[i] * ngroups
94
+ end
95
+ end
96
+
97
+ n
98
+ end
99
+
100
+ # More efficient method which can use the references directly
101
+ # Levels are always sorted
102
+ function combine_col! (x:: AbstractVector ,
103
+ col:: Union{AbstractCategoricalVector, AbstractNullableCategoricalVector} ,
104
+ ngroups:: Integer , sort:: Bool )
105
+ nlevels = length (levels (col))
106
+ order = CategoricalArrays. order (col. pool)
107
+ codes = similar (order, length (order)+ 1 )
108
+ codes[1 ] = nlevels # Sort nulls last, only used if present
109
+ codes[2 : end ] .= order .- 1
110
+ anynulls = false
111
+ @inbounds for i in eachindex (x)
112
+ ref = col. refs[i]
113
+ x[i] += codes[ref + 1 ] * ngroups
114
+ if eltype (col) <: Nullable
115
+ anynulls |= (ref == 0 )
116
+ end
117
+ end
118
+ nlevels + anynulls
119
+ end
120
+
29
121
"""
30
122
A view of an AbstractDataTable split into row groups
31
123
32
124
```julia
33
- groupby(d::AbstractDataTable, cols)
34
- groupby(cols)
125
+ groupby(d::AbstractDataTable, cols; sort = true )
126
+ groupby(cols; sort = true )
35
127
```
36
128
37
129
### Arguments
38
130
39
131
* `d` : an AbstractDataTable to split (optional, see [Returns](#returns))
40
132
* `cols` : data table columns to group by
133
+ * `sort`: whether to sort row groups; disable sorting for maximum performance
41
134
42
135
### Returns
43
136
@@ -79,17 +172,24 @@ dt |> groupby([:a, :b]) |> [sum, length]
79
172
```
80
173
81
174
"""
82
- function groupby {T} (dt:: AbstractDataTable , cols:: Vector{T} ; sort:: Bool = false )
83
- sdt = dt[cols]
84
- dt_groups = group_rows (sdt)
85
- # sort the groups
86
- if sort
87
- group_perm = sortperm (view (sdt, dt_groups. rperm[dt_groups. starts]))
88
- permute! (dt_groups. starts, group_perm)
89
- Base. permute!! (dt_groups. stops, group_perm)
175
+ function groupby {T} (d:: AbstractDataTable , cols:: Vector{T} ; sort:: Bool = true )
176
+ # # a subset of Wes McKinney's algorithm here:
177
+ # # http://wesmckinney.com/blog/?p=489
178
+
179
+ x = ones (UInt32, nrow (d))
180
+ ngroups = 1
181
+ for j in length (cols): - 1 : 1
182
+ # also compute the number of groups, which is the product of the set lengths
183
+ ngroups *= combine_col! (x, d[cols[j]], ngroups, sort)
184
+ # TODO if ngroups is really big, shrink it
90
185
end
91
- GroupedDataTable (dt, cols, dt_groups. rperm,
92
- dt_groups. starts, dt_groups. stops)
186
+ (idx, starts) = groupsort_indexer (x, ngroups)
187
+ # Remove zero-length groupings
188
+ starts = _groupedunique! (starts)
189
+ ends = starts[2 : end ]
190
+ ends .- = 1
191
+ pop! (starts)
192
+ GroupedDataTable (d, cols, idx, starts, ends)
93
193
end
94
194
groupby (d:: AbstractDataTable , cols; sort:: Bool = false ) = groupby (d, [cols], sort = sort)
95
195
@@ -263,8 +363,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
263
363
based on columns `col`
264
364
265
365
```julia
266
- by(d::AbstractDataTable, cols, f::Function; sort::Bool = false )
267
- by(f::Function, d::AbstractDataTable, cols; sort::Bool = false )
366
+ by(d::AbstractDataTable, cols, f::Function; sort::Bool = true )
367
+ by(f::Function, d::AbstractDataTable, cols; sort::Bool = true )
268
368
```
269
369
270
370
### Arguments
@@ -273,7 +373,7 @@ by(f::Function, d::AbstractDataTable, cols; sort::Bool = false)
273
373
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
274
374
* `f` : a function to be applied to groups; expects each argument to
275
375
be an AbstractDataTable
276
- * `sort`: sort row groups (no sorting by default)
376
+ * `sort`: whether to sort row groups; disable sorting for maximum performance
277
377
278
378
`f` can return a value, a vector, or a DataTable. For a value or
279
379
vector, these are merged into a column along with the `cols` keys. For
@@ -321,8 +421,8 @@ Split-apply-combine that applies a set of functions over columns of an
321
421
AbstractDataTable or GroupedDataTable
322
422
323
423
```julia
324
- aggregate(d::AbstractDataTable, cols, fs)
325
- aggregate(gd::GroupedDataTable, fs)
424
+ aggregate(d::AbstractDataTable, cols, fs; sort::Bool=true )
425
+ aggregate(gd::GroupedDataTable, fs; sort::Bool=true )
326
426
```
327
427
328
428
### Arguments
@@ -332,6 +432,7 @@ aggregate(gd::GroupedDataTable, fs)
332
432
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
333
433
* `fs` : a function or vector of functions to be applied to vectors
334
434
within groups; expects each argument to be a column vector
435
+ * `sort`: whether to sort row groups; disable sorting for maximum performance
335
436
336
437
Each `fs` should return a value or vector. All returns must be the
337
438
same length.
@@ -353,15 +454,17 @@ dt |> groupby(:a) |> [sum, x->mean(dropnull(x))] # equivalent
353
454
```
354
455
355
456
"""
356
- aggregate (d:: AbstractDataTable , fs:: Function ; sort:: Bool = false ) = aggregate (d, [fs], sort= sort)
357
- function aggregate {T<:Function} (d:: AbstractDataTable , fs:: Vector{T} ; sort:: Bool = false )
457
+ aggregate (d:: AbstractDataTable , fs:: Function ; sort:: Bool = true ) =
458
+ aggregate (d, [fs], sort= sort)
459
+ function aggregate {T<:Function} (d:: AbstractDataTable , fs:: Vector{T} ; sort:: Bool = true )
358
460
headers = _makeheaders (fs, _names (d))
359
461
_aggregate (d, fs, headers, sort)
360
462
end
361
463
362
464
# Applies aggregate to non-key cols of each SubDataTable of a GroupedDataTable
363
- aggregate (gd:: GroupedDataTable , f:: Function ; sort:: Bool = false ) = aggregate (gd, [f], sort= sort)
364
- function aggregate {T<:Function} (gd:: GroupedDataTable , fs:: Vector{T} ; sort:: Bool = false )
465
+ aggregate (gd:: GroupedDataTable , f:: Function ; sort:: Bool = true ) =
466
+ aggregate (gd, [f], sort= sort)
467
+ function aggregate {T<:Function} (gd:: GroupedDataTable , fs:: Vector{T} ; sort:: Bool = true )
365
468
headers = _makeheaders (fs, setdiff (_names (gd), gd. cols))
366
469
res = combine (map (x -> _aggregate (without (x, gd. cols), fs, headers), gd))
367
470
sort && sort! (res, cols= headers)
375
478
function aggregate {S<:ColumnIndex, T <:Function} (d:: AbstractDataTable ,
376
479
cols:: Union{S, AbstractVector{S}} ,
377
480
fs:: Union{T, Vector{T}} ;
378
- sort:: Bool = false )
481
+ sort:: Bool = true )
379
482
aggregate (groupby (d, cols, sort= sort), fs)
380
483
end
381
484
@@ -384,7 +487,8 @@ function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol})
384
487
[Symbol (colname,' _' ,fname) for fname in fnames for colname in cn]
385
488
end
386
489
387
- function _aggregate {T<:Function} (d:: AbstractDataTable , fs:: Vector{T} , headers:: Vector{Symbol} , sort:: Bool = false )
490
+ function _aggregate {T<:Function} (d:: AbstractDataTable , fs:: Vector{T} ,
491
+ headers:: Vector{Symbol} , sort:: Bool = true )
388
492
res = DataTable (Any[vcat (f (d[i])) for f in fs for i in 1 : size (d, 2 )], headers)
389
493
sort && sort! (res, cols= headers)
390
494
res
0 commit comments