Skip to content

Commit 50f3154

Browse files
committed
make sorting of row groups optional
by default no sorting is applied to preserve original ordering (the initial order of the 1st rows is preserved) and make things faster
1 parent c4972fe commit 50f3154

File tree

3 files changed

+39
-22
lines changed

3 files changed

+39
-22
lines changed

src/groupeddataframe/grouping.jl

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ groupby(cols)
3737
3838
* `d` : an AbstractDataFrame
3939
* `cols` : data frame columns to group by
40+
* `sort`: sort row groups (no sorting by default)
4041
4142
If `d` is not provided, a curried version of groupby is given.
4243
@@ -81,20 +82,24 @@ df |> groupby([:a, :b]) |> [sum, length]
8182
```
8283
8384
"""
84-
function groupby{T}(df::AbstractDataFrame, cols::Vector{T})
85+
function groupby{T}(df::AbstractDataFrame, cols::Vector{T}; sort::Bool = false)
8586
sdf = df[cols]
8687
df_groups = _group_rows(sdf)
8788
# sort the groups
88-
group_perm = sortperm(sub(sdf, df_groups.rperm[df_groups.starts]))
89+
if sort
90+
group_perm = sortperm(sub(sdf, df_groups.rperm[df_groups.starts]))
91+
permute!(df_groups.starts, group_perm)
92+
permute!(df_groups.stops, group_perm)
93+
end
8994
GroupedDataFrame(df, cols, df_groups.rperm,
90-
df_groups.starts[group_perm],
91-
df_groups.stops[group_perm])
95+
df_groups.starts,
96+
df_groups.stops)
9297
end
93-
groupby(d::AbstractDataFrame, cols) = groupby(d, [cols])
98+
groupby(d::AbstractDataFrame, cols; sort::Bool = false) = groupby(d, [cols], sort = sort)
9499

95100
# add a function curry
96-
groupby{T}(cols::Vector{T}) = x -> groupby(x, cols)
97-
groupby(cols) = x -> groupby(x, cols)
101+
groupby{T}(cols::Vector{T}; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
102+
groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort)
98103

99104
Base.start(gd::GroupedDataFrame) = 1
100105
Base.next(gd::GroupedDataFrame, state::Int) =
@@ -241,8 +246,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d`
241246
based on columns `col`
242247
243248
```julia
244-
by(d::AbstractDataFrame, cols, f::Function)
245-
by(f::Function, d::AbstractDataFrame, cols)
249+
by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false)
250+
by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false)
246251
```
247252
248253
### Arguments
@@ -251,6 +256,7 @@ by(f::Function, d::AbstractDataFrame, cols)
251256
* `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.)
252257
* `f` : a function to be applied to groups; expects each argument to
253258
be an AbstractDataFrame
259+
* `sort`: sort row groups (no sorting by default)
254260
255261
`f` can return a value, a vector, or a DataFrame. For a value or
256262
vector, these are merged into a column along with the `cols` keys. For
@@ -281,8 +287,10 @@ end
281287
```
282288
283289
"""
284-
by(d::AbstractDataFrame, cols, f::Function) = combine(map(f, groupby(d, cols)))
285-
by(f::Function, d::AbstractDataFrame, cols) = by(d, cols, f)
290+
by(d::AbstractDataFrame, cols, f::Function; sort::Bool = false) =
291+
combine(map(f, groupby(d, cols, sort = sort)))
292+
by(f::Function, d::AbstractDataFrame, cols; sort::Bool = false) =
293+
by(d, cols, f, sort = sort)
286294

287295
#
288296
# Aggregate convenience functions
@@ -342,8 +350,9 @@ Base.(:|>)(gd::GroupedDataFrame, fs::Vector{Function}) = aggregate(gd, fs)
342350
# Groups DataFrame by cols before applying aggregate
343351
function aggregate{T <: ColumnIndex}(d::AbstractDataFrame,
344352
cols::@compat(Union{T, AbstractVector{T}}),
345-
fs::@compat(Union{Function, Vector{Function}}))
346-
aggregate(groupby(d, cols), fs)
353+
fs::@compat(Union{Function, Vector{Function}});
354+
sort::Bool = false)
355+
aggregate(groupby(d, cols, sort = sort), fs)
347356
end
348357

349358
function _makeheaders(fs::Vector{Function}, cn::Vector{Symbol})

test/data.jl

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -102,18 +102,16 @@ module TestData
102102
df8 = aggregate(df7[[1, 3]], sum)
103103
@test df8[1, :d1_sum] == sum(df7[:d1])
104104

105-
df8 = aggregate(df7, :d2, [sum, length])
105+
df8 = aggregate(df7, :d2, [sum, length], sort=true)
106106
@test df8[1:2, :d2] == ["A", "B"]
107107
@test size(df8, 1) == 3
108108
@test size(df8, 2) == 5
109109
@test sum(df8[:d1_length]) == N
110110
@test all(df8[:d1_length] .> 0)
111111
@test df8[:d1_length] == [4, 5, 11]
112-
@test isequal(df8, aggregate(groupby(df7, :d2), [sum, length]))
112+
@test isequal(df8, aggregate(groupby(df7, :d2, sort=true), [sum, length]))
113113

114-
df9 = df7 |> groupby([:d2]) |> [sum, length]
115-
@test isequal(df9, df8)
116-
df9 = aggregate(df7, :d2, [sum, length])
114+
df9 = df7 |> groupby([:d2], sort=true) |> [sum, length]
117115
@test isequal(df9, df8)
118116

119117
df10 = DataFrame(

test/grouping.jl

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,29 +2,39 @@ module TestGrouping
22
using Base.Test
33
using DataFrames
44

5-
df = DataFrame(a=rep(1:4, 2), b=rep(2:-1:1, 4), c=randn(8))
5+
df = DataFrame(a=rep(4:-1:1, 2), b=rep(1:2, 4), c=randn(8))
66
#df[6, :a] = NA
77
#df[7, :b] = NA
88

99
cols = [:a, :b]
1010

1111
f(df) = DataFrame(cmax = maximum(df[:c]))
1212

13-
sdf = sort(df, cols=cols)
13+
sdf = unique(df[cols])
14+
15+
# by() without groups sorting
1416
bdf = by(df, cols, f)
17+
@test bdf[cols] == sdf
1518

16-
@test bdf[cols] == unique(sdf[cols])
19+
# by() with groups sorting
20+
sbdf = by(df, cols, f, sort=true)
21+
@test sbdf[cols] == sort(sdf)
1722

1823
byf = by(df, :a, df -> DataFrame(bsum = sum(df[:b])))
1924

2025
@test all(T -> T <: AbstractVector, map(typeof, colwise([sum], df)))
2126
@test all(T -> T <: AbstractVector, map(typeof, colwise(sum, df)))
2227

28+
# groupby() without groups sorting
2329
gd = groupby(df, cols)
2430
ga = map(f, gd)
25-
2631
@test bdf == combine(ga)
2732

33+
# groupby() with groups sorting
34+
gd = groupby(df, cols, sort=true)
35+
ga = map(f, gd)
36+
@test sbdf == combine(ga)
37+
2838
g(df) = DataFrame(cmax1 = df[:cmax] + 1)
2939
h(df) = g(f(df))
3040

0 commit comments

Comments
 (0)