Skip to content

Commit 00d2b4a

Browse files
committed
Merge pull request #36 from JuliaStats/ts/data-table-timings
Add benchmarks from R's data.table benchmarks
2 parents aa022e5 + dd90af1 commit 00d2b4a

File tree

1 file changed

+86
-0
lines changed

1 file changed

+86
-0
lines changed

test/data.table.timings.jl

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
# Julia benchmarks from R's data.table
2+
# https://github.com/Rdatatable/data.table/wiki/Benchmarks-:-Grouping
3+
4+
using DataFrames, DataFramesMeta
5+
using NullableArrays
6+
7+
N=10_000_000; K=100
8+
srand(1)
9+
10+
# Array version
11+
12+
DA = DataFrame(
13+
id1 = P(rand([symbol(string("id", i)) for i=1:K], N)), # large groups (char)
14+
id2 = P(rand([symbol(string("id", i)) for i=1:K], N)), # large groups (char)
15+
id3 = P(rand([symbol(string("id", i)) for i=1:Int(N/K)], N)), # small groups (char)
16+
id4 = P(rand(1:K, N)), # large groups (int)
17+
id5 = P(rand(1:K, N)), # large groups (int)
18+
id6 = P(rand(1:Int(N/K), N)), # small groups (int)
19+
v1 = P(rand(1:5, N)), # int in range [1,5]
20+
v2 = P(rand(1:5, N)), # int in range [1,5]
21+
v3 = P(rand(N)) # numeric e.g. 23.5749
22+
);
23+
24+
# PooledDataArray version
25+
26+
DPDA = DataFrame(
27+
id1 = PooledDataArray(rand([symbol(string("id", i)) for i=1:K], N)), # large groups (char)
28+
id2 = PooledDataArray(rand([symbol(string("id", i)) for i=1:K], N)), # large groups (char)
29+
id3 = PooledDataArray(rand([symbol(string("id", i)) for i=1:Int(N/K)], N)), # small groups (char)
30+
id4 = PooledDataArray(rand(1:K, N)), # large groups (int)
31+
id5 = PooledDataArray(rand(1:K, N)), # large groups (int)
32+
id6 = PooledDataArray(rand(1:Int(N/K), N)), # small groups (int)
33+
v1 = P(rand(1:5, N)), # int in range [1,5]
34+
v2 = P(rand(1:5, N)), # int in range [1,5]
35+
v3 = P(rand(N)) # numeric e.g. 23.5749
36+
);
37+
38+
# DataArray version
39+
40+
DDA = DataFrame(
41+
id1 = (rand([symbol(string("id", i)) for i=1:K], N)), # large groups (char)
42+
id2 = (rand([symbol(string("id", i)) for i=1:K], N)), # large groups (char)
43+
id3 = (rand([symbol(string("id", i)) for i=1:Int(N/K)], N)), # small groups (char)
44+
id4 = (rand(1:K, N)), # large groups (int)
45+
id5 = (rand(1:K, N)), # large groups (int)
46+
id6 = (rand(1:Int(N/K), N)), # small groups (int)
47+
v1 = (rand(1:5, N)), # int in range [1,5]
48+
v2 = (rand(1:5, N)), # int in range [1,5]
49+
v3 = (rand(N)) # numeric e.g. 23.5749
50+
);
51+
52+
# NullableArray version
53+
54+
DNA = DataFrame(
55+
id1 = P(NullableArray(rand([symbol(string("id", i)) for i=1:K], N))), # large groups (char)
56+
id2 = P(NullableArray(rand([symbol(string("id", i)) for i=1:K], N))), # large groups (char)
57+
id3 = P(NullableArray(rand([symbol(string("id", i)) for i=1:Int(N/K)], N))), # small groups (char)
58+
id4 = P(NullableArray(rand(1:K, N))), # large groups (int)
59+
id5 = P(NullableArray(rand(1:K, N))), # large groups (int)
60+
id6 = P(NullableArray(rand(1:Int(N/K), N))), # small groups (int)
61+
v1 = P(NullableArray(rand(1:5, N))), # int in range [1,5]
62+
v2 = P(NullableArray(rand(1:5, N))), # int in range [1,5]
63+
v3 = P(NullableArray(rand(N))) # numeric e.g. 23.5749
64+
);
65+
66+
67+
function dt_timings(D)
68+
@time @by(D, :id1, sv =sum(:v1));
69+
@time @by(D, :id1, sv =sum(:v1));
70+
@time @by(D, [:id1, :id2], sv =sum(:v1));
71+
@time @by(D, [:id1, :id2], sv =sum(:v1));
72+
@time @by(D, :id3, sv = sum(:v1), mv3 = mean(:v3));
73+
@time @by(D, :id3, sv = sum(:v1), mv3 = mean(:v3));
74+
@time aggregate(D[[4,7:9;]], :id4, mean);
75+
@time aggregate(D[[4,7:9;]], :id4, mean);
76+
@time aggregate(D[[6,7:9;]], :id6, sum);
77+
@time aggregate(D[[6,7:9;]], :id6, sum);
78+
return
79+
end
80+
81+
dt_timings(DA)
82+
dt_timings(DPDA)
83+
dt_timings(DNA)
84+
dt_timings(DDA)
85+
86+
@profile @by(D, :id1, sv =sum(:v1));

0 commit comments

Comments
 (0)