Skip to content

Commit

Permalink
dplyr syntax according to tidyverse/dplyr#4090
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Jan 20, 2019
1 parent 2f562ae commit d0f256f
Showing 1 changed file with 10 additions and 10 deletions.
20 changes: 10 additions & 10 deletions report-code.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ groupby.code = list(
"sum v1 by id1" = c( # q1
"dask"="x.groupby(['id1']).agg({'v1':'sum'}).compute()",
"data.table"="DT[, .(v1=sum(v1)), by=id1]",
"dplyr"="DF %>% group_by(id1) %>% summarise(sum(v1))",
"dplyr"="DF %>% group_by(id1, .drop=TRUE) %>% summarise(sum(v1))",
"juliadf"="by(x, :id1, v1 = :v1 => sum)",
"pandas"="DF.groupby(['id1']).agg({'v1':'sum'})",
"pydatatable"="DT[:, {'v1': sum(f.v1)}, by(f.id1)]",
Expand All @@ -11,7 +11,7 @@ groupby.code = list(
"sum v1 by id1:id2" = c( # q2
"dask"="x.groupby(['id1','id2']).agg({'v1':'sum'}).compute()",
"data.table"="DT[, .(v1=sum(v1)), by=.(id1, id2)]",
"dplyr"="DF %>% group_by(id1,id2) %>% summarise(sum(v1))",
"dplyr"="DF %>% group_by(id1, id2, .drop=TRUE) %>% summarise(sum(v1))",
"juliadf"="by(x, [:id1, :id2], v1 = :v1 => sum)",
"pandas"="DF.groupby(['id1','id2']).agg({'v1':'sum'})",
"pydatatable"="DT[:, {'v1': sum(f.v1)}, by(f.id1, f.id2)]",
Expand All @@ -20,7 +20,7 @@ groupby.code = list(
"sum v1 mean v3 by id3" = c( # q3
"dask"="x.groupby(['id3']).agg({'v1':'sum', 'v3':'mean'}).compute()",
"data.table"="DT[, .(v1=sum(v1), v3=mean(v3)), by=id3]",
"dplyr"="DF %>% group_by(id3) %>% summarise(sum(v1), mean(v3))",
"dplyr"="DF %>% group_by(id3, .drop=TRUE) %>% summarise(sum(v1), mean(v3))",
"juliadf"="by(x, :id3, v1 = :v1 => sum, v3 = :v3 => mean)",
"pandas"="DF.groupby(['id3']).agg({'v1':'sum', 'v3':'mean'})",
"pydatatable"="DT[:, {'v1': sum(f.v1), 'v3': mean(f.v3)}, by(f.id3)]",
Expand All @@ -29,7 +29,7 @@ groupby.code = list(
"mean v1:v3 by id4" = c( # q4
"dask"="x.groupby(['id4']).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'}).compute()",
"data.table"="DT[, lapply(.SD, mean), by=id4, .SDcols=v1:v3]",
"dplyr"="DF %>% group_by(id4) %>% summarise_each(funs(mean), vars=7:9)",
"dplyr"="DF %>% group_by(id4, .drop=TRUE) %>% summarise_each(funs(mean), vars=7:9)",
"juliadf"="by(x, :id4, v1 = :v1 => mean, v2 = :v2 => mean, v3 = :v3 => mean)",
"pandas"="DF.groupby(['id4']).agg({'v1':'mean', 'v2':'mean', 'v3':'mean'})",
"pydatatable"="DT[:, {'v1': mean(f.v1), 'v2': mean(f.v2), 'v3': mean(f.v3)}, by(f.id4)]",
Expand All @@ -38,7 +38,7 @@ groupby.code = list(
"sum v1:v3 by id6" = c( # q5
"dask"="x.groupby(['id6']).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'}).compute()",
"data.table"="DT[, lapply(.SD, sum), by=id6, .SDcols=v1:v3]",
"dplyr"="DF %>% group_by(id6) %>% summarise_each(funs(sum), vars=7:9)",
"dplyr"="DF %>% group_by(id6, .drop=TRUE) %>% summarise_each(funs(sum), vars=7:9)",
"juliadf"="by(x, :id6, v1 = :v1 => sum, v2 = :v2 => sum, v3 = :v3 => sum)",
"pandas"="DF.groupby(['id6']).agg({'v1':'sum', 'v2':'sum', 'v3':'sum'})",
"pydatatable"="DT[:, {'v1': sum(f.v1), 'v2': sum(f.v2), 'v3': sum(f.v3)}, by(f.id6)]",
Expand All @@ -47,7 +47,7 @@ groupby.code = list(
"median v3 sd v3 by id2 id4" = c ( # q6
"dask" = "median not yet implemented: dask#4362", # x.groupby(['id2','id4']).agg({'v3': ['median','std']}).compute()
"data.table" = "DT[, .(median_v3=median(v3), sd_v3=sd(v3)), by=.(id2, id4)]",
"dplyr" = "DF %>% group_by(id2, id4) %>% summarise(median_v3=median(v3), sd_v3=sd(v3))",
"dplyr" = "DF %>% group_by(id2, id4, .drop=TRUE) %>% summarise(median_v3=median(v3), sd_v3=sd(v3))",
"juliadf" = "by(x, [:id2, :id4], median_v3 = :v3 => median, sd_v3 = :v3 => std)",
"pandas" = "x.groupby(['id2','id4']).agg({'v3': ['median','std']})",
"pydatatable" = "median not yet implemented: datatable#1530", # x[:, {'median_v3': median(f.v3), 'sd_v3': sd(f.v3)}, by(f.id2, f.id4)]
Expand All @@ -56,7 +56,7 @@ groupby.code = list(
"max v1 - min v2 by id2 id4" = c ( # q7
"dask" = "x.groupby(['id2','id4']).apply(lambda x: pd.Series({'range_v1_v2': x['v1'].max()-x['v2'].min()}), meta={'range_v1_v2': 'int64'}).compute()",
"data.table" = "DT[, .(range_v1_v2=max(v1)-min(v2)), by=.(id2, id4)]",
"dplyr" = "DF %>% group_by(id2, id4) %>% summarise(range_v1_v2=max(v1)-min(v2))",
"dplyr" = "DF %>% group_by(id2, id4, .drop=TRUE) %>% summarise(range_v1_v2=max(v1)-min(v2))",
"juliadf" = "by(x, [:id2, :id4], range_v1_v2 = [:v1, :v2] => x -> maximum(skipmissing(x.v1))-minimum(skipmissing(x.v2)))",
"pandas" = "x.groupby(['id2','id4']).apply(lambda x: pd.Series({'range_v1_v2': x['v1'].max()-x['v2'].min()}))",
"pydatatable" = "x[:, {'range_v1_v2': max(f.v1)-min(f.v2)}, by(f.id2, f.id4)]",
Expand All @@ -65,7 +65,7 @@ groupby.code = list(
"largest two v3 by id2 id4" = c ( # q8
"dask" = "x[['id2','id4','v3']].groupby(['id2','id4']).apply(lambda x: x.nlargest(2, columns='v3'), meta={'id2': 'category', 'id4': 'int64', 'v3': 'float64'})[['v3']].compute()",
"data.table" = "DT[order(-v3), .(largest2_v3=head(v3, 2L)), by=.(id2, id4)]",
"dplyr" = "DF %>% select(id2, id4, largest2_v3=v3) %>% arrange(desc(largest2_v3)) %>% group_by(id2, id4) %>% filter(row_number() <= 2L)",
"dplyr" = "DF %>% select(id2, id4, largest2_v3=v3) %>% arrange(desc(largest2_v3)) %>% group_by(id2, id4, .drop=TRUE) %>% filter(row_number() <= 2L)",
"juliadf" = "by(x, [:id2, :id4], largest2_v3 = :v3 => x -> partialsort(x, 1:2, rev=true))",
"pandas" = "x[['id2','id4','v3']].sort_values('v3', ascending=False).groupby(['id2','id4']).head(2)",
"pydatatable" = "not yet implemented: datatable#1531",
Expand All @@ -74,7 +74,7 @@ groupby.code = list(
"regression v1 v2 by id2 id4" = c ( # q9
"dask" = "x[['id2','id4','v1','v2']].groupby(['id2','id4']).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}), meta={'r2': 'float64'}).compute()",
"data.table" = "DT[, .(r2=cor(v1, v2)^2), by=.(id2, id4)]",
"dplyr" = "DF %>% group_by(id2, id4) %>% summarise(r2=cor(v1, v2)^2)",
"dplyr" = "DF %>% group_by(id2, id4, .drop=TRUE) %>% summarise(r2=cor(v1, v2)^2)",
"juliadf" = "by(x, [:id2, :id4], r2 = [:v1, :v2] => x -> cor(x.v1, x.v2)^2)",
"pandas" = "x[['id2','id4','v1','v2']].groupby(['id2','id4']).apply(lambda x: pd.Series({'r2': x.corr()['v1']['v2']**2}))",
"pydatatable" = "not yet implemented: datatable#1543", # x[:, {'r2': cor(v1, v2)^2}, by(f.id2, f.id4)],
Expand All @@ -83,7 +83,7 @@ groupby.code = list(
"sum v3 count by id1:id6" = c( # q10
"dask" = "x.groupby(['id1','id2','id3','id4','id5','id6']).agg({'v3':'sum', 'v1':'count'}).compute()",
"data.table" = "DT[, .(v3=sum(v3), count=.N), by=id1:id6]",
"dplyr" = "DF %>% group_by(id1, id2, id3, id4, id5, id6) %>% summarise(v3=sum(v3), count=n())",
"dplyr" = "DF %>% group_by(id1, id2, id3, id4, id5, id6, .drop=TRUE) %>% summarise(v3=sum(v3), count=n())",
"juliadf" = "by(x, [:id1, :id2, :id3, :id4, :id5, :id6], v3 = :v3 => sum, count = :v3 => length)",
"pandas" = "x.groupby(['id1','id2','id3','id4','id5','id6']).agg({'v3':'sum', 'v1':'count'})",
"pydatatable" = "x[:, {'v3': sum(f.v3), 'count': count()}, by(f.id1, f.id2, f.id3, f.id4, f.id5, f.id6)]",
Expand Down

0 comments on commit d0f256f

Please sign in to comment.