From 20c71d6d40b3b238e902189b8262ba2b2e679b31 Mon Sep 17 00:00:00 2001 From: Cameron Prybol Date: Mon, 6 Mar 2017 12:36:34 -0800 Subject: [PATCH] Enhance joining and grouping (#17) Using a hashing approach rather than converting all columns to categorical arrays. Based on work by @alyst in DataFrames. --- REQUIRE | 2 +- docs/src/man/joins.md | 10 +- src/DataTables.jl | 1 + src/abstractdatatable/abstractdatatable.jl | 17 +- src/abstractdatatable/join.jl | 404 ++++++++++----------- src/abstractdatatable/reshape.jl | 2 +- src/abstractdatatable/sort.jl | 69 +--- src/datatable/datatable.jl | 14 +- src/datatablerow/datatablerow.jl | 75 +++- src/datatablerow/utils.jl | 194 ++++++++++ src/groupeddatatable/grouping.jl | 129 ++----- src/other/utils.jl | 13 - src/subdatatable/subdatatable.jl | 12 +- test/cat.jl | 6 +- test/constructors.jl | 5 + test/data.jl | 38 +- test/datatable.jl | 13 +- test/datatablerow.jl | 50 +++ test/grouping.jl | 29 +- test/join.jl | 10 +- 20 files changed, 640 insertions(+), 453 deletions(-) create mode 100644 src/datatablerow/utils.jl diff --git a/REQUIRE b/REQUIRE index d4d0eff..826e3d1 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,6 +1,6 @@ julia 0.5 NullableArrays 0.1.0 -CategoricalArrays 0.0.6 +CategoricalArrays 0.1.2 StatsBase 0.11.0 GZip SortingAlgorithms diff --git a/docs/src/man/joins.md b/docs/src/man/joins.md index 0212b5f..254ea97 100644 --- a/docs/src/man/joins.md +++ b/docs/src/man/joins.md @@ -51,7 +51,7 @@ Cross joins are the only kind of join that does not use a key: join(a, b, kind = :cross) ``` -In order to join data frames on keys which have different names, you must first rename them so that they match. This can be done using rename!: +In order to join data tables on keys which have different names, you must first rename them so that they match. This can be done using rename!: ```julia a = DataTable(ID = [1, 2], Name = ["A", "B"]) @@ -63,11 +63,11 @@ join(a, b, on = :ID, kind = :inner) Or renaming multiple columns at a time: ```julia -a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"], - Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], +a = DataTable(City = ["Amsterdam", "London", "London", "New York", "New York"], + Job = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], Category = [1, 2, 3, 4, 5]) -b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"], - Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], +b = DataTable(Location = ["Amsterdam", "London", "London", "New York", "New York"], + Work = ["Lawyer", "Lawyer", "Lawyer", "Doctor", "Doctor"], Name = ["a", "b", "c", "d", "e"]) rename!(b, [:Location => :City, :Work => :Job]) join(a, b, on = [:City, :Job]) diff --git a/src/DataTables.jl b/src/DataTables.jl index c1b1479..6d68ae4 100644 --- a/src/DataTables.jl +++ b/src/DataTables.jl @@ -104,6 +104,7 @@ for (dir, filename) in [ ("subdatatable", "subdatatable.jl"), ("groupeddatatable", "grouping.jl"), ("datatablerow", "datatablerow.jl"), + ("datatablerow", "utils.jl"), ("abstractdatatable", "iteration.jl"), ("abstractdatatable", "join.jl"), diff --git a/src/abstractdatatable/abstractdatatable.jl b/src/abstractdatatable/abstractdatatable.jl index e616338..a885136 100644 --- a/src/abstractdatatable/abstractdatatable.jl +++ b/src/abstractdatatable/abstractdatatable.jl @@ -602,17 +602,14 @@ nonunique(dt, 1) """ function nonunique(dt::AbstractDataTable) - res = fill(false, nrow(dt)) - rows = Set{DataTableRow}() - for i in 1:nrow(dt) - arow = DataTableRow(dt, i) - if in(arow, rows) - res[i] = true - else - push!(rows, arow) - end + gslots = row_group_slots(dt)[3] + # unique rows are the first encountered group representatives, + # nonunique are everything else + res = fill(true, nrow(dt)) + @inbounds for g_row in gslots + (g_row > 0) && (res[g_row] = false) end - res + return res end nonunique(dt::AbstractDataTable, cols::Union{Real, Symbol}) = nonunique(dt[[cols]]) diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl index 8523a5a..1ad170b 100644 --- a/src/abstractdatatable/join.jl +++ b/src/abstractdatatable/join.jl @@ -15,205 +15,166 @@ similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::@compat(Union{Int, Tuple{ similar_nullable(dt::AbstractDataTable, dims::Int) = DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt))) -function join_idx(left, right, max_groups) - ## adapted from Wes McKinney's full_outer_join in pandas (file: src/join.pyx). - - # NULL group in location 0 - - left_sorter, where, left_count = groupsort_indexer(left, max_groups) - right_sorter, where, right_count = groupsort_indexer(right, max_groups) - - # First pass, determine size of result set - tcount = 0 - rcount = 0 - lcount = 0 - for i in 1:(max_groups + 1) - lc = left_count[i] - rc = right_count[i] - - if rc > 0 && lc > 0 - tcount += lc * rc - elseif rc > 0 - rcount += rc - else - lcount += lc - end - end - - # group 0 is the NULL group - tposition = 0 - lposition = 0 - rposition = 0 - - left_pos = 0 - right_pos = 0 - - left_indexer = Vector{Int}(tcount) - right_indexer = Vector{Int}(tcount) - leftonly_indexer = Vector{Int}(lcount) - rightonly_indexer = Vector{Int}(rcount) - for i in 1:(max_groups + 1) - lc = left_count[i] - rc = right_count[i] - if rc == 0 - for j in 1:lc - leftonly_indexer[lposition + j] = left_pos + j - end - lposition += lc - elseif lc == 0 - for j in 1:rc - rightonly_indexer[rposition + j] = right_pos + j - end - rposition += rc - else - for j in 1:lc - offset = tposition + (j-1) * rc - for k in 1:rc - left_indexer[offset + k] = left_pos + j - right_indexer[offset + k] = right_pos + k - end - end - tposition += lc * rc - end - left_pos += lc - right_pos += rc - end - - ## (left_sorter, left_indexer, leftonly_indexer, - ## right_sorter, right_indexer, rightonly_indexer) - (left_sorter[left_indexer], left_sorter[leftonly_indexer], - right_sorter[right_indexer], right_sorter[rightonly_indexer]) -end - -function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, - v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, - index::Vector{S}, - R) - tidx1 = convert(Vector{R}, indexin(CategoricalArrays.index(v1.pool), index)) - tidx2 = convert(Vector{R}, indexin(CategoricalArrays.index(v2.pool), index)) - refs1 = zeros(R, length(v1)) - refs2 = zeros(R, length(v2)) - for i in 1:length(refs1) - if v1.refs[i] != 0 - refs1[i] = tidx1[v1.refs[i]] - end +# helper structure for DataTables joining +immutable DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable} + dtl::DT1 + dtr::DT2 + dtl_on::DT1 + dtr_on::DT2 + on_cols::Vector{Symbol} + + function DataTableJoiner(dtl::DT1, dtr::DT2, on::Union{Symbol,Vector{Symbol}}) + on_cols = isa(on, Symbol) ? [on] : on + new(dtl, dtr, dtl[on_cols], dtr[on_cols], on_cols) end - for i in 1:length(refs2) - if v2.refs[i] != 0 - refs2[i] = tidx2[v2.refs[i]] - end - end - pool = CategoricalPool{S, R}(index) - return (CategoricalArray(refs1, pool), - CategoricalArray(refs2, pool)) end -function sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, - v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}) - index = sort(unique([levels(v1); levels(v2)])) - sz = length(index) - - R = sz <= typemax(UInt8) ? UInt8 : - sz <= typemax(UInt16) ? UInt16 : - sz <= typemax(UInt32) ? UInt32 : - UInt64 +DataTableJoiner{DT1<:AbstractDataTable, DT2<:AbstractDataTable}(dtl::DT1, dtr::DT2, on::Union{Symbol,Vector{Symbol}}) = + DataTableJoiner{DT1,DT2}(dtl, dtr, on) - # To ensure type stability during actual work - sharepools(v1, v2, index, R) +# helper map between the row indices in original and joined table +immutable RowIndexMap + "row indices in the original table" + orig::Vector{Int} + "row indices in the resulting joined table" + join::Vector{Int} end -sharepools{S,N}(v1::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}, - v2::AbstractArray{S,N}) = - sharepools(v1, oftype(v1, v2)) - -sharepools{S,N}(v1::AbstractArray{S,N}, - v2::Union{CategoricalArray{S,N}, NullableCategoricalArray{S,N}}) = - sharepools(oftype(v2, v1), v2) - -# TODO: write an optimized version for (Nullable)CategoricalArray -function sharepools{S, T}(v1::AbstractArray{S}, - v2::AbstractArray{T}) - ## Return two categorical arrays that share the same pool. - - ## TODO: allow specification of R - R = CategoricalArrays.DefaultRefType - refs1 = Array{R}(size(v1)) - refs2 = Array{R}(size(v2)) - K = promote_type(S, T) - poolref = Dict{K, R}() - maxref = 0 - - # loop through once to fill the poolref dict - for i = 1:length(v1) - if !_isnull(v1[i]) - poolref[K(v1[i])] = 0 - end +Base.length(x::RowIndexMap) = length(x.orig) + +# composes the joined data table using the maps between the left and right +# table rows and the indices of rows in the result +function compose_joined_table(joiner::DataTableJoiner, + left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap, + right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap) + @assert length(left_ixs) == length(right_ixs) + # compose left half of the result taking all left columns + all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig) + if length(leftonly_ixs) > 0 + # combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows + # preserving the original rows order + all_orig_left_ixs = similar(left_ixs.orig, length(left_ixs)+length(leftonly_ixs)) + @inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig + @inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig + else + # the result contains only the left rows that are matched to right rows (left_ixs) + all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere + end + ril = length(right_ixs) + loil = length(leftonly_ixs) + roil = length(rightonly_ixs) + left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil) + for col in columns(joiner.dtl)], + names(joiner.dtl)) + + # compose right half of the result taking all right columns excluding on + dtr_noon = without(joiner.dtr, joiner.on_cols) + # permutation to swap rightonly and leftonly rows + right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil) + if length(leftonly_ixs) > 0 + # compose right_perm with the permutation that restores left rows order + right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil] end - for i = 1:length(v2) - if !_isnull(v2[i]) - poolref[K(v2[i])] = 0 + all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig) + right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm] + for col in columns(dtr_noon)], + names(dtr_noon)) + # merge left and right parts of the joined table + res = hcat!(left_dt, right_dt) + + if length(rightonly_ixs.join) > 0 + # some left rows are nulls, so the values of the "on" columns + # need to be taken from the right + for (on_col_ix, on_col) in enumerate(joiner.on_cols) + # fix the result of the rightjoin by taking the nonnull values from the right table + res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix] end end + return res +end - # fill positions in poolref - pool = sort(collect(keys(poolref))) - i = 1 - for p in pool - poolref[p] = i - i += 1 +# map the indices of the left and right joined tables +# to the indices of the rows in the resulting table +# if `nothing` is given, the corresponding map is not built +function update_row_maps!(left_table::AbstractDataTable, + right_table::AbstractDataTable, + right_dict::RowGroupDict, + left_ixs::Union{Void, RowIndexMap}, + leftonly_ixs::Union{Void, RowIndexMap}, + right_ixs::Union{Void, RowIndexMap}, + rightonly_mask::Union{Void, Vector{Bool}}) + # helper functions + @inline update!(ixs::Void, orig_ix::Int, join_ix::Int, count::Int = 1) = nothing + @inline function update!(ixs::RowIndexMap, orig_ix::Int, join_ix::Int, count::Int = 1) + n = length(ixs.orig) + resize!(ixs.orig, n+count) + ixs.orig[n+1:end] = orig_ix + append!(ixs.join, join_ix:(join_ix+count-1)) + ixs end - - # fill in newrefs - zeroval = zero(R) - for i = 1:length(v1) - if _isnull(v1[i]) - refs1[i] = zeroval - else - refs1[i] = poolref[K(v1[i])] - end + @inline update!(ixs::Void, orig_ixs::AbstractArray, join_ix::Int) = nothing + @inline function update!(ixs::RowIndexMap, orig_ixs::AbstractArray, join_ix::Int) + append!(ixs.orig, orig_ixs) + append!(ixs.join, join_ix:(join_ix+length(orig_ixs)-1)) + ixs end - for i = 1:length(v2) - if _isnull(v2[i]) - refs2[i] = zeroval + @inline update!(ixs::Void, orig_ixs::AbstractArray) = nothing + @inline update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = (mask[orig_ixs] = false) + + # iterate over left rows and compose the left<->right index map + next_join_ix = 1 + for l_ix in 1:nrow(left_table) + r_ixs = findrows(right_dict, left_table, l_ix) + if isempty(r_ixs) + update!(leftonly_ixs, l_ix, next_join_ix) + next_join_ix += 1 else - refs2[i] = poolref[K(v2[i])] + update!(left_ixs, l_ix, next_join_ix, length(r_ixs)) + update!(right_ixs, r_ixs, next_join_ix) + update!(rightonly_mask, r_ixs) + next_join_ix += length(r_ixs) end end - - pool = CategoricalPool(pool) - return (NullableCategoricalArray(refs1, pool), - NullableCategoricalArray(refs2, pool)) end -function sharepools(dt1::AbstractDataTable, dt2::AbstractDataTable) - # This method exists to allow merge to work with multiple columns. - # It takes the columns of each DataTable and returns a categorical array - # with a merged pool that "keys" the combination of column values. - # The pools of the result don't really mean anything. - dv1, dv2 = sharepools(dt1[1], dt2[1]) - # use UInt32 instead of the minimum integer size chosen by sharepools - # since the number of levels can be high - refs1 = Vector{UInt32}(dv1.refs) - refs2 = Vector{UInt32}(dv2.refs) - # the + 1 handles nulls - refs1[:] += 1 - refs2[:] += 1 - ngroups = length(levels(dv1)) + 1 - for j = 2:ncol(dt1) - dv1, dv2 = sharepools(dt1[j], dt2[j]) - for i = 1:length(refs1) - refs1[i] += (dv1.refs[i]) * ngroups - end - for i = 1:length(refs2) - refs2[i] += (dv2.refs[i]) * ngroups - end - ngroups *= length(levels(dv1)) + 1 +# map the row indices of the left and right joined tables +# to the indices of rows in the resulting table +# returns the 4-tuple of row indices maps for +# - matching left rows +# - non-matching left rows +# - matching right rows +# - non-matching right rows +# if false is provided, the corresponding map is not built and the +# tuple element is empty RowIndexMap +function update_row_maps!(left_table::AbstractDataTable, + right_table::AbstractDataTable, + right_dict::RowGroupDict, + map_left::Bool, map_leftonly::Bool, + map_right::Bool, map_rightonly::Bool) + init_map(dt::AbstractDataTable, init::Bool) = init ? + RowIndexMap(sizehint!(Vector{Int}(), nrow(dt)), + sizehint!(Vector{Int}(), nrow(dt))) : nothing + to_bimap(x::RowIndexMap) = x + to_bimap(::Void) = RowIndexMap(Vector{Int}(), Vector{Int}()) + + # init maps as requested + left_ixs = init_map(left_table, map_left) + leftonly_ixs = init_map(left_table, map_leftonly) + right_ixs = init_map(right_table, map_right) + rightonly_mask = map_rightonly ? fill(true, nrow(right_table)) : nothing + update_row_maps!(left_table, right_table, right_dict, left_ixs, leftonly_ixs, right_ixs, rightonly_mask) + if map_rightonly + rightonly_orig_ixs = find(rightonly_mask) + rightonly_ixs = RowIndexMap(rightonly_orig_ixs, + collect(length(right_ixs.orig) + + (leftonly_ixs === nothing ? 0 : length(leftonly_ixs)) + + (1:length(rightonly_orig_ixs)))) + else + rightonly_ixs = nothing end - # recode refs1 and refs2 to drop the unused column combinations and - # limit the pool size - sharepools(refs1, refs2) -end + return to_bimap(left_ixs), to_bimap(leftonly_ixs), to_bimap(right_ixs), to_bimap(rightonly_ixs) +end """ Join two DataTables @@ -270,68 +231,67 @@ join(name, job, kind = :cross) """ function Base.join(dt1::AbstractDataTable, dt2::AbstractDataTable; - on::@compat(Union{Symbol, Vector{Symbol}}) = Symbol[], + on::Union{Symbol, Vector{Symbol}} = Symbol[], kind::Symbol = :inner) if kind == :cross - if on != Symbol[] - throw(ArgumentError("Cross joins don't use argument 'on'.")) - end + (on == Symbol[]) || throw(ArgumentError("Cross joins don't use argument 'on'.")) return crossjoin(dt1, dt2) elseif on == Symbol[] throw(ArgumentError("Missing join argument 'on'.")) end - dv1, dv2 = sharepools(dt1[on], dt2[on]) - - left_idx, leftonly_idx, right_idx, rightonly_idx = - join_idx(dv1.refs, dv2.refs, length(dv1.pool)) + joiner = DataTableJoiner(dt1, dt2, on) if kind == :inner - dt2w = without(dt2, on) - - left = dt1[left_idx, :] - right = dt2w[right_idx, :] - - return hcat!(left, right) + compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, false, true, false)...) elseif kind == :left - dt2w = without(dt2, on) - - left = dt1[[left_idx; leftonly_idx], :] - right = vcat(dt2w[right_idx, :], - similar_nullable(dt2w, length(leftonly_idx))) - - return hcat!(left, right) + compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, false)...) elseif kind == :right - dt1w = without(dt1, on) - - left = vcat(dt1w[left_idx, :], - similar_nullable(dt1w, length(rightonly_idx))) - right = dt2[[right_idx; rightonly_idx], :] - - return hcat!(left, right) + right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on, + group_rows(joiner.dtl_on), + true, true, true, false) + compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs) elseif kind == :outer - dt1w, dt2w = without(dt1, on), without(dt2, on) - - mixed = hcat!(dt1[left_idx, :], dt2w[right_idx, :]) - leftonly = hcat!(dt1[leftonly_idx, :], - similar_nullable(dt2w, length(leftonly_idx))) - rightonly = hcat!(similar_nullable(dt1w, length(rightonly_idx)), - dt2[rightonly_idx, :]) - - return vcat(mixed, leftonly, rightonly) + compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on, + group_rows(joiner.dtr_on), + true, true, true, true)...) elseif kind == :semi - dt1[unique(left_idx), :] + # hash the right rows + dtr_on_grp = group_rows(joiner.dtr_on) + # iterate over left rows and leave those found in right + left_ixs = Vector{Int}() + sizehint!(left_ixs, nrow(joiner.dtl)) + @inbounds for l_ix in 1:nrow(joiner.dtl_on) + if findrow(dtr_on_grp, joiner.dtl_on, l_ix) != 0 + push!(left_ixs, l_ix) + end + end + return joiner.dtl[left_ixs, :] elseif kind == :anti - dt1[leftonly_idx, :] + # hash the right rows + dtr_on_grp = group_rows(joiner.dtr_on) + # iterate over left rows and leave those not found in right + leftonly_ixs = Vector{Int}() + sizehint!(leftonly_ixs, nrow(joiner.dtl)) + @inbounds for l_ix in 1:nrow(joiner.dtl_on) + if findrow(dtr_on_grp, joiner.dtl_on, l_ix) == 0 + push!(leftonly_ixs, l_ix) + end + end + return joiner.dtl[leftonly_ixs, :] else - throw(ArgumentError("Unknown kind of join requested")) + throw(ArgumentError("Unknown kind of join requested: $kind")) end end function crossjoin(dt1::AbstractDataTable, dt2::AbstractDataTable) r1, r2 = size(dt1, 1), size(dt2, 1) - cols = Any[[Compat.repeat(c, inner=r2) for c in columns(dt1)]; - [Compat.repeat(c, outer=r1) for c in columns(dt2)]] + cols = Any[[repeat(c, inner=r2) for c in columns(dt1)]; + [repeat(c, outer=r1) for c in columns(dt2)]] colindex = merge(index(dt1), index(dt2)) DataTable(cols, colindex) end diff --git a/src/abstractdatatable/reshape.jl b/src/abstractdatatable/reshape.jl index cf40304..ed4d519 100644 --- a/src/abstractdatatable/reshape.jl +++ b/src/abstractdatatable/reshape.jl @@ -227,7 +227,7 @@ unstack(dt::AbstractDataTable, colkey, value) = function unstack(dt::AbstractDataTable, colkey::Int, value::Int) # group on anything not a key or value: - g = groupby(dt, setdiff(_names(dt), _names(dt)[[colkey, value]])) + g = groupby(dt, setdiff(_names(dt), _names(dt)[[colkey, value]]), sort=true) groupidxs = [g.idx[g.starts[i]:g.ends[i]] for i in 1:length(g.starts)] rowkey = zeros(Int, size(dt, 1)) for i in 1:length(groupidxs) diff --git a/src/abstractdatatable/sort.jl b/src/abstractdatatable/sort.jl index cfe2d11..f732ff6 100644 --- a/src/abstractdatatable/sort.jl +++ b/src/abstractdatatable/sort.jl @@ -55,69 +55,36 @@ ordering(col::ColumnIndex, lt::Function, by::Function, rev::Bool, order::Orderin # the permutation induced by this ordering is used to # sort the original (presumably larger) DataTable -type DTPerm{O<:@compat(Union{Ordering, AbstractVector}), DT<:AbstractDataTable} <: Ordering +immutable DTPerm{O<:Union{Ordering, AbstractVector}, DT<:AbstractDataTable} <: Ordering ord::O dt::DT end -function DTPerm{O<:Ordering}(ords::AbstractVector{O}, dt::AbstractDataTable) +function DTPerm{O<:Ordering, DT<:AbstractDataTable}(ords::AbstractVector{O}, dt::DT) if length(ords) != ncol(dt) error("DTPerm: number of column orderings does not equal the number of DataTable columns") end - DTPerm{AbstractVector{O}, typeof(dt)}(ords, dt) + DTPerm{typeof(ords), DT}(ords, dt) end -DTPerm{O<:Ordering}(o::O, dt::AbstractDataTable) = DTPerm{O,typeof(dt)}(o,dt) +DTPerm{O<:Ordering, DT<:AbstractDataTable}(o::O, dt::DT) = DTPerm{O,DT}(o,dt) -# For sorting, a and b are row indices (first two lt definitions) -# For issorted, the default row iterator returns DataTableRows instead, -# so two more lt function is defined below -function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a, b) - for i = 1:ncol(o.dt) - if lt(o.ord[i], o.dt[a,i], o.dt[b,i]) - return true - end - if lt(o.ord[i], o.dt[b,i], o.dt[a,i]) - return false - end - end - false -end +# get ordering function for the i-th column used for ordering +col_ordering{O<:Ordering}(o::DTPerm{O}, i::Int) = o.ord +col_ordering{V<:AbstractVector}(o::DTPerm{V}, i::Int) = o.ord[i] -function Sort.lt{O<:Ordering}(o::DTPerm{O}, a, b) - for i = 1:ncol(o.dt) - if lt(o.ord, o.dt[a,i], o.dt[b,i]) - return true - end - if lt(o.ord, o.dt[b,i], o.dt[a,i]) - return false - end - end - false -end +Base.@propagate_inbounds Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j] +Base.@propagate_inbounds Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j] -function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a::DataTableRow, b::DataTableRow) - for i = 1:ncol(o.dt) - if lt(o.ord[i], a[i], b[i]) - return true - end - if lt(o.ord[i], b[i], a[i]) - return false - end - end - false -end - -function Sort.lt{O<:Ordering}(o::DTPerm{O}, a::DataTableRow, b::DataTableRow) - for i = 1:ncol(o.dt) - if lt(o.ord, a[i], b[i]) - return true - end - if lt(o.ord, b[i], a[i]) - return false - end +function Sort.lt(o::DTPerm, a, b) + @inbounds for i = 1:ncol(o.dt) + ord = col_ordering(o, i) + va = o[a, i] + vb = o[b, i] + lt(ord, va, vb) && return true + lt(ord, vb, va) && return false end - false + false # a and b are equal end ### @@ -306,5 +273,5 @@ for s in [:(Base.sort), :(Base.sortperm)] end Base.sort(dt::AbstractDataTable, a::Algorithm, o::Ordering) = dt[sortperm(dt, a, o),:] -Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::@compat(Union{Perm,DTPerm})) = sort!([1:size(dt, 1);], a, o) +Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Union{Perm,DTPerm}) = sort!([1:size(dt, 1);], a, o) Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Ordering) = sortperm(dt, a, DTPerm(o,dt)) diff --git a/src/datatable/datatable.jl b/src/datatable/datatable.jl index 1ebac92..5eb0e7b 100644 --- a/src/datatable/datatable.jl +++ b/src/datatable/datatable.jl @@ -297,7 +297,7 @@ function Base.getindex{R<:Real}(dt::DataTable, return DataTable(new_columns, copy(index(dt))) end -# dt[:, :] => (Sub)?DataTable +# dt[:, :] => DataTable Base.getindex(dt::DataTable, ::Colon, ::Colon) = copy(dt) ############################################################################## @@ -792,17 +792,17 @@ end ## ############################################################################## -function categorical!(dt::DataTable, cname::@compat(Union{Integer, Symbol}), compact::Bool=true) +function categorical!(dt::DataTable, cname::Union{Integer, Symbol}, compact::Bool=true) dt[cname] = categorical(dt[cname], compact) - return + dt end -function categorical!{T <: @compat(Union{Integer, Symbol})}(dt::DataTable, cnames::Vector{T}, - compact::Bool=true) +function categorical!{T <: Union{Integer, Symbol}}(dt::DataTable, cnames::Vector{T}, + compact::Bool=true) for cname in cnames dt[cname] = categorical(dt[cname], compact) end - return + dt end function categorical!(dt::DataTable, compact::Bool=true) @@ -811,7 +811,7 @@ function categorical!(dt::DataTable, compact::Bool=true) dt[i] = categorical(dt[i], compact) end end - return + dt end function Base.append!(dt1::DataTable, dt2::AbstractDataTable) diff --git a/src/datatablerow/datatablerow.jl b/src/datatablerow/datatablerow.jl index a6b4a61..8cbade4 100644 --- a/src/datatablerow/datatablerow.jl +++ b/src/datatablerow/datatablerow.jl @@ -27,7 +27,7 @@ Base.length(r::DataTableRow) = size(r.dt, 2) Base.endof(r::DataTableRow) = size(r.dt, 2) -Base.collect(r::DataTableRow) = @compat Tuple{Symbol, Any}[x for x in r] +Base.collect(r::DataTableRow) = Tuple{Symbol, Any}[x for x in r] Base.start(r::DataTableRow) = 1 @@ -37,31 +37,80 @@ Base.done(r::DataTableRow, s) = s > length(r) Base.convert(::Type{Array}, r::DataTableRow) = convert(Array, r.dt[r.row,:]) +# hash column element +Base.@propagate_inbounds hash_colel(v::AbstractArray, i, h::UInt = zero(UInt)) = hash(v[i], h) +Base.@propagate_inbounds hash_colel{T<:Nullable}(v::AbstractArray{T}, i, h::UInt = zero(UInt)) = + isnull(v[i]) ? h + Base.nullablehash_seed : hash(unsafe_get(v[i]), h) +Base.@propagate_inbounds hash_colel{T}(v::NullableArray{T}, i, h::UInt = zero(UInt)) = + isnull(v, i) ? h + Base.nullablehash_seed : hash(v.values[i], h) +Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) = + hash(CategoricalArrays.index(v.pool)[v.refs[i]], h) +Base.@propagate_inbounds function hash_colel{T}(v::AbstractNullableCategoricalArray{T}, i, h::UInt = zero(UInt)) + ref = v.refs[i] + ref == 0 ? h + Base.nullablehash_seed : hash(CategoricalArrays.index(v.pool)[ref], h) +end + # hash of DataTable rows based on its values # so that duplicate rows would have the same hash -function Base.hash(r::DataTableRow, h::UInt) - for col in columns(r.dt) - if _isnull(col[r.row]) - h = hash(false, h) - else - h = hash(true, hash(col[r.row], h)) - end +function rowhash(dt::DataTable, r::Int, h::UInt = zero(UInt)) + @inbounds for col in columns(dt) + h = hash_colel(col, r, h) end return h end +Base.hash(r::DataTableRow, h::UInt = zero(UInt)) = rowhash(r.dt, r.row, h) + # comparison of DataTable rows # only the rows of the same DataTable could be compared # rows are equal if they have the same values (while the row indices could differ) +# returns Nullable{Bool} +# if all non-null values are equal, but there are nulls, returns null @compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow) = isequal(r1, r2) function Base.isequal(r1::DataTableRow, r2::DataTableRow) - r1.dt == r2.dt || throw(ArgumentError("Comparing rows from different frames not supported")) - r1.row == r2.row && return true - for col in columns(r1.dt) - if !isequal(col[r1.row], col[r2.row]) - return false + isequal_row(r1.dt, r1.row, r2.dt, r2.row) +end + +# internal method for comparing the elements of the same data table column +isequal_colel(col::AbstractArray, r1::Int, r2::Int) = + (r1 == r2) || isequal(Base.unsafe_getindex(col, r1), Base.unsafe_getindex(col, r2)) + +isequal_colel(a::Any, b::Any) = isequal(a, b) +isequal_colel(a::Nullable, b::Any) = !isnull(a) & isequal(unsafe_get(a), b) +isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a) +isequal_colel(a::Nullable, b::Nullable) = isequal(a, b) + +function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int) + if dt1 === dt2 + if r1 == r2 + return true end + elseif !(ncol(dt1) == ncol(dt2)) + throw(ArgumentError("Rows of the tables that have different number of columns cannot be compared. Got $(ncol(dt1)) and $(ncol(dt2)) columns")) + end + @inbounds for (col1, col2) in zip(columns(dt1), columns(dt2)) + isequal_colel(col1[r1], col2[r2]) || return false end return true end + +# lexicographic ordering on DataTable rows, null > !null +function Base.isless(r1::DataTableRow, r2::DataTableRow) + (ncol(r1.dt) == ncol(r2.dt)) || + throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))")) + @inbounds for i in 1:ncol(r1.dt) + x = r1.dt[i][r1.row] + y = r2.dt[i][r2.row] + isnullx = _isnull(x) + isnully = _isnull(y) + (isnullx != isnully) && return isnully # null > !null + if !isnullx + v1 = unsafe_get(x) + v2 = unsafe_get(y) + isless(v1, v2) && return true + !isequal(v1, v2) && return false + end + end + return false +end diff --git a/src/datatablerow/utils.jl b/src/datatablerow/utils.jl new file mode 100644 index 0000000..eae6a0d --- /dev/null +++ b/src/datatablerow/utils.jl @@ -0,0 +1,194 @@ +# Rows grouping. +# Maps row contents to the indices of all the equal rows. +# Used by groupby(), join(), nonunique() +immutable RowGroupDict{T<:AbstractDataTable} + "source data table" + dt::T + "number of groups" + ngroups::Int + "row hashes" + rhashes::Vector{UInt} + "hashindex -> index of group-representative row" + gslots::Vector{Int} + "group index for each row" + groups::Vector{Int} + "permutation of row indices that sorts them by groups" + rperm::Vector{Int} + "starts of ranges in rperm for each group" + starts::Vector{Int} + "stops of ranges in rperm for each group" + stops::Vector{Int} +end + +# "kernel" functions for hashrows() +# adjust row hashes by the hashes of column elements +function hashrows_col!(h::Vector{UInt}, v::AbstractVector) + @inbounds for i in eachindex(h) + h[i] = hash(v[i], h[i]) + end + h +end + +if !isdefined(Base, :unsafe_get) + unsafe_get(x::Nullable) = x.value + unsafe_get(x::Any) = x +end + +function hashrows_col!{T<:Nullable}(h::Vector{UInt}, v::AbstractVector{T}) + @inbounds for i in eachindex(h) + h[i] = isnull(v[i]) ? + h[i] + Base.nullablehash_seed : + hash(unsafe_get(v[i]), h[i]) + end + h +end + +# should give the same hash as AbstractVector{T} +function hashrows_col!{T}(h::Vector{UInt}, v::AbstractCategoricalVector{T}) + # TODO is it possible to optimize by hashing the pool values once? + @inbounds for (i, ref) in enumerate(v.refs) + h[i] = hash(CategoricalArrays.index(v.pool)[ref], h[i]) + end + h +end + +# should give the same hash as AbstractNullableVector{T} +# enables efficient sequential memory access pattern +function hashrows_col!{T}(h::Vector{UInt}, v::AbstractNullableCategoricalVector{T}) + # TODO is it possible to optimize by hashing the pool values once? + @inbounds for (i, ref) in enumerate(v.refs) + h[i] = ref == 0 ? + h[i] + Base.nullablehash_seed : + hash(CategoricalArrays.index(v.pool)[ref], h[i]) + end + h +end + +# Calculate the vector of `dt` rows hash values. +function hashrows(dt::AbstractDataTable) + res = zeros(UInt, nrow(dt)) + for col in columns(dt) + hashrows_col!(res, col) + end + return res +end + +# Helper function for RowGroupDict. +# Returns a tuple: +# 1) the number of row groups in a data table +# 2) vector of row hashes +# 3) slot array for a hash map, non-zero values are +# the indices of the first row in a group +# Optional group vector is set to the group indices of each row +function row_group_slots(dt::AbstractDataTable, + groups::Union{Vector{Int}, Void} = nothing) + @assert groups === nothing || length(groups) == nrow(dt) + rhashes = hashrows(dt) + # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481 + sz = Base._tablesz(length(rhashes)) + @assert sz >= length(rhashes) + szm1 = sz-1 + gslots = zeros(Int, sz) + ngroups = 0 + @inbounds for i in eachindex(rhashes) + # find the slot and group index for a row + slotix = rhashes[i] & szm1 + 1 + gix = 0 + probe = 0 + while true + g_row = gslots[slotix] + if g_row == 0 # unoccupied slot, current row starts a new group + gslots[slotix] = i + gix = ngroups += 1 + break + elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit + eq = true + for col in columns(dt) + if !isequal_colel(col, i, g_row) + eq = false # miss + break + end + end + if eq # hit + gix = groups !== nothing ? groups[g_row] : 0 + break + end + end + slotix = slotix & szm1 + 1 # check the next slot + probe += 1 + @assert probe < sz + end + if groups !== nothing + groups[i] = gix + end + end + return ngroups, rhashes, gslots +end + +# Builds RowGroupDict for a given datatable. +# Partly uses the code of Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). +function group_rows(dt::AbstractDataTable) + groups = Vector{Int}(nrow(dt)) + ngroups, rhashes, gslots = row_group_slots(dt, groups) + + # count elements in each group + stops = zeros(Int, ngroups) + @inbounds for g_ix in groups + stops[g_ix] += 1 + end + + # group start positions in a sorted table + starts = Vector{Int}(ngroups) + if !isempty(starts) + starts[1] = 1 + @inbounds for i in 1:(ngroups-1) + starts[i+1] = starts[i] + stops[i] + end + end + + # define row permutation that sorts them into groups + rperm = Vector{Int}(length(groups)) + copy!(stops, starts) + @inbounds for (i, gix) in enumerate(groups) + rperm[stops[gix]] = i + stops[gix] += 1 + end + stops .-= 1 + return RowGroupDict(dt, ngroups, rhashes, gslots, groups, rperm, starts, stops) +end + +# Find index of a row in gd that matches given row by content, 0 if not found +function findrow(gd::RowGroupDict, dt::DataTable, row::Int) + (gd.dt === dt) && return row # same table, return itself + # different tables, content matching required + rhash = rowhash(dt, row) + szm1 = length(gd.gslots)-1 + slotix = ini_slotix = rhash & szm1 + 1 + while true + g_row = gd.gslots[slotix] + if g_row == 0 || # not found + (rhash == gd.rhashes[g_row] && + isequal_row(gd.dt, g_row, dt, row)) # found + return g_row + end + slotix = (slotix & szm1) + 1 # miss, try the next slot + (slotix == ini_slotix) && break + end + return 0 # not found +end + +# Find indices of rows in 'gd' that match given row by content. +# return empty set if no row matches +function findrows(gd::RowGroupDict, dt::DataTable, row::Int) + g_row = findrow(gd, dt, row) + (g_row == 0) && return view(gd.rperm, 0:-1) + gix = gd.groups[g_row] + return view(gd.rperm, gd.starts[gix]:gd.stops[gix]) +end + +function Base.getindex(gd::RowGroupDict, dtr::DataTableRow) + g_row = findrow(gd, dtr.dt, dtr.row) + (g_row == 0) && throw(KeyError(dtr)) + gix = gd.groups[g_row] + return view(gd.rperm, gd.starts[gix]:gd.stops[gix]) +end diff --git a/src/groupeddatatable/grouping.jl b/src/groupeddatatable/grouping.jl index 5857865..83db685 100644 --- a/src/groupeddatatable/grouping.jl +++ b/src/groupeddatatable/grouping.jl @@ -26,40 +26,6 @@ end # Split # -function groupsort_indexer(x::AbstractVector, ngroups::Integer, null_last::Bool=false) - # translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). - - # count group sizes, location 0 for NULL - n = length(x) - # counts = x.pool - counts = fill(0, ngroups + 1) - for i = 1:n - counts[x[i] + 1] += 1 - end - - # mark the start of each contiguous group of like-indexed data - where = fill(1, ngroups + 1) - if null_last - for i = 3:ngroups+1 - where[i] = where[i - 1] + counts[i - 1] - end - where[1] = where[end] + counts[end] - else - for i = 2:ngroups+1 - where[i] = where[i - 1] + counts[i - 1] - end - end - - # this is our indexer - result = fill(0, n) - for i = 1:n - label = x[i] + 1 - result[where[label]] = i - where[label] += 1 - end - result, where, counts -end - """ A view of an AbstractDataTable split into row groups @@ -71,7 +37,7 @@ groupby(cols) ### Arguments * `d` : an AbstractDataTable to split (optional, see [Returns](#returns)) -* `cols` : data frame columns to group by +* `cols` : data table columns to group by ### Returns @@ -113,49 +79,23 @@ dt |> groupby([:a, :b]) |> [sum, length] ``` """ -function groupby{T}(d::AbstractDataTable, cols::Vector{T}) - ## a subset of Wes McKinney's algorithm here: - ## http://wesmckinney.com/blog/?p=489 - - ncols = length(cols) - # use CategoricalArray to get a set of integer references for each unique item - nv = NullableCategoricalArray(d[cols[ncols]]) - # if there are NULLs, add 1 to the refs to avoid underflows in x later - anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0) - # use UInt32 instead of the original array's integer size since the number of levels can be high - x = similar(nv.refs, UInt32) - for i = 1:nrow(d) - if nv.refs[i] == 0 - x[i] = 1 - else - x[i] = CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls - end +function groupby{T}(dt::AbstractDataTable, cols::Vector{T}; sort::Bool = false) + sdt = dt[cols] + dt_groups = group_rows(sdt) + # sort the groups + if sort + group_perm = sortperm(view(sdt, dt_groups.rperm[dt_groups.starts])) + permute!(dt_groups.starts, group_perm) + Base.permute!!(dt_groups.stops, group_perm) end - # also compute the number of groups, which is the product of the set lengths - ngroups = length(levels(nv)) + anynulls - # if there's more than 1 column, do roughly the same thing repeatedly - for j = (ncols - 1):-1:1 - nv = NullableCategoricalArray(d[cols[j]]) - anynulls = (findfirst(nv.refs, 0) > 0 ? 1 : 0) - for i = 1:nrow(d) - if nv.refs[i] != 0 - x[i] += (CategoricalArrays.order(nv.pool)[nv.refs[i]] + anynulls - 1) * ngroups - end - end - ngroups = ngroups * (length(levels(nv)) + anynulls) - # TODO if ngroups is really big, shrink it - end - (idx, starts) = groupsort_indexer(x, ngroups) - # Remove zero-length groupings - starts = _uniqueofsorted(starts) - ends = starts[2:end] - 1 - GroupedDataTable(d, cols, idx, starts[1:end-1], ends) + GroupedDataTable(dt, cols, dt_groups.rperm, + dt_groups.starts, dt_groups.stops) end -groupby(d::AbstractDataTable, cols) = groupby(d, [cols]) +groupby(d::AbstractDataTable, cols; sort::Bool = false) = groupby(d, [cols], sort = sort) # add a function curry -groupby{T}(cols::Vector{T}) = x -> groupby(x, cols) -groupby(cols) = x -> groupby(x, cols) +groupby{T}(cols::Vector{T}; sort::Bool = false) = x -> groupby(x, cols, sort = sort) +groupby(cols; sort::Bool = false) = x -> groupby(x, cols, sort = sort) Base.start(gd::GroupedDataTable) = 1 Base.next(gd::GroupedDataTable, state::Int) = @@ -313,8 +253,8 @@ Split-apply-combine in one step; apply `f` to each grouping in `d` based on columns `col` ```julia -by(d::AbstractDataTable, cols, f::Function) -by(f::Function, d::AbstractDataTable, cols) +by(d::AbstractDataTable, cols, f::Function; sort::Bool = false) +by(f::Function, d::AbstractDataTable, cols; sort::Bool = false) ``` ### Arguments @@ -323,6 +263,7 @@ by(f::Function, d::AbstractDataTable, cols) * `cols` : a column indicator (Symbol, Int, Vector{Symbol}, etc.) * `f` : a function to be applied to groups; expects each argument to be an AbstractDataTable +* `sort`: sort row groups (no sorting by default) `f` can return a value, a vector, or a DataTable. For a value or vector, these are merged into a column along with the `cols` keys. For @@ -355,8 +296,10 @@ end ``` """ -by(d::AbstractDataTable, cols, f::Function) = combine(map(f, groupby(d, cols))) -by(f::Function, d::AbstractDataTable, cols) = by(d, cols, f) +by(d::AbstractDataTable, cols, f::Function; sort::Bool = false) = + combine(map(f, groupby(d, cols, sort = sort))) +by(f::Function, d::AbstractDataTable, cols; sort::Bool = false) = + by(d, cols, f, sort = sort) # # Aggregate convenience functions @@ -400,26 +343,30 @@ dt |> groupby(:a) |> [sum, x->mean(dropnull(x))] # equivalent ``` """ -aggregate(d::AbstractDataTable, fs::Function) = aggregate(d, [fs]) -function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}) +aggregate(d::AbstractDataTable, fs::Function; sort::Bool=false) = aggregate(d, [fs], sort=sort) +function aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}; sort::Bool=false) headers = _makeheaders(fs, _names(d)) - _aggregate(d, fs, headers) + _aggregate(d, fs, headers, sort) end # Applies aggregate to non-key cols of each SubDataTable of a GroupedDataTable -aggregate(gd::GroupedDataTable, f::Function) = aggregate(gd, [f]) -function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}) +aggregate(gd::GroupedDataTable, f::Function; sort::Bool=false) = aggregate(gd, [f], sort=sort) +function aggregate{T<:Function}(gd::GroupedDataTable, fs::Vector{T}; sort::Bool=false) headers = _makeheaders(fs, _setdiff(_names(gd), gd.cols)) - combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)) + res = combine(map(x -> _aggregate(without(x, gd.cols), fs, headers), gd)) + sort && sort!(res, cols=headers) + res end + (|>)(gd::GroupedDataTable, fs::Function) = aggregate(gd, fs) (|>){T<:Function}(gd::GroupedDataTable, fs::Vector{T}) = aggregate(gd, fs) # Groups DataTable by cols before applying aggregate -function aggregate{S <: ColumnIndex, T <:Function}(d::AbstractDataTable, - cols::@compat(Union{S, AbstractVector{S}}), - fs::@compat(Union{T, Vector{T}})) - aggregate(groupby(d, cols), fs) +function aggregate{S<:ColumnIndex, T <:Function}(d::AbstractDataTable, + cols::Union{S, AbstractVector{S}}, + fs::Union{T, Vector{T}}; + sort::Bool=false) + aggregate(groupby(d, cols, sort=sort), fs) end function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol}) @@ -428,6 +375,8 @@ function _makeheaders{T<:Function}(fs::Vector{T}, cn::Vector{Symbol}) length(fnames)*length(cn)) end -function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}, headers::Vector{Symbol}) - DataTable(colwise(fs, d), headers) +function _aggregate{T<:Function}(d::AbstractDataTable, fs::Vector{T}, headers::Vector{Symbol}, sort::Bool=false) + res = DataTable(colwise(fs, d), headers) + sort && sort!(res, cols=headers) + res end diff --git a/src/other/utils.jl b/src/other/utils.jl index 1931284..4badd76 100644 --- a/src/other/utils.jl +++ b/src/other/utils.jl @@ -175,19 +175,6 @@ function _setdiff{T}(a::AbstractVector{T}, b::T) diff end -function _uniqueofsorted(x::Vector) - idx = fill(true, length(x)) - lastx = x[1] - for i = 2:length(x) - if lastx == x[i] - idx[i] = false - else - lastx = x[i] - end - end - x[idx] -end - # Gets the name of a function. Used in groupedatatable/grouping.jl function _fnames{T<:Function}(fs::Vector{T}) λcounter = 0 diff --git a/src/subdatatable/subdatatable.jl b/src/subdatatable/subdatatable.jl index 77fee68..83c3c63 100644 --- a/src/subdatatable/subdatatable.jl +++ b/src/subdatatable/subdatatable.jl @@ -9,7 +9,7 @@ if VERSION >= v"0.6.0-dev.2643" include_string(""" immutable SubDataTable{T <: AbstractVector{Int}} <: AbstractDataTable parent::DataTable - rows::T # maps from subdf row indexes to parent row indexes + rows::T # maps from subdt row indexes to parent row indexes function SubDataTable{T}(parent::DataTable, rows::T) where {T <: AbstractVector{Int}} if length(rows) > 0 @@ -26,7 +26,7 @@ else @eval begin immutable SubDataTable{T <: AbstractVector{Int}} <: AbstractDataTable parent::DataTable - rows::T # maps from subdf row indexes to parent row indexes + rows::T # maps from subdt row indexes to parent row indexes function SubDataTable(parent::DataTable, rows::T) if length(rows) > 0 @@ -164,10 +164,4 @@ end Base.map(f::Function, sdt::SubDataTable) = f(sdt) # TODO: deprecate -function Base.delete!(sdt::SubDataTable, c::Any) # TODO: deprecate? - return SubDataTable(delete!(sdt.parent, c), sdt.rows) -end - -without(sdt::SubDataTable, c::Vector{Int}) = view(without(sdt.parent, c), sdt.rows) -without(sdt::SubDataTable, c::Int) = view(without(sdt.parent, c), sdt.rows) -without(sdt::SubDataTable, c::Any) = view(without(sdt.parent, c), sdt.rows) +without(sdt::SubDataTable, c) = view(without(sdt.parent, c), sdt.rows) diff --git a/test/cat.jl b/test/cat.jl index 8448bcb..ab4e2ab 100644 --- a/test/cat.jl +++ b/test/cat.jl @@ -58,17 +58,17 @@ module TestCat dt[1] = 3 dt[:x3] = 2 - # assignment of subframes + # assignment of subtables dt[1, 1:2] = dt[2, 2:3] dt[1:2, 1:2] = dt[2:3, 2:3] dt[[true,false,false,true], 2:3] = dt[1:2,1:2] - # scalar broadcasting assignment of subframes + # scalar broadcasting assignment of subtables dt[1, 1:2] = 3 dt[1:2, 1:2] = 3 dt[[true,false,false,true], 2:3] = 3 - # vector broadcasting assignment of subframes + # vector broadcasting assignment of subtables dt[1:2, 1:2] = [3,2] dt[[true,false,false,true], 2:3] = [2,3] diff --git a/test/constructors.jl b/test/constructors.jl index 324a412..6edf2e9 100644 --- a/test/constructors.jl +++ b/test/constructors.jl @@ -44,4 +44,9 @@ module TestConstructors @test isequal(dt, DataTable([Int, Float64], 2)) + @test_throws BoundsError SubDataTable(DataTable(A=1), 0) + @test_throws BoundsError SubDataTable(DataTable(A=1), 0) + @test isequal(SubDataTable(DataTable(A=1), 1), DataTable(A=1)) + @test isequal(SubDataTable(DataTable(A=1:10), 1:4), DataTable(A=1:4)) + @test isequal(view(SubDataTable(DataTable(A=1:10), 1:4), [true, true, false, false]), DataTable(A=1:2)) end diff --git a/test/data.jl b/test/data.jl index cda1e48..9259a6e 100644 --- a/test/data.jl +++ b/test/data.jl @@ -110,15 +110,22 @@ module TestData dt8 = aggregate(dt7[[1, 3]], sum) @test isequal(dt8[1, :d1_sum], sum(dt7[:d1])) - dt8 = aggregate(dt7, :d2, [sum, length]) + dt8 = aggregate(dt7, :d2, [sum, length], sort=true) + @test isequal(dt8[1:2, :d2], NullableCategoricalArray(["A", "B"])) @test size(dt8, 1) == 3 @test size(dt8, 2) == 5 - @test isequal(dt8[2, :d1_length], Nullable(4)) - @test isequal(dt8, aggregate(groupby(dt7, :d2), [sum, length])) - - dt9 = dt7 |> groupby([:d2]) |> [sum, length] + @test get(sum(dt8[:d1_length])) == N + @test all(dt8[:d1_length].values .> 0) + @test dt8[:d1_length].values == [4, 5, 11] + @test isequal(dt8, aggregate(groupby(dt7, :d2, sort=true), [sum, length])) + @test isequal(dt8[1, :d1_length], Nullable(4)) + @test isequal(dt8[2, :d1_length], Nullable(5)) + @test isequal(dt8[3, :d1_length], Nullable(11)) + @test isequal(dt8, aggregate(groupby(dt7, :d2), [sum, length], sort=true)) + + dt9 = dt7 |> groupby([:d2], sort=true) |> [sum, length] @test isequal(dt9, dt8) - dt9 = aggregate(dt7, :d2, [sum, length]) + dt9 = aggregate(dt7, :d2, [sum, length], sort=true) @test isequal(dt9, dt8) dt10 = DataTable( @@ -126,8 +133,8 @@ module TestData [:d1, :d2, :d3, :d4] ) - gd = groupby(dt10, [:d3]) - ggd = groupby(gd[1], [:d3, :d4]) # make sure we can groupby subdatatables + gd = groupby(dt10, [:d3], sort=true) + ggd = groupby(gd[1], [:d3, :d4], sort=true) # make sure we can groupby subdatatables @test ggd[1][1, :d3] == "a" @test ggd[1][1, :d4] == "c" @test ggd[1][2, :d3] == "a" @@ -207,8 +214,11 @@ module TestData b2 = [:A,:B,:C][rand(1:3, 5)], v2 = randn(5)) - m1 = join(dt1, dt2, on = :a) - @test isequal(m1[:a], NullableArray([1, 2, 3, 4, 5])) + m1 = join(dt1, dt2, on = :a, kind=:inner) + @test isequal(m1[:a], dt1[:a][dt1[:a].values .<= 5]) # preserves dt1 order + m2 = join(dt1, dt2, on = :a, kind = :outer) + @test isequal(m2[:a], dt1[:a]) # preserves dt1 order + @test isequal(m2[:b], dt1[:b]) # preserves dt1 order # TODO: Re-enable m2 = join(dt1, dt2, on = :a, kind = :outer) # @test isequal(m2[:b2], @@ -239,20 +249,20 @@ module TestData # test with nulls (issue #185) dt1 = DataTable() - dt1[:A] = NullableArray(Nullable{Compat.ASCIIString}["a", "b", "a", Nullable()]) + dt1[:A] = NullableArray(["a", "b", "a", Nullable()]) dt1[:B] = NullableArray([1, 2, 1, 3]) dt2 = DataTable() - dt2[:A] = NullableArray(Nullable{Compat.ASCIIString}["a", Nullable(), "c"]) + dt2[:A] = NullableArray(["a", Nullable(), "c"]) dt2[:C] = NullableArray([1, 2, 4]) m1 = join(dt1, dt2, on = :A) @test size(m1) == (3,3) - @test isequal(m1[:A], NullableArray(Nullable{Compat.ASCIIString}[Nullable(),"a","a"])) + @test isequal(m1[:A], NullableArray(["a","a", Nullable()])) m2 = join(dt1, dt2, on = :A, kind = :outer) @test size(m2) == (5,3) - @test isequal(m2[:A], NullableArray(Nullable{Compat.ASCIIString}[Nullable(),"a","a","b","c"])) + @test isequal(m2[:A], NullableArray(["a", "b", "a", Nullable(), "c"])) srand(1) dt1 = DataTable( diff --git a/test/datatable.jl b/test/datatable.jl index 306a176..c75f5fe 100644 --- a/test/datatable.jl +++ b/test/datatable.jl @@ -80,7 +80,6 @@ module TestDataTable b = NullableArray{String}(2), c = NullableCategoricalArray{Float64}(2)) @test isequal(nulldt, similar(dt, 2)) - @test isequal(nulldt, DataTables.similar_nullable(dt, 2)) # Associative methods @@ -330,4 +329,16 @@ module TestDataTable #This changes the expected result dt4[2,:Mass] = Nullable() @test isequal(dt2, dt4) + + dt = DataTable(A = 1:10, B = 'A':'J') + @test !(dt[:,:] === dt) + + @test append!(DataTable(A = 1:2, B = 1:2), DataTable(A = 3:4, B = 3:4)) == DataTable(A=1:4, B = 1:4) + @test !any(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6)).columns) + @test all(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1,2]).columns) + @test all(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A,:B]).columns) + @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [:A]).columns) == [1] + @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), :A).columns) == [1] + @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), [1]).columns) == [1] + @test find(c -> isa(c, NullableCategoricalArray), categorical!(DataTable(A=1:3, B=4:6), 1).columns) == [1] end diff --git a/test/datatablerow.jl b/test/datatablerow.jl index 7e5dc8f..c54f264 100644 --- a/test/datatablerow.jl +++ b/test/datatablerow.jl @@ -21,10 +21,60 @@ module TestDataTableRow @test isequal(DataTableRow(dt, 2), DataTableRow(dt, 5)) @test !isequal(DataTableRow(dt, 2), DataTableRow(dt, 6)) + # isless() + dt4 = DataTable(a=NullableArray([1, 1, 2, 2, 2, 2, Nullable(), Nullable()]), + b=NullableArray([2.0, 3.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0]), + c=NullableArray([:B, Nullable(), :A, :C, :D, :D, :A, :A])) + @test isless(DataTableRow(dt4, 1), DataTableRow(dt4, 2)) + @test !isless(DataTableRow(dt4, 2), DataTableRow(dt4, 1)) + @test !isless(DataTableRow(dt4, 1), DataTableRow(dt4, 1)) + @test isless(DataTableRow(dt4, 1), DataTableRow(dt4, 3)) + @test !isless(DataTableRow(dt4, 3), DataTableRow(dt4, 1)) + @test isless(DataTableRow(dt4, 3), DataTableRow(dt4, 4)) + @test !isless(DataTableRow(dt4, 4), DataTableRow(dt4, 3)) + @test isless(DataTableRow(dt4, 4), DataTableRow(dt4, 5)) + @test !isless(DataTableRow(dt4, 5), DataTableRow(dt4, 4)) + @test !isless(DataTableRow(dt4, 6), DataTableRow(dt4, 5)) + @test !isless(DataTableRow(dt4, 5), DataTableRow(dt4, 6)) + @test isless(DataTableRow(dt4, 7), DataTableRow(dt4, 8)) + @test !isless(DataTableRow(dt4, 8), DataTableRow(dt4, 7)) + # hashing @test !isequal(hash(DataTableRow(dt, 1)), hash(DataTableRow(dt, 2))) @test !isequal(hash(DataTableRow(dt, 1)), hash(DataTableRow(dt, 3))) @test isequal(hash(DataTableRow(dt, 1)), hash(DataTableRow(dt, 4))) @test isequal(hash(DataTableRow(dt, 2)), hash(DataTableRow(dt, 5))) @test !isequal(hash(DataTableRow(dt, 2)), hash(DataTableRow(dt, 6))) + + + # check that hashrows() function generates the same hashes as DataTableRow + dt_rowhashes = DataTables.hashrows(dt) + @test dt_rowhashes == [hash(dr) for dr in eachrow(dt)] + + # test incompatible frames + @test_throws UndefVarError isequal(DataTableRow(dt, 1), DataTableRow(dt3, 1)) + + # test RowGroupDict + N = 20 + d1 = rand(map(Int64, 1:2), N) + dt5 = DataTable(Any[d1], [:d1]) + dt6 = DataTable(d1 = [2,3]) + + # test_group("groupby") + gd = DataTables.group_rows(dt5) + @test gd.ngroups == 2 + + # getting groups for the rows of the other frames + @test length(gd[DataTableRow(dt6, 1)]) > 0 + @test_throws KeyError gd[DataTableRow(dt6, 2)] + @test isempty(DataTables.findrows(gd, dt6, 2)) + @test length(DataTables.findrows(gd, dt6, 2)) == 0 + + # grouping empty frame + gd = DataTables.group_rows(DataTable(x=Int[])) + @test gd.ngroups == 0 + + # grouping single row + gd = DataTables.group_rows(dt5[1,:]) + @test gd.ngroups == 1 end diff --git a/test/grouping.jl b/test/grouping.jl index c8faac0..9e1ab41 100644 --- a/test/grouping.jl +++ b/test/grouping.jl @@ -12,21 +12,32 @@ module TestGrouping f(dt) = DataTable(cmax = maximum(dt[:c])) - sdt = sort(dt, cols=cols) + sdt = unique(dt[cols]) + + # by() without groups sorting bdt = by(dt, cols, f) + @test bdt[cols] == sdt - @test isequal(bdt[cols], unique(sdt[cols])) + # by() with groups sorting + sbdt = by(dt, cols, f, sort=true) + @test sbdt[cols] == sort(sdt) byf = by(dt, :a, dt -> DataTable(bsum = sum(dt[:b]))) @test all(T -> T <: AbstractVector, map(typeof, colwise([sum], dt))) @test all(T -> T <: AbstractVector, map(typeof, colwise(sum, dt))) + # groupby() without groups sorting gd = groupby(dt, cols) ga = map(f, gd) @test isequal(bdt, combine(ga)) + # groupby() with groups sorting + gd = groupby(dt, cols, sort=true) + ga = map(f, gd) + @test sbdt == combine(ga) + g(dt) = DataTable(cmax1 = Vector(dt[:cmax]) + 1) h(dt) = g(f(dt)) @@ -37,7 +48,7 @@ module TestGrouping @test groupby(dt2, [:v1, :v2]).starts == collect(1:1000) @test groupby(dt2, [:v2, :v1]).starts == collect(1:1000) - # grouping empty frame + # grouping empty table @test groupby(DataTable(A=Int[]), :A).starts == Int[] # grouping single row @test groupby(DataTable(A=Int[1]), :A).starts == Int[1] @@ -67,11 +78,11 @@ module TestGrouping levels!(dt[:Key1], ["Z", "B", "A"]) levels!(dt[:Key2], ["Z", "B", "A"]) gd = groupby(dt, :Key1) - @test isequal(gd[1], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) - @test isequal(gd[2], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) + @test isequal(gd[1], DataTable(Key1=["A", "A"], Key2=["A", "B"], Value=1:2)) + @test isequal(gd[2], DataTable(Key1=["B", "B"], Key2=["A", "B"], Value=3:4)) gd = groupby(dt, [:Key1, :Key2]) - @test isequal(gd[1], DataTable(Key1="B", Key2="B", Value=4)) - @test isequal(gd[2], DataTable(Key1="B", Key2="A", Value=3)) - @test isequal(gd[3], DataTable(Key1="A", Key2="B", Value=2)) - @test isequal(gd[4], DataTable(Key1="A", Key2="A", Value=1)) + @test isequal(gd[1], DataTable(Key1="A", Key2="A", Value=1)) + @test isequal(gd[2], DataTable(Key1="A", Key2="B", Value=2)) + @test isequal(gd[3], DataTable(Key1="B", Key2="A", Value=3)) + @test isequal(gd[4], DataTable(Key1="B", Key2="B", Value=4)) end diff --git a/test/join.jl b/test/join.jl index cb9c239..0ac3fe6 100644 --- a/test/join.jl +++ b/test/join.jl @@ -18,7 +18,7 @@ module TestJoin Job = NullableArray(Nullable{String}["Lawyer", "Doctor", "Florist", Nullable(), "Farmer"])) # (Tests use current column ordering but don't promote it) - right = outer[Bool[!isnull(x) for x in outer[:Job]], [:Name, :ID, :Job]] + right = outer[Bool[!isnull(x) for x in outer[:Job]], [:ID, :Name, :Job]] left = outer[Bool[!isnull(x) for x in outer[:Name]], :] inner = left[Bool[!isnull(x) for x in left[:Job]], :] semi = unique(inner[:, [:ID, :Name]]) @@ -31,6 +31,8 @@ module TestJoin @test isequal(join(name, job, on = :ID, kind = :right), right) @test isequal(join(name, job, on = :ID, kind = :semi), semi) @test isequal(join(name, job, on = :ID, kind = :anti), anti) + @test_throws ArgumentError join(name, job) + @test_throws ArgumentError join(name, job, on=:ID, kind=:other) # Join with no non-key columns on = [:ID] @@ -104,9 +106,9 @@ module TestJoin Mass = [1.5, 2.2, 1.1]) dt2 = DataTable(Name = ["A", "B", "C", "A"], Quantity = [3, 3, 2, 4]) - @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = ["A", "A", "B", "C"], - Quantity = [3, 4, 3, 2], - Mass = [1.5, 1.5, 2.2, 1.1]) + @test join(dt2, dt, on=:Name, kind=:left) == DataTable(Name = ["A", "B", "C", "A"], + Quantity = [3, 3, 2, 4], + Mass = [1.5, 2.2, 1.1, 1.5]) # Test that join works when mixing Array and NullableArray (#1151) dt = DataTable([collect(1:10), collect(2:11)], [:x, :y])