-
Notifications
You must be signed in to change notification settings - Fork 11
Enhance joining and grouping #17
Changes from 6 commits
dd68a65
a5fd472
9424201
a652768
0cdf755
d292dd3
53774f5
2adc883
d52c791
5e9664a
e1b4d0e
74c36d1
de09a5c
160be5c
7f28a14
61bf607
6147d0c
bab097f
cdac010
8cf4a67
199f96b
f3b06a3
8308879
1c842dc
637b8cf
49d6328
b6c1f98
839c558
46aaae2
01b3ce8
7b9b8e2
7fe0389
cf0486a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,69 +55,36 @@ ordering(col::ColumnIndex, lt::Function, by::Function, rev::Bool, order::Orderin | |
# the permutation induced by this ordering is used to | ||
# sort the original (presumably larger) DataTable | ||
|
||
type DTPerm{O<:@compat(Union{Ordering, AbstractVector}), DT<:AbstractDataTable} <: Ordering | ||
immutable DTPerm{O<:Union{Ordering, AbstractVector}, DT<:AbstractDataTable} <: Ordering | ||
ord::O | ||
dt::DT | ||
end | ||
|
||
function DTPerm{O<:Ordering}(ords::AbstractVector{O}, dt::AbstractDataTable) | ||
function DTPerm{O<:Ordering, DT<:AbstractDataTable}(ords::AbstractVector{O}, dt::DT) | ||
if length(ords) != ncol(dt) | ||
error("DTPerm: number of column orderings does not equal the number of DataTable columns") | ||
end | ||
DTPerm{AbstractVector{O}, typeof(dt)}(ords, dt) | ||
DTPerm{typeof(ords), DT}(ords, dt) | ||
end | ||
|
||
DTPerm{O<:Ordering}(o::O, dt::AbstractDataTable) = DTPerm{O,typeof(dt)}(o,dt) | ||
DTPerm{O<:Ordering, DT<:AbstractDataTable}(o::O, dt::DT) = DTPerm{O,DT}(o,dt) | ||
|
||
# For sorting, a and b are row indices (first two lt definitions) | ||
# For issorted, the default row iterator returns DataTableRows instead, | ||
# so two more lt function is defined below | ||
function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a, b) | ||
for i = 1:ncol(o.dt) | ||
if lt(o.ord[i], o.dt[a,i], o.dt[b,i]) | ||
return true | ||
end | ||
if lt(o.ord[i], o.dt[b,i], o.dt[a,i]) | ||
return false | ||
end | ||
end | ||
false | ||
end | ||
|
||
function Sort.lt{O<:Ordering}(o::DTPerm{O}, a, b) | ||
for i = 1:ncol(o.dt) | ||
if lt(o.ord, o.dt[a,i], o.dt[b,i]) | ||
return true | ||
end | ||
if lt(o.ord, o.dt[b,i], o.dt[a,i]) | ||
return false | ||
end | ||
end | ||
false | ||
end | ||
# get ordering function for the i-th column used for ordering | ||
col_ordering{O<:Ordering}(o::DTPerm{O}, i::Int) = o.ord | ||
col_ordering{V<:AbstractVector}(o::DTPerm{V}, i::Int) = o.ord[i] | ||
|
||
function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a::DataTableRow, b::DataTableRow) | ||
for i = 1:ncol(o.dt) | ||
if lt(o.ord[i], a[i], b[i]) | ||
return true | ||
end | ||
if lt(o.ord[i], b[i], a[i]) | ||
return false | ||
end | ||
end | ||
false | ||
end | ||
Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j] | ||
Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j] | ||
|
||
function Sort.lt{O<:Ordering}(o::DTPerm{O}, a::DataTableRow, b::DataTableRow) | ||
for i = 1:ncol(o.dt) | ||
if lt(o.ord, a[i], b[i]) | ||
return true | ||
end | ||
if lt(o.ord, b[i], a[i]) | ||
return false | ||
end | ||
function Sort.lt(o::DTPerm, a, b) | ||
@inbounds for i = 1:ncol(o.dt) | ||
ord = col_ordering(o, i) | ||
va = o[a, i] | ||
vb = o[b, i] | ||
lt(ord, va, vb) && return true | ||
lt(ord, vb, va) && return false | ||
end | ||
false | ||
false # a and b are equal | ||
end | ||
|
||
### | ||
|
@@ -306,5 +273,35 @@ for s in [:(Base.sort), :(Base.sortperm)] | |
end | ||
|
||
Base.sort(dt::AbstractDataTable, a::Algorithm, o::Ordering) = dt[sortperm(dt, a, o),:] | ||
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::@compat(Union{Perm,DTPerm})) = sort!([1:size(dt, 1);], a, o) | ||
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Union{Perm,DTPerm}) = sort!([1:size(dt, 1);], a, o) | ||
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Ordering) = sortperm(dt, a, DTPerm(o,dt)) | ||
|
||
# Extras to speed up sorting | ||
#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this commented out? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't recall now. Maybe something to do with methods ambiguity, but worth rechecking now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nalimilan see here. I'll add these back There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, funny how you can forget about issues. I'll try to add these. |
||
#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec)) | ||
|
||
# permute rows | ||
function Base.permute!(dt::AbstractDataTable, p::AbstractVector) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a docstring to explain that permutation is applied to rows? |
||
pp = similar(p) | ||
for (icol, col) in enumerate(columns(dt)) | ||
# Check if this column has been sorted already | ||
any(j -> dt[j]===col, 1:icol-1) && continue | ||
|
||
copy!(pp, p) | ||
Base.permute!!(col, pp) | ||
end | ||
dt | ||
end | ||
|
||
# apply inverse of given rows permutation | ||
function Base.ipermute!(dt::AbstractDataTable, p::AbstractVector) | ||
pp = similar(p) | ||
for (icol, col) in enumerate(columns(dt)) | ||
# Check if this column has been sorted already | ||
any(j -> dt[j]===col, 1:icol-1) && continue | ||
|
||
copy!(pp, p) | ||
Base.ipermute!!(col, pp) | ||
end | ||
dt | ||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,8 +27,6 @@ Base.length(r::DataTableRow) = size(r.dt, 2) | |
|
||
Base.endof(r::DataTableRow) = size(r.dt, 2) | ||
|
||
Base.collect(r::DataTableRow) = @compat Tuple{Symbol, Any}[x for x in r] | ||
|
||
Base.start(r::DataTableRow) = 1 | ||
|
||
Base.next(r::DataTableRow, s) = ((_names(r)[s], r[s]), s + 1) | ||
|
@@ -37,31 +35,119 @@ Base.done(r::DataTableRow, s) = s > length(r) | |
|
||
Base.convert(::Type{Array}, r::DataTableRow) = convert(Array, r.dt[r.row,:]) | ||
|
||
Base.collect(r::DataTableRow) = Tuple{Symbol, Any}[x for x in r] | ||
|
||
# the equal elements of nullable and normal arrays would have the same hashes | ||
const NULL_MAGIC = 0xBADDEED # what to hash if the element is null | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's already a hash for null values in const nullablehash_seed = UInt === UInt64 ? 0x932e0143e51d0171 : 0xe51d0171 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I missed that. Then we should use |
||
|
||
# hash column element | ||
Base.@propagate_inbounds hash_colel(v::AbstractArray, i, h::UInt = zero(UInt)) = hash(v[i], h) | ||
Base.@propagate_inbounds hash_colel{T}(v::NullableArray{T}, i, h::UInt = zero(UInt)) = | ||
isnull(v, i) ? hash(NULL_MAGIC, h) : hash(get(v[i]), h) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since you special-case You will also need a generic method for |
||
Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) = | ||
hash(CategoricalArrays.index(v.pool)[v.refs[i]], h) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you sure this is really more efficient than the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it's not as efficient. But in these functions the constraint is to make hashes invariant to the hashed value representation: whether it's nullable or not and whether it's stored "as is" or in a categorical array. Otherwise joins would not work (we may require that joins only use the columns of identical types, but that would result in too much overhead on the user side). So we have to check if the default hash functions have this property ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be surprising that it would be significantly slower, since the code is very similar. Though since we need the special method for |
||
Base.@propagate_inbounds function hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
ref = v.refs[i] | ||
ref == 0 ? hash(NULL_MAGIC, h) : hash(CategoricalArrays.index(v.pool)[ref], h) | ||
end | ||
|
||
# hash of DataTable rows based on its values | ||
# so that duplicate rows would have the same hash | ||
function Base.hash(r::DataTableRow, h::UInt) | ||
for col in columns(r.dt) | ||
if _isnull(col[r.row]) | ||
h = hash(false, h) | ||
else | ||
h = hash(true, hash(col[r.row], h)) | ||
end | ||
function rowhash(dt::DataTable, r::Int, h::UInt = zero(UInt)) | ||
@inbounds for col in columns(dt) | ||
h = hash_colel(col, r, h) | ||
end | ||
return h | ||
end | ||
|
||
Base.hash(r::DataTableRow, h::UInt = zero(UInt)) = rowhash(r.dt, r.row, h) | ||
|
||
# comparison of DataTable rows | ||
# only the rows of the same DataTable could be compared | ||
# rows are equal if they have the same values (while the row indices could differ) | ||
@compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow) = isequal(r1, r2) | ||
|
||
function Base.isequal(r1::DataTableRow, r2::DataTableRow) | ||
r1.dt == r2.dt || throw(ArgumentError("Comparing rows from different frames not supported")) | ||
r1.row == r2.row && return true | ||
for col in columns(r1.dt) | ||
if !isequal(col[r1.row], col[r2.row]) | ||
return false | ||
# returns Nullable{Bool} | ||
# if all non-null values are equal, but there are nulls, returns null | ||
function @compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is that definition actually needed for this PR to work? If not, I'd rather keep the current one returning a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not needed, reverting to current |
||
if r1.dt !== r2.dt | ||
(ncol(r1.dt) != ncol(r2.dt)) && | ||
throw(ArgumentError("Comparing rows from different frames not supported")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. frames -> tables Also, the message doesn't sound very clear: shouldn't it say that the number of columns should be equal? and what about column names? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Checking the names and the column types compatibility would be too expensive for this function, that's why different tables are not supported. What would you suggest as a more descriptive message? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. But say something which described the actual behavior correctly, like |
||
eq = Nullable(true) | ||
@inbounds for (col1, col2) in zip(columns(r1.dt), columns(r2.dt)) | ||
eq_col = convert(Nullable{Bool}, col1[r1.row] == col2[r2.row]) | ||
# If true or null, need to compare remaining columns | ||
get(eq_col, true) || return Nullable(false) | ||
eq &= eq_col | ||
end | ||
return eq | ||
else | ||
r1.row == r2.row && return Nullable(true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indentation issue. |
||
eq = Nullable(true) | ||
@inbounds for col in columns(r1.dt) | ||
eq_col = convert(Nullable{Bool}, col[r1.row] == col[r2.row]) | ||
# If true or null, need to compare remaining columns | ||
get(eq_col, true) || return Nullable(false) | ||
eq &= eq_col | ||
end | ||
return eq | ||
end | ||
end | ||
|
||
# internal method for comparing the elements of the same data frame column | ||
isequal_colel(col::AbstractArray, r1::Int, r2::Int) = | ||
(r1 == r2) || isequal(Base.unsafe_getindex(col, r1), Base.unsafe_getindex(col, r2)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
function isequal_colel{T}(col::Union{NullableArray{T}, | ||
AbstractNullableCategoricalArray{T}}, | ||
r1::Int, r2::Int) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indentation issue. |
||
(r1 == r2) && return true | ||
isnull(col[r1]) && return isnull(col[r2]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doesn't this definition exactly match that of |
||
return !isnull(col[r2]) && isequal(get(col[r1]), get(col[r2])) | ||
end | ||
|
||
isequal_colel(a::Any, b::Any) = isequal(a, b) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two-argument definitions are not needed AFAICT. The only place where they are called could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests fail after making the changes. They all seem necessary to get around Nullable comparisons julia> using DataTables
julia> DataTables.isequal_colel(Nullable(1), 1)
true
julia> isequal(Nullable(1), 1)
false There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, right, I forgot about the need to unwrap nullables when comparing with non-nullable. That behavior is quite annoying, but well... |
||
isequal_colel(a::Nullable, b::Any) = !isnull(a) && isequal(get(a), b) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a) | ||
isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(get(a), get(b))) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just use |
||
|
||
# comparison of DataTable rows | ||
function isequal_row(dt::AbstractDataTable, r1::Int, r2::Int) | ||
(r1 == r2) && return true # same raw | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. raw -> row |
||
@inbounds for col in columns(dt) | ||
isequal_colel(col, r1, r2) || return false | ||
end | ||
return true | ||
end | ||
|
||
function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int) | ||
(dt1 === dt2) && return isequal_row(dt1, r1, r2) | ||
(ncol(dt1) == ncol(dt2)) || | ||
throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. frames -> tables |
||
@inbounds for (col1, col2) in zip(columns(dt1), columns(dt2)) | ||
isequal_colel(col1[r1], col2[r2]) || return false | ||
end | ||
return true | ||
end | ||
|
||
# comparison of DataTable rows | ||
# rows are equal if they have the same values (while the row indices could differ) | ||
Base.isequal(r1::DataTableRow, r2::DataTableRow) = | ||
isequal_row(r1.dt, r1.row, r2.dt, r2.row) | ||
|
||
# lexicographic ordering on DataTable rows, null > !null | ||
function Base.isless(r1::DataTableRow, r2::DataTableRow) | ||
(ncol(r1.dt) == ncol(r2.dt)) || | ||
throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))")) | ||
@inbounds for i in 1:ncol(r1.dt) | ||
col1 = r1.dt[i] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of creating this variable, create |
||
col2 = r2.dt[i] | ||
isnull1 = _isnull(col1, r1.row) | ||
isnull2 = _isnull(col2, r2.row) | ||
(isnull1 != isnull2) && return isnull2 # null > !null | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, this seems equivalent to |
||
if !isnull1 | ||
v1 = get(col1[r1.row]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you'll need a |
||
v2 = get(col2[r2.row]) | ||
isless(v1, v2) && return true | ||
!isequal(v1, v2) && return false | ||
end | ||
end | ||
return false | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mark these as
@propagate_inbounds
(see http://julia.readthedocs.io/en/latest/devdocs/boundscheck/).