Skip to content
This repository was archived by the owner on May 5, 2019. It is now read-only.

Enhance joining and grouping #17

Merged
merged 33 commits into from
Mar 6, 2017
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
dd68a65
merge alyst groupby to DataTables
cjprybol Feb 20, 2017
a5fd472
passing tests
cjprybol Feb 20, 2017
9424201
Merge branch 'master' into cjp/alyst-groupby
cjprybol Feb 20, 2017
a652768
revert seemingly unrelated changes
cjprybol Feb 20, 2017
0cdf755
revert unnecessary changes for variable name and spacing
cjprybol Feb 21, 2017
d292dd3
fix indentation issue
cjprybol Feb 21, 2017
53774f5
add nonunique()
cjprybol Feb 22, 2017
2adc883
commit join.jl merge for Alyst to debug
cjprybol Feb 22, 2017
d52c791
make the easy changes requested during review
cjprybol Feb 22, 2017
5e9664a
add docstrings to row permutation functions
cjprybol Feb 23, 2017
e1b4d0e
clarify error message
cjprybol Feb 23, 2017
74c36d1
remove unused function
cjprybol Feb 23, 2017
de09a5c
update function to use isequal(a::Nullable, b::Nullable) from base
cjprybol Feb 23, 2017
160be5c
frame -> table
cjprybol Feb 23, 2017
7f28a14
update merge based on helpful diff
cjprybol Feb 23, 2017
61bf607
pass all tests that don't use Categorical
cjprybol Feb 23, 2017
6147d0c
added back commented out functions
cjprybol Feb 23, 2017
bab097f
minor cleanup
cjprybol Feb 23, 2017
cdac010
Merge branch 'master' into cjp/alyst-groupby
cjprybol Feb 23, 2017
8cf4a67
more changes suggested during review
cjprybol Feb 23, 2017
199f96b
use explicit vcat, indendation, parentheses
cjprybol Feb 23, 2017
f3b06a3
more indentation
cjprybol Feb 23, 2017
8308879
fix test/join.jl errors using `resize!` in
cjprybol Feb 24, 2017
1c842dc
passing all tests!
cjprybol Feb 25, 2017
637b8cf
update categorical Arrays version
cjprybol Feb 27, 2017
49d6328
incorporate edits suggested during review
cjprybol Mar 1, 2017
b6c1f98
more fixes
cjprybol Mar 3, 2017
839c558
added tests and trimmed unneccessary functions
cjprybol Mar 4, 2017
46aaae2
update new function name
cjprybol Mar 4, 2017
01b3ce8
revert code deletions and address most comments
cjprybol Mar 6, 2017
7b9b8e2
revert bad edit, function is untested
cjprybol Mar 6, 2017
7fe0389
revert some changes, clean up tests
cjprybol Mar 6, 2017
cf0486a
change docstring to comment
cjprybol Mar 6, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/DataTables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ using FileIO # remove after read_rda deprecation period

using Base: Sort, Order
import Base: ==, |>
import Base: permute!, ipermute!

##############################################################################
##
Expand Down Expand Up @@ -104,6 +105,7 @@ for (dir, filename) in [
("subdatatable", "subdatatable.jl"),
("groupeddatatable", "grouping.jl"),
("datatablerow", "datatablerow.jl"),
("datatablerow", "utils.jl"),

("abstractdatatable", "iteration.jl"),
("abstractdatatable", "join.jl"),
Expand Down
99 changes: 48 additions & 51 deletions src/abstractdatatable/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -55,69 +55,36 @@ ordering(col::ColumnIndex, lt::Function, by::Function, rev::Bool, order::Orderin
# the permutation induced by this ordering is used to
# sort the original (presumably larger) DataTable

type DTPerm{O<:@compat(Union{Ordering, AbstractVector}), DT<:AbstractDataTable} <: Ordering
immutable DTPerm{O<:Union{Ordering, AbstractVector}, DT<:AbstractDataTable} <: Ordering
ord::O
dt::DT
end

function DTPerm{O<:Ordering}(ords::AbstractVector{O}, dt::AbstractDataTable)
function DTPerm{O<:Ordering, DT<:AbstractDataTable}(ords::AbstractVector{O}, dt::DT)
if length(ords) != ncol(dt)
error("DTPerm: number of column orderings does not equal the number of DataTable columns")
end
DTPerm{AbstractVector{O}, typeof(dt)}(ords, dt)
DTPerm{typeof(ords), DT}(ords, dt)
end

DTPerm{O<:Ordering}(o::O, dt::AbstractDataTable) = DTPerm{O,typeof(dt)}(o,dt)
DTPerm{O<:Ordering, DT<:AbstractDataTable}(o::O, dt::DT) = DTPerm{O,DT}(o,dt)

# For sorting, a and b are row indices (first two lt definitions)
# For issorted, the default row iterator returns DataTableRows instead,
# so two more lt function is defined below
function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a, b)
for i = 1:ncol(o.dt)
if lt(o.ord[i], o.dt[a,i], o.dt[b,i])
return true
end
if lt(o.ord[i], o.dt[b,i], o.dt[a,i])
return false
end
end
false
end

function Sort.lt{O<:Ordering}(o::DTPerm{O}, a, b)
for i = 1:ncol(o.dt)
if lt(o.ord, o.dt[a,i], o.dt[b,i])
return true
end
if lt(o.ord, o.dt[b,i], o.dt[a,i])
return false
end
end
false
end
# get ordering function for the i-th column used for ordering
col_ordering{O<:Ordering}(o::DTPerm{O}, i::Int) = o.ord
col_ordering{V<:AbstractVector}(o::DTPerm{V}, i::Int) = o.ord[i]

function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a::DataTableRow, b::DataTableRow)
for i = 1:ncol(o.dt)
if lt(o.ord[i], a[i], b[i])
return true
end
if lt(o.ord[i], b[i], a[i])
return false
end
end
false
end
Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mark these as @propagate_inbounds (see http://julia.readthedocs.io/en/latest/devdocs/boundscheck/).

Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j]

function Sort.lt{O<:Ordering}(o::DTPerm{O}, a::DataTableRow, b::DataTableRow)
for i = 1:ncol(o.dt)
if lt(o.ord, a[i], b[i])
return true
end
if lt(o.ord, b[i], a[i])
return false
end
function Sort.lt(o::DTPerm, a, b)
@inbounds for i = 1:ncol(o.dt)
ord = col_ordering(o, i)
va = o[a, i]
vb = o[b, i]
lt(ord, va, vb) && return true
lt(ord, vb, va) && return false
end
false
false # a and b are equal
end

###
Expand Down Expand Up @@ -306,5 +273,35 @@ for s in [:(Base.sort), :(Base.sortperm)]
end

Base.sort(dt::AbstractDataTable, a::Algorithm, o::Ordering) = dt[sortperm(dt, a, o),:]
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::@compat(Union{Perm,DTPerm})) = sort!([1:size(dt, 1);], a, o)
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Union{Perm,DTPerm}) = sort!([1:size(dt, 1);], a, o)
Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Ordering) = sortperm(dt, a, DTPerm(o,dt))

# Extras to speed up sorting
#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this commented out?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't recall now. Maybe something to do with methods ambiguity, but worth rechecking now.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FastPerm is part of DataArrays so these (this and below) aren't applicable. I'll remove them

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nalimilan see here. I'll add these back

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, funny how you can forget about issues. I'll try to add these.

#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec))

# permute rows
function Base.permute!(dt::AbstractDataTable, p::AbstractVector)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a docstring to explain that permutation is applied to rows?

pp = similar(p)
for (icol, col) in enumerate(columns(dt))
# Check if this column has been sorted already
any(j -> dt[j]===col, 1:icol-1) && continue

copy!(pp, p)
Base.permute!!(col, pp)
end
dt
end

# apply inverse of given rows permutation
function Base.ipermute!(dt::AbstractDataTable, p::AbstractVector)
pp = similar(p)
for (icol, col) in enumerate(columns(dt))
# Check if this column has been sorted already
any(j -> dt[j]===col, 1:icol-1) && continue

copy!(pp, p)
Base.ipermute!!(col, pp)
end
dt
end
122 changes: 104 additions & 18 deletions src/datatablerow/datatablerow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ Base.length(r::DataTableRow) = size(r.dt, 2)

Base.endof(r::DataTableRow) = size(r.dt, 2)

Base.collect(r::DataTableRow) = @compat Tuple{Symbol, Any}[x for x in r]

Base.start(r::DataTableRow) = 1

Base.next(r::DataTableRow, s) = ((_names(r)[s], r[s]), s + 1)
Expand All @@ -37,31 +35,119 @@ Base.done(r::DataTableRow, s) = s > length(r)

Base.convert(::Type{Array}, r::DataTableRow) = convert(Array, r.dt[r.row,:])

Base.collect(r::DataTableRow) = Tuple{Symbol, Any}[x for x in r]

# the equal elements of nullable and normal arrays would have the same hashes
const NULL_MAGIC = 0xBADDEED # what to hash if the element is null
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's already a hash for null values in base/nullable.jl:

const nullablehash_seed = UInt === UInt64 ? 0x932e0143e51d0171 : 0xe51d0171

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I missed that. Then we should use nullablehash_seed instead NULL_MAGIC.


# hash column element
Base.@propagate_inbounds hash_colel(v::AbstractArray, i, h::UInt = zero(UInt)) = hash(v[i], h)
Base.@propagate_inbounds hash_colel{T}(v::NullableArray{T}, i, h::UInt = zero(UInt)) =
isnull(v, i) ? hash(NULL_MAGIC, h) : hash(get(v[i]), h)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since you special-case NullableArray, use v.values[i], which will be faster.

You will also need a generic method for AbstractArray{<:Nullable} which does not rely on isnull(v, i).

Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) =
hash(CategoricalArrays.index(v.pool)[v.refs[i]], h)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you sure this is really more efficient than the default hash method for CategoricalValue?

Copy link
Contributor

@alyst alyst Feb 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's not as efficient. But in these functions the constraint is to make hashes invariant to the hashed value representation: whether it's nullable or not and whether it's stored "as is" or in a categorical array. Otherwise joins would not work (we may require that joins only use the columns of identical types, but that would result in too much overhead on the user side). So we have to check if the default hash functions have this property (Nullable AFAIR is not).

Copy link
Member

@nalimilan nalimilan Feb 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be surprising that it would be significantly slower, since the code is very similar. Though since we need the special method for NullableCategoricalArray to avoid the cost of creating a Nullable just to unwrap it, I guess it doesn't matter too much what we do here.

Base.@propagate_inbounds function hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AbstractCategoricalArray should be AbstractNullableCategoricalArray AFAICT. That also means a test is missing to catch this.

ref = v.refs[i]
ref == 0 ? hash(NULL_MAGIC, h) : hash(CategoricalArrays.index(v.pool)[ref], h)
end

# hash of DataTable rows based on its values
# so that duplicate rows would have the same hash
function Base.hash(r::DataTableRow, h::UInt)
for col in columns(r.dt)
if _isnull(col[r.row])
h = hash(false, h)
else
h = hash(true, hash(col[r.row], h))
end
function rowhash(dt::DataTable, r::Int, h::UInt = zero(UInt))
@inbounds for col in columns(dt)
h = hash_colel(col, r, h)
end
return h
end

Base.hash(r::DataTableRow, h::UInt = zero(UInt)) = rowhash(r.dt, r.row, h)

# comparison of DataTable rows
# only the rows of the same DataTable could be compared
# rows are equal if they have the same values (while the row indices could differ)
@compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow) = isequal(r1, r2)

function Base.isequal(r1::DataTableRow, r2::DataTableRow)
r1.dt == r2.dt || throw(ArgumentError("Comparing rows from different frames not supported"))
r1.row == r2.row && return true
for col in columns(r1.dt)
if !isequal(col[r1.row], col[r2.row])
return false
# returns Nullable{Bool}
# if all non-null values are equal, but there are nulls, returns null
function @compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that definition actually needed for this PR to work? If not, I'd rather keep the current one returning a Bool and discuss this change separately. This PR is already complex enough to review...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not needed, reverting to current

if r1.dt !== r2.dt
(ncol(r1.dt) != ncol(r2.dt)) &&
throw(ArgumentError("Comparing rows from different frames not supported"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

frames -> tables

Also, the message doesn't sound very clear: shouldn't it say that the number of columns should be equal? and what about column names?

Copy link
Contributor

@alyst alyst Feb 22, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Checking the names and the column types compatibility would be too expensive for this function, that's why different tables are not supported. What would you suggest as a more descriptive message?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. But say something which described the actual behavior correctly, like "Cannot compare rows with different numbers of columns (got $(ncol(r1.dt)) and $(ncol(r2.dt)))" .

eq = Nullable(true)
@inbounds for (col1, col2) in zip(columns(r1.dt), columns(r2.dt))
eq_col = convert(Nullable{Bool}, col1[r1.row] == col2[r2.row])
# If true or null, need to compare remaining columns
get(eq_col, true) || return Nullable(false)
eq &= eq_col
end
return eq
else
r1.row == r2.row && return Nullable(true)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation issue.

eq = Nullable(true)
@inbounds for col in columns(r1.dt)
eq_col = convert(Nullable{Bool}, col[r1.row] == col[r2.row])
# If true or null, need to compare remaining columns
get(eq_col, true) || return Nullable(false)
eq &= eq_col
end
return eq
end
end

# internal method for comparing the elements of the same data frame column
isequal_colel(col::AbstractArray, r1::Int, r2::Int) =
(r1 == r2) || isequal(Base.unsafe_getindex(col, r1), Base.unsafe_getindex(col, r2))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use @inbounds instead of unsafe_getindex? Better, mark the function with Base.@propagate_inbounds and add @inbounds at the call site, which is the only place where we can know that the indices are correct.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@inbounds would not work, because AFAIR the macro interfers with the result of an expression. @propagate_inbounds is much nicer anyway, I became aware of it too late :)


function isequal_colel{T}(col::Union{NullableArray{T},
AbstractNullableCategoricalArray{T}},
r1::Int, r2::Int)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation issue.

(r1 == r2) && return true
isnull(col[r1]) && return isnull(col[r2])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't this definition exactly match that of isequal on Nullable (i.e. could be simplified)? Also, indexing col repeatedly could have a performance impact.

return !isnull(col[r2]) && isequal(get(col[r1]), get(col[r2]))
end

isequal_colel(a::Any, b::Any) = isequal(a, b)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two-argument definitions are not needed AFAICT. The only place where they are called could use isequal directly instead.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests fail after making the changes. They all seem necessary to get around Nullable comparisons

julia> using DataTables

julia> DataTables.isequal_colel(Nullable(1), 1)
true

julia> isequal(Nullable(1), 1)
false

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, right, I forgot about the need to unwrap nullables when comparing with non-nullable. That behavior is quite annoying, but well...

isequal_colel(a::Nullable, b::Any) = !isnull(a) && isequal(get(a), b)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use unsafe_get for performance. Also avoid branching by using &, which can allow for optimization in loops.

isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a)
isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(get(a), get(b)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just use isequal(a, b)?


# comparison of DataTable rows
function isequal_row(dt::AbstractDataTable, r1::Int, r2::Int)
(r1 == r2) && return true # same raw
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

raw -> row

@inbounds for col in columns(dt)
isequal_colel(col, r1, r2) || return false
end
return true
end

function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int)
(dt1 === dt2) && return isequal_row(dt1, r1, r2)
(ncol(dt1) == ncol(dt2)) ||
throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

frames -> tables

@inbounds for (col1, col2) in zip(columns(dt1), columns(dt2))
isequal_colel(col1[r1], col2[r2]) || return false
end
return true
end

# comparison of DataTable rows
# rows are equal if they have the same values (while the row indices could differ)
Base.isequal(r1::DataTableRow, r2::DataTableRow) =
isequal_row(r1.dt, r1.row, r2.dt, r2.row)

# lexicographic ordering on DataTable rows, null > !null
function Base.isless(r1::DataTableRow, r2::DataTableRow)
(ncol(r1.dt) == ncol(r2.dt)) ||
throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
@inbounds for i in 1:ncol(r1.dt)
col1 = r1.dt[i]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of creating this variable, create x = r1.dt[i][r1.row] (same for y).

col2 = r2.dt[i]
isnull1 = _isnull(col1, r1.row)
isnull2 = _isnull(col2, r2.row)
(isnull1 != isnull2) && return isnull2 # null > !null
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, this seems equivalent to isless(::Nullable, ::Nullable). I guess that definition didn't exist when the PR was filed.

if !isnull1
v1 = get(col1[r1.row])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you'll need a _get function, just like _isnull, which will be a no-op for non nullables. Looks like we also need a test with non-nullable columns to catch this (unless I'm missing something).

v2 = get(col2[r2.row])
isless(v1, v2) && return true
!isequal(v1, v2) && return false
end
end
return false
end
Loading