JuliaData · nalimilan · Mar 6, 2017 · Feb 20, 2017 · Feb 20, 2017 · Feb 20, 2017
diff --git a/src/DataTables.jl b/src/DataTables.jl
@@ -22,6 +22,7 @@ using FileIO  # remove after read_rda deprecation period
 
 using Base: Sort, Order
 import Base: ==, |>
+import Base: permute!, ipermute!
 
 ##############################################################################
 ##
@@ -104,6 +105,7 @@ for (dir, filename) in [
         ("subdatatable", "subdatatable.jl"),
         ("groupeddatatable", "grouping.jl"),
         ("datatablerow", "datatablerow.jl"),
+        ("datatablerow", "utils.jl"),
 
         ("abstractdatatable", "iteration.jl"),
         ("abstractdatatable", "join.jl"),

diff --git a/src/abstractdatatable/sort.jl b/src/abstractdatatable/sort.jl
@@ -55,69 +55,36 @@ ordering(col::ColumnIndex, lt::Function, by::Function, rev::Bool, order::Orderin
 #         the permutation induced by this ordering is used to
 #         sort the original (presumably larger) DataTable
 
-type DTPerm{O<:@compat(Union{Ordering, AbstractVector}), DT<:AbstractDataTable} <: Ordering
+immutable DTPerm{O<:Union{Ordering, AbstractVector}, DT<:AbstractDataTable} <: Ordering
     ord::O
     dt::DT
 end
 
-function DTPerm{O<:Ordering}(ords::AbstractVector{O}, dt::AbstractDataTable)
+function DTPerm{O<:Ordering, DT<:AbstractDataTable}(ords::AbstractVector{O}, dt::DT)
     if length(ords) != ncol(dt)
         error("DTPerm: number of column orderings does not equal the number of DataTable columns")
     end
-    DTPerm{AbstractVector{O}, typeof(dt)}(ords, dt)
+    DTPerm{typeof(ords), DT}(ords, dt)
 end
 
-DTPerm{O<:Ordering}(o::O, dt::AbstractDataTable) = DTPerm{O,typeof(dt)}(o,dt)
+DTPerm{O<:Ordering, DT<:AbstractDataTable}(o::O, dt::DT) = DTPerm{O,DT}(o,dt)
 
-# For sorting, a and b are row indices (first two lt definitions)
-# For issorted, the default row iterator returns DataTableRows instead,
-# so two more lt function is defined below
-function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a, b)
-    for i = 1:ncol(o.dt)
-        if lt(o.ord[i], o.dt[a,i], o.dt[b,i])
-            return true
-        end
-        if lt(o.ord[i], o.dt[b,i], o.dt[a,i])
-            return false
-        end
-    end
-    false
-end
-
-function Sort.lt{O<:Ordering}(o::DTPerm{O}, a, b)
-    for i = 1:ncol(o.dt)
-        if lt(o.ord, o.dt[a,i], o.dt[b,i])
-            return true
-        end
-        if lt(o.ord, o.dt[b,i], o.dt[a,i])
-            return false
-        end
-    end
-    false
-end
+# get ordering function for the i-th column used for ordering
+col_ordering{O<:Ordering}(o::DTPerm{O}, i::Int) = o.ord
+col_ordering{V<:AbstractVector}(o::DTPerm{V}, i::Int) = o.ord[i]
 
-function Sort.lt{V<:AbstractVector}(o::DTPerm{V}, a::DataTableRow, b::DataTableRow)
-    for i = 1:ncol(o.dt)
-        if lt(o.ord[i], a[i], b[i])
-            return true
-        end
-        if lt(o.ord[i], b[i], a[i])
-            return false
-        end
-    end
-    false
-end
+Base.getindex(o::DTPerm, i::Int, j::Int) = o.dt[i, j]
+Base.getindex(o::DTPerm, a::DataTableRow, j::Int) = a[j]
 
-function Sort.lt{O<:Ordering}(o::DTPerm{O}, a::DataTableRow, b::DataTableRow)
-    for i = 1:ncol(o.dt)
-        if lt(o.ord, a[i], b[i])
-            return true
-        end
-        if lt(o.ord, b[i], a[i])
-            return false
-        end
+function Sort.lt(o::DTPerm, a, b)
+    @inbounds for i = 1:ncol(o.dt)
+        ord = col_ordering(o, i)
+        va = o[a, i]
+        vb = o[b, i]
+        lt(ord, va, vb) && return true
+        lt(ord, vb, va) && return false
     end
-    false
+    false # a and b are equal
 end
 
 ###
@@ -306,5 +273,35 @@ for s in [:(Base.sort), :(Base.sortperm)]
 end
 
 Base.sort(dt::AbstractDataTable, a::Algorithm, o::Ordering) = dt[sortperm(dt, a, o),:]
-Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::@compat(Union{Perm,DTPerm})) = sort!([1:size(dt, 1);], a, o)
+Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Union{Perm,DTPerm}) = sort!([1:size(dt, 1);], a, o)
 Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Ordering) = sortperm(dt, a, DTPerm(o,dt))
+
+# Extras to speed up sorting
+#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec)
+#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec))
+
+# permute rows
+function Base.permute!(dt::AbstractDataTable, p::AbstractVector)
+    pp = similar(p)
+    for (icol, col) in enumerate(columns(dt))
+        # Check if this column has been sorted already
+        any(j -> dt[j]===col, 1:icol-1) && continue
+
+        copy!(pp, p)
+        Base.permute!!(col, pp)
+    end
+    dt
+end
+
+# apply inverse of given rows permutation
+function Base.ipermute!(dt::AbstractDataTable, p::AbstractVector)
+    pp = similar(p)
+    for (icol, col) in enumerate(columns(dt))
+        # Check if this column has been sorted already
+        any(j -> dt[j]===col, 1:icol-1) && continue
+
+        copy!(pp, p)
+        Base.ipermute!!(col, pp)
+    end
+    dt
+end
diff --git a/src/datatablerow/datatablerow.jl b/src/datatablerow/datatablerow.jl
@@ -27,8 +27,6 @@ Base.length(r::DataTableRow) = size(r.dt, 2)
 
 Base.endof(r::DataTableRow) = size(r.dt, 2)
 
-Base.collect(r::DataTableRow) = @compat Tuple{Symbol, Any}[x for x in r]
-
 Base.start(r::DataTableRow) = 1
 
 Base.next(r::DataTableRow, s) = ((_names(r)[s], r[s]), s + 1)
@@ -37,31 +35,119 @@ Base.done(r::DataTableRow, s) = s > length(r)
 
 Base.convert(::Type{Array}, r::DataTableRow) = convert(Array, r.dt[r.row,:])
 
+Base.collect(r::DataTableRow) = Tuple{Symbol, Any}[x for x in r]
+
+# the equal elements of nullable and normal arrays would have the same hashes
+const NULL_MAGIC = 0xBADDEED # what to hash if the element is null
+
+# hash column element
+Base.@propagate_inbounds hash_colel(v::AbstractArray, i, h::UInt = zero(UInt)) = hash(v[i], h)
+Base.@propagate_inbounds hash_colel{T}(v::NullableArray{T}, i, h::UInt = zero(UInt)) =
+    isnull(v, i) ? hash(NULL_MAGIC, h) : hash(get(v[i]), h)
+Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) =
+    hash(CategoricalArrays.index(v.pool)[v.refs[i]], h)
+Base.@propagate_inbounds function hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt))
+    ref = v.refs[i]
+    ref == 0 ? hash(NULL_MAGIC, h) : hash(CategoricalArrays.index(v.pool)[ref], h)
+end
+
 # hash of DataTable rows based on its values
 # so that duplicate rows would have the same hash
-function Base.hash(r::DataTableRow, h::UInt)
-    for col in columns(r.dt)
-        if _isnull(col[r.row])
-            h = hash(false, h)
-        else
-            h = hash(true, hash(col[r.row], h))
-        end
+function rowhash(dt::DataTable, r::Int, h::UInt = zero(UInt))
+    @inbounds for col in columns(dt)
+        h = hash_colel(col, r, h)
     end
     return h
 end
 
+Base.hash(r::DataTableRow, h::UInt = zero(UInt)) = rowhash(r.dt, r.row, h)
+
 # comparison of DataTable rows
-# only the rows of the same DataTable could be compared
 # rows are equal if they have the same values (while the row indices could differ)
-@compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow) = isequal(r1, r2)
-
-function Base.isequal(r1::DataTableRow, r2::DataTableRow)
-    r1.dt == r2.dt || throw(ArgumentError("Comparing rows from different frames not supported"))
-    r1.row == r2.row && return true
-    for col in columns(r1.dt)
-        if !isequal(col[r1.row], col[r2.row])
-            return false
+# returns Nullable{Bool}
+# if all non-null values are equal, but there are nulls, returns null
+function @compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow)
+    if r1.dt !== r2.dt
+        (ncol(r1.dt) != ncol(r2.dt)) &&
+            throw(ArgumentError("Comparing rows from different frames not supported"))
+        eq = Nullable(true)
+        @inbounds for (col1, col2) in zip(columns(r1.dt), columns(r2.dt))
+            eq_col = convert(Nullable{Bool}, col1[r1.row] == col2[r2.row])
+            # If true or null, need to compare remaining columns
+            get(eq_col, true) || return Nullable(false)
+            eq &= eq_col
         end
+        return eq
+    else
+    	r1.row == r2.row && return Nullable(true)
+        eq = Nullable(true)
+        @inbounds for col in columns(r1.dt)
+            eq_col = convert(Nullable{Bool}, col[r1.row] == col[r2.row])
+            # If true or null, need to compare remaining columns
+            get(eq_col, true) || return Nullable(false)
+            eq &= eq_col
+        end
+        return eq
+    end
+end
+
+# internal method for comparing the elements of the same data frame column
+isequal_colel(col::AbstractArray, r1::Int, r2::Int) =
+    (r1 == r2) || isequal(Base.unsafe_getindex(col, r1), Base.unsafe_getindex(col, r2))
+
+function isequal_colel{T}(col::Union{NullableArray{T},
+                                     AbstractNullableCategoricalArray{T}},
+                                     r1::Int, r2::Int)
+    (r1 == r2) && return true
+    isnull(col[r1]) && return isnull(col[r2])
+    return !isnull(col[r2]) && isequal(get(col[r1]), get(col[r2]))
+end
+
+isequal_colel(a::Any, b::Any) = isequal(a, b)
+isequal_colel(a::Nullable, b::Any) = !isnull(a) && isequal(get(a), b)
+isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a)
+isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(get(a), get(b)))
+
+# comparison of DataTable rows
+function isequal_row(dt::AbstractDataTable, r1::Int, r2::Int)
+    (r1 == r2) && return true # same raw
+    @inbounds for col in columns(dt)
+        isequal_colel(col, r1, r2) || return false
+    end
+    return true
+end
+
+function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int)
+    (dt1 === dt2) && return isequal_row(dt1, r1, r2)
+    (ncol(dt1) == ncol(dt2)) ||
+        throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
+    @inbounds for (col1, col2) in zip(columns(dt1), columns(dt2))
+        isequal_colel(col1[r1], col2[r2]) || return false
     end
     return true
 end
+
+# comparison of DataTable rows
+# rows are equal if they have the same values (while the row indices could differ)
+Base.isequal(r1::DataTableRow, r2::DataTableRow) =
+    isequal_row(r1.dt, r1.row, r2.dt, r2.row)
+
+# lexicographic ordering on DataTable rows, null > !null
+function Base.isless(r1::DataTableRow, r2::DataTableRow)
+    (ncol(r1.dt) == ncol(r2.dt)) ||
+        throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
+    @inbounds for i in 1:ncol(r1.dt)
+        col1 = r1.dt[i]
+        col2 = r2.dt[i]
+        isnull1 = _isnull(col1, r1.row)
+        isnull2 = _isnull(col2, r2.row)
+        (isnull1 != isnull2) && return isnull2 # null > !null
+        if !isnull1
+            v1 = get(col1[r1.row])
+            v2 = get(col2[r2.row])
+            isless(v1, v2) && return true
+            !isequal(v1, v2) && return false
+        end
+    end
+    return false
+end