This repository was archived by the owner on May 5, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathdatatablerow.jl
152 lines (128 loc) · 5.67 KB
/
datatablerow.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# Container for a DataTable row
immutable DataTableRow{T <: AbstractDataTable}
dt::T
row::Int
end
function Base.getindex(r::DataTableRow, idx::AbstractArray)
return DataTableRow(r.dt[idx], r.row)
end
function Base.getindex(r::DataTableRow, idx::Any)
return r.dt[r.row, idx]
end
function Base.setindex!(r::DataTableRow, value::Any, idx::Any)
return setindex!(r.dt, value, r.row, idx)
end
Base.names(r::DataTableRow) = names(r.dt)
_names(r::DataTableRow) = _names(r.dt)
Base.view(r::DataTableRow, c) = DataTableRow(r.dt[[c]], r.row)
index(r::DataTableRow) = index(r.dt)
Base.length(r::DataTableRow) = size(r.dt, 2)
Base.endof(r::DataTableRow) = size(r.dt, 2)
Base.start(r::DataTableRow) = 1
Base.next(r::DataTableRow, s) = ((_names(r)[s], r[s]), s + 1)
Base.done(r::DataTableRow, s) = s > length(r)
Base.convert(::Type{Array}, r::DataTableRow) = convert(Array, r.dt[r.row,:])
Base.collect(r::DataTableRow) = Tuple{Symbol, Any}[x for x in r]
# hash column element
Base.@propagate_inbounds hash_colel(v::AbstractArray, i, h::UInt = zero(UInt)) = hash(v[i], h)
Base.@propagate_inbounds hash_colel{T<:Nullable}(v::AbstractArray{T}, i, h::UInt = zero(UInt)) =
isnull(v[i]) ? hash(Base.nullablehash_seed, h) : hash(get(v[i]), h)
Base.@propagate_inbounds hash_colel{T}(v::NullableArray{T}, i, h::UInt = zero(UInt)) =
isnull(v, i) ? hash(Base.nullablehash_seed, h) : hash(v.values[i], h)
Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) =
hash(CategoricalArrays.index(v.pool)[v.refs[i]], h)
Base.@propagate_inbounds function hash_colel{T}(v::AbstractNullableCategoricalArray{T}, i, h::UInt = zero(UInt))
ref = v.refs[i]
ref == 0 ? hash(Base.nullablehash_seed, h) : hash(CategoricalArrays.index(v.pool)[ref], h)
end
# hash of DataTable rows based on its values
# so that duplicate rows would have the same hash
function rowhash(dt::DataTable, r::Int, h::UInt = zero(UInt))
@inbounds for col in columns(dt)
h = hash_colel(col, r, h)
end
return h
end
Base.hash(r::DataTableRow, h::UInt = zero(UInt)) = rowhash(r.dt, r.row, h)
# comparison of DataTable rows
# rows are equal if they have the same values (while the row indices could differ)
# returns Nullable{Bool}
# if all non-null values are equal, but there are nulls, returns null
function @compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow)
if r1.dt !== r2.dt
(ncol(r1.dt) != ncol(r2.dt)) &&
throw(ArgumentError("Comparing rows from different frames not supported"))
eq = Nullable(true)
@inbounds for (col1, col2) in zip(columns(r1.dt), columns(r2.dt))
eq_col = convert(Nullable{Bool}, col1[r1.row] == col2[r2.row])
# If true or null, need to compare remaining columns
get(eq_col, true) || return Nullable(false)
eq &= eq_col
end
return eq
else
r1.row == r2.row && return Nullable(true)
eq = Nullable(true)
@inbounds for col in columns(r1.dt)
eq_col = convert(Nullable{Bool}, col[r1.row] == col[r2.row])
# If true or null, need to compare remaining columns
get(eq_col, true) || return Nullable(false)
eq &= eq_col
end
return eq
end
end
# internal method for comparing the elements of the same data frame column
isequal_colel(col::AbstractArray, r1::Int, r2::Int) =
(r1 == r2) || isequal(Base.unsafe_getindex(col, r1), Base.unsafe_getindex(col, r2))
function isequal_colel{T}(col::Union{NullableArray{T},
AbstractNullableCategoricalArray{T}},
r1::Int, r2::Int)
(r1 == r2) && return true
isnull(col[r1]) && return isnull(col[r2])
return !isnull(col[r2]) && isequal(get(col[r1]), get(col[r2]))
end
isequal_colel(a::Any, b::Any) = isequal(a, b)
isequal_colel(a::Nullable, b::Any) = !isnull(a) & isequal(unsafe_get(a), b)
isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a)
isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(a, b))
# comparison of DataTable rows
function isequal_row(dt::AbstractDataTable, r1::Int, r2::Int)
(r1 == r2) && return true # same row
@inbounds for col in columns(dt)
isequal_colel(col, r1, r2) || return false
end
return true
end
function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int)
(dt1 === dt2) && return isequal_row(dt1, r1, r2)
(ncol(dt1) == ncol(dt2)) ||
throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
@inbounds for (col1, col2) in zip(columns(dt1), columns(dt2))
isequal_colel(col1[r1], col2[r2]) || return false
end
return true
end
# comparison of DataTable rows
# rows are equal if they have the same values (while the row indices could differ)
Base.isequal(r1::DataTableRow, r2::DataTableRow) =
isequal_row(r1.dt, r1.row, r2.dt, r2.row)
# lexicographic ordering on DataTable rows, null > !null
function Base.isless(r1::DataTableRow, r2::DataTableRow)
(ncol(r1.dt) == ncol(r2.dt)) ||
throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))"))
@inbounds for i in 1:ncol(r1.dt)
col1 = r1.dt[i]
col2 = r2.dt[i]
isnull1 = _isnull(col1, r1.row)
isnull2 = _isnull(col2, r2.row)
(isnull1 != isnull2) && return isnull2 # null > !null
if !isnull1
v1 = get(col1[r1.row])
v2 = get(col2[r2.row])
isless(v1, v2) && return true
!isequal(v1, v2) && return false
end
end
return false
end