Skip to content
This repository was archived by the owner on May 5, 2019. It is now read-only.

Commit 776f293

Browse files
cjprybolnalimilan
authored andcommitted
Update vcat for consistency w/ Base.vcat and improve array promotion
This PR removes vcat support for arrays of datatables and makes the Base.vcat style of vcat(args...) the only call option. Removes assumptions for joining datatables with missing, unique, and out of order columns. vcat'ing datatables with unmatched headers results in error messages that explain how the columns are inconsistent. Uses @nalimilan's @generated function to implement a new type of AbstractArray promotion rule that improves handling of NullableArrays and CategoricalArrays. Extends vcat testing.
1 parent f047cd2 commit 776f293

File tree

2 files changed

+208
-110
lines changed

2 files changed

+208
-110
lines changed

src/abstractdatatable/abstractdatatable.jl

Lines changed: 83 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -706,83 +706,97 @@ Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable) = hcat!(dt[:, :], dt2)
706706
Base.hcat(dt::AbstractDataTable, x, y...) = hcat!(hcat(dt, x), y...)
707707
Base.hcat(dt1::AbstractDataTable, dt2::AbstractDataTable, dtn::AbstractDataTable...) = hcat!(hcat(dt1, dt2), dtn...)
708708

709-
# vcat only accepts DataTables. Finds union of columns, maintaining order
710-
# of first dt. Missing data become null values.
711-
712-
Base.vcat(dt::AbstractDataTable) = dt
713-
714-
Base.vcat(dts::AbstractDataTable...) = vcat(AbstractDataTable[dts...])
715-
716-
function Base.vcat{T<:AbstractDataTable}(dts::Vector{T})
717-
isempty(dts) && return DataTable()
718-
coltyps, colnams, similars = _colinfo(dts)
719-
720-
res = DataTable()
721-
Nrow = sum(nrow, dts)
722-
for j in 1:length(colnams)
723-
colnam = colnams[j]
724-
col = similar(similars[j], coltyps[j], Nrow)
725-
726-
i = 1
727-
for dt in dts
728-
if haskey(dt, colnam)
729-
copy!(col, i, dt[colnam])
730-
end
731-
i += size(dt, 1)
709+
@generated function promote_col_type(cols::AbstractVector...)
710+
elty = Base.promote_eltype(cols...)
711+
if elty <: Nullable
712+
elty = eltype(elty)
713+
end
714+
if elty <: CategoricalValue
715+
elty = elty.parameters[1]
716+
end
717+
if any(col -> eltype(col) <: Nullable, cols)
718+
if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols)
719+
return :(NullableCategoricalVector{$elty})
720+
else
721+
return :(NullableVector{$elty})
722+
end
723+
else
724+
if any(col -> col <: Union{AbstractCategoricalArray, AbstractNullableCategoricalArray}, cols)
725+
return :(CategoricalVector{$elty})
726+
else
727+
return :(Vector{$elty})
732728
end
733-
734-
res[colnam] = col
735729
end
736-
res
737730
end
738731

739-
_isnullable{T}(::AbstractArray{T}) = T <: Nullable
740-
const EMPTY_DATA = NullableArray(Void, 0)
741-
742-
function _colinfo{T<:AbstractDataTable}(dts::Vector{T})
743-
dt1 = dts[1]
744-
colindex = copy(index(dt1))
745-
coltyps = eltypes(dt1)
746-
similars = collect(columns(dt1))
747-
nonnull_ct = Int[_isnullable(c) for c in columns(dt1)]
748-
749-
for i in 2:length(dts)
750-
dt = dts[i]
751-
for j in 1:size(dt, 2)
752-
col = dt[j]
753-
cn, ct = _names(dt)[j], eltype(col)
754-
if haskey(colindex, cn)
755-
idx = colindex[cn]
756-
757-
oldtyp = coltyps[idx]
758-
if !(ct <: oldtyp)
759-
coltyps[idx] = promote_type(oldtyp, ct)
760-
# Needed on Julia 0.4 since e.g.
761-
# promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T},
762-
# which is not a usable type: fall back to Nullable{Any}
763-
if VERSION < v"0.5.0-dev" &&
764-
coltyps[idx] <: Nullable && !isa(coltyps[idx].types[2], DataType)
765-
coltyps[idx] = Nullable{Any}
766-
end
767-
end
768-
nonnull_ct[idx] += !_isnullable(col)
769-
else # new column
770-
push!(colindex, cn)
771-
push!(coltyps, ct)
772-
push!(similars, col)
773-
push!(nonnull_ct, !_isnullable(col))
732+
"""
733+
vcat(dts::AbstractDataTable...)
734+
735+
Vertically concatenate `AbstractDataTables` that have the same column names in
736+
the same order.
737+
738+
# Example
739+
```jldoctest
740+
julia> dt1 = DataTable(A=1:3, B=1:3);
741+
julia> dt2 = DataTable(A=4:6, B=4:6);
742+
julia> vcat(dt1, dt2)
743+
6×2 DataTables.DataTable
744+
│ Row │ A │ B │
745+
├─────┼───┼───┤
746+
│ 1 │ 1 │ 1 │
747+
│ 2 │ 2 │ 2 │
748+
│ 3 │ 3 │ 3 │
749+
│ 4 │ 4 │ 4 │
750+
│ 5 │ 5 │ 5 │
751+
│ 6 │ 6 │ 6 │
752+
```
753+
"""
754+
Base.vcat(dt::AbstractDataTable) = dt
755+
function Base.vcat(dts::AbstractDataTable...)
756+
isempty(dts) && return DataTable()
757+
allheaders = map(names, dts)
758+
if all(h -> length(h) == 0, allheaders)
759+
return DataTable()
760+
end
761+
uniqueheaders = unique(allheaders)
762+
if length(uniqueheaders) > 1
763+
unionunique = union(uniqueheaders...)
764+
coldiff = setdiff(unionunique, intersect(uniqueheaders...))
765+
if !isempty(coldiff)
766+
# if any datatables are a full superset of names, skip them
767+
filter!(u -> Set(u) != Set(unionunique), uniqueheaders)
768+
estrings = Vector{String}(length(uniqueheaders))
769+
for (i, u) in enumerate(uniqueheaders)
770+
matching = find(h -> u == h, allheaders)
771+
headerdiff = setdiff(coldiff, u)
772+
cols = join(headerdiff, ", ", " and ")
773+
args = join(matching, ", ", " and ")
774+
estrings[i] = "column(s) $cols are missing from argument(s) $args"
774775
end
776+
throw(ArgumentError(join(estrings, ", ", ", and ")))
777+
else
778+
estrings = Vector{String}(length(uniqueheaders))
779+
for (i, u) in enumerate(uniqueheaders)
780+
indices = find(a -> a == u, allheaders)
781+
estrings[i] = "column order of argument(s) $(join(indices, ", ", " and "))"
782+
end
783+
throw(ArgumentError(join(estrings, " != ")))
775784
end
776-
end
777-
778-
for j in 1:length(colindex)
779-
if nonnull_ct[j] < length(dts) && !_isnullable(similars[j])
780-
similars[j] = EMPTY_DATA
785+
else
786+
header = uniqueheaders[1]
787+
cols = Vector{Any}(length(header))
788+
for i in 1:length(cols)
789+
data = [dt[i] for dt in dts]
790+
lens = map(length, data)
791+
cols[i] = promote_col_type(data...)(sum(lens))
792+
offset = 1
793+
for j in 1:length(data)
794+
copy!(cols[i], offset, data[j])
795+
offset += lens[j]
796+
end
781797
end
798+
return DataTable(cols, header)
782799
end
783-
colnams = _names(colindex)
784-
785-
coltyps, colnams, similars
786800
end
787801

788802
##############################################################################

test/cat.jl

Lines changed: 125 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,14 @@ module TestCat
7272
dt[1:2, 1:2] = [3,2]
7373
dt[[true,false,false,true], 2:3] = [2,3]
7474

75-
vcat([])
76-
vcat(null_dt)
77-
vcat(null_dt, null_dt)
78-
vcat(null_dt, dt)
79-
vcat(dt, null_dt)
80-
vcat(dt, dt)
81-
vcat(dt, dt, dt)
82-
@test vcat(DataTable[]) == DataTable()
75+
@test vcat(null_dt) == DataTable()
76+
@test vcat(null_dt, null_dt) == DataTable()
77+
@test_throws ArgumentError vcat(null_dt, dt)
78+
@test_throws ArgumentError vcat(dt, null_dt)
79+
@test eltypes(vcat(dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}]
80+
@test size(vcat(dt, dt)) == (size(dt,1)*2, size(dt,2))
81+
@test eltypes(vcat(dt, dt, dt)) == [Nullable{Float64}, Nullable{Float64}, Nullable{Int}]
82+
@test size(vcat(dt, dt, dt)) == (size(dt,1)*3, size(dt,2))
8383

8484
alt_dt = deepcopy(dt)
8585
vcat(dt, alt_dt)
@@ -88,29 +88,13 @@ module TestCat
8888
dt[1] = zeros(Int, nrow(dt))
8989
vcat(dt, alt_dt)
9090

91-
# Don't fail on non-matching names
92-
names!(alt_dt, [:A, :B, :C])
93-
vcat(dt, alt_dt)
94-
9591
dtr = vcat(dt4, dt4)
9692
@test size(dtr, 1) == 8
9793
@test names(dt4) == names(dtr)
9894
@test isequal(dtr, [dt4; dt4])
9995

100-
dtr = vcat(dt2, dt3)
101-
@test size(dtr) == (8,2)
102-
@test names(dt2) == names(dtr)
103-
@test isnull(dtr[8,:x2])
104-
105-
# Eltype promotion
106-
# Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
107-
if VERSION >= v"0.5.0-dev"
108-
@test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}]
109-
@test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}]
110-
else
111-
@test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Any}]
112-
@test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Any}]
113-
end
96+
@test eltypes(vcat(DataTable(a = [1]), DataTable(a = [2.1]))) == [Nullable{Float64}]
97+
@test eltypes(vcat(DataTable(a = NullableArray(Int, 1)), DataTable(a = [2.1]))) == [Nullable{Float64}]
11498

11599
# Minimal container type promotion
116100
dta = DataTable(a = CategoricalArray([1, 2, 2]))
@@ -122,12 +106,7 @@ module TestCat
122106
@test isequal(dtab[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
123107
@test isequal(dtac[:a], Nullable{Int}[1, 2, 2, 2, 3, 4])
124108
@test isa(dtab[:a], NullableCategoricalVector{Int})
125-
# Fails on Julia 0.4 since promote_type(Nullable{Int}, Nullable{Float64}) gives Nullable{T}
126-
if VERSION >= v"0.5.0-dev"
127-
@test isa(dtac[:a], NullableCategoricalVector{Int})
128-
else
129-
@test isa(dtac[:a], NullableCategoricalVector{Any})
130-
end
109+
@test isa(dtac[:a], NullableCategoricalVector{Int})
131110
# ^^ container may flip if container promotion happens in Base/DataArrays
132111
dc = vcat(dtd, dtc)
133112
@test isequal(vcat(dtc, dtd), dc)
@@ -137,15 +116,120 @@ module TestCat
137116
@test isequal(vcat(dtd, dtc0, dtc), dc)
138117
@test eltypes(vcat(dtd, dtc0)) == eltypes(dc)
139118

140-
# Missing columns
141-
rename!(dtd, :a, :b)
142-
dtda = DataTable(b = NullableArray(Nullable{Int}[2, 3, 4, Nullable(), Nullable(), Nullable()]),
143-
a = NullableCategoricalVector(Nullable{Int}[Nullable(), Nullable(), Nullable(), 1, 2, 2]))
144-
@test isequal(vcat(dtd, dta), dtda)
145-
146-
# Alignment
147-
@test isequal(vcat(dtda, dtd, dta), vcat(dtda, dtda))
148-
149119
# vcat should be able to concatenate different implementations of AbstractDataTable (PR #944)
150120
@test isequal(vcat(view(DataTable(A=1:3),2),DataTable(A=4:5)), DataTable(A=[2,4,5]))
121+
122+
@testset "vcat >2 args" begin
123+
@test vcat(DataTable(), DataTable(), DataTable()) == DataTable()
124+
dt = DataTable(x = trues(1), y = falses(1))
125+
@test vcat(dt, dt, dt) == DataTable(x = trues(3), y = falses(3))
126+
end
127+
128+
@testset "vcat mixed coltypes" begin
129+
drf = CategoricalArrays.DefaultRefType
130+
dt = vcat(DataTable([[1]], [:x]), DataTable([[1.0]], [:x]))
131+
@test dt == DataTable([[1.0, 1.0]], [:x])
132+
@test typeof.(dt.columns) == [Vector{Float64}]
133+
dt = vcat(DataTable([[1]], [:x]), DataTable([["1"]], [:x]))
134+
@test dt == DataTable([[1, "1"]], [:x])
135+
@test typeof.(dt.columns) == [Vector{Any}]
136+
dt = vcat(DataTable([NullableArray([1])], [:x]), DataTable([[1]], [:x]))
137+
@test dt == DataTable([NullableArray([1, 1])], [:x])
138+
@test typeof.(dt.columns) == [NullableVector{Int}]
139+
dt = vcat(DataTable([CategoricalArray([1])], [:x]), DataTable([[1]], [:x]))
140+
@test dt == DataTable([CategoricalArray([1, 1])], [:x])
141+
@test typeof.(dt.columns) == [CategoricalVector{Int, drf}]
142+
dt = vcat(DataTable([CategoricalArray([1])], [:x]),
143+
DataTable([NullableArray([1])], [:x]))
144+
@test dt == DataTable([NullableCategoricalArray([1, 1])], [:x])
145+
@test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}]
146+
dt = vcat(DataTable([CategoricalArray([1])], [:x]),
147+
DataTable([NullableCategoricalArray([1])], [:x]))
148+
@test dt == DataTable([NullableCategoricalArray([1, 1])], [:x])
149+
@test typeof.(dt.columns) == [NullableCategoricalVector{Int, drf}]
150+
dt = vcat(DataTable([NullableArray([1])], [:x]),
151+
DataTable([NullableArray(["1"])], [:x]))
152+
@test dt == DataTable([NullableArray([1, "1"])], [:x])
153+
@test typeof.(dt.columns) == [NullableVector{Any}]
154+
dt = vcat(DataTable([CategoricalArray([1])], [:x]),
155+
DataTable([CategoricalArray(["1"])], [:x]))
156+
@test dt == DataTable([CategoricalArray([1, "1"])], [:x])
157+
@test typeof.(dt.columns) == [CategoricalVector{Any, drf}]
158+
dt = vcat(DataTable([trues(1)], [:x]), DataTable([[false]], [:x]))
159+
@test dt == DataTable([[true, false]], [:x])
160+
@test typeof.(dt.columns) == [Vector{Bool}]
161+
end
162+
163+
@testset "vcat errors" begin
164+
err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[]))
165+
@test err.value.msg == "column(s) x are missing from argument(s) 1 and 2"
166+
err = @test_throws ArgumentError vcat(DataTable(), DataTable(), DataTable(x=[1]))
167+
@test err.value.msg == "column(s) x are missing from argument(s) 1 and 2"
168+
dt1 = DataTable(A = 1:3, B = 1:3)
169+
dt2 = DataTable(A = 1:3)
170+
# right missing 1 column
171+
err = @test_throws ArgumentError vcat(dt1, dt2)
172+
@test err.value.msg == "column(s) B are missing from argument(s) 2"
173+
# left missing 1 column
174+
err = @test_throws ArgumentError vcat(dt2, dt1)
175+
@test err.value.msg == "column(s) B are missing from argument(s) 1"
176+
# multiple missing 1 column
177+
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2, dt2)
178+
@test err.value.msg == "column(s) B are missing from argument(s) 2, 3, 4, 5 and 6"
179+
# argument missing >1 columns
180+
dt1 = DataTable(A = 1:3, B = 1:3, C = 1:3, D = 1:3, E = 1:3)
181+
err = @test_throws ArgumentError vcat(dt1, dt2)
182+
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2"
183+
# >1 arguments missing >1 columns
184+
err = @test_throws ArgumentError vcat(dt1, dt2, dt2, dt2, dt2)
185+
@test err.value.msg == "column(s) B, C, D and E are missing from argument(s) 2, 3, 4 and 5"
186+
# out of order
187+
dt2 = dt1[reverse(names(dt1))]
188+
err = @test_throws ArgumentError vcat(dt1, dt2)
189+
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2"
190+
# first group >1 arguments
191+
err = @test_throws ArgumentError vcat(dt1, dt1, dt2)
192+
@test err.value.msg == "column order of argument(s) 1 and 2 != column order of argument(s) 3"
193+
# second group >1 arguments
194+
err = @test_throws ArgumentError vcat(dt1, dt2, dt2)
195+
@test err.value.msg == "column order of argument(s) 1 != column order of argument(s) 2 and 3"
196+
# first and second groups >1 argument
197+
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2)
198+
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6"
199+
# >2 groups out of order
200+
srand(1)
201+
dt3 = dt1[shuffle(names(dt1))]
202+
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt3)
203+
@test err.value.msg == "column order of argument(s) 1, 2 and 3 != column order of argument(s) 4, 5 and 6 != column order of argument(s) 7, 8, 9 and 10"
204+
# missing columns throws error before out of order columns
205+
dt1 = DataTable(A = 1, B = 1)
206+
dt2 = DataTable(A = 1)
207+
dt3 = DataTable(B = 1, A = 1)
208+
err = @test_throws ArgumentError vcat(dt1, dt2, dt3)
209+
@test err.value.msg == "column(s) B are missing from argument(s) 2"
210+
# unique columns for both sides
211+
dt1 = DataTable(A = 1, B = 1, C = 1, D = 1)
212+
dt2 = DataTable(A = 1, C = 1, D = 1, E = 1, F = 1)
213+
err = @test_throws ArgumentError vcat(dt1, dt2)
214+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, and column(s) B are missing from argument(s) 2"
215+
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2)
216+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, and column(s) B are missing from argument(s) 3 and 4"
217+
dt3 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1)
218+
err = @test_throws ArgumentError vcat(dt1, dt2, dt3)
219+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3"
220+
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3)
221+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6"
222+
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3)
223+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9"
224+
# dt4 is a superset of names found in all other datatables and won't be shown in error
225+
dt4 = DataTable(A = 1, B = 1, C = 1, D = 1, E = 1, F = 1)
226+
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4)
227+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, column(s) B are missing from argument(s) 2, and column(s) F are missing from argument(s) 3"
228+
err = @test_throws ArgumentError vcat(dt1, dt1, dt2, dt2, dt3, dt3, dt4, dt4)
229+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1 and 2, column(s) B are missing from argument(s) 3 and 4, and column(s) F are missing from argument(s) 5 and 6"
230+
err = @test_throws ArgumentError vcat(dt1, dt1, dt1, dt2, dt2, dt2, dt3, dt3, dt3, dt4, dt4, dt4)
231+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 2 and 3, column(s) B are missing from argument(s) 4, 5 and 6, and column(s) F are missing from argument(s) 7, 8 and 9"
232+
err = @test_throws ArgumentError vcat(dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4, dt1, dt2, dt3, dt4)
233+
@test err.value.msg == "column(s) E and F are missing from argument(s) 1, 5 and 9, column(s) B are missing from argument(s) 2, 6 and 10, and column(s) F are missing from argument(s) 3, 7 and 11"
234+
end
151235
end

0 commit comments

Comments
 (0)