Skip to content
This repository was archived by the owner on May 5, 2019. It is now read-only.

Commit 12443f4

Browse files
cjprybolnalimilan
authored andcommitted
adjust outer join behavior (types and right outer join bug)
outer joins need to return nullable tables as they may introduce missing data. similar_nullable on DataTables has been removed (unused) and replaced with a similar_nullable that works on NullableCategoricalArrays, and this change is made to support the new changes to join. The 3 outer joins share a function with inner joins, and this shared function (compose_joined_table) now performs a check to see if the join type is :inner, and if so, it will return the same column type as the parent table rather than promoting to a nullable column. A bug was found in right-outer join behavior where the values unique to the right table were added to the table in the incorrect locations, overwriting data and leaving nulls where they shouldn't be. This bug, due to incorrect values in rightonly_ixs.join, was fixed by filling the last n-rows of the datatable where n = length(rightonly_ixs.join). Tests were checked for accuracy against pandas.
1 parent 776f293 commit 12443f4

File tree

2 files changed

+254
-38
lines changed

2 files changed

+254
-38
lines changed

src/abstractdatatable/join.jl

Lines changed: 53 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,10 @@ similar_nullable{T}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}})
99
similar_nullable{T<:Nullable}(dv::AbstractArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
1010
NullableArray{eltype(T)}(dims)
1111

12-
similar_nullable{T,R}(dv::CategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) =
12+
similar_nullable{T}(dv::CategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
1313
NullableCategoricalArray{T}(dims)
1414

15-
similar_nullable(dt::AbstractDataTable, dims::Int) =
16-
DataTable(Any[similar_nullable(x, dims) for x in columns(dt)], copy(index(dt)))
17-
18-
similar_nullable{T,R}(dv::NullableCategoricalArray{T,R}, dims::Union{Int, Tuple{Vararg{Int}}}) =
15+
similar_nullable{T}(dv::NullableCategoricalArray{T}, dims::Union{Int, Tuple{Vararg{Int}}}) =
1916
NullableCategoricalArray{T}(dims)
2017

2118
# helper structure for DataTables joining
@@ -47,55 +44,74 @@ Base.length(x::RowIndexMap) = length(x.orig)
4744

4845
# composes the joined data table using the maps between the left and right
4946
# table rows and the indices of rows in the result
50-
function compose_joined_table(joiner::DataTableJoiner,
47+
function compose_joined_table(joiner::DataTableJoiner, kind::Symbol,
5148
left_ixs::RowIndexMap, leftonly_ixs::RowIndexMap,
5249
right_ixs::RowIndexMap, rightonly_ixs::RowIndexMap)
5350
@assert length(left_ixs) == length(right_ixs)
5451
# compose left half of the result taking all left columns
5552
all_orig_left_ixs = vcat(left_ixs.orig, leftonly_ixs.orig)
56-
if length(leftonly_ixs) > 0
53+
54+
ril = length(right_ixs)
55+
lil = length(left_ixs)
56+
loil = length(leftonly_ixs)
57+
roil = length(rightonly_ixs)
58+
59+
if loil > 0
5760
# combine the matched (left_ixs.orig) and non-matched (leftonly_ixs.orig) indices of the left table rows
5861
# preserving the original rows order
59-
all_orig_left_ixs = similar(left_ixs.orig, length(left_ixs)+length(leftonly_ixs))
62+
all_orig_left_ixs = similar(left_ixs.orig, lil + loil)
6063
@inbounds all_orig_left_ixs[left_ixs.join] = left_ixs.orig
6164
@inbounds all_orig_left_ixs[leftonly_ixs.join] = leftonly_ixs.orig
6265
else
6366
# the result contains only the left rows that are matched to right rows (left_ixs)
6467
all_orig_left_ixs = left_ixs.orig # no need to copy left_ixs.orig as it's not used elsewhere
6568
end
66-
ril = length(right_ixs)
67-
loil = length(leftonly_ixs)
68-
roil = length(rightonly_ixs)
69-
left_dt = DataTable(Any[resize!(col[all_orig_left_ixs], length(all_orig_left_ixs)+roil)
70-
for col in columns(joiner.dtl)],
71-
names(joiner.dtl))
72-
73-
# compose right half of the result taking all right columns excluding on
74-
dtr_noon = without(joiner.dtr, joiner.on_cols)
7569
# permutation to swap rightonly and leftonly rows
7670
right_perm = vcat(1:ril, ril+roil+1:ril+roil+loil, ril+1:ril+roil)
7771
if length(leftonly_ixs) > 0
7872
# compose right_perm with the permutation that restores left rows order
7973
right_perm[vcat(right_ixs.join, leftonly_ixs.join)] = right_perm[1:ril+loil]
8074
end
8175
all_orig_right_ixs = vcat(right_ixs.orig, rightonly_ixs.orig)
82-
right_dt = DataTable(Any[resize!(col[all_orig_right_ixs], length(all_orig_right_ixs)+loil)[right_perm]
83-
for col in columns(dtr_noon)],
84-
names(dtr_noon))
85-
# merge left and right parts of the joined table
86-
res = hcat!(left_dt, right_dt)
76+
77+
# compose right half of the result taking all right columns excluding on
78+
dtr_noon = without(joiner.dtr, joiner.on_cols)
79+
80+
nrow = length(all_orig_left_ixs) + roil
81+
@assert nrow == length(all_orig_right_ixs) + loil
82+
ncleft = ncol(joiner.dtl)
83+
cols = Vector{Any}(ncleft + ncol(dtr_noon))
84+
_similar = kind == :inner ? similar : similar_nullable
85+
for (i, col) in enumerate(columns(joiner.dtl))
86+
cols[i] = _similar(col, nrow)
87+
fillcolumn!(cols[i], col, all_orig_left_ixs)
88+
end
89+
for (i, col) in enumerate(columns(dtr_noon))
90+
cols[i+ncleft] = _similar(col, nrow)
91+
fillcolumn!(cols[i+ncleft], col, all_orig_right_ixs)
92+
permute!(cols[i+ncleft], right_perm)
93+
end
94+
res = DataTable(cols, vcat(names(joiner.dtl), names(dtr_noon)))
8795

8896
if length(rightonly_ixs.join) > 0
8997
# some left rows are nulls, so the values of the "on" columns
9098
# need to be taken from the right
9199
for (on_col_ix, on_col) in enumerate(joiner.on_cols)
92100
# fix the result of the rightjoin by taking the nonnull values from the right table
93-
res[on_col][rightonly_ixs.join] = joiner.dtr_on[rightonly_ixs.orig, on_col_ix]
101+
offset = nrow - length(rightonly_ixs.orig)
102+
fillcolumn!(res[on_col], joiner.dtr_on[on_col_ix], rightonly_ixs.orig, offset)
94103
end
95104
end
96105
return res
97106
end
98107

108+
function fillcolumn!{T1, T2}(dtcol::AbstractVector{T1}, refcol::AbstractVector{T2},
109+
indices::Vector{Int}, offset::Int=0)
110+
@inbounds for (j, k) in enumerate(indices)
111+
dtcol[j+offset] = refcol[k]
112+
end
113+
end
114+
99115
# map the indices of the left and right joined tables
100116
# to the indices of the rows in the resulting table
101117
# if `nothing` is given, the corresponding map is not built
@@ -210,7 +226,8 @@ join(dt1::AbstractDataTable,
210226
- `:cross` : a full Cartesian product of the key combinations; every
211227
row of `dt1` is matched with every row of `dt2`
212228
213-
Null values are filled in where needed to complete joins.
229+
For the three join operations that may introduce missing values (`:outer`, `:left`,
230+
and `:right`), all columns of the returned data table will be nullable.
214231
215232
### Result
216233
@@ -246,22 +263,21 @@ function Base.join(dt1::AbstractDataTable,
246263
joiner = DataTableJoiner(dt1, dt2, on)
247264

248265
if kind == :inner
249-
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
250-
group_rows(joiner.dtr_on),
251-
true, false, true, false)...)
266+
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
267+
group_rows(joiner.dtr_on),
268+
true, false, true, false)...)
252269
elseif kind == :left
253-
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
254-
group_rows(joiner.dtr_on),
255-
true, true, true, false)...)
270+
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
271+
group_rows(joiner.dtr_on),
272+
true, true, true, false)...)
256273
elseif kind == :right
257-
right_ixs, rightonly_ixs, left_ixs, leftonly_ixs = update_row_maps!(joiner.dtr_on, joiner.dtl_on,
258-
group_rows(joiner.dtl_on),
259-
true, true, true, false)
260-
compose_joined_table(joiner, left_ixs, leftonly_ixs, right_ixs, rightonly_ixs)
274+
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtr_on, joiner.dtl_on,
275+
group_rows(joiner.dtl_on),
276+
true, true, true, false)[[3, 4, 1, 2]]...)
261277
elseif kind == :outer
262-
compose_joined_table(joiner, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
263-
group_rows(joiner.dtr_on),
264-
true, true, true, true)...)
278+
compose_joined_table(joiner, kind, update_row_maps!(joiner.dtl_on, joiner.dtr_on,
279+
group_rows(joiner.dtr_on),
280+
true, true, true, true)...)
265281
elseif kind == :semi
266282
# hash the right rows
267283
dtr_on_grp = group_rows(joiner.dtr_on)

0 commit comments

Comments
 (0)