Specialize row_group_slots() and findrow() on column types to improve performance

nalimilan · nalimilan · commit f5007b52b6e4 · 2017-08-07T14:49:43.000+02:00
Looping over columns is very slow when their type is unknown at compile time.
Specialize the method on the types of the key (grouping) columns by passing
a tuple of columns rather than a DataTable. This will force compiling a specific
method for each combination of key types, but their number should remain relatively low
and the one-time cost is worth it.

This dramatically improves performance of groupby(), but does not have a large
effect on join() since it is very inefficient in other areas.

Also add return type assertion for rowhash(). The fact that the type of
the columns isn't known at compile time appears to confuse inference,
which isn't able to detect that this function always returns UInt.
This reduces a lot the number of allocations when calling join(),
but doesn't really change performance.
diff --git a/src/abstractdatatable/join.jl b/src/abstractdatatable/join.jl
@@ -141,9 +141,11 @@ function update_row_maps!(left_table::AbstractDataTable,
     @inline update!(mask::Vector{Bool}, orig_ixs::AbstractArray) = (mask[orig_ixs] = false)
 
     # iterate over left rows and compose the left<->right index map
+    right_dict_cols = ntuple(i -> right_dict.dt[i], ncol(right_dict.dt))
+    left_table_cols = ntuple(i -> left_table[i], ncol(left_table))
     next_join_ix = 1
     for l_ix in 1:nrow(left_table)
-        r_ixs = findrows(right_dict, left_table, l_ix)
+        r_ixs = findrows(right_dict, left_table, right_dict_cols, left_table_cols, l_ix)
         if isempty(r_ixs)
             update!(leftonly_ixs, l_ix, next_join_ix)
             next_join_ix += 1
@@ -284,8 +286,10 @@ function Base.join(dt1::AbstractDataTable,
         # iterate over left rows and leave those found in right
         left_ixs = Vector{Int}()
         sizehint!(left_ixs, nrow(joiner.dtl))
+        dtr_on_grp_cols = ntuple(i -> dtr_on_grp.dt[i], ncol(dtr_on_grp.dt))
+        dtl_on_cols = ntuple(i -> joiner.dtl_on[i], ncol(joiner.dtl_on))
         @inbounds for l_ix in 1:nrow(joiner.dtl_on)
-            if findrow(dtr_on_grp, joiner.dtl_on, l_ix) != 0
+            if findrow(dtr_on_grp, joiner.dtl_on, dtr_on_grp_cols, dtl_on_cols, l_ix) != 0
                 push!(left_ixs, l_ix)
             end
         end
@@ -296,8 +300,10 @@ function Base.join(dt1::AbstractDataTable,
         # iterate over left rows and leave those not found in right
         leftonly_ixs = Vector{Int}()
         sizehint!(leftonly_ixs, nrow(joiner.dtl))
+        dtr_on_grp_cols = ntuple(i -> dtr_on_grp.dt[i], ncol(dtr_on_grp.dt))
+        dtl_on_cols = ntuple(i -> joiner.dtl_on[i], ncol(joiner.dtl_on))
         @inbounds for l_ix in 1:nrow(joiner.dtl_on)
-            if findrow(dtr_on_grp, joiner.dtl_on, l_ix) == 0
+            if findrow(dtr_on_grp, joiner.dtl_on, dtr_on_grp_cols, dtl_on_cols, l_ix) == 0
                 push!(leftonly_ixs, l_ix)
             end
         end
diff --git a/src/datatablerow/datatablerow.jl b/src/datatablerow/datatablerow.jl
@@ -52,14 +52,16 @@ end
 
 # hash of DataTable rows based on its values
 # so that duplicate rows would have the same hash
-function rowhash(dt::DataTable, r::Int, h::UInt = zero(UInt))
-    @inbounds for col in columns(dt)
-        h = hash_colel(col, r, h)
-    end
-    return h
+# table columns are passed as a tuple of vectors to ensure type specialization
+rowhash(cols::Tuple{AbstractVector}, r::Int, h::UInt = zero(UInt))::UInt =
+    hash_colel(cols[1], r, h)
+function rowhash(cols::Tuple{Vararg{AbstractVector}}, r::Int, h::UInt = zero(UInt))::UInt
+    h = hash_colel(cols[1], r, h)
+    rowhash(Base.tail(cols), r, h)
 end
 
-Base.hash(r::DataTableRow, h::UInt = zero(UInt)) = rowhash(r.dt, r.row, h)
+Base.hash(r::DataTableRow, h::UInt = zero(UInt)) =
+    rowhash(ntuple(i -> r.dt[i], ncol(r.dt)), r.row, h)
 
 # comparison of DataTable rows
 # only the rows of the same DataTable could be compared
@@ -81,6 +83,19 @@ isequal_colel(a::Nullable, b::Any) = !isnull(a) & isequal(unsafe_get(a), b)
 isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a)
 isequal_colel(a::Nullable, b::Nullable) = isequal(a, b)
 
+# table columns are passed as a tuple of vectors to ensure type specialization
+isequal_row(cols::Tuple{AbstractVector}, r1::Int, r2::Int) =
+    isequal_colel(cols[1][r1], cols[1][r2])
+isequal_row(cols::Tuple{Vararg{AbstractVector}}, r1::Int, r2::Int) =
+    isequal_colel(cols[1][r1], cols[1][r2]) && isequal_row(Base.tail(cols), r1, r2)
+
+isequal_row(cols1::Tuple{AbstractVector}, r1::Int, cols2::Tuple{AbstractVector}, r2::Int) =
+    isequal_colel(cols1[1][r1], cols2[1][r2])
+isequal_row(cols1::Tuple{Vararg{AbstractVector}}, r1::Int,
+            cols2::Tuple{Vararg{AbstractVector}}, r2::Int) =
+    isequal_colel(cols1[1][r1], cols2[1][r2]) &&
+        isequal_row(Base.tail(cols1), r1, Base.tail(cols2), r2)
+
 function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int)
     if dt1 === dt2
         if r1 == r2
diff --git a/src/datatablerow/utils.jl b/src/datatablerow/utils.jl
@@ -80,10 +80,13 @@ end
 # 3) slot array for a hash map, non-zero values are
 #    the indices of the first row in a group
 # Optional group vector is set to the group indices of each row
-function row_group_slots(dt::AbstractDataTable,
+row_group_slots(dt::AbstractDataTable, groups::Union{Vector{Int}, Void} = nothing) =
+    row_group_slots(ntuple(i -> dt[i], ncol(dt)), hashrows(dt), groups)
+
+function row_group_slots(cols::Tuple{Vararg{AbstractVector}},
+                         rhashes::AbstractVector{UInt},
                          groups::Union{Vector{Int}, Void} = nothing)
-    @assert groups === nothing || length(groups) == nrow(dt)
-    rhashes = hashrows(dt)
+    @assert groups === nothing || length(groups) == length(cols[1])
     # inspired by Dict code from base cf. https://github.com/JuliaData/DataTables.jl/pull/17#discussion_r102481481
     sz = Base._tablesz(length(rhashes))
     @assert sz >= length(rhashes)
@@ -102,17 +105,10 @@ function row_group_slots(dt::AbstractDataTable,
                 gix = ngroups += 1
                 break
             elseif rhashes[i] == rhashes[g_row] # occupied slot, check if miss or hit
-                eq = true
-                for col in columns(dt)
-                    if !isequal_colel(col, i, g_row)
-                        eq = false # miss
-                        break
-                    end
-                end
-                if eq # hit
+                if isequal_row(cols, i, g_row) # hit
                     gix = groups !== nothing ? groups[g_row] : 0
-                    break
                 end
+                break
             end
             slotix = slotix & szm1 + 1 # check the next slot
             probe += 1
@@ -158,17 +154,21 @@ function group_rows(dt::AbstractDataTable)
 end
 
 # Find index of a row in gd that matches given row by content, 0 if not found
-function findrow(gd::RowGroupDict, dt::DataTable, row::Int)
+function findrow(gd::RowGroupDict,
+                 dt::DataTable,
+                 gd_cols::Tuple{Vararg{AbstractVector}},
+                 dt_cols::Tuple{Vararg{AbstractVector}},
+                 row::Int)
     (gd.dt === dt) && return row # same table, return itself
     # different tables, content matching required
-    rhash = rowhash(dt, row)
+    rhash = rowhash(dt_cols, row)
     szm1 = length(gd.gslots)-1
     slotix = ini_slotix = rhash & szm1 + 1
     while true
         g_row = gd.gslots[slotix]
         if g_row == 0 || # not found
             (rhash == gd.rhashes[g_row] &&
-            isequal_row(gd.dt, g_row, dt, row)) # found
+            isequal_row(gd_cols, g_row, dt_cols, row)) # found
             return g_row
         end
         slotix = (slotix & szm1) + 1 # miss, try the next slot
@@ -179,15 +179,20 @@ end
 
 # Find indices of rows in 'gd' that match given row by content.
 # return empty set if no row matches
-function findrows(gd::RowGroupDict, dt::DataTable, row::Int)
-    g_row = findrow(gd, dt, row)
+function findrows(gd::RowGroupDict,
+                  dt::DataTable,
+                  gd_cols::Tuple{Vararg{AbstractVector}},
+                  dt_cols::Tuple{Vararg{AbstractVector}},
+                  row::Int)
+    g_row = findrow(gd, dt, gd_cols, dt_cols, row)
     (g_row == 0) && return view(gd.rperm, 0:-1)
     gix = gd.groups[g_row]
     return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
 end
 
 function Base.getindex(gd::RowGroupDict, dtr::DataTableRow)
-    g_row = findrow(gd, dtr.dt, dtr.row)
+    g_row = findrow(gd, dtr.dt, ntuple(i -> gd.dt[i], ncol(gd.dt)),
+                    ntuple(i -> dtr.dt[i], ncol(dtr.dt)), dtr.row)
     (g_row == 0) && throw(KeyError(dtr))
     gix = gd.groups[g_row]
     return view(gd.rperm, gd.starts[gix]:gd.stops[gix])
diff --git a/test/datatablerow.jl b/test/datatablerow.jl
@@ -67,8 +67,7 @@ module TestDataTableRow
     # getting groups for the rows of the other frames
     @test length(gd[DataTableRow(dt6, 1)]) > 0
     @test_throws KeyError gd[DataTableRow(dt6, 2)]
-    @test isempty(DataTables.findrows(gd, dt6, 2))
-    @test length(DataTables.findrows(gd, dt6, 2)) == 0
+    @test isempty(DataTables.findrows(gd, dt6, (gd.dt[1],), (dt6[1],), 2))
 
     # grouping empty frame
     gd = DataTables.group_rows(DataTable(x=Int[]))