JuliaStats · kleinschmidt · Sep 13, 2021 · Sep 18, 2021 · Oct 25, 2021 · Oct 26, 2021
diff --git a/Project.toml b/Project.toml
@@ -11,6 +11,7 @@ ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
+TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
@@ -21,7 +22,7 @@ DataStructures = "0.17, 0.18"
 ShiftedArrays = "1"
 StatsBase = "0.33.5"
 StatsFuns = "0.9"
-Tables = "0.2, 1"
+Tables = "1.6"
 julia = "1"
 
 [extras]

diff --git a/src/StatsModels.jl b/src/StatsModels.jl
@@ -1,6 +1,7 @@
 module StatsModels
 
 using Tables
+using TableOperations
 using StatsBase
 using ShiftedArrays
 using ShiftedArrays: lag, lead
@@ -13,7 +14,7 @@ using StatsFuns: chisqccdf
 using SparseArrays
 using LinearAlgebra
 
-using Tables: ColumnTable
+using Tables: ColumnTable, Columns, getcolumn
 
 export
     #re-export from StatsBase:

diff --git a/src/contrasts.jl b/src/contrasts.jl
@@ -201,7 +201,7 @@ function ContrastsMatrix(contrasts::C, levels::AbstractVector{T}) where {C<:Abst
 
     mat = contrasts_matrix(contrasts, baseind, n)
 
-    ContrastsMatrix(mat, tnames, c_levels, contrasts)
+    ContrastsMatrix(mat, Vector(tnames), Vector(c_levels), contrasts)
 end
 
 ContrastsMatrix(c::Type{<:AbstractContrasts}, levels::AbstractVector) =

diff --git a/src/modelframe.jl b/src/modelframe.jl
@@ -41,47 +41,24 @@ mutable struct ModelFrame{D,M}
     model::Type{M}
 end
 
-
-
-## copied from DataFrames:
-function _nonmissing!(res, col)
-    # workaround until JuliaLang/julia#21256 is fixed
-    eltype(col) >: Missing || return
-    res .&= .!ismissing.(col)
+function missing_omit(data, formula::AbstractTerm)
+    cols = termvars(formula)
+    materialize = Tables.materializer(data)
+    data = materialize(TableOperations.select(cols...)(data))
+    drop = TableOperations.narrowtypes() ∘ TableOperations.dropmissing()
+    return materialize(drop(data))
 end
 
-
-function missing_omit(d::T) where T<:ColumnTable
-    nonmissings = trues(length(first(d)))
-    for col in d
-        _nonmissing!(nonmissings, col)
-    end
-
-    rows = findall(nonmissings)
-    d_nonmissing =
-        NamedTuple{Tables.names(T)}(tuple((copyto!(similar(col,
-                                                           Base.nonmissingtype(eltype(col)),
-                                                           length(rows)),
-                                                   view(col, rows)) for col in d)...))
-    d_nonmissing, nonmissings
-end
-
-missing_omit(data::T, formula::AbstractTerm) where T<:ColumnTable =
-    missing_omit(NamedTuple{tuple(termvars(formula)...)}(data))
-
-function ModelFrame(f::FormulaTerm, data::ColumnTable;
+function ModelFrame(f::FormulaTerm, data;
                     model::Type{M}=StatisticalModel, contrasts=Dict{Symbol,Any}()) where M
-    data, _ = missing_omit(data, f)
+    data = missing_omit(data, f)
 
     sch = schema(f, data, contrasts)
     f = apply_schema(f, sch, M)
 
     ModelFrame(f, sch, data, model)
 end
 
-ModelFrame(f::FormulaTerm, data; model=StatisticalModel, contrasts=Dict{Symbol,Any}()) =
-    ModelFrame(f, columntable(data); model=model, contrasts=contrasts)
-
 StatsBase.modelmatrix(f::FormulaTerm, data; kwargs...) = modelmatrix(f.rhs, data; kwargs...)
 
 """

diff --git a/src/schema.jl b/src/schema.jl
@@ -112,16 +112,16 @@ julia> sch[term(:y)]
 y(continuous)
 ```
 """
-schema(data, hints=Dict{Symbol,Any}()) = schema(columntable(data), hints)
-schema(dt::D, hints=Dict{Symbol,Any}()) where {D<:ColumnTable} =
-    schema(Term.(collect(fieldnames(D))), dt, hints)
-schema(ts::AbstractVector{<:AbstractTerm}, data, hints::Dict{Symbol}) =
-    schema(ts, columntable(data), hints)
+schema(data, hints=Dict{Symbol,Any}()) =
+    schema(Term.(collect(Tables.columnnames(data))), data, hints)
 
 # handle hints:
-schema(ts::AbstractVector{<:AbstractTerm}, dt::ColumnTable,
-      hints::Dict{Symbol}=Dict{Symbol,Any}()) =
-    sch = Schema(t=>concrete_term(t, dt, hints) for t in ts)
+function schema(ts::AbstractVector{<:AbstractTerm},
+                data,
+                hints::Dict{Symbol}=Dict{Symbol,Any}())
+    data = Tables.Columns(Tables.columns(data))
+    sch = Schema(t=>concrete_term(t, data, hints) for t in ts)
+end
 
 schema(f::TermOrTerms, data, hints::Dict{Symbol}) =
     schema(filter(needs_schema, terms(f)), data, hints)
@@ -168,15 +168,15 @@ a(continuous)
 """
 concrete_term(t::Term, d, hints::Dict{Symbol}) =
     concrete_term(t, d, get(hints, t.sym, nothing))
-concrete_term(t::Term, dt::ColumnTable, hint) =
-    concrete_term(t, getproperty(dt, t.sym), hint)
-concrete_term(t::Term, dt::ColumnTable, hints::Dict{Symbol}) =
-    concrete_term(t, getproperty(dt, t.sym), get(hints, t.sym, nothing))
+concrete_term(t::Term, d, hint) =
+    concrete_term(t, getcolumn(d, t.sym), hint)
+concrete_term(t::Term, d, hints::Dict{Symbol}) =
+    concrete_term(t, getcolumn(d, t.sym), get(hints, t.sym, nothing))
 concrete_term(t::Term, d) = concrete_term(t, d, nothing)
 
 # if the "hint" is already an AbstractTerm, use that
 # need this specified to avoid ambiguity
-concrete_term(t::Term, d::ColumnTable, hint::AbstractTerm) = hint
+concrete_term(t::Term, d::Tables.Columns, hint::AbstractTerm) = hint
-concrete_term(t::Term, d::Tables.Columns, hint::AbstractTerm) = hint
+concrete_term(t::Term, d, hint::AbstractTerm) = hint
-concrete_term(t::Term, d::Tables.Columns, hint::AbstractTerm) = hint
+concrete_term(t::Term, d, hint::AbstractTerm) = hint
 concrete_term(t::Term, x, hint::AbstractTerm) = hint
 
 # second possible fix for #97

diff --git a/src/statsmodel.jl b/src/statsmodel.jl
@@ -80,7 +80,7 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, TableStatisticalModel),
                                kwargs...) where T<:$modeltype
 
             Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))"))
-            cols = columntable(data)
+            cols = Tables.Columns(data)
 
             mf = ModelFrame(f, cols, model=T, contrasts=contrasts)
             mm = ModelMatrix(mf)
@@ -172,7 +172,7 @@ function StatsBase.predict(mm::TableRegressionModel, data; kwargs...)
         throw(ArgumentError("expected data in a Table, got $(typeof(data))"))
 
     f = mm.mf.f
-    cols, nonmissings = missing_omit(columntable(data), f.rhs)
+    cols, nonmissings = missing_omit(Tables.Columns(data), f.rhs)
     new_x = modelcols(f.rhs, cols)
     y_pred = predict(mm.model, reshape(new_x, size(new_x, 1), :);
                      kwargs...)

diff --git a/src/temporal_terms.jl b/src/temporal_terms.jl
@@ -54,7 +54,7 @@ end
 ShiftedArrays.lead(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lead)}(t, n)
 ShiftedArrays.lag(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lag)}(t, n)
 
-function modelcols(ll::LeadLagTerm{<:Any, F}, d::Tables.ColumnTable) where F
+function modelcols(ll::LeadLagTerm{<:Any, F}, d::Columns) where F
     original_cols = modelcols(ll.term, d)
     return F.instance(original_cols, ll.nsteps)
 end

diff --git a/src/terms.jl b/src/terms.jl
@@ -430,9 +430,9 @@ function modelcols(t, d::D) where D
     ## like modelcols(::Any, ::NamedTuple) or modelcols(::AbstractTerm, ::NamedTuple)
     ## but that causes ambiguity errors or under-constrained modelcols methods for
     ## custom term types...
-    d isa NamedTuple && throw(ArgumentError("don't know how to generate modelcols for " *
-                                            "term $t. Did you forget to call apply_schema?"))
-    modelcols(t, columntable(d))
+    d isa Tables.Columns && throw(ArgumentError("don't know how to generate modelcols for " *
+                                                "term $t. Did you forget to call apply_schema?"))
+    modelcols(t, Tables.Columns(d))
 end
 
 """
@@ -481,19 +481,19 @@ julia> modelcols(MatrixTerm(ts), d)
  9.0  0.05079    0.0  1.0
 ```
 """
-modelcols(ts::TupleTerm, d::NamedTuple) = modelcols.(ts, Ref(d))
+modelcols(ts::TupleTerm, d) = modelcols.(ts, Ref(d))
 
-modelcols(t::Term, d::NamedTuple) =
+modelcols(t::Term, d) =
     throw(ArgumentError("can't generate modelcols for un-typed term $t. " *
                         "Use apply_schema to create concrete terms first"))
 
 # TODO: @generated to unroll the getfield stuff
-modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::NamedTuple) where {Fo,Fa,Names} =
-    ft.fanon.(getfield.(Ref(d), Names)...)
+modelcols(ft::FunctionTerm{Fo,Fa,Names}, d) where {Fo,Fa,Names} =
+    ft.fanon.(getcolumn.(Ref(d), Names)...)
 
-modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym])
+modelcols(t::ContinuousTerm, d) = copy.(getcolumn(d, t.sym))
 
-modelcols(t::CategoricalTerm, d::NamedTuple) = t.contrasts[d[t.sym], :]
+modelcols(t::CategoricalTerm, d) = t.contrasts[getcolumn(d, t.sym), :]
 
 
 """
@@ -523,28 +523,28 @@ function row_kron_insideout(op::Function, args...)
     reshape(broadcast(op, args...), rows, :)
 end
 
-# two options here: either special-case ColumnTable (named tuple of vectors)
-# vs. vanilla NamedTuple, or reshape and use normal broadcasting
-modelcols(t::InteractionTerm, d::NamedTuple) =
-    kron_insideout(*, (modelcols(term, d) for term in t.terms)...)
-
-function modelcols(t::InteractionTerm, d::ColumnTable)
-    row_kron_insideout(*, (modelcols(term, d) for term in t.terms)...)
+function modelcols(t::InteractionTerm, d)
+    if Tables.istable(d)
+        return row_kron_insideout(*, (modelcols(term, d) for term in t.terms)...)
+    else
+        return kron_insideout(*, (modelcols(term, d) for term in t.terms)...)
+    end
 end
 
-modelcols(t::InterceptTerm{true}, d::NamedTuple) = ones(size(first(d)))
-modelcols(t::InterceptTerm{false}, d) = Matrix{Float64}(undef, size(first(d),1), 0)
+modelcols(t::InterceptTerm{true}, d) = ones(size(Tables.getcolumn(d, 1), 1))
+modelcols(t::InterceptTerm{false}, d) = Matrix{Float64}(undef, size(Tables.getcolumn(d, 1), 1), 0)
 
-modelcols(t::FormulaTerm, d::NamedTuple) = (modelcols(t.lhs,d), modelcols(t.rhs, d))
+modelcols(t::FormulaTerm, d) = (modelcols(t.lhs,d), modelcols(t.rhs, d))
 
-function modelcols(t::MatrixTerm, d::ColumnTable)
-    mat = reduce(hcat, [modelcols(tt, d) for tt in t.terms])
-    reshape(mat, size(mat, 1), :)
+function modelcols(t::MatrixTerm, d)
+    if Tables.istable(d)
+        mat = reduce(hcat, [modelcols(tt, d) for tt in t.terms])
+        return reshape(mat, size(mat, 1), :)
+    else # single row
+        return reduce(vcat, [modelcols(tt, d) for tt in t.terms])
+    end
 end
 
-modelcols(t::MatrixTerm, d::NamedTuple) =
-    reduce(vcat, [modelcols(tt, d) for tt in t.terms])
-
 vectorize(x::Tuple) = collect(x)
 vectorize(x::AbstractVector) = x
 vectorize(x) = [x]

diff --git a/test/extension.jl b/test/extension.jl
@@ -12,7 +12,7 @@ PolyTerm(t::Term, deg::ConstantTerm) = PolyTerm(t.sym, deg.n)
 StatsModels.apply_schema(t::FunctionTerm{typeof(poly)}, sch, ::Type{<:PolyModel}) =
     PolyTerm(t.args_parsed...)
 
-StatsModels.modelcols(p::PolyTerm, d::NamedTuple) =
+StatsModels.modelcols(p::PolyTerm, d) =
     reduce(hcat, [d[p.term].^n for n in 1:p.deg])
 
 struct NonMatrixTerm{T} <: AbstractTerm

diff --git a/test/modelmatrix.jl b/test/modelmatrix.jl
@@ -324,7 +324,7 @@
     mf = ModelFrame(@formula(y ~ 0 + x), d)
     X = ModelMatrix(mf).m
     X[1] = 0.0
-    @test mf.data[:x][1] === 1.0
+    @test mf.data[1, :x] === 1.0
 
     # Ensure string columns are supported
     d1 = DataFrame(A = 1:4, B = categorical(["M", "F", "F", "M"]))
@@ -345,7 +345,6 @@
                       z = repeat([:e, :f], inner = 4))
 
         f = apply_schema(@formula(r ~ 1 + w*x*y*z), schema(d))
-        modelmatrix(f, d)
         @test reduce(vcat, last.(modelcols.(Ref(f), Tables.rowtable(d)))') == modelmatrix(f,d)
     end