From e6a5365c3938e992526d67c1178a53de1b247b97 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 13 Sep 2021 08:52:29 -0400 Subject: [PATCH 1/8] WIP replace ColumnTable with Columns --- Project.toml | 1 + src/schema.jl | 18 +++++++++--------- src/terms.jl | 6 +++--- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/Project.toml b/Project.toml index 0ba99964..1d97b60c 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c" +TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] diff --git a/src/schema.jl b/src/schema.jl index 7245793a..21d55a87 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -112,15 +112,15 @@ julia> sch[term(:y)] y(continuous) ``` """ -schema(data, hints=Dict{Symbol,Any}()) = schema(columntable(data), hints) -schema(dt::D, hints=Dict{Symbol,Any}()) where {D<:ColumnTable} = - schema(Term.(collect(fieldnames(D))), dt, hints) +schema(data, hints=Dict{Symbol,Any}()) = schema(Tables.Columns(data), hints) +schema(dt::Tables.Columns, hints=Dict{Symbol,Any}()) = + schema(Term.(collect(Tables.columnnames(dt))), dt, hints) schema(ts::AbstractVector{<:AbstractTerm}, data, hints::Dict{Symbol}) = - schema(ts, columntable(data), hints) + schema(ts, Tables.Columns(data), hints) # handle hints: -schema(ts::AbstractVector{<:AbstractTerm}, dt::ColumnTable, - hints::Dict{Symbol}=Dict{Symbol,Any}()) = +schema(ts::AbstractVector{<:AbstractTerm}, dt::Tables.Columns, + hints::Dict{Symbol}=Dict{Symbol,Any}()) = sch = Schema(t=>concrete_term(t, dt, hints) for t in ts) schema(f::TermOrTerms, data, hints::Dict{Symbol}) = @@ -168,15 +168,15 @@ a(continuous) """ concrete_term(t::Term, d, hints::Dict{Symbol}) = concrete_term(t, d, get(hints, t.sym, nothing)) -concrete_term(t::Term, dt::ColumnTable, hint) = +concrete_term(t::Term, dt::Tables.Columns, hint) = concrete_term(t, getproperty(dt, t.sym), hint) -concrete_term(t::Term, dt::ColumnTable, hints::Dict{Symbol}) = +concrete_term(t::Term, dt::Tables.Columns, hints::Dict{Symbol}) = concrete_term(t, getproperty(dt, t.sym), get(hints, t.sym, nothing)) concrete_term(t::Term, d) = concrete_term(t, d, nothing) # if the "hint" is already an AbstractTerm, use that # need this specified to avoid ambiguity -concrete_term(t::Term, d::ColumnTable, hint::AbstractTerm) = hint +concrete_term(t::Term, d::Tables.Columns, hint::AbstractTerm) = hint concrete_term(t::Term, x, hint::AbstractTerm) = hint # second possible fix for #97 diff --git a/src/terms.jl b/src/terms.jl index b49a9027..5628b042 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -430,9 +430,9 @@ function modelcols(t, d::D) where D ## like modelcols(::Any, ::NamedTuple) or modelcols(::AbstractTerm, ::NamedTuple) ## but that causes ambiguity errors or under-constrained modelcols methods for ## custom term types... - d isa NamedTuple && throw(ArgumentError("don't know how to generate modelcols for " * - "term $t. Did you forget to call apply_schema?")) - modelcols(t, columntable(d)) + d isa Tables.Columns && throw(ArgumentError("don't know how to generate modelcols for " * + "term $t. Did you forget to call apply_schema?")) + modelcols(t, Tables.Columns(d)) end """ From d7d1283ef69eb9c8049cf40120fb24ebdf9b2c25 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Fri, 17 Sep 2021 23:45:06 -0400 Subject: [PATCH 2/8] WIP --- src/StatsModels.jl | 3 ++- src/modelframe.jl | 39 ++++++++---------------------------- src/schema.jl | 24 +++++++++++----------- src/temporal_terms.jl | 2 +- src/terms.jl | 46 +++++++++++++++++++++---------------------- 5 files changed, 46 insertions(+), 68 deletions(-) diff --git a/src/StatsModels.jl b/src/StatsModels.jl index 7c6f2bf1..b076d6e1 100644 --- a/src/StatsModels.jl +++ b/src/StatsModels.jl @@ -1,6 +1,7 @@ module StatsModels using Tables +using TableOperations using StatsBase using ShiftedArrays using ShiftedArrays: lag, lead @@ -13,7 +14,7 @@ using StatsFuns: chisqccdf using SparseArrays using LinearAlgebra -using Tables: ColumnTable +using Tables: ColumnTable, Columns, getcolumn export #re-export from StatsBase: diff --git a/src/modelframe.jl b/src/modelframe.jl index cf31d122..8844f188 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -41,37 +41,17 @@ mutable struct ModelFrame{D,M} model::Type{M} end - - -## copied from DataFrames: -function _nonmissing!(res, col) - # workaround until JuliaLang/julia#21256 is fixed - eltype(col) >: Missing || return - res .&= .!ismissing.(col) +function missing_omit(data, formula::AbstractTerm) + cols = termvars(formula) + sel = TableOperations.select(cols...) + drop = TableOperations.narrowtypes() ∘ TableOperations.dropmissing() + materialize = Tables.materializer(data) + return materialize(drop(sel(data))) end - -function missing_omit(d::T) where T<:ColumnTable - nonmissings = trues(length(first(d))) - for col in d - _nonmissing!(nonmissings, col) - end - - rows = findall(nonmissings) - d_nonmissing = - NamedTuple{Tables.names(T)}(tuple((copyto!(similar(col, - Base.nonmissingtype(eltype(col)), - length(rows)), - view(col, rows)) for col in d)...)) - d_nonmissing, nonmissings -end - -missing_omit(data::T, formula::AbstractTerm) where T<:ColumnTable = - missing_omit(NamedTuple{tuple(termvars(formula)...)}(data)) - -function ModelFrame(f::FormulaTerm, data::ColumnTable; +function ModelFrame(f::FormulaTerm, data; model::Type{M}=StatisticalModel, contrasts=Dict{Symbol,Any}()) where M - data, _ = missing_omit(data, f) + data = missing_omit(data, f) sch = schema(f, data, contrasts) f = apply_schema(f, sch, M) @@ -79,9 +59,6 @@ function ModelFrame(f::FormulaTerm, data::ColumnTable; ModelFrame(f, sch, data, model) end -ModelFrame(f::FormulaTerm, data; model=StatisticalModel, contrasts=Dict{Symbol,Any}()) = - ModelFrame(f, columntable(data); model=model, contrasts=contrasts) - StatsBase.modelmatrix(f::FormulaTerm, data; kwargs...) = modelmatrix(f.rhs, data; kwargs...) """ diff --git a/src/schema.jl b/src/schema.jl index 21d55a87..fc5a7387 100644 --- a/src/schema.jl +++ b/src/schema.jl @@ -112,16 +112,16 @@ julia> sch[term(:y)] y(continuous) ``` """ -schema(data, hints=Dict{Symbol,Any}()) = schema(Tables.Columns(data), hints) -schema(dt::Tables.Columns, hints=Dict{Symbol,Any}()) = - schema(Term.(collect(Tables.columnnames(dt))), dt, hints) -schema(ts::AbstractVector{<:AbstractTerm}, data, hints::Dict{Symbol}) = - schema(ts, Tables.Columns(data), hints) +schema(data, hints=Dict{Symbol,Any}()) = + schema(Term.(collect(Tables.columnnames(data))), data, hints) # handle hints: -schema(ts::AbstractVector{<:AbstractTerm}, dt::Tables.Columns, - hints::Dict{Symbol}=Dict{Symbol,Any}()) = - sch = Schema(t=>concrete_term(t, dt, hints) for t in ts) +function schema(ts::AbstractVector{<:AbstractTerm}, + data, + hints::Dict{Symbol}=Dict{Symbol,Any}()) + data = Tables.Columns(Tables.columns(data)) + sch = Schema(t=>concrete_term(t, data, hints) for t in ts) +end schema(f::TermOrTerms, data, hints::Dict{Symbol}) = schema(filter(needs_schema, terms(f)), data, hints) @@ -168,10 +168,10 @@ a(continuous) """ concrete_term(t::Term, d, hints::Dict{Symbol}) = concrete_term(t, d, get(hints, t.sym, nothing)) -concrete_term(t::Term, dt::Tables.Columns, hint) = - concrete_term(t, getproperty(dt, t.sym), hint) -concrete_term(t::Term, dt::Tables.Columns, hints::Dict{Symbol}) = - concrete_term(t, getproperty(dt, t.sym), get(hints, t.sym, nothing)) +concrete_term(t::Term, d, hint) = + concrete_term(t, getcolumn(d, t.sym), hint) +concrete_term(t::Term, d, hints::Dict{Symbol}) = + concrete_term(t, getcolumn(d, t.sym), get(hints, t.sym, nothing)) concrete_term(t::Term, d) = concrete_term(t, d, nothing) # if the "hint" is already an AbstractTerm, use that diff --git a/src/temporal_terms.jl b/src/temporal_terms.jl index f3fe4e12..1ecde111 100644 --- a/src/temporal_terms.jl +++ b/src/temporal_terms.jl @@ -54,7 +54,7 @@ end ShiftedArrays.lead(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lead)}(t, n) ShiftedArrays.lag(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lag)}(t, n) -function modelcols(ll::LeadLagTerm{<:Any, F}, d::Tables.ColumnTable) where F +function modelcols(ll::LeadLagTerm{<:Any, F}, d::Columns) where F original_cols = modelcols(ll.term, d) return F.instance(original_cols, ll.nsteps) end diff --git a/src/terms.jl b/src/terms.jl index 5628b042..e4f0796f 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -432,7 +432,7 @@ function modelcols(t, d::D) where D ## custom term types... d isa Tables.Columns && throw(ArgumentError("don't know how to generate modelcols for " * "term $t. Did you forget to call apply_schema?")) - modelcols(t, Tables.Columns(d)) + modelcols(t, Tables.Columns(Tables.columns(d))) end """ @@ -481,19 +481,19 @@ julia> modelcols(MatrixTerm(ts), d) 9.0 0.05079 0.0 1.0 ``` """ -modelcols(ts::TupleTerm, d::NamedTuple) = modelcols.(ts, Ref(d)) +modelcols(ts::TupleTerm, d) = modelcols.(ts, Ref(d)) -modelcols(t::Term, d::NamedTuple) = +modelcols(t::Term, d) = throw(ArgumentError("can't generate modelcols for un-typed term $t. " * "Use apply_schema to create concrete terms first")) # TODO: @generated to unroll the getfield stuff -modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::NamedTuple) where {Fo,Fa,Names} = - ft.fanon.(getfield.(Ref(d), Names)...) +modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::Columns) where {Fo,Fa,Names} = + ft.fanon.(getcolumn.(Ref(d), Names)...) -modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym]) +modelcols(t::ContinuousTerm, d::Columns) = copy.(getcolumn(d, t.sym)) -modelcols(t::CategoricalTerm, d::NamedTuple) = t.contrasts[d[t.sym], :] +modelcols(t::CategoricalTerm, d::Columns) = t.contrasts[getcolumn(d, t.sym), :] """ @@ -523,28 +523,28 @@ function row_kron_insideout(op::Function, args...) reshape(broadcast(op, args...), rows, :) end -# two options here: either special-case ColumnTable (named tuple of vectors) -# vs. vanilla NamedTuple, or reshape and use normal broadcasting -modelcols(t::InteractionTerm, d::NamedTuple) = - kron_insideout(*, (modelcols(term, d) for term in t.terms)...) - -function modelcols(t::InteractionTerm, d::ColumnTable) - row_kron_insideout(*, (modelcols(term, d) for term in t.terms)...) +function modelcols(t::InteractionTerm, d) + if Tables.istable(d) + return row_kron_insideout(*, (modelcols(term, d) for term in t.terms)...) + else + return kron_insideout(*, (modelcols(term, d) for term in t.terms)...) + end end -modelcols(t::InterceptTerm{true}, d::NamedTuple) = ones(size(first(d))) -modelcols(t::InterceptTerm{false}, d) = Matrix{Float64}(undef, size(first(d),1), 0) +modelcols(t::InterceptTerm{true}, d::Columns) = ones(size(Tables.getcolumn(d, 1), 1)) +modelcols(t::InterceptTerm{false}, d::Columns) = Matrix{Float64}(undef, size(first(d),1), 0) -modelcols(t::FormulaTerm, d::NamedTuple) = (modelcols(t.lhs,d), modelcols(t.rhs, d)) +modelcols(t::FormulaTerm, d::Columns) = (modelcols(t.lhs,d), modelcols(t.rhs, d)) -function modelcols(t::MatrixTerm, d::ColumnTable) - mat = reduce(hcat, [modelcols(tt, d) for tt in t.terms]) - reshape(mat, size(mat, 1), :) +function modelcols(t::MatrixTerm, d::Columns) + if Tables.istable(d) + mat = reduce(hcat, [modelcols(tt, d) for tt in t.terms]) + return reshape(mat, size(mat, 1), :) + else # single row + return reduce(vcat, [modelcols(tt, d) for tt in t.terms]) + end end -modelcols(t::MatrixTerm, d::NamedTuple) = - reduce(vcat, [modelcols(tt, d) for tt in t.terms]) - vectorize(x::Tuple) = collect(x) vectorize(x::AbstractVector) = x vectorize(x) = [x] From 8a000974de2d87e118a93778c016c96ae217e1a8 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 25 Oct 2021 16:51:39 -0400 Subject: [PATCH 3/8] Tables does this for us --- src/terms.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/terms.jl b/src/terms.jl index e4f0796f..999613a6 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -432,7 +432,7 @@ function modelcols(t, d::D) where D ## custom term types... d isa Tables.Columns && throw(ArgumentError("don't know how to generate modelcols for " * "term $t. Did you forget to call apply_schema?")) - modelcols(t, Tables.Columns(Tables.columns(d))) + modelcols(t, Tables.Columns(d)) end """ From 89adab5095717c01ccc027e699b81007600999e0 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 26 Oct 2021 09:55:42 -0400 Subject: [PATCH 4/8] tables compat for Columns --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 1d97b60c..0935def5 100644 --- a/Project.toml +++ b/Project.toml @@ -22,7 +22,7 @@ DataStructures = "0.17, 0.18" ShiftedArrays = "1" StatsBase = "0.33.5" StatsFuns = "0.9" -Tables = "0.2, 1" +Tables = "1.6" julia = "1" [extras] From 3810c4854eedb5fbe79fc5ff467c1ff359407151 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 26 Oct 2021 11:02:26 -0400 Subject: [PATCH 5/8] use Columns --- src/statsmodel.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/statsmodel.jl b/src/statsmodel.jl index 1d3a03af..0dee3ab3 100644 --- a/src/statsmodel.jl +++ b/src/statsmodel.jl @@ -80,7 +80,7 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, TableStatisticalModel), kwargs...) where T<:$modeltype Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))")) - cols = columntable(data) + cols = Tables.Columns(data) mf = ModelFrame(f, cols, model=T, contrasts=contrasts) mm = ModelMatrix(mf) @@ -172,7 +172,7 @@ function StatsBase.predict(mm::TableRegressionModel, data; kwargs...) throw(ArgumentError("expected data in a Table, got $(typeof(data))")) f = mm.mf.f - cols, nonmissings = missing_omit(columntable(data), f.rhs) + cols, nonmissings = missing_omit(Tables.Columns(data), f.rhs) new_x = modelcols(f.rhs, cols) y_pred = predict(mm.model, reshape(new_x, size(new_x, 1), :); kwargs...) From 8dcbf62789ca10ee1842409d98c091a037c5564b Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 26 Oct 2021 11:02:34 -0400 Subject: [PATCH 6/8] make it say Vector --- src/contrasts.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/contrasts.jl b/src/contrasts.jl index 7a330649..e40be806 100644 --- a/src/contrasts.jl +++ b/src/contrasts.jl @@ -201,7 +201,7 @@ function ContrastsMatrix(contrasts::C, levels::AbstractVector{T}) where {C<:Abst mat = contrasts_matrix(contrasts, baseind, n) - ContrastsMatrix(mat, tnames, c_levels, contrasts) + ContrastsMatrix(mat, Vector(tnames), Vector(c_levels), contrasts) end ContrastsMatrix(c::Type{<:AbstractContrasts}, levels::AbstractVector) = From 6032e6e148d02eddd3c2a2bb0ed9e0c1bfd4c98c Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Tue, 26 Oct 2021 11:06:04 -0400 Subject: [PATCH 7/8] need to matieralize after select before dropmissing --- src/modelframe.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/modelframe.jl b/src/modelframe.jl index 8844f188..5dc7377d 100644 --- a/src/modelframe.jl +++ b/src/modelframe.jl @@ -43,10 +43,10 @@ end function missing_omit(data, formula::AbstractTerm) cols = termvars(formula) - sel = TableOperations.select(cols...) - drop = TableOperations.narrowtypes() ∘ TableOperations.dropmissing() materialize = Tables.materializer(data) - return materialize(drop(sel(data))) + data = materialize(TableOperations.select(cols...)(data)) + drop = TableOperations.narrowtypes() ∘ TableOperations.dropmissing() + return materialize(drop(data)) end function ModelFrame(f::FormulaTerm, data; From 59601d8a27c45d489b6797009cb0ff3a5bdc65a7 Mon Sep 17 00:00:00 2001 From: Dave Kleinschmidt Date: Mon, 14 Feb 2022 10:13:43 -0500 Subject: [PATCH 8/8] do we _really_ need to specialize on Columns? --- src/terms.jl | 14 +++++++------- test/extension.jl | 2 +- test/modelmatrix.jl | 3 +-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/terms.jl b/src/terms.jl index 999613a6..cba7299c 100644 --- a/src/terms.jl +++ b/src/terms.jl @@ -488,12 +488,12 @@ modelcols(t::Term, d) = "Use apply_schema to create concrete terms first")) # TODO: @generated to unroll the getfield stuff -modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::Columns) where {Fo,Fa,Names} = +modelcols(ft::FunctionTerm{Fo,Fa,Names}, d) where {Fo,Fa,Names} = ft.fanon.(getcolumn.(Ref(d), Names)...) -modelcols(t::ContinuousTerm, d::Columns) = copy.(getcolumn(d, t.sym)) +modelcols(t::ContinuousTerm, d) = copy.(getcolumn(d, t.sym)) -modelcols(t::CategoricalTerm, d::Columns) = t.contrasts[getcolumn(d, t.sym), :] +modelcols(t::CategoricalTerm, d) = t.contrasts[getcolumn(d, t.sym), :] """ @@ -531,12 +531,12 @@ function modelcols(t::InteractionTerm, d) end end -modelcols(t::InterceptTerm{true}, d::Columns) = ones(size(Tables.getcolumn(d, 1), 1)) -modelcols(t::InterceptTerm{false}, d::Columns) = Matrix{Float64}(undef, size(first(d),1), 0) +modelcols(t::InterceptTerm{true}, d) = ones(size(Tables.getcolumn(d, 1), 1)) +modelcols(t::InterceptTerm{false}, d) = Matrix{Float64}(undef, size(Tables.getcolumn(d, 1), 1), 0) -modelcols(t::FormulaTerm, d::Columns) = (modelcols(t.lhs,d), modelcols(t.rhs, d)) +modelcols(t::FormulaTerm, d) = (modelcols(t.lhs,d), modelcols(t.rhs, d)) -function modelcols(t::MatrixTerm, d::Columns) +function modelcols(t::MatrixTerm, d) if Tables.istable(d) mat = reduce(hcat, [modelcols(tt, d) for tt in t.terms]) return reshape(mat, size(mat, 1), :) diff --git a/test/extension.jl b/test/extension.jl index 2f056928..2e0cc8f6 100644 --- a/test/extension.jl +++ b/test/extension.jl @@ -12,7 +12,7 @@ PolyTerm(t::Term, deg::ConstantTerm) = PolyTerm(t.sym, deg.n) StatsModels.apply_schema(t::FunctionTerm{typeof(poly)}, sch, ::Type{<:PolyModel}) = PolyTerm(t.args_parsed...) -StatsModels.modelcols(p::PolyTerm, d::NamedTuple) = +StatsModels.modelcols(p::PolyTerm, d) = reduce(hcat, [d[p.term].^n for n in 1:p.deg]) struct NonMatrixTerm{T} <: AbstractTerm diff --git a/test/modelmatrix.jl b/test/modelmatrix.jl index 7b4e073d..0cdc82b2 100644 --- a/test/modelmatrix.jl +++ b/test/modelmatrix.jl @@ -324,7 +324,7 @@ mf = ModelFrame(@formula(y ~ 0 + x), d) X = ModelMatrix(mf).m X[1] = 0.0 - @test mf.data[:x][1] === 1.0 + @test mf.data[1, :x] === 1.0 # Ensure string columns are supported d1 = DataFrame(A = 1:4, B = categorical(["M", "F", "F", "M"])) @@ -345,7 +345,6 @@ z = repeat([:e, :f], inner = 4)) f = apply_schema(@formula(r ~ 1 + w*x*y*z), schema(d)) - modelmatrix(f, d) @test reduce(vcat, last.(modelcols.(Ref(f), Tables.rowtable(d)))') == modelmatrix(f,d) end