Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP use Tables.Columns instead of columntable #247

Draft
wants to merge 8 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ShiftedArrays = "1277b4bf-5013-50f5-be3d-901d8477a67a"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Expand All @@ -21,7 +22,7 @@ DataStructures = "0.17, 0.18"
ShiftedArrays = "1"
StatsBase = "0.33.5"
StatsFuns = "0.9"
Tables = "0.2, 1"
Tables = "1.6"
julia = "1"

[extras]
Expand Down
3 changes: 2 additions & 1 deletion src/StatsModels.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module StatsModels

using Tables
using TableOperations
using StatsBase
using ShiftedArrays
using ShiftedArrays: lag, lead
Expand All @@ -13,7 +14,7 @@ using StatsFuns: chisqccdf
using SparseArrays
using LinearAlgebra

using Tables: ColumnTable
using Tables: ColumnTable, Columns, getcolumn

export
#re-export from StatsBase:
Expand Down
2 changes: 1 addition & 1 deletion src/contrasts.jl
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ function ContrastsMatrix(contrasts::C, levels::AbstractVector{T}) where {C<:Abst

mat = contrasts_matrix(contrasts, baseind, n)

ContrastsMatrix(mat, tnames, c_levels, contrasts)
ContrastsMatrix(mat, Vector(tnames), Vector(c_levels), contrasts)
end

ContrastsMatrix(c::Type{<:AbstractContrasts}, levels::AbstractVector) =
Expand Down
39 changes: 8 additions & 31 deletions src/modelframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,47 +41,24 @@ mutable struct ModelFrame{D,M}
model::Type{M}
end



## copied from DataFrames:
function _nonmissing!(res, col)
# workaround until JuliaLang/julia#21256 is fixed
eltype(col) >: Missing || return
res .&= .!ismissing.(col)
function missing_omit(data, formula::AbstractTerm)
cols = termvars(formula)
materialize = Tables.materializer(data)
data = materialize(TableOperations.select(cols...)(data))
drop = TableOperations.narrowtypes() ∘ TableOperations.dropmissing()
Copy link
Member

@nalimilan nalimilan Jan 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAICT TableOperations.dropmissing operates row-wise (it calls filter). I'm afraid this is going to kill performance for data frames.

Maybe an optimized method for column tables could be added? (EDIT: That's probably doable, as we can use a faster approach than filter since we know that the condition can be computed separately for each row.) Another solution would be to define dropmissing in DataAPI, say that dropmissing(::Any) is owned by TableOperations, but have dropmissing(::DataFrame) be defined in DataFrames.

Also, narrowtypes is a much more costly operation that just doing nonmissingtype(eltype(col)) as it requires going over all entries. DataFrames's dropmissing does that by default, maybe TableOperations could take a similar argument.

return materialize(drop(data))
end


function missing_omit(d::T) where T<:ColumnTable
nonmissings = trues(length(first(d)))
for col in d
_nonmissing!(nonmissings, col)
end

rows = findall(nonmissings)
d_nonmissing =
NamedTuple{Tables.names(T)}(tuple((copyto!(similar(col,
Base.nonmissingtype(eltype(col)),
length(rows)),
view(col, rows)) for col in d)...))
d_nonmissing, nonmissings
end

missing_omit(data::T, formula::AbstractTerm) where T<:ColumnTable =
missing_omit(NamedTuple{tuple(termvars(formula)...)}(data))

function ModelFrame(f::FormulaTerm, data::ColumnTable;
function ModelFrame(f::FormulaTerm, data;
model::Type{M}=StatisticalModel, contrasts=Dict{Symbol,Any}()) where M
data, _ = missing_omit(data, f)
data = missing_omit(data, f)

sch = schema(f, data, contrasts)
f = apply_schema(f, sch, M)

ModelFrame(f, sch, data, model)
end

ModelFrame(f::FormulaTerm, data; model=StatisticalModel, contrasts=Dict{Symbol,Any}()) =
ModelFrame(f, columntable(data); model=model, contrasts=contrasts)

StatsBase.modelmatrix(f::FormulaTerm, data; kwargs...) = modelmatrix(f.rhs, data; kwargs...)

"""
Expand Down
26 changes: 13 additions & 13 deletions src/schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -112,16 +112,16 @@ julia> sch[term(:y)]
y(continuous)
```
"""
schema(data, hints=Dict{Symbol,Any}()) = schema(columntable(data), hints)
schema(dt::D, hints=Dict{Symbol,Any}()) where {D<:ColumnTable} =
schema(Term.(collect(fieldnames(D))), dt, hints)
schema(ts::AbstractVector{<:AbstractTerm}, data, hints::Dict{Symbol}) =
schema(ts, columntable(data), hints)
schema(data, hints=Dict{Symbol,Any}()) =
schema(Term.(collect(Tables.columnnames(data))), data, hints)

# handle hints:
schema(ts::AbstractVector{<:AbstractTerm}, dt::ColumnTable,
hints::Dict{Symbol}=Dict{Symbol,Any}()) =
sch = Schema(t=>concrete_term(t, dt, hints) for t in ts)
function schema(ts::AbstractVector{<:AbstractTerm},
data,
hints::Dict{Symbol}=Dict{Symbol,Any}())
data = Tables.Columns(Tables.columns(data))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the advantage of wrapping the result of Tables.columns in a Tables.Columns object?

sch = Schema(t=>concrete_term(t, data, hints) for t in ts)
end

schema(f::TermOrTerms, data, hints::Dict{Symbol}) =
schema(filter(needs_schema, terms(f)), data, hints)
Expand Down Expand Up @@ -168,15 +168,15 @@ a(continuous)
"""
concrete_term(t::Term, d, hints::Dict{Symbol}) =
concrete_term(t, d, get(hints, t.sym, nothing))
concrete_term(t::Term, dt::ColumnTable, hint) =
concrete_term(t, getproperty(dt, t.sym), hint)
concrete_term(t::Term, dt::ColumnTable, hints::Dict{Symbol}) =
concrete_term(t, getproperty(dt, t.sym), get(hints, t.sym, nothing))
concrete_term(t::Term, d, hint) =
concrete_term(t, getcolumn(d, t.sym), hint)
concrete_term(t::Term, d, hints::Dict{Symbol}) =
concrete_term(t, getcolumn(d, t.sym), get(hints, t.sym, nothing))
concrete_term(t::Term, d) = concrete_term(t, d, nothing)

# if the "hint" is already an AbstractTerm, use that
# need this specified to avoid ambiguity
concrete_term(t::Term, d::ColumnTable, hint::AbstractTerm) = hint
concrete_term(t::Term, d::Tables.Columns, hint::AbstractTerm) = hint
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just

Suggested change
concrete_term(t::Term, d::Tables.Columns, hint::AbstractTerm) = hint
concrete_term(t::Term, d, hint::AbstractTerm) = hint

concrete_term(t::Term, x, hint::AbstractTerm) = hint

# second possible fix for #97
Expand Down
4 changes: 2 additions & 2 deletions src/statsmodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ for (modeltype, dfmodeltype) in ((:StatisticalModel, TableStatisticalModel),
kwargs...) where T<:$modeltype

Tables.istable(data) || throw(ArgumentError("expected data in a Table, got $(typeof(data))"))
cols = columntable(data)
cols = Tables.Columns(data)

mf = ModelFrame(f, cols, model=T, contrasts=contrasts)
mm = ModelMatrix(mf)
Expand Down Expand Up @@ -172,7 +172,7 @@ function StatsBase.predict(mm::TableRegressionModel, data; kwargs...)
throw(ArgumentError("expected data in a Table, got $(typeof(data))"))

f = mm.mf.f
cols, nonmissings = missing_omit(columntable(data), f.rhs)
cols, nonmissings = missing_omit(Tables.Columns(data), f.rhs)
new_x = modelcols(f.rhs, cols)
y_pred = predict(mm.model, reshape(new_x, size(new_x, 1), :);
kwargs...)
Expand Down
2 changes: 1 addition & 1 deletion src/temporal_terms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ end
ShiftedArrays.lead(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lead)}(t, n)
ShiftedArrays.lag(t::T, n=1) where {T<:AbstractTerm} = LeadLagTerm{T,typeof(lag)}(t, n)

function modelcols(ll::LeadLagTerm{<:Any, F}, d::Tables.ColumnTable) where F
function modelcols(ll::LeadLagTerm{<:Any, F}, d::Columns) where F
original_cols = modelcols(ll.term, d)
return F.instance(original_cols, ll.nsteps)
end
Expand Down
50 changes: 25 additions & 25 deletions src/terms.jl
Original file line number Diff line number Diff line change
Expand Up @@ -430,9 +430,9 @@ function modelcols(t, d::D) where D
## like modelcols(::Any, ::NamedTuple) or modelcols(::AbstractTerm, ::NamedTuple)
## but that causes ambiguity errors or under-constrained modelcols methods for
## custom term types...
d isa NamedTuple && throw(ArgumentError("don't know how to generate modelcols for " *
"term $t. Did you forget to call apply_schema?"))
modelcols(t, columntable(d))
d isa Tables.Columns && throw(ArgumentError("don't know how to generate modelcols for " *
"term $t. Did you forget to call apply_schema?"))
modelcols(t, Tables.Columns(d))
end

"""
Expand Down Expand Up @@ -481,19 +481,19 @@ julia> modelcols(MatrixTerm(ts), d)
9.0 0.05079 0.0 1.0
```
"""
modelcols(ts::TupleTerm, d::NamedTuple) = modelcols.(ts, Ref(d))
modelcols(ts::TupleTerm, d) = modelcols.(ts, Ref(d))

modelcols(t::Term, d::NamedTuple) =
modelcols(t::Term, d) =
throw(ArgumentError("can't generate modelcols for un-typed term $t. " *
"Use apply_schema to create concrete terms first"))

# TODO: @generated to unroll the getfield stuff
modelcols(ft::FunctionTerm{Fo,Fa,Names}, d::NamedTuple) where {Fo,Fa,Names} =
ft.fanon.(getfield.(Ref(d), Names)...)
modelcols(ft::FunctionTerm{Fo,Fa,Names}, d) where {Fo,Fa,Names} =
ft.fanon.(getcolumn.(Ref(d), Names)...)

modelcols(t::ContinuousTerm, d::NamedTuple) = copy.(d[t.sym])
modelcols(t::ContinuousTerm, d) = copy.(getcolumn(d, t.sym))

modelcols(t::CategoricalTerm, d::NamedTuple) = t.contrasts[d[t.sym], :]
modelcols(t::CategoricalTerm, d) = t.contrasts[getcolumn(d, t.sym), :]


"""
Expand Down Expand Up @@ -523,28 +523,28 @@ function row_kron_insideout(op::Function, args...)
reshape(broadcast(op, args...), rows, :)
end

# two options here: either special-case ColumnTable (named tuple of vectors)
# vs. vanilla NamedTuple, or reshape and use normal broadcasting
modelcols(t::InteractionTerm, d::NamedTuple) =
kron_insideout(*, (modelcols(term, d) for term in t.terms)...)

function modelcols(t::InteractionTerm, d::ColumnTable)
row_kron_insideout(*, (modelcols(term, d) for term in t.terms)...)
function modelcols(t::InteractionTerm, d)
if Tables.istable(d)
return row_kron_insideout(*, (modelcols(term, d) for term in t.terms)...)
else
return kron_insideout(*, (modelcols(term, d) for term in t.terms)...)
end
end

modelcols(t::InterceptTerm{true}, d::NamedTuple) = ones(size(first(d)))
modelcols(t::InterceptTerm{false}, d) = Matrix{Float64}(undef, size(first(d),1), 0)
modelcols(t::InterceptTerm{true}, d) = ones(size(Tables.getcolumn(d, 1), 1))
modelcols(t::InterceptTerm{false}, d) = Matrix{Float64}(undef, size(Tables.getcolumn(d, 1), 1), 0)

modelcols(t::FormulaTerm, d::NamedTuple) = (modelcols(t.lhs,d), modelcols(t.rhs, d))
modelcols(t::FormulaTerm, d) = (modelcols(t.lhs,d), modelcols(t.rhs, d))

function modelcols(t::MatrixTerm, d::ColumnTable)
mat = reduce(hcat, [modelcols(tt, d) for tt in t.terms])
reshape(mat, size(mat, 1), :)
function modelcols(t::MatrixTerm, d)
if Tables.istable(d)
mat = reduce(hcat, [modelcols(tt, d) for tt in t.terms])
return reshape(mat, size(mat, 1), :)
else # single row
return reduce(vcat, [modelcols(tt, d) for tt in t.terms])
end
end

modelcols(t::MatrixTerm, d::NamedTuple) =
reduce(vcat, [modelcols(tt, d) for tt in t.terms])

vectorize(x::Tuple) = collect(x)
vectorize(x::AbstractVector) = x
vectorize(x) = [x]
Expand Down
2 changes: 1 addition & 1 deletion test/extension.jl
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ PolyTerm(t::Term, deg::ConstantTerm) = PolyTerm(t.sym, deg.n)
StatsModels.apply_schema(t::FunctionTerm{typeof(poly)}, sch, ::Type{<:PolyModel}) =
PolyTerm(t.args_parsed...)

StatsModels.modelcols(p::PolyTerm, d::NamedTuple) =
StatsModels.modelcols(p::PolyTerm, d) =
reduce(hcat, [d[p.term].^n for n in 1:p.deg])

struct NonMatrixTerm{T} <: AbstractTerm
Expand Down
3 changes: 1 addition & 2 deletions test/modelmatrix.jl
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@
mf = ModelFrame(@formula(y ~ 0 + x), d)
X = ModelMatrix(mf).m
X[1] = 0.0
@test mf.data[:x][1] === 1.0
@test mf.data[1, :x] === 1.0

# Ensure string columns are supported
d1 = DataFrame(A = 1:4, B = categorical(["M", "F", "F", "M"]))
Expand All @@ -345,7 +345,6 @@
z = repeat([:e, :f], inner = 4))

f = apply_schema(@formula(r ~ 1 + w*x*y*z), schema(d))
modelmatrix(f, d)
@test reduce(vcat, last.(modelcols.(Ref(f), Tables.rowtable(d)))') == modelmatrix(f,d)
end

Expand Down