From 1d8560549a51206581026a517bcbfbf45fcad885 Mon Sep 17 00:00:00 2001 From: rofinn Date: Tue, 2 Jul 2019 17:56:37 -0500 Subject: [PATCH 01/34] Started work on using Tables API. --- Project.toml | 10 +++++-- src/Impute.jl | 62 ++++++++++++++++++++++++------------------- src/context.jl | 4 +-- src/imputors.jl | 29 +++++++++++--------- src/imputors/chain.jl | 20 +++++++------- src/imputors/drop.jl | 28 ++++++++++--------- test/runtests.jl | 43 +++++++++++++++++++++--------- 7 files changed, 117 insertions(+), 79 deletions(-) diff --git a/Project.toml b/Project.toml index 85a029f..7938c56 100644 --- a/Project.toml +++ b/Project.toml @@ -4,16 +4,22 @@ authors = ["Invenia Technical Computing"] version = "0.2.0" [deps] -DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" +Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" + +[compat] +Tables = "0.2" [compat] DataFrames = "0.17, 0.18" julia = "1" [extras] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["RDatasets", "Test"] +test = ["DataFrames", "RDatasets", "Test"] diff --git a/src/Impute.jl b/src/Impute.jl index 96941a5..a037949 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -1,14 +1,22 @@ module Impute -using DataFrames +using IterTools using Statistics +using Tables: Tables, materializer, columntable, rowtable, istable -import DataFrames: DataFrameRow import Base.Iterators export impute, impute!, chain, chain!, drop, drop!, interp, interp!, ImputeError -const Dataset = Union{AbstractArray, DataFrame} +function __init__() + for sym in [:chain, :chain!, :drop, :drop!, :interp, :interp!] + Base.depwarn( + "`$sym` will no longer be exported in future releases. " * + "Please qualify your calls with `Impute.$sym(...).` or explicitly import it.", + sym + ) + end +end """ ImputeError{T} <: Exception @@ -36,39 +44,39 @@ const global imputation_methods = Dict{Symbol, Type}( ) """ - impute!(data::Dataset, method::Symbol=:interp, args...; limit::Float64=0.1) + impute!(data, method::Symbol=:interp, args...; limit::Float64=0.1) Looks up the `Imputor` type for the `method`, creates it and calls -`impute!(imputor::Imputor, data::Dataset, limit::Float64)` with it. +`impute!(imputor::Imputor, data, limit::Float64)` with it. # Arguments -* `data::Dataset`: the datset containing missing elements we should impute. +* `data`: the datset containing missing elements we should impute. * `method::Symbol`: the imputation method to use (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) * `args::Any...`: any arguments you should pass to the `Imputor` constructor. * `limit::Float64`: missing data ratio limit/threshold (default: 0.1) """ -function impute!(data::Dataset, method::Symbol, args...; limit::Float64=0.1) +function impute!(data, method::Symbol, args...; limit::Float64=0.1) imputor_type = imputation_methods[method] imputor = length(args) > 0 ? imputor_type(args...) : imputor_type() return impute!(imputor, data, limit) end """ - impute!(data::Dataset, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) + impute!(data, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) Creates the appropriate `Imputor` type and `Context` (using `missing` function) in order to call -`impute!(imputor::Imputor, ctx::Context, data::Dataset)` with them. +`impute!(imputor::Imputor, ctx::Context, data)` with them. # Arguments -* `data::Dataset`: the datset containing missing elements we should impute. +* `data`: the datset containing missing elements we should impute. * `missing::Function`: the missing data function to use * `method::Symbol`: the imputation method to use (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) * `args::Any...`: any arguments you should pass to the `Imputor` constructor. * `limit::Float64`: missing data ratio limit/threshold (default: 0.1) """ -function impute!(data::Dataset, missing::Function, method::Symbol, args...; limit::Float64=0.1) +function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) imputor_type = imputation_methods[method] imputor = length(args) > 0 ? imputor_type(args...) : imputor_type() ctx = Context(*(size(data)...), 0, limit, missing) @@ -76,70 +84,70 @@ function impute!(data::Dataset, missing::Function, method::Symbol, args...; limi end """ - impute(data::Dataset, args...; kwargs...) + impute(data, args...; kwargs...) Copies the `data` before calling `impute!(new_data, args...; kwargs...)` """ -function impute(data::Dataset, args...; kwargs...) +function impute(data, args...; kwargs...) return impute!(deepcopy(data), args...; kwargs...) end """ - chain!(data::Dataset, missing::Function, imputors::Imputor...; kwargs...) + chain!(data, missing::Function, imputors::Imputor...; kwargs...) Creates a `Chain` with `imputors` and calls `impute!(imputor, missing, data; kwargs...)` """ -function chain!(data::Dataset, missing::Function, imputors::Imputor...; kwargs...) +function chain!(data, missing::Function, imputors::Imputor...; kwargs...) imputor = Chain(imputors...) return impute!(imputor, missing, data; kwargs...) end """ - chain!(data::Dataset, imputors::Imputor...; kwargs...) + chain!(data, imputors::Imputor...; kwargs...) Creates a `Chain` with `imputors` and calls `impute!(imputor, data; kwargs...)` """ -function chain!(data::Dataset, imputors::Imputor...; kwargs...) +function chain!(data, imputors::Imputor...; kwargs...) imputor = Chain(imputors...) return impute!(imputor, data; kwargs...) end """ - chain(data::Dataset, args...; kwargs...) + chain(data, args...; kwargs...) Copies the `data` before calling `chain!(data, args...; kwargs...)` """ -function chain(data::Dataset, args...; kwargs...) +function chain(data, args...; kwargs...) result = deepcopy(data) return chain!(data, args...; kwargs...) end """ - drop!(data::Dataset; limit=1.0) + drop!(data; limit=1.0) Utility method for `impute!(data, :drop; limit=limit)` """ -drop!(data::Dataset; limit=1.0) = impute!(data, :drop; limit=limit) +drop!(data; limit=1.0) = impute!(data, :drop; limit=limit) """ - drop(data::Dataset; limit=1.0) + drop(data; limit=1.0) Utility method for `impute(data, :drop; limit=limit)` """ -Iterators.drop(data::Dataset; limit=1.0) = impute(data, :drop; limit=limit) +Iterators.drop(data; limit=1.0) = impute(data, :drop; limit=limit) """ - interp!(data::Dataset; limit=1.0) + interp!(data; limit=1.0) Utility method for `impute!(data, :interp; limit=limit)` """ -interp!(data::Dataset; limit=1.0) = impute!(data, :interp; limit=limit) +interp!(data; limit=1.0) = impute!(data, :interp; limit=limit) """ - interp(data::Dataset; limit=1.0) + interp(data; limit=1.0) Utility method for `impute(data, :interp; limit=limit)` """ -interp(data::Dataset; limit=1.0) = impute(data, :interp; limit=limit) +interp(data; limit=1.0) = impute(data, :interp; limit=limit) end # module diff --git a/src/context.jl b/src/context.jl index 201965b..f399b99 100644 --- a/src/context.jl +++ b/src/context.jl @@ -23,7 +23,7 @@ Base.copy(x::Context) = Context(x.num, x.count, x.limit, x.missing) """ ismissing(ctx::Context, x) -> Bool -Uses `ctx.missing` to determine if x is missing. If x is a data row or an abstract array +Uses `ctx.missing` to determine if x is missing. If x is a named tuple or an abstract array then `ismissing` will return true if `ctx.missing` returns true for any element. The ctx.count is increased whenever whenever we return true and if `ctx.count / ctx.num` exceeds our `ctx.limit` we throw an `ImputeError` @@ -33,7 +33,7 @@ exceeds our `ctx.limit` we throw an `ImputeError` * `x`: the value to check (may be an single values, abstract array or row) """ function Base.ismissing(ctx::Context, x) - missing = if isa(x, DataFrameRow) + missing = if isa(x, NamedTuple) any(entry -> ctx.missing(entry[2]), pairs(x)) elseif isa(x, AbstractArray) any(ctx.missing, x) diff --git a/src/imputors.jl b/src/imputors.jl index 9837750..e9a1fe2 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -1,7 +1,7 @@ """ Imputor -An imputor stores information about imputing values in `AbstractArray`s and `DataFrame`s. +An imputor stores information about imputing values in `AbstractArray`s and `Tables.table`s. New imputation methods are expected to sutype `Imputor` and, at minimum, implement the `impute!{T<:Any}(imp::, ctx::Context, data::AbstractArray{T, 1})` method. @@ -10,7 +10,7 @@ method. abstract type Imputor end """ - impute!(imp::Imputor, data::Dataset, limit::Float64=0.1) + impute!(imp::Imputor, data, limit::Float64=0.1) Creates a `Context` using information about `data`. These include @@ -20,13 +20,13 @@ Creates a `Context` using information about `data`. These include # Arguments * `imp::Imputor`: the Imputor method to use -* `data::Dataset`: the data to impute +* `data`: the data to impute * `limit::Float64: missing data ratio limit/threshold (default: 0.1)` # Return -* `Dataset`: the input `data` with values imputed. +* the input `data` with values imputed. """ -function impute!(imp::Imputor, data::Dataset, limit::Float64=0.1) +function impute!(imp::Imputor, data, limit::Float64=0.1) ctx = Context(*(size(data)...), 0, limit, ismissing) return impute!(imp, ctx, data) end @@ -53,25 +53,28 @@ function impute!(imp::Imputor, ctx::Context, data::AbstractMatrix) end """ - impute!(imp::Imputor, ctx::Context, data::DataFrame) + impute!(imp::Imputor, ctx::Context, table) -Imputes the data in a DataFrame by imputing the values 1 column at a time; +Imputes the data in a table by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments * `imp::Imputor`: the Imputor method to use * `ctx::Context`: the contextual information for missing data -* `data::DataFrame`: the data to impute +* `table`: the data to impute # Returns -* `DataFrame`: the input `data` with values imputed +* the input `data` with values imputed """ -function impute!(imp::Imputor, ctx::Context, data::DataFrame) - colwise(data) do c - impute!(imp, ctx, c) +function impute!(imp::Imputor, ctx::Context, table) + @assert istable(table) + result = columntable(table) + + for cname in propertynames(result) + impute!(imp, ctx, getproperty(table, cname)) end - return data + return materializer(table)(result) end diff --git a/src/imputors/chain.jl b/src/imputors/chain.jl index c416b6b..c2b5f0a 100644 --- a/src/imputors/chain.jl +++ b/src/imputors/chain.jl @@ -18,44 +18,44 @@ Creates a Chain using the `Imputor`s provided (ordering matters). Chain(imputors::Imputor...) = Chain(collect(imputors)) """ - impute!(imp::Chain, missing::Function, data::Dataset; limit::Float64=0.1) + impute!(imp::Chain, missing::Function, data; limit::Float64=0.1) Creates a `Context` and runs the `Imputor`s on the supplied data. # Arguments * `imp::Chain`: the chain to run * `missing::Function`: the missing function to use in the `Context` to pass to the `Imputor`s -* `data::Dataset`: our data to impute +* `data`: our data to impute * `limit::Float64`: the missing data ration limit/threshold # Returns -* `Dataset`: our imputed data +* our imputed data """ -function impute!(imp::Chain, missing::Function, data::Dataset; limit::Float64=0.1) +function impute!(imp::Chain, missing::Function, data; limit::Float64=0.1) ctx = Context(*(size(data)...), 0, limit, missing) for imputor in imp.imputors - impute!(imputor, copy(ctx), data) + data = impute!(imputor, copy(ctx), data) end return data end """ - impute!(imp::Chain, data::Dataset; limit::Float64=0.1) + impute!(imp::Chain, data; limit::Float64=0.1) Infers the missing data function from the `data` and passes that to -`impute!(imp::Chain, missing::Function, data::Dataset; limit::Float64=0.1)`. +`impute!(imp::Chain, missing::Function, data; limit::Float64=0.1)`. # Arguments * `imp::Chain`: the chain to run -* `data::Dataset`: our data to impute +* `data`: our data to impute * `limit::Float64`: the missing data ration limit/threshold # Returns -* `Dataset`: our imputed data +* our imputed data """ -function impute!(imp::Chain, data::Dataset; limit::Float64=0.1) +function impute!(imp::Chain, data; limit::Float64=0.1) impute!(imp, ismissing, data; limit=limit) end diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 0c6c0cb..eb28a97 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -1,7 +1,7 @@ """ Drop <: Imputor -Removes missing values from the `AbstractArray` or `DataFrame` provided. +Removes missing values from the `AbstractArray` or `Tables.table` provided. """ struct Drop <: Imputor end @@ -50,24 +50,26 @@ function impute!(imp::Drop, ctx::Context, data::AbstractMatrix) end """ - impute!(imp::Drop, ctx::Context, data::DataFrame) + impute!(imp::Drop, ctx::Context, table) -Finds the missing rows in the `DataFrame` and deletes them. - -NOTE: this isn't quite as fast as `dropnull` in `DataFrames`s as we're using an arbitrary -`missing` function rather than using the precomputed `dt.isnull` vector of bools. +Finds the missing rows in the table and deletes them. # Arguments * `imp::Drop`: this `Imputor` method * `ctx::Context`: contextual information for missing data -* `data::DataFrame`: the data to impute +* `table`: a type that implements the Tables API. # Returns -* `DataFrame`: our data with the missing rows removed. +* our data with the missing rows removed. """ -function impute!(imp::Drop, ctx::Context, data::DataFrame) - ctx.num = size(data, 1) - m = typeof(data).name.module - m.deleterows!(data, findall(r -> ismissing(ctx, r), m.eachrow(data))) - return data +function impute!(imp::Drop, ctx::Context, table) + @assert istable(table) + rows = rowtable(table) + ctx.num = length(rows) + + filter!(rows) do r + !any(x -> ismissing(ctx, x), propertyvalues(r)) + end + + return materializer(table)(rows) end diff --git a/test/runtests.jl b/test/runtests.jl index 1a268e5..c28b39f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ using Impute +using Tables using Test using DataFrames using RDatasets @@ -98,18 +99,36 @@ using Statistics end @testset "Chain" begin - data = Matrix(dataset("boot", "neuro")) - result = chain( - data, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) - - @test size(result) == size(data) - # Confirm that we don't have any more missing values - @test !any(ismissing, result) + orig = dataset("boot", "neuro") + + @testset "DataFrame" begin + result = chain( + orig, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test !any(ismissing, Matrix(result)) + end + + @testset "Matrix" begin + data = Matrix(orig) + result = chain( + data, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test !any(ismissing, result) + end end @testset "Alternate missing functions" begin From 78ba17feeb77cb709634e8b2e598864da7c3b1f4 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 Jul 2019 15:22:23 -0500 Subject: [PATCH 02/34] Fixed up Context code to better fit with Tables interface changes. --- src/Impute.jl | 8 +- src/context.jl | 187 +++++++++++++++++++++++++++++++----------- src/imputors.jl | 15 ++-- src/imputors/chain.jl | 6 +- src/imputors/drop.jl | 16 ++-- test/runtests.jl | 15 ++++ 6 files changed, 184 insertions(+), 63 deletions(-) diff --git a/src/Impute.jl b/src/Impute.jl index a037949..ce777e6 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -2,7 +2,8 @@ module Impute using IterTools using Statistics -using Tables: Tables, materializer, columntable, rowtable, istable +using StatsBase +using Tables: Tables, materializer, istable import Base.Iterators @@ -79,8 +80,9 @@ Creates the appropriate `Imputor` type and `Context` (using `missing` function) function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) imputor_type = imputation_methods[method] imputor = length(args) > 0 ? imputor_type(args...) : imputor_type() - ctx = Context(*(size(data)...), 0, limit, missing) - return impute!(imputor, ctx, data) + return Context(; limit=limit, is_missing=missing)() do ctx + impute!(imputor, ctx, data) + end end """ diff --git a/src/context.jl b/src/context.jl index f399b99..93dbd6a 100644 --- a/src/context.jl +++ b/src/context.jl @@ -1,30 +1,15 @@ """ - Context - -Stores common summary information for all Imputor types. + AbstractContext -# Fields -* `num::Int`: number of observations -* `count::Int`: number of missing values found -* `limit::Float64`: allowable limit for missing values to impute -* `missing::Function`: returns a Bool if the value counts as missing. +An imputation context records summary information about missing data for an imputation algorithm. """ -mutable struct Context - num::Int - count::Int - limit::Float64 - missing::Function -end - -Context(limit::Float64, missing::Function=ismissing) = Context(0, 0, limit, missing) - -Base.copy(x::Context) = Context(x.num, x.count, x.limit, x.missing) +abstract type AbstractContext end """ - ismissing(ctx::Context, x) -> Bool + ismissing(ctx::AbstractContext, x) -> Bool -Uses `ctx.missing` to determine if x is missing. If x is a named tuple or an abstract array -then `ismissing` will return true if `ctx.missing` returns true for any element. +Uses `ctx.is_missing` to determine if x is missing. If x is a named tuple or an abstract array +then `ismissing` will return true if `ctx.is_missing` returns true for any element. The ctx.count is increased whenever whenever we return true and if `ctx.count / ctx.num` exceeds our `ctx.limit` we throw an `ImputeError` @@ -32,74 +17,184 @@ exceeds our `ctx.limit` we throw an `ImputeError` * `ctx::Context`: the contextual information about missing information. * `x`: the value to check (may be an single values, abstract array or row) """ -function Base.ismissing(ctx::Context, x) +function Base.ismissing(ctx::AbstractContext, x) missing = if isa(x, NamedTuple) - any(entry -> ctx.missing(entry[2]), pairs(x)) + any(entry -> ctx.is_missing(entry[2]), pairs(x)) elseif isa(x, AbstractArray) - any(ctx.missing, x) + any(ctx.is_missing, x) else - ctx.missing(x) + ctx.is_missing(x) end - if missing - ctx.count += 1 - - if (ctx.count / ctx.num) > ctx.limit - throw(ImputeError( - "More than $(ctx.limit * 100)% of values were missing ()." - )) - end + missing_update!(ctx, missing) - return true - else - return false - end + return missing end """ - findfirst(ctx::Context, data::AbstractVector) -> Int + findfirst(ctx::AbstractContext, data::AbstractVector) -> Int Returns the first not missing index in `data`. # Arguments -* `ctx::Context`: the context to pass into `ismissing` +* `ctx::AbstractContext`: the context to pass into `ismissing` * `data::AbstractVector`: the data array to search # Returns * `Int`: the first index in `data` that isn't missing """ -function Base.findfirst(ctx::Context, data::AbstractVector) +function Base.findfirst(ctx::AbstractContext, data::AbstractVector) return findfirst(x -> !ismissing(ctx, x), data) end """ - findlast(ctx::Context, data::AbstractVector) -> Int + findlast(ctx::AbstractContext, data::AbstractVector) -> Int Returns the last not missing index in `data`. # Arguments -* `ctx::Context`: the context to pass into `ismissing` +* `ctx::AbstractContext`: the context to pass into `ismissing` * `data::AbstractVector`: the data array to search # Returns * `Int`: the last index in `data` that isn't missing """ -function Base.findlast(ctx::Context, data::AbstractVector) +function Base.findlast(ctx::AbstractContext, data::AbstractVector) return findlast(x -> !ismissing(ctx, x), data) end """ - findnext(ctx::Context, data::AbstractVector) -> Int + findnext(ctx::AbstractContext, data::AbstractVector) -> Int Returns the next not missing index in `data`. # Arguments -* `ctx::Context`: the context to pass into `ismissing` +* `ctx::AbstractContext`: the context to pass into `ismissing` * `data::AbstractVector`: the data array to search # Returns * `Int`: the next index in `data` that isn't missing """ -function Base.findnext(ctx::Context, data::AbstractVector, idx::Int) +function Base.findnext(ctx::AbstractContext, data::AbstractVector, idx::Int) return findnext(x -> !ismissing(ctx, x), data, idx) end + +""" + Context + +Records base information about the missing data and assume all observations are equally +weighted. + +# Fields +* `n::Int`: number of observations +* `count::Int`: number of missing values found +* `limit::Float64`: allowable limit for missing values to impute +* `is_missing::Function`: returns a Bool if the value counts as missing +* `on_complete::Function`: a function to run when imputation is complete +""" +mutable struct Context <: AbstractContext + num::Int + count::Int + limit::Float64 + is_missing::Function + on_complete::Function +end + +function Context(; + limit::Float64=1.0, + is_missing::Function=ismissing, + on_complete::Function=complete +) + Context(0, 0, limit, is_missing, on_complete) +end + +function (ctx::Context)(f::Function) + _ctx = copy(ctx) + _ctx.num = 0 + _ctx.count = 0 + + result = f(_ctx) + ctx.on_complete(_ctx) + return result +end + +Base.copy(x::Context) = Context(x.num, x.count, x.limit, x.is_missing, x.on_complete) + +function missing_update!(ctx::Context, miss) + ctx.num += 1 + + if miss + ctx.count += 1 + end +end + +function complete(ctx::Context) + if (ctx.count / ctx.num) > ctx.limit + throw(ImputeError( + "More than $(ctx.limit * 100)% of values were missing ()." + )) + end +end + + +""" + WeightedContext + +Records information about the missing data relative to a set of weights. + +# Fields +* `num::Int`: number of observations +* `s::Float64`: sum of missing values weights +* `limit::Float64`: allowable limit for missing values to impute +* `is_missing::Function`: returns a Bool if the value counts as missing +* `on_complete::Function`: a function to run when imputation is complete +* `wv::AbstractWeights`: a set of statistical weights to use when evaluating the importance + of each observation +""" +mutable struct WeightedContext <: AbstractContext + num::Int + s::Float64 + limit::Float64 + is_missing::Function + on_complete::Function + wv::AbstractWeights +end + +function WeightedContext( + wv::AbstractWeights; + limit::Float64=1.0, + is_missing::Function=ismissing, + on_complete::Function=complete +) + WeightedContext(0, 0.0, limit, is_missing, on_complete, wv) +end + +function (ctx::WeightedContext)(f::Function) + _ctx = copy(ctx) + _ctx.num = 0 + _ctx.s = 0.0 + + result = f(_ctx) + ctx.on_complete(_ctx) + return result +end + +function Base.copy(x::WeightedContext) + WeightedContext(x.num, x.s, x.limit, x.is_missing, x.on_complete, wv) +end + +function missing_update!(ctx::WeightedContext, miss) + ctx.num += 1 + + if miss + ctx.s += ctx.wv[ctx.num] + end +end + +function complete(ctx::WeightedContext) + if (ctx.s / sum(ctx.wv)) > ctx.limit + throw(ImputeError( + "More than $(ctx.limit * 100)% of weighted values were missing ()." + )) + end +end diff --git a/src/imputors.jl b/src/imputors.jl index e9a1fe2..0a89ec6 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -27,8 +27,9 @@ Creates a `Context` using information about `data`. These include * the input `data` with values imputed. """ function impute!(imp::Imputor, data, limit::Float64=0.1) - ctx = Context(*(size(data)...), 0, limit, ismissing) - return impute!(imp, ctx, data) + Context(; limit=limit)() do ctx + return impute!(imp, ctx, data) + end end """ @@ -68,13 +69,15 @@ if this is not the desired behaviour custom imputor methods should overload this """ function impute!(imp::Imputor, ctx::Context, table) @assert istable(table) - result = columntable(table) + # Extract a columns iterate that we should be able to use to mutate the data. + # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data + columntable = Tables.columns(table) - for cname in propertynames(result) - impute!(imp, ctx, getproperty(table, cname)) + for cname in propertynames(columntable) + impute!(imp, ctx, getproperty(columntable, cname)) end - return materializer(table)(result) + return table end diff --git a/src/imputors/chain.jl b/src/imputors/chain.jl index c2b5f0a..91e39e7 100644 --- a/src/imputors/chain.jl +++ b/src/imputors/chain.jl @@ -32,10 +32,12 @@ Creates a `Context` and runs the `Imputor`s on the supplied data. * our imputed data """ function impute!(imp::Chain, missing::Function, data; limit::Float64=0.1) - ctx = Context(*(size(data)...), 0, limit, missing) + context = Context(; limit=limit, is_missing=missing) for imputor in imp.imputors - data = impute!(imputor, copy(ctx), data) + data = context() do c + impute!(imputor, c, data) + end end return data diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index eb28a97..7ca0282 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -44,8 +44,10 @@ NOTES (or premature optimizations): * `AbstractMatrix`: a new matrix with missing rows removed """ function impute!(imp::Drop, ctx::Context, data::AbstractMatrix) - ctx.num = size(data, 1) - mask = map(i -> !ismissing(ctx, data[i, :]), 1:size(data, 1)) + mask = ctx() do c + map(i -> !ismissing(ctx, data[i, :]), 1:size(data, 1)) + end + return data[mask, :] end @@ -64,12 +66,14 @@ Finds the missing rows in the table and deletes them. """ function impute!(imp::Drop, ctx::Context, table) @assert istable(table) - rows = rowtable(table) - ctx.num = length(rows) + rows = Tables.rows(table) - filter!(rows) do r + result = Iterators.filter(rows) do r !any(x -> ismissing(ctx, x), propertyvalues(r)) end - return materializer(table)(rows) + # Unfortunately, we'll need to construct a new table + # since Tables.rows is just an iterator + table = materializer(table)(result) + return table end diff --git a/test/runtests.jl b/test/runtests.jl index c28b39f..140b683 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -115,6 +115,21 @@ using Statistics @test !any(ismissing, Matrix(result)) end + @testset "Column Table" begin + data = Tables.columntable(orig) + result = chain( + data, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) |> Tables.matrix + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test !any(ismissing, result) + end + @testset "Matrix" begin data = Matrix(orig) result = chain( From 37b7ef254615a7dbf80c02a25b27742e40dbf105 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 3 Jul 2019 17:00:27 -0500 Subject: [PATCH 03/34] Tests and bug fixes for working with Context types directly. --- Project.toml | 1 + src/context.jl | 16 ++++++++---- src/imputors.jl | 22 +++++++++++----- src/imputors/chain.jl | 4 +-- src/imputors/drop.jl | 50 +++++++++++++++++++----------------- src/imputors/fill.jl | 26 ++++++++++--------- src/imputors/interp.jl | 58 ++++++++++++++++++++++-------------------- src/imputors/locf.jl | 18 +++++++------ src/imputors/nocb.jl | 18 +++++++------ test/runtests.jl | 27 ++++++++++++++++++++ 10 files changed, 147 insertions(+), 93 deletions(-) diff --git a/Project.toml b/Project.toml index 7938c56..363a101 100644 --- a/Project.toml +++ b/Project.toml @@ -7,6 +7,7 @@ version = "0.2.0" IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" +StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] diff --git a/src/context.jl b/src/context.jl index 93dbd6a..14aa0bf 100644 --- a/src/context.jl +++ b/src/context.jl @@ -129,9 +129,11 @@ function missing_update!(ctx::Context, miss) end function complete(ctx::Context) - if (ctx.count / ctx.num) > ctx.limit + missing_ratio = ctx.count / ctx.num + + if missing_ratio > ctx.limit throw(ImputeError( - "More than $(ctx.limit * 100)% of values were missing ()." + "More than $(ctx.limit * 100)% of values were missing ($missing_ratio)." )) end end @@ -141,6 +143,8 @@ end WeightedContext Records information about the missing data relative to a set of weights. +This context type can be useful if some missing observation are more important than others +(e.g., more recent observations in time series datasets) # Fields * `num::Int`: number of observations @@ -180,7 +184,7 @@ function (ctx::WeightedContext)(f::Function) end function Base.copy(x::WeightedContext) - WeightedContext(x.num, x.s, x.limit, x.is_missing, x.on_complete, wv) + WeightedContext(x.num, x.s, x.limit, x.is_missing, x.on_complete, x.wv) end function missing_update!(ctx::WeightedContext, miss) @@ -192,9 +196,11 @@ function missing_update!(ctx::WeightedContext, miss) end function complete(ctx::WeightedContext) - if (ctx.s / sum(ctx.wv)) > ctx.limit + missing_ratio = ctx.s / sum(ctx.wv) + + if missing_ratio > ctx.limit throw(ImputeError( - "More than $(ctx.limit * 100)% of weighted values were missing ()." + "More than $(ctx.limit * 100)% of weighted values were missing ($missing_ratio)." )) end end diff --git a/src/imputors.jl b/src/imputors.jl index 0a89ec6..5ab0adc 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -9,6 +9,16 @@ method. abstract type Imputor end + +""" + impute(imp::Imputor, data, limit=0.1) + impute(imp::Imputor, ctx, data) + +Copies the `data` before calling the corresponding `impute!(imp, ...)` call. +""" +impute(imp::Imputor, data) = impute!(imp, deepcopy(data)) +impute(imp::Imputor, ctx::AbstractContext, data) = impute!(imp, ctx, deepcopy(data)) + """ impute!(imp::Imputor, data, limit::Float64=0.1) @@ -33,20 +43,20 @@ function impute!(imp::Imputor, data, limit::Float64=0.1) end """ - impute!(imp::Imputor, ctx::Context, data::AbstractMatrix) + impute!(imp::Imputor, ctx::AbstractContext, data::AbstractMatrix) Imputes the data in a matrix by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments * `imp::Imputor`: the Imputor method to use -* `ctx::Context`: the contextual information for missing data +* `ctx::AbstractContext`: the contextual information for missing data * `data::AbstractMatrix`: the data to impute # Returns * `AbstractMatrix`: the input `data` with values imputed """ -function impute!(imp::Imputor, ctx::Context, data::AbstractMatrix) +function impute!(imp::Imputor, ctx::AbstractContext, data::AbstractMatrix) for i in 1:size(data, 2) impute!(imp, ctx, view(data, :, i)) end @@ -54,20 +64,20 @@ function impute!(imp::Imputor, ctx::Context, data::AbstractMatrix) end """ - impute!(imp::Imputor, ctx::Context, table) + impute!(imp::Imputor, ctx::AbstractContext, table) Imputes the data in a table by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments * `imp::Imputor`: the Imputor method to use -* `ctx::Context`: the contextual information for missing data +* `ctx::AbstractContext`: the contextual information for missing data * `table`: the data to impute # Returns * the input `data` with values imputed """ -function impute!(imp::Imputor, ctx::Context, table) +function impute!(imp::Imputor, ctx::AbstractContext, table) @assert istable(table) # Extract a columns iterate that we should be able to use to mutate the data. # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data diff --git a/src/imputors/chain.jl b/src/imputors/chain.jl index 91e39e7..b9238a8 100644 --- a/src/imputors/chain.jl +++ b/src/imputors/chain.jl @@ -35,9 +35,7 @@ function impute!(imp::Chain, missing::Function, data; limit::Float64=0.1) context = Context(; limit=limit, is_missing=missing) for imputor in imp.imputors - data = context() do c - impute!(imputor, c, data) - end + data = impute!(imputor, context, data) end return data diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 7ca0282..b9a5ca5 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -6,24 +6,26 @@ Removes missing values from the `AbstractArray` or `Tables.table` provided. struct Drop <: Imputor end """ - impute!(imp::Drop, ctx::Context, data::AbstractVector) + impute!(imp::Drop, context::AbstractContext, data::AbstractVector) Uses `filter!` to remove missing elements from the array. # Arguments * `imp::Drop`: this `Imputor` method -* `ctx::Context`: contextual information for missing data +* `context::AbstractContext`: contextual information for missing data * `data::AbstractVector`: the data to impute # Returns * `AbstractVector`: our data array with missing elements removed """ -function impute!(imp::Drop, ctx::Context, data::AbstractVector) - return filter!(x -> !ismissing(ctx, x), data) +function impute!(imp::Drop, context::AbstractContext, data::AbstractVector) + context() do c + filter!(x -> !ismissing(c, x), data) + end end """ - impute!(imp::Drop, ctx::Context, data::AbstractMatrix) + impute!(imp::Drop, context::AbstractContext, data::AbstractMatrix) Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the `data` with those rows removed. Unfortunately, the mask approach requires copying the matrix. @@ -37,43 +39,43 @@ NOTES (or premature optimizations): # Arguments * `imp::Drop`: this `Imputor` method -* `ctx::Context`: contextual information for missing data +* `context::AbstractContext`: contextual information for missing data * `data::AbstractMatrix`: the data to impute # Returns * `AbstractMatrix`: a new matrix with missing rows removed """ -function impute!(imp::Drop, ctx::Context, data::AbstractMatrix) - mask = ctx() do c - map(i -> !ismissing(ctx, data[i, :]), 1:size(data, 1)) +function impute!(imp::Drop, context::AbstractContext, data::AbstractMatrix) + context() do c + mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1)) + return data[mask, :] end - - return data[mask, :] end """ - impute!(imp::Drop, ctx::Context, table) + impute!(imp::Drop, context::AbstractContext, table) Finds the missing rows in the table and deletes them. # Arguments * `imp::Drop`: this `Imputor` method -* `ctx::Context`: contextual information for missing data +* `context::AbstractContext`: contextual information for missing data * `table`: a type that implements the Tables API. # Returns * our data with the missing rows removed. """ -function impute!(imp::Drop, ctx::Context, table) - @assert istable(table) - rows = Tables.rows(table) - - result = Iterators.filter(rows) do r - !any(x -> ismissing(ctx, x), propertyvalues(r)) +function impute!(imp::Drop, context::AbstractContext, table) + context() do c + @assert istable(table) + rows = Tables.rows(table) + + # Unfortunately, we'll need to construct a new table + # since Tables.rows is just an iterator + table = Iterators.filter(rows) do r + !any(x -> ismissing(c, x), propertyvalues(r)) + end |> materializer(table) + + return table end - - # Unfortunately, we'll need to construct a new table - # since Tables.rows is just an iterator - table = materializer(table)(result) - return table end diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index c7deec9..5dc274d 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -19,23 +19,25 @@ By default `Fill()` will use the mean of the existing values as the fill value. Fill() = Fill(mean) """ - impute!(imp::Fill, ctx::Context, data::AbstractVector) + impute!(imp::Fill, context::AbstractContext, data::AbstractVector) Computes the fill value if `imp.value` is a `Function` (i.e., `imp.value(drop(copy(data)))`) and replaces all missing values in the `data` with that value. """ -function impute!(imp::Fill, ctx::Context, data::AbstractVector) - fill_val = if isa(imp.value, Function) - imp.value(Iterators.drop(copy(data))) - else - imp.value - end +function impute!(imp::Fill, context::AbstractContext, data::AbstractVector) + context() do c + fill_val = if isa(imp.value, Function) + imp.value(Iterators.drop(copy(data))) + else + imp.value + end - for i in 1:length(data) - if ismissing(ctx, data[i]) - data[i] = fill_val + for i in 1:length(data) + if ismissing(c, data[i]) + data[i] = fill_val + end end - end - return data + return data + end end diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index a2a8c92..d59ecdd 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -6,7 +6,7 @@ Performs linear interpolation between the nearest values in an vector. struct Interpolate <: Imputor end """ - impute!(imp::Interpolate, ctx::Context, data::AbstractVector) + impute!(imp::Interpolate, context::AbstractContext, data::AbstractVector) Uses linear interpolation between existing elements of a vector to fill in missing data. @@ -14,34 +14,38 @@ WARNING: Missing values at the head or tail of the array cannot be interpolated are no existing values on both sides. As a result, this method does not guarantee that all missing values will be imputed. """ -function impute!(imp::Interpolate, ctx::Context, data::AbstractVector{<:Union{T, Missing}}) where T - i = findfirst(ctx, data) + 1 - - while i < length(data) - if ismissing(ctx, data[i]) - prev_idx = i - 1 - next_idx = findnext(ctx, data, i + 1) - - if next_idx !== nothing - gap_sz = (next_idx - prev_idx) - 1 - - diff = data[next_idx] - data[prev_idx] - incr = diff / T(gap_sz + 1) - val = data[prev_idx] + incr - - # Iteratively fill in the values - for j in i:(next_idx - 1) - data[j] = val - val += incr +function impute!( + imp::Interpolate, context::AbstractContext, data::AbstractVector{<:Union{T, Missing}} +) where T + context() do c + i = findfirst(c, data) + 1 + + while i < length(data) + if ismissing(c, data[i]) + prev_idx = i - 1 + next_idx = findnext(c, data, i + 1) + + if next_idx !== nothing + gap_sz = (next_idx - prev_idx) - 1 + + diff = data[next_idx] - data[prev_idx] + incr = diff / T(gap_sz + 1) + val = data[prev_idx] + incr + + # Iteratively fill in the values + for j in i:(next_idx - 1) + data[j] = val + val += incr + end + + i = next_idx + else + break end - - i = next_idx - else - break end + i += 1 end - i += 1 - end - return data + return data + end end diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index f911abf..a118c1e 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -1,7 +1,7 @@ struct LOCF <: Imputor end """ - impute!(imp::LOCF, ctx::Context, data::AbstractVector) + impute!(imp::LOCF, context::AbstractContext, data::AbstractVector) Iterates forwards through the `data` and fills missing data with the last existing observation. @@ -15,13 +15,15 @@ that all missing values will be imputed. ``` """ -function impute!(imp::LOCF, ctx::Context, data::AbstractVector) - start_idx = findfirst(ctx, data) + 1 - for i in start_idx:length(data) - if ismissing(ctx, data[i]) - data[i] = data[i-1] +function impute!(imp::LOCF, context::AbstractContext, data::AbstractVector) + context() do c + start_idx = findfirst(c, data) + 1 + for i in start_idx:length(data) + if ismissing(c, data[i]) + data[i] = data[i-1] + end end - end - return data + return data + end end diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index dd7c914..5a56e4b 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -6,7 +6,7 @@ Fills in missing data using the Next Observation Carried Backward (NOCB) approac struct NOCB <: Imputor end """ - impute!(imp::NOCB, ctx::Context, data::AbstractVector) + impute!(imp::NOCB, context::AbstractContext, data::AbstractVector) Iterates backwards through the `data` and fills missing data with the next existing observation. @@ -20,13 +20,15 @@ that all missing values will be imputed. ``` """ -function impute!(imp::NOCB, ctx::Context, data::AbstractVector) - end_idx = findlast(ctx, data) - 1 - for i in end_idx:-1:1 - if ismissing(ctx, data[i]) - data[i] = data[i+1] +function impute!(imp::NOCB, context::AbstractContext, data::AbstractVector) + context() do c + end_idx = findlast(c, data) - 1 + for i in end_idx:-1:1 + if ismissing(c, data[i]) + data[i] = data[i+1] + end end - end - return data + return data + end end diff --git a/test/runtests.jl b/test/runtests.jl index 140b683..062249f 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,6 +4,9 @@ using Test using DataFrames using RDatasets using Statistics +using StatsBase + +import Impute: Drop, Context, WeightedContext @testset "Impute" begin a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) @@ -156,4 +159,28 @@ using Statistics result2 = chain(data2, isnan, Impute.Interpolate(), Impute.Drop(); limit=1.0) @test result1 == result2 end + + @testset "Contexts" begin + @testset "Base" begin + @test_throws ImputeError impute(a, :drop; limit=0.1) + @test_throws ImputeError impute(Drop(), Context(; limit=0.1), a) + end + + @testset "Weighted" begin + # If we use an exponentially weighted context then we won't pass the limit + # because missing earlier observations is less important than later ones. + @show a + ctx = WeightedContext(eweights(20, 0.3); limit=0.1) + @test isa(ctx, WeightedContext) + result = impute(Drop(), ctx, a) + expected = copy(a) + deleteat!(expected, [2, 3, 7]) + @test result == expected + + # If we reverse the weights such that earlier observations are more important + # then our previous limit of 0.2 won't be enough to succeed. + ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2) + @test_throws ImputeError impute(Drop(), ctx, a) + end + end end From 45dfea90cd9aec9c04b26350df88da67f85b0450 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 4 Jul 2019 17:40:49 -0500 Subject: [PATCH 04/34] Simplify exports deprecation. --- src/Impute.jl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/Impute.jl b/src/Impute.jl index ce777e6..bfff85a 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -10,13 +10,14 @@ import Base.Iterators export impute, impute!, chain, chain!, drop, drop!, interp, interp!, ImputeError function __init__() - for sym in [:chain, :chain!, :drop, :drop!, :interp, :interp!] - Base.depwarn( - "`$sym` will no longer be exported in future releases. " * - "Please qualify your calls with `Impute.$sym(...).` or explicitly import it.", - sym - ) - end + sym = join(["chain", "chain!", "drop", "drop!", "interp", "interp!"], ", ", " and ") + + @warn( + """ + The following symbols will not be exported in future releases: $sym. + Please qualify your calls with `Impute.(...)` or explicitly import the symbol. + """ + ) end """ From 7c6ceedc9b6bb49ad61f29097f7350e219042684 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 5 Jul 2019 14:08:24 -0500 Subject: [PATCH 05/34] API simplification. --- Project.toml | 1 + src/Impute.jl | 129 ++++++++--------------------------------- src/context.jl | 18 +++++- src/imputors.jl | 43 +++----------- src/imputors/chain.jl | 31 ++-------- src/imputors/drop.jl | 32 +++++----- src/imputors/fill.jl | 21 ++++--- src/imputors/interp.jl | 15 +++-- src/imputors/locf.jl | 13 +++-- src/imputors/nocb.jl | 13 +++-- test/runtests.jl | 1 - 11 files changed, 110 insertions(+), 207 deletions(-) diff --git a/Project.toml b/Project.toml index 363a101..4b234b8 100644 --- a/Project.toml +++ b/Project.toml @@ -11,6 +11,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] +IterTools = "1.2" Tables = "0.2" [compat] diff --git a/src/Impute.jl b/src/Impute.jl index bfff85a..5ada2fd 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -5,7 +5,7 @@ using Statistics using StatsBase using Tables: Tables, materializer, istable -import Base.Iterators +import Base.Iterators: drop export impute, impute!, chain, chain!, drop, drop!, interp, interp!, ImputeError @@ -18,6 +18,13 @@ function __init__() Please qualify your calls with `Impute.(...)` or explicitly import the symbol. """ ) + + @warn( + """ + The default limit for all impute functions will be 1.0 going forward. + If you depend on a specific threshold please pass in an appropriate `AbstractContext`. + """ + ) end """ @@ -45,112 +52,22 @@ const global imputation_methods = Dict{Symbol, Type}( :nocb => NOCB, ) -""" - impute!(data, method::Symbol=:interp, args...; limit::Float64=0.1) - -Looks up the `Imputor` type for the `method`, creates it and calls -`impute!(imputor::Imputor, data, limit::Float64)` with it. - -# Arguments -* `data`: the datset containing missing elements we should impute. -* `method::Symbol`: the imputation method to use - (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) -* `args::Any...`: any arguments you should pass to the `Imputor` constructor. -* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) -""" -function impute!(data, method::Symbol, args...; limit::Float64=0.1) - imputor_type = imputation_methods[method] - imputor = length(args) > 0 ? imputor_type(args...) : imputor_type() - return impute!(imputor, data, limit) -end - -""" - impute!(data, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) - -Creates the appropriate `Imputor` type and `Context` (using `missing` function) in order to call -`impute!(imputor::Imputor, ctx::Context, data)` with them. - -# Arguments -* `data`: the datset containing missing elements we should impute. -* `missing::Function`: the missing data function to use -* `method::Symbol`: the imputation method to use - (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) -* `args::Any...`: any arguments you should pass to the `Imputor` constructor. -* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) -""" -function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) - imputor_type = imputation_methods[method] - imputor = length(args) > 0 ? imputor_type(args...) : imputor_type() - return Context(; limit=limit, is_missing=missing)() do ctx - impute!(imputor, ctx, data) +include("deprecated.jl") + +let + for (k, v) in imputation_methods + local typename = nameof(v) + local f = k + local f! = Symbol(k, :!) + + # NOTE: The + @eval begin + $f(data; kwargs...) = impute($typename(; context=Context(Dict(kwargs...))), data) + $f!(data; kwargs...) = impute!($typename(; context=Context(Dict(kwargs...))), data) + $f(; kwargs...) = data -> impute($typename(; context=Context(Dict(kwargs...))), data) + $f!(; kwargs...) = data -> impute!($typename(; context=Context(Dict(kwargs...))), data) + end end end -""" - impute(data, args...; kwargs...) - -Copies the `data` before calling `impute!(new_data, args...; kwargs...)` -""" -function impute(data, args...; kwargs...) - return impute!(deepcopy(data), args...; kwargs...) -end - -""" - chain!(data, missing::Function, imputors::Imputor...; kwargs...) - -Creates a `Chain` with `imputors` and calls `impute!(imputor, missing, data; kwargs...)` -""" -function chain!(data, missing::Function, imputors::Imputor...; kwargs...) - imputor = Chain(imputors...) - return impute!(imputor, missing, data; kwargs...) -end - -""" - chain!(data, imputors::Imputor...; kwargs...) - -Creates a `Chain` with `imputors` and calls `impute!(imputor, data; kwargs...)` -""" -function chain!(data, imputors::Imputor...; kwargs...) - imputor = Chain(imputors...) - return impute!(imputor, data; kwargs...) -end - -""" - chain(data, args...; kwargs...) - -Copies the `data` before calling `chain!(data, args...; kwargs...)` -""" -function chain(data, args...; kwargs...) - result = deepcopy(data) - return chain!(data, args...; kwargs...) -end - -""" - drop!(data; limit=1.0) - -Utility method for `impute!(data, :drop; limit=limit)` -""" -drop!(data; limit=1.0) = impute!(data, :drop; limit=limit) - -""" - drop(data; limit=1.0) - -Utility method for `impute(data, :drop; limit=limit)` -""" -Iterators.drop(data; limit=1.0) = impute(data, :drop; limit=limit) - -""" - interp!(data; limit=1.0) - -Utility method for `impute!(data, :interp; limit=limit)` -""" -interp!(data; limit=1.0) = impute!(data, :interp; limit=limit) - -""" - interp(data; limit=1.0) - -Utility method for `impute(data, :interp; limit=limit)` -""" -interp(data; limit=1.0) = impute(data, :interp; limit=limit) - end # module diff --git a/src/context.jl b/src/context.jl index 14aa0bf..68b2000 100644 --- a/src/context.jl +++ b/src/context.jl @@ -101,13 +101,29 @@ mutable struct Context <: AbstractContext end function Context(; - limit::Float64=1.0, + limit::Float64=0.1, is_missing::Function=ismissing, on_complete::Function=complete ) Context(0, 0, limit, is_missing, on_complete) end +# The constructor only exists for legacy reasons +# We should drop this when we're ready to stop accepting limit in +# arbitrary impute functions. +function Context(d::Dict) + if haskey(d, :context) + return d[:context] + else haskey(d, :limit) + return Context(; + # We using a different default limit value here for legacy reason. + limit=get(d, :limit, 1.0), + is_missing=get(d, :is_missing, ismissing), + on_complete=get(d, :on_complete, complete), + ) + end +end + function (ctx::Context)(f::Function) _ctx = copy(ctx) _ctx.num = 0 diff --git a/src/imputors.jl b/src/imputors.jl index 5ab0adc..769926e 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -11,80 +11,55 @@ abstract type Imputor end """ - impute(imp::Imputor, data, limit=0.1) - impute(imp::Imputor, ctx, data) + impute(imp::Imputor, data) Copies the `data` before calling the corresponding `impute!(imp, ...)` call. """ -impute(imp::Imputor, data) = impute!(imp, deepcopy(data)) -impute(imp::Imputor, ctx::AbstractContext, data) = impute!(imp, ctx, deepcopy(data)) - -""" - impute!(imp::Imputor, data, limit::Float64=0.1) - -Creates a `Context` using information about `data`. These include - -1. missing data function which defaults to `missing` - -2. number of elements: `*(size(data)...)` - -# Arguments -* `imp::Imputor`: the Imputor method to use -* `data`: the data to impute -* `limit::Float64: missing data ratio limit/threshold (default: 0.1)` - -# Return -* the input `data` with values imputed. -""" -function impute!(imp::Imputor, data, limit::Float64=0.1) - Context(; limit=limit)() do ctx - return impute!(imp, ctx, data) - end +function impute(imp::Imputor, data) + impute!(imp, deepcopy(data)) end """ - impute!(imp::Imputor, ctx::AbstractContext, data::AbstractMatrix) + impute!(imp::Imputor, data::AbstractMatrix) Imputes the data in a matrix by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments * `imp::Imputor`: the Imputor method to use -* `ctx::AbstractContext`: the contextual information for missing data * `data::AbstractMatrix`: the data to impute # Returns * `AbstractMatrix`: the input `data` with values imputed """ -function impute!(imp::Imputor, ctx::AbstractContext, data::AbstractMatrix) +function impute!(imp::Imputor, data::AbstractMatrix) for i in 1:size(data, 2) - impute!(imp, ctx, view(data, :, i)) + impute!(imp, view(data, :, i)) end return data end """ - impute!(imp::Imputor, ctx::AbstractContext, table) + impute!(imp::Imputor, table) Imputes the data in a table by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments * `imp::Imputor`: the Imputor method to use -* `ctx::AbstractContext`: the contextual information for missing data * `table`: the data to impute # Returns * the input `data` with values imputed """ -function impute!(imp::Imputor, ctx::AbstractContext, table) +function impute!(imp::Imputor, table) @assert istable(table) # Extract a columns iterate that we should be able to use to mutate the data. # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data columntable = Tables.columns(table) for cname in propertynames(columntable) - impute!(imp, ctx, getproperty(columntable, cname)) + impute!(imp, getproperty(columntable, cname)) end return table diff --git a/src/imputors/chain.jl b/src/imputors/chain.jl index b9238a8..d9164a0 100644 --- a/src/imputors/chain.jl +++ b/src/imputors/chain.jl @@ -18,44 +18,21 @@ Creates a Chain using the `Imputor`s provided (ordering matters). Chain(imputors::Imputor...) = Chain(collect(imputors)) """ - impute!(imp::Chain, missing::Function, data; limit::Float64=0.1) + impute!(imp::Chain, data) -Creates a `Context` and runs the `Imputor`s on the supplied data. +Runs the `Imputor`s on the supplied data. # Arguments * `imp::Chain`: the chain to run -* `missing::Function`: the missing function to use in the `Context` to pass to the `Imputor`s * `data`: our data to impute -* `limit::Float64`: the missing data ration limit/threshold # Returns * our imputed data """ -function impute!(imp::Chain, missing::Function, data; limit::Float64=0.1) - context = Context(; limit=limit, is_missing=missing) - +function impute!(imp::Chain, data) for imputor in imp.imputors - data = impute!(imputor, context, data) + data = impute!(imputor, data) end return data end - -""" - impute!(imp::Chain, data; limit::Float64=0.1) - - -Infers the missing data function from the `data` and passes that to -`impute!(imp::Chain, missing::Function, data; limit::Float64=0.1)`. - -# Arguments -* `imp::Chain`: the chain to run -* `data`: our data to impute -* `limit::Float64`: the missing data ration limit/threshold - -# Returns -* our imputed data -""" -function impute!(imp::Chain, data; limit::Float64=0.1) - impute!(imp, ismissing, data; limit=limit) -end diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index b9a5ca5..1e44355 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -2,30 +2,38 @@ Drop <: Imputor Removes missing values from the `AbstractArray` or `Tables.table` provided. + +# Fields +* `context::AbstractContext`: A context which keeps track of missing data + summary information """ -struct Drop <: Imputor end +struct Drop <: Imputor + context::AbstractContext +end + +"""Drop(; context=Context()) -> Drop""" +Drop(; context=Context()) = Drop(context) """ - impute!(imp::Drop, context::AbstractContext, data::AbstractVector) + impute!(imp::Drop, data::AbstractVector) Uses `filter!` to remove missing elements from the array. # Arguments * `imp::Drop`: this `Imputor` method -* `context::AbstractContext`: contextual information for missing data * `data::AbstractVector`: the data to impute # Returns * `AbstractVector`: our data array with missing elements removed """ -function impute!(imp::Drop, context::AbstractContext, data::AbstractVector) - context() do c +function impute!(imp::Drop, data::AbstractVector) + imp.context() do c filter!(x -> !ismissing(c, x), data) end end """ - impute!(imp::Drop, context::AbstractContext, data::AbstractMatrix) + impute!(imp::Drop, data::AbstractMatrix) Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the `data` with those rows removed. Unfortunately, the mask approach requires copying the matrix. @@ -39,34 +47,32 @@ NOTES (or premature optimizations): # Arguments * `imp::Drop`: this `Imputor` method -* `context::AbstractContext`: contextual information for missing data * `data::AbstractMatrix`: the data to impute # Returns * `AbstractMatrix`: a new matrix with missing rows removed """ -function impute!(imp::Drop, context::AbstractContext, data::AbstractMatrix) - context() do c +function impute!(imp::Drop, data::AbstractMatrix) + imp.context() do c mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1)) return data[mask, :] end end """ - impute!(imp::Drop, context::AbstractContext, table) + impute!(imp::Drop, table) Finds the missing rows in the table and deletes them. # Arguments * `imp::Drop`: this `Imputor` method -* `context::AbstractContext`: contextual information for missing data * `table`: a type that implements the Tables API. # Returns * our data with the missing rows removed. """ -function impute!(imp::Drop, context::AbstractContext, table) - context() do c +function impute!(imp::Drop, table) + imp.context() do c @assert istable(table) rows = Tables.rows(table) diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index 5dc274d..fa68d31 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -5,29 +5,28 @@ Fills in the missing data with a specific value. # Fields * `value::Any`: A scalar missing value or a function that returns the a scalar if - passed the data with missing data removed (e.g, `mean`) + passed the data with missing data removed (e.g, `mean`) +* `context::AbstractContext`: A context which keeps track of missing data + summary information """ struct Fill{T} <: Imputor value::T + context::AbstractContext end -""" - Fill() -> Fill - -By default `Fill()` will use the mean of the existing values as the fill value. -""" -Fill() = Fill(mean) +"""Fill(; value=mean, context=Context()) -> Fill""" +Fill(; value=mean, context=Context()) = Fill(value, context) """ - impute!(imp::Fill, context::AbstractContext, data::AbstractVector) + impute!(imp::Fill, data::AbstractVector) Computes the fill value if `imp.value` is a `Function` (i.e., `imp.value(drop(copy(data)))`) and replaces all missing values in the `data` with that value. """ -function impute!(imp::Fill, context::AbstractContext, data::AbstractVector) - context() do c +function impute!(imp::Fill, data::AbstractVector) + imp.context() do c fill_val = if isa(imp.value, Function) - imp.value(Iterators.drop(copy(data))) + imp.value(Iterators.drop(copy(data); context=c)) else imp.value end diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index d59ecdd..a6c178a 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -3,10 +3,15 @@ Performs linear interpolation between the nearest values in an vector. """ -struct Interpolate <: Imputor end +struct Interpolate <: Imputor + context::AbstractContext +end + +"""Interpolate(; context=Context()) -> Interpolate""" +Interpolate(; context=Context()) = Interpolate(context) """ - impute!(imp::Interpolate, context::AbstractContext, data::AbstractVector) + impute!(imp::Interpolate, data::AbstractVector) Uses linear interpolation between existing elements of a vector to fill in missing data. @@ -14,10 +19,8 @@ WARNING: Missing values at the head or tail of the array cannot be interpolated are no existing values on both sides. As a result, this method does not guarantee that all missing values will be imputed. """ -function impute!( - imp::Interpolate, context::AbstractContext, data::AbstractVector{<:Union{T, Missing}} -) where T - context() do c +function impute!(imp::Interpolate, data::AbstractVector{<:Union{T, Missing}}) where T + imp.context() do c i = findfirst(c, data) + 1 while i < length(data) diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index a118c1e..10ccb8b 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -1,7 +1,12 @@ -struct LOCF <: Imputor end +struct LOCF <: Imputor + context::AbstractContext +end + +"""LOCF(; context=Context()) -> LOCF""" +LOCF(; context=Context()) = LOCF(context) """ - impute!(imp::LOCF, context::AbstractContext, data::AbstractVector) + impute!(imp::LOCF, data::AbstractVector) Iterates forwards through the `data` and fills missing data with the last existing observation. @@ -15,8 +20,8 @@ that all missing values will be imputed. ``` """ -function impute!(imp::LOCF, context::AbstractContext, data::AbstractVector) - context() do c +function impute!(imp::LOCF, data::AbstractVector) + imp.context() do c start_idx = findfirst(c, data) + 1 for i in start_idx:length(data) if ismissing(c, data[i]) diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index 5a56e4b..32690a1 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -3,10 +3,15 @@ Fills in missing data using the Next Observation Carried Backward (NOCB) approach. """ -struct NOCB <: Imputor end +struct NOCB <: Imputor + context::AbstractContext +end + +"""NOCB(; context=Context()) -> NOCB""" +NOCB(; context=Context()) = NOCB(context) """ - impute!(imp::NOCB, context::AbstractContext, data::AbstractVector) + impute!(imp::NOCB, data::AbstractVector) Iterates backwards through the `data` and fills missing data with the next existing observation. @@ -20,8 +25,8 @@ that all missing values will be imputed. ``` """ -function impute!(imp::NOCB, context::AbstractContext, data::AbstractVector) - context() do c +function impute!(imp::NOCB, data::AbstractVector) + imp.context() do c end_idx = findlast(c, data) - 1 for i in end_idx:-1:1 if ismissing(c, data[i]) diff --git a/test/runtests.jl b/test/runtests.jl index 062249f..c1f954a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -169,7 +169,6 @@ import Impute: Drop, Context, WeightedContext @testset "Weighted" begin # If we use an exponentially weighted context then we won't pass the limit # because missing earlier observations is less important than later ones. - @show a ctx = WeightedContext(eweights(20, 0.3); limit=0.1) @test isa(ctx, WeightedContext) result = impute(Drop(), ctx, a) From f54e2e2451ab2b0db92dc88e800a71c4db0055af Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 5 Jul 2019 14:09:36 -0500 Subject: [PATCH 06/34] Fix automerge on Project.toml --- Project.toml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 4b234b8..328122e 100644 --- a/Project.toml +++ b/Project.toml @@ -11,11 +11,9 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] +DataFrames = "0.17, 0.18" IterTools = "1.2" Tables = "0.2" - -[compat] -DataFrames = "0.17, 0.18" julia = "1" [extras] From a0ab2ea575285325a511567711977630fd238fc5 Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 5 Jul 2019 14:28:47 -0500 Subject: [PATCH 07/34] Drop 0.7 tests and add the deprecated file. --- .appveyor.yml | 1 - .travis.yml | 1 - src/deprecated.jl | 158 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 src/deprecated.jl diff --git a/.appveyor.yml b/.appveyor.yml index 2a62834..756f881 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -1,6 +1,5 @@ environment: matrix: - - julia_version: 0.7 - julia_version: 1.0 - julia_version: nightly diff --git a/.travis.yml b/.travis.yml index 56e488c..abafdbe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ os: - linux - osx julia: - - 0.7 - 1.0 - nightly notifications: diff --git a/src/deprecated.jl b/src/deprecated.jl new file mode 100644 index 0000000..29a6ea0 --- /dev/null +++ b/src/deprecated.jl @@ -0,0 +1,158 @@ +############################################################################### +# Deprecations for calling impute on an Imputor with a custom AbstractContext # +############################################################################### +Base.@deprecate( + impute(imp::Imputor, context::AbstractContext, data; kwargs...), + impute(typeof(imp)(; context=context), data; kwargs...) +) + +Base.@deprecate( + impute!(imp::Imputor, context::AbstractContext, data; kwargs...), + impute!(typeof(imp)(; context=context), data; kwargs...) +) +##################################################################### +# Deprecate all impute calls where the first argument is an Imputor # +##################################################################### +""" + impute!(data, method::Symbol=:interp, args...; limit::Float64=0.1) + +Looks up the `Imputor` type for the `method`, creates it and calls +`impute!(imputor::Imputor, data, limit::Float64)` with it. + +# Arguments +* `data`: the datset containing missing elements we should impute. +* `method::Symbol`: the imputation method to use + (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) +* `args::Any...`: any arguments you should pass to the `Imputor` constructor. +* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) +""" +function impute!(data, method::Symbol, args...; limit::Float64=0.1) + Base.depwarn( + """ + impute!(data, method) is deprecated. + Please use Impute.method!(data) or impute!(imputor, data). + """, + :impute! + ) + imputor_type = imputation_methods[method] + imputor = if length(args) > 0 + imputor_type(args...; context=Context(; limit=limit)) + else + imputor_type(; context=Context(; limit=limit)) + end + + return impute!(imputor, data) +end + +""" + impute!(data, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) + +Creates the appropriate `Imputor` type and `Context` (using `missing` function) in order to call +`impute!(imputor::Imputor, ctx::Context, data)` with them. + +# Arguments +* `data`: the datset containing missing elements we should impute. +* `missing::Function`: the missing data function to use +* `method::Symbol`: the imputation method to use + (options: [`:drop`, `:fill`, `:interp`, `:locf`, `:nocb`]) +* `args::Any...`: any arguments you should pass to the `Imputor` constructor. +* `limit::Float64`: missing data ratio limit/threshold (default: 0.1) +""" +function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) + Base.depwarn( + """ + impute!(data, missing, method) is deprecated. Please use impute!(imputor, data). + """, + :impute! + ) + imputor_type = imputation_methods[method] + imputor = if length(args) > 0 + imputor_type(args...; context=Context(; limit=limit)) + else + imputor_type(; context=Context(; limit=limit)) + end + + return impute!(imputor, data) +end + +""" + impute(data, args...; kwargs...) + +Copies the `data` before calling `impute!(new_data, args...; kwargs...)` +""" +function impute(data, args...; kwargs...) + Base.depwarn( + """ + impute(data, args...; kwargs...) is deprecated. + Please use Impute.method(data) or impute(imputor, data). + """, + :impute + ) + return impute!(deepcopy(data), args...; kwargs...) +end + +################################# +# Deprecate the chain functions # +################################# +""" + chain!(data, missing::Function, imputors::Imputor...; kwargs...) + +Creates a `Chain` with `imputors` and calls `impute!(imputor, missing, data; kwargs...)` +""" +function chain!(data, missing::Function, imputors::Imputor...; kwargs...) + Base.depwarn( + """ + chain!(data, missing, imputors...) is deprecated. + Please use data = imp1(data) |> imp2 |> imp3 + """, + :chain! + ) + return chain!(data, imputors...; is_missing=missing, kwargs...) +end + +""" + chain!(data, imputors::Imputor...; kwargs...) + +Creates a `Chain` with `imputors` and calls `impute!(imputor, data; kwargs...)` +""" +function chain!(data, imputors::Imputor...; kwargs...) + Base.depwarn( + """ + chain!(data, imputors...) is deprecated. + Please use data = imp1(data) |> imp2 |> imp3 + """, + :chain! + ) + ctx = Context(; kwargs...) + + for imputor in imputors + imp = typeof(imputor)( + (isa(x, AbstractContext) ? ctx : x for x in fieldvalues(imputor))... + ) + data = impute!(imp, data) + end + + return data +end + +""" + chain(data, args...; kwargs...) + +Copies the `data` before calling `chain!(data, args...; kwargs...)` +""" +function chain(data, args...; kwargs...) + Base.depwarn( + """ + chain(data, args...) is deprecated. + Please use result = imp1(data) |> imp2 |> imp3 + """, + :chain + ) + result = deepcopy(data) + return chain!(data, args...; kwargs...) +end + +##################### +# Misc Deprecations # +##################### +Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...) From 0b2bbe7eabc2084f30aafe44aeac11c98a95aefc Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 5 Jul 2019 15:45:35 -0500 Subject: [PATCH 08/34] Added a deprecation for switching to the column-major convention. --- src/Impute.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Impute.jl b/src/Impute.jl index 5ada2fd..e8e143c 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -25,6 +25,13 @@ function __init__() If you depend on a specific threshold please pass in an appropriate `AbstractContext`. """ ) + + @warn( + """ + All matrix imputation methods will be switching to the JuliaStats column-major convention + (e.g., each column corresponds to an observation, and each row corresponds to a variable). + """ + ) end """ From 1f99bbd1886543238af6823cd7e1ab93410f91bc Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 5 Jul 2019 17:20:13 -0500 Subject: [PATCH 09/34] Updated tests to new API and moved existing deprecated tests to a different file. --- src/Impute.jl | 8 +-- src/context.jl | 16 ----- src/deprecated.jl | 23 +++++++ test/deprecated.jl | 152 +++++++++++++++++++++++++++++++++++++++++++++ test/runtests.jl | 90 ++++++++++++++------------- 5 files changed, 226 insertions(+), 63 deletions(-) create mode 100644 test/deprecated.jl diff --git a/src/Impute.jl b/src/Impute.jl index e8e143c..7c5c062 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -69,10 +69,10 @@ let # NOTE: The @eval begin - $f(data; kwargs...) = impute($typename(; context=Context(Dict(kwargs...))), data) - $f!(data; kwargs...) = impute!($typename(; context=Context(Dict(kwargs...))), data) - $f(; kwargs...) = data -> impute($typename(; context=Context(Dict(kwargs...))), data) - $f!(; kwargs...) = data -> impute!($typename(; context=Context(Dict(kwargs...))), data) + $f(data; kwargs...) = impute($typename(; _extract_context_kwargs(kwargs...)...), data) + $f!(data; kwargs...) = impute!($typename(; _extract_context_kwargs(kwargs...)...), data) + $f(; kwargs...) = data -> impute($typename(; _extract_context_kwargs(kwargs...)...), data) + $f!(; kwargs...) = data -> impute!($typename(; _extract_context_kwargs(kwargs...)...), data) end end end diff --git a/src/context.jl b/src/context.jl index 68b2000..6645174 100644 --- a/src/context.jl +++ b/src/context.jl @@ -108,22 +108,6 @@ function Context(; Context(0, 0, limit, is_missing, on_complete) end -# The constructor only exists for legacy reasons -# We should drop this when we're ready to stop accepting limit in -# arbitrary impute functions. -function Context(d::Dict) - if haskey(d, :context) - return d[:context] - else haskey(d, :limit) - return Context(; - # We using a different default limit value here for legacy reason. - limit=get(d, :limit, 1.0), - is_missing=get(d, :is_missing, ismissing), - on_complete=get(d, :on_complete, complete), - ) - end -end - function (ctx::Context)(f::Function) _ctx = copy(ctx) _ctx.num = 0 diff --git a/src/deprecated.jl b/src/deprecated.jl index 29a6ea0..0060035 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -156,3 +156,26 @@ end # Misc Deprecations # ##################### Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...) + +# This function is just used to support legacy behaviour and should be removed in a +# future release when we dropping accepting the limit kwarg to impute functions. +function _extract_context_kwargs(kwargs...) + d = Dict(kwargs...) + limit = 1.0 + + if haskey(d, :limit) + warn( + "Passing `limit` directly to impute functions is deprecated. " * + "Please pass a `context` in the future." + ) + + limit = d[:limit] + delete!(d, :limit) + end + + if !haskey(d, :context) + d[:context] = Context(; limit=limit) + end + + return d +end diff --git a/test/deprecated.jl b/test/deprecated.jl new file mode 100644 index 0000000..8c80832 --- /dev/null +++ b/test/deprecated.jl @@ -0,0 +1,152 @@ +@testset "deprecated" begin + a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) + a[[2, 3, 7]] .= missing + mask = map(!ismissing, a) + + @testset "Drop" begin + result = impute(a, :drop; limit=0.2) + expected = copy(a) + deleteat!(expected, [2, 3, 7]) + + @test result == expected + end + + @testset "Interpolate" begin + result = impute(a, :interp; limit=0.2) + @test result == collect(1.0:1.0:20) + @test result == interp(a) + + # Test interpolation between identical points + b = ones(Union{Float64, Missing}, 20) + b[[2, 3, 7]] .= missing + @test interp(b) == ones(Union{Float64, Missing}, 20) + + # Test interpolation at endpoints + b = ones(Union{Float64, Missing}, 20) + b[[1, 3, 20]] .= missing + result = interp(b) + @test ismissing(result[1]) + @test ismissing(result[20]) + end + + @testset "Fill" begin + @testset "Value" begin + fill_val = -1.0 + result = impute(a, :fill, fill_val; limit=0.2) + expected = copy(a) + expected[[2, 3, 7]] .= fill_val + + @test result == expected + end + + @testset "Mean" begin + result = impute(a, :fill; limit=0.2) + expected = copy(a) + expected[[2, 3, 7]] .= mean(a[mask]) + + @test result == expected + end + end + + @testset "LOCF" begin + result = impute(a, :locf; limit=0.2) + expected = copy(a) + expected[2] = 1.0 + expected[3] = 1.0 + expected[7] = 6.0 + + @test result == expected + end + + @testset "NOCB" begin + result = impute(a, :nocb; limit=0.2) + expected = copy(a) + expected[2] = 4.0 + expected[3] = 4.0 + expected[7] = 8.0 + + @test result == expected + end + + @testset "DataFrame" begin + data = dataset("boot", "neuro") + df = impute(data, :interp; limit=1.0) + end + + @testset "Matrix" begin + data = Matrix(dataset("boot", "neuro")) + + @testset "Drop" begin + result = Iterators.drop(data) + @test size(result, 1) == 4 + end + + @testset "Fill" begin + result = impute(data, :fill, 0.0; limit=1.0) + @test size(result) == size(data) + end + end + + @testset "Not enough data" begin + @test_throws ImputeError impute(a, :drop) + end + + @testset "Chain" begin + orig = dataset("boot", "neuro") + + @testset "DataFrame" begin + result = chain( + orig, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test !any(ismissing, Matrix(result)) + end + + @testset "Column Table" begin + data = Tables.columntable(orig) + result = chain( + data, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) |> Tables.matrix + + @test size(result) == size(orig) + # Confirm that we don't have any more missing values + @test !any(ismissing, result) + end + + @testset "Matrix" begin + data = Matrix(orig) + result = chain( + data, + Impute.Interpolate(), + Impute.LOCF(), + Impute.NOCB(); + limit=1.0 + ) + + @test size(result) == size(data) + # Confirm that we don't have any more missing values + @test !any(ismissing, result) + end + end + + @testset "Alternate missing functions" begin + data1 = dataset("boot", "neuro") # Missing values with `missing` + data2 = impute(data1, :fill, NaN; limit=1.0) # Missing values with `NaN` + + @test impute(data1, :drop; limit=1.0) == dropmissing(data1) + + result1 = chain(data1, Impute.Interpolate(), Impute.Drop(); limit=1.0) + result2 = chain(data2, isnan, Impute.Interpolate(), Impute.Drop(); limit=1.0) + @test result1 == result2 + end +end diff --git a/test/runtests.jl b/test/runtests.jl index c1f954a..c5b7f55 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,35 +6,37 @@ using RDatasets using Statistics using StatsBase -import Impute: Drop, Context, WeightedContext +import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, ImputeError @testset "Impute" begin a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) a[[2, 3, 7]] .= missing mask = map(!ismissing, a) + ctx = Context(; limit=0.2) @testset "Drop" begin - result = impute(a, :drop; limit=0.2) + result = impute(Drop(; context=ctx), a) expected = copy(a) deleteat!(expected, [2, 3, 7]) @test result == expected + @test result == Impute.drop(a; context=ctx) end @testset "Interpolate" begin - result = impute(a, :interp; limit=0.2) + result = impute(Interpolate(; context=ctx), a) @test result == collect(1.0:1.0:20) - @test result == interp(a) + @test result == interp(a; context=ctx) # Test interpolation between identical points b = ones(Union{Float64, Missing}, 20) b[[2, 3, 7]] .= missing - @test interp(b) == ones(Union{Float64, Missing}, 20) + @test interp(b; context=ctx) == ones(Union{Float64, Missing}, 20) # Test interpolation at endpoints b = ones(Union{Float64, Missing}, 20) b[[1, 3, 20]] .= missing - result = interp(b) + result = interp(b; context=ctx) @test ismissing(result[1]) @test ismissing(result[20]) end @@ -42,76 +44,81 @@ import Impute: Drop, Context, WeightedContext @testset "Fill" begin @testset "Value" begin fill_val = -1.0 - result = impute(a, :fill, fill_val; limit=0.2) + result = impute(Fill(; value=fill_val, context=ctx), a) expected = copy(a) expected[[2, 3, 7]] .= fill_val @test result == expected + @test result == Impute.fill(a; value=fill_val, context=ctx) end @testset "Mean" begin - result = impute(a, :fill; limit=0.2) + result = impute(Fill(; value=mean, context=ctx), a) expected = copy(a) expected[[2, 3, 7]] .= mean(a[mask]) @test result == expected + @test result == Impute.fill(a; value=mean, context=ctx) end end @testset "LOCF" begin - result = impute(a, :locf; limit=0.2) + result = impute(LOCF(; context=ctx), a) expected = copy(a) expected[2] = 1.0 expected[3] = 1.0 expected[7] = 6.0 @test result == expected + @test result == Impute.locf(a; context=ctx) end @testset "NOCB" begin - result = impute(a, :nocb; limit=0.2) + result = impute(NOCB(; context=ctx), a) expected = copy(a) expected[2] = 4.0 expected[3] = 4.0 expected[7] = 8.0 @test result == expected + @test result == Impute.nocb(a; context=ctx) end @testset "DataFrame" begin + ctx = Context(; limit=1.0) data = dataset("boot", "neuro") - df = impute(data, :interp; limit=1.0) + df = impute(Interpolate(; context=ctx), data) end @testset "Matrix" begin + ctx = Context(; limit=1.0) data = Matrix(dataset("boot", "neuro")) @testset "Drop" begin - result = Iterators.drop(data) + result = impute(Drop(; context=ctx), data) @test size(result, 1) == 4 + @test result == Impute.drop(data; context=ctx) end @testset "Fill" begin - result = impute(data, :fill, 0.0; limit=1.0) + result = impute(Fill(; value=0.0, context=ctx), data) @test size(result) == size(data) + @test result == Impute.fill(data; value=0.0, context=ctx) end end @testset "Not enough data" begin - @test_throws ImputeError impute(a, :drop) + ctx = Context(; limit=0.1) + @test_throws ImputeError impute(Drop(; context=ctx), a) + @test_throws ImputeError Impute.drop(a; context=ctx) end @testset "Chain" begin orig = dataset("boot", "neuro") + ctx = Context(; limit=1.0) @testset "DataFrame" begin - result = chain( - orig, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) + result = Impute.interp(orig; context=ctx) |> Impute.locf() |> Impute.nocb() @test size(result) == size(orig) # Confirm that we don't have any more missing values @@ -119,14 +126,11 @@ import Impute: Drop, Context, WeightedContext end @testset "Column Table" begin - data = Tables.columntable(orig) - result = chain( - data, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) |> Tables.matrix + result = Tables.columntable(orig) |> + Impute.interp(; context=ctx) |> + Impute.locf() |> + Impute.nocb() |> + Tables.matrix @test size(result) == size(orig) # Confirm that we don't have any more missing values @@ -135,13 +139,7 @@ import Impute: Drop, Context, WeightedContext @testset "Matrix" begin data = Matrix(orig) - result = chain( - data, - Impute.Interpolate(), - Impute.LOCF(), - Impute.NOCB(); - limit=1.0 - ) + result = Impute.interp(data; context=ctx) |> Impute.locf() |> Impute.nocb() @test size(result) == size(data) # Confirm that we don't have any more missing values @@ -150,20 +148,24 @@ import Impute: Drop, Context, WeightedContext end @testset "Alternate missing functions" begin - data1 = dataset("boot", "neuro") # Missing values with `missing` - data2 = impute(data1, :fill, NaN; limit=1.0) # Missing values with `NaN` + ctx1 = Context(; limit=1.0) + ctx2 = Context(; limit=1.0, is_missing=isnan) + data1 = dataset("boot", "neuro") # Missing values with `missing` + data2 = Impute.fill(data1; value=NaN, context=ctx1) # Missing values with `NaN` - @test impute(data1, :drop; limit=1.0) == dropmissing(data1) + @test Impute.drop(data1; context=ctx1) == dropmissing(data1) + + result1 = Impute.interp(data1; context=ctx1) |> Impute.drop() + result2 = Impute.interp(data2; context=ctx2) |> Impute.drop(; context=ctx2) - result1 = chain(data1, Impute.Interpolate(), Impute.Drop(); limit=1.0) - result2 = chain(data2, isnan, Impute.Interpolate(), Impute.Drop(); limit=1.0) @test result1 == result2 end @testset "Contexts" begin @testset "Base" begin - @test_throws ImputeError impute(a, :drop; limit=0.1) - @test_throws ImputeError impute(Drop(), Context(; limit=0.1), a) + ctx = Context(; limit=0.1) + @test_throws ImputeError Impute.drop(a; context=ctx) + @test_throws ImputeError impute(Drop(; context=ctx), a) end @testset "Weighted" begin @@ -182,4 +184,6 @@ import Impute: Drop, Context, WeightedContext @test_throws ImputeError impute(Drop(), ctx, a) end end + + include("deprecated.jl") end From 20f084eeb49e07b5960273485a620a13b1aca560 Mon Sep 17 00:00:00 2001 From: rofinn Date: Sun, 7 Jul 2019 13:07:02 -0500 Subject: [PATCH 10/34] Added some more tests for Chain and mutating methods. --- src/deprecated.jl | 8 +++---- test/deprecated.jl | 22 ++++++++++++++++++++ test/runtests.jl | 52 +++++++++++++++++++++++++++++++++++++++------- 3 files changed, 71 insertions(+), 11 deletions(-) diff --git a/src/deprecated.jl b/src/deprecated.jl index 0060035..26dab75 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -67,9 +67,9 @@ function impute!(data, missing::Function, method::Symbol, args...; limit::Float6 ) imputor_type = imputation_methods[method] imputor = if length(args) > 0 - imputor_type(args...; context=Context(; limit=limit)) + imputor_type(args...; context=Context(; is_missing=missing, limit=limit)) else - imputor_type(; context=Context(; limit=limit)) + imputor_type(; context=Context(; is_missing=missing, limit=limit)) end return impute!(imputor, data) @@ -160,11 +160,11 @@ Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...) # This function is just used to support legacy behaviour and should be removed in a # future release when we dropping accepting the limit kwarg to impute functions. function _extract_context_kwargs(kwargs...) - d = Dict(kwargs...) + d = Dict{Symbol, Any}(kwargs...) limit = 1.0 if haskey(d, :limit) - warn( + @warn( "Passing `limit` directly to impute functions is deprecated. " * "Please pass a `context` in the future." ) diff --git a/test/deprecated.jl b/test/deprecated.jl index 8c80832..6c93d43 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -9,6 +9,11 @@ deleteat!(expected, [2, 3, 7]) @test result == expected + + # Mutating method + a2 = copy(a) + Impute.drop!(a2; limit=0.2) + @test a2 == expected end @testset "Interpolate" begin @@ -16,6 +21,11 @@ @test result == collect(1.0:1.0:20) @test result == interp(a) + # Test in-place method + a2 = copy(a) + Impute.interp!(a2; limit=0.2) + @test a2 == result + # Test interpolation between identical points b = ones(Union{Float64, Missing}, 20) b[[2, 3, 7]] .= missing @@ -45,6 +55,10 @@ expected[[2, 3, 7]] .= mean(a[mask]) @test result == expected + + a2 = copy(a) + Impute.fill!(a2; limit=0.2) + @test a2 == result end end @@ -56,6 +70,9 @@ expected[7] = 6.0 @test result == expected + a2 = copy(a) + impute!(a2, :locf; limit=0.2) + @test a2 == result end @testset "NOCB" begin @@ -66,6 +83,9 @@ expected[7] = 8.0 @test result == expected + a2 = copy(a) + Impute.nocb!(a2; limit=0.2) + @test a2 == result end @testset "DataFrame" begin @@ -148,5 +168,7 @@ result1 = chain(data1, Impute.Interpolate(), Impute.Drop(); limit=1.0) result2 = chain(data2, isnan, Impute.Interpolate(), Impute.Drop(); limit=1.0) @test result1 == result2 + + @test Impute.drop(data1; limit=1.0) == impute(data2, isnan, :drop; limit=1.0) end end diff --git a/test/runtests.jl b/test/runtests.jl index c5b7f55..085d643 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -21,6 +21,10 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @test result == expected @test result == Impute.drop(a; context=ctx) + + a2 = copy(a) + Impute.drop!(a2; context=ctx) + @test a2 == expected end @testset "Interpolate" begin @@ -28,6 +32,11 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @test result == collect(1.0:1.0:20) @test result == interp(a; context=ctx) + # Test in-place method + a2 = copy(a) + Impute.interp!(a2; context=ctx) + @test a2 == result + # Test interpolation between identical points b = ones(Union{Float64, Missing}, 20) b[[2, 3, 7]] .= missing @@ -59,6 +68,10 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @test result == expected @test result == Impute.fill(a; value=mean, context=ctx) + + a2 = copy(a) + Impute.fill!(a2; context=ctx) + @test a2 == result end end @@ -71,6 +84,10 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @test result == expected @test result == Impute.locf(a; context=ctx) + + a2 = copy(a) + Impute.locf!(a2; context=ctx) + @test a2 == result end @testset "NOCB" begin @@ -82,6 +99,10 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @test result == expected @test result == Impute.nocb(a; context=ctx) + + a2 = copy(a) + Impute.nocb!(a2; context=ctx) + @test a2 == result end @testset "DataFrame" begin @@ -104,6 +125,10 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im result = impute(Fill(; value=0.0, context=ctx), data) @test size(result) == size(data) @test result == Impute.fill(data; value=0.0, context=ctx) + + data2 = copy(data) + Impute.fill!(data2; value=0.0, context=ctx) + @test data2 == result end end @@ -118,18 +143,31 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im ctx = Context(; limit=1.0) @testset "DataFrame" begin - result = Impute.interp(orig; context=ctx) |> Impute.locf() |> Impute.nocb() + result = Impute.interp(orig; context=ctx) |> Impute.locf!() |> Impute.nocb!() @test size(result) == size(orig) # Confirm that we don't have any more missing values @test !any(ismissing, Matrix(result)) + + + # We can also use the Chain type with explicit Imputor types + result2 = impute( + Impute.Chain( + Impute.Interpolate(; context=ctx), + Impute.LOCF(), + Impute.NOCB() + ), + orig, + ) + + @test result == result2 end @testset "Column Table" begin result = Tables.columntable(orig) |> - Impute.interp(; context=ctx) |> - Impute.locf() |> - Impute.nocb() |> + Impute.interp!(; context=ctx) |> + Impute.locf!() |> + Impute.nocb!() |> Tables.matrix @test size(result) == size(orig) @@ -139,7 +177,7 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @testset "Matrix" begin data = Matrix(orig) - result = Impute.interp(data; context=ctx) |> Impute.locf() |> Impute.nocb() + result = Impute.interp(data; context=ctx) |> Impute.locf!() |> Impute.nocb!() @test size(result) == size(data) # Confirm that we don't have any more missing values @@ -155,8 +193,8 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @test Impute.drop(data1; context=ctx1) == dropmissing(data1) - result1 = Impute.interp(data1; context=ctx1) |> Impute.drop() - result2 = Impute.interp(data2; context=ctx2) |> Impute.drop(; context=ctx2) + result1 = Impute.interp(data1; context=ctx1) |> Impute.drop!() + result2 = Impute.interp(data2; context=ctx2) |> Impute.drop!(; context=ctx2) @test result1 == result2 end From aedd1abfe9fe14512c89100333fa453e7fb7099d Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 8 Jul 2019 16:57:41 -0500 Subject: [PATCH 11/34] Introduce dropobs and dropvars and deprecate Drop. --- src/Impute.jl | 4 +- src/deprecated.jl | 1 + src/imputors/drop.jl | 118 ++++++++++++++++++++++++++++++++++++++----- test/runtests.jl | 87 +++++++++++++++++++++++-------- 4 files changed, 176 insertions(+), 34 deletions(-) diff --git a/src/Impute.jl b/src/Impute.jl index 7c5c062..e72a9fd 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -52,7 +52,9 @@ include("context.jl") include("imputors.jl") const global imputation_methods = Dict{Symbol, Type}( - :drop => Drop, + :drop => DropObs, + :dropobs => DropObs, + :dropvars => DropVars, :interp => Interpolate, :fill => Fill, :locf => LOCF, diff --git a/src/deprecated.jl b/src/deprecated.jl index 26dab75..a5174df 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -156,6 +156,7 @@ end # Misc Deprecations # ##################### Base.@deprecate Fill(val; kwargs...) Fill(; value=val, kwargs...) +Base.@deprecate_binding Drop DropObs false # This function is just used to support legacy behaviour and should be removed in a # future release when we dropping accepting the limit kwarg to impute functions. diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 1e44355..8f02b88 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -1,5 +1,5 @@ """ - Drop <: Imputor + DropObs <: Imputor Removes missing values from the `AbstractArray` or `Tables.table` provided. @@ -7,33 +7,33 @@ Removes missing values from the `AbstractArray` or `Tables.table` provided. * `context::AbstractContext`: A context which keeps track of missing data summary information """ -struct Drop <: Imputor +struct DropObs <: Imputor context::AbstractContext end -"""Drop(; context=Context()) -> Drop""" -Drop(; context=Context()) = Drop(context) +"""DropObs(; context=Context()) -> DropObs""" +DropObs(; context=Context()) = DropObs(context) """ - impute!(imp::Drop, data::AbstractVector) + impute!(imp::DropObs, data::AbstractVector) Uses `filter!` to remove missing elements from the array. # Arguments -* `imp::Drop`: this `Imputor` method +* `imp::DropObs`: this `Imputor` method * `data::AbstractVector`: the data to impute # Returns * `AbstractVector`: our data array with missing elements removed """ -function impute!(imp::Drop, data::AbstractVector) +function impute!(imp::DropObs, data::AbstractVector) imp.context() do c filter!(x -> !ismissing(c, x), data) end end """ - impute!(imp::Drop, data::AbstractMatrix) + impute!(imp::DropObs, data::AbstractMatrix) Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the `data` with those rows removed. Unfortunately, the mask approach requires copying the matrix. @@ -46,13 +46,13 @@ NOTES (or premature optimizations): 3. reshaping the data back to the desired shape. # Arguments -* `imp::Drop`: this `Imputor` method +* `imp::DropObs`: this `Imputor` method * `data::AbstractMatrix`: the data to impute # Returns * `AbstractMatrix`: a new matrix with missing rows removed """ -function impute!(imp::Drop, data::AbstractMatrix) +function impute!(imp::DropObs, data::AbstractMatrix) imp.context() do c mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1)) return data[mask, :] @@ -60,18 +60,18 @@ function impute!(imp::Drop, data::AbstractMatrix) end """ - impute!(imp::Drop, table) + impute!(imp::DropObs, table) Finds the missing rows in the table and deletes them. # Arguments -* `imp::Drop`: this `Imputor` method +* `imp::DropObs`: this `Imputor` method * `table`: a type that implements the Tables API. # Returns * our data with the missing rows removed. """ -function impute!(imp::Drop, table) +function impute!(imp::DropObs, table) imp.context() do c @assert istable(table) rows = Tables.rows(table) @@ -85,3 +85,95 @@ function impute!(imp::Drop, table) return table end end + + +""" + DropVars <: Imputor + + +Removes missing values from the `AbstractArray` or `Tables.table` provided. + +# Fields +* `context::AbstractContext`: A context which keeps track of missing data + summary information +""" +struct DropVars <: Imputor + context::AbstractContext +end + +"""DropVars(; context=Context()) -> DropVars""" +DropVars(; context=Context()) = DropVars(context) + +""" + impute!(imp::DropVars, data::AbstractMatrix) + +Finds columns in the matrix with too many missing values and uses a mask (Vector{Bool}) to +return the `data` with those columns removed. Unfortunately, the mask approach +requires copying the matrix. + +# Arguments +* `imp::DropVars`: this `Imputor` method +* `data::AbstractMatrix`: the data to impute + +# Returns +* `AbstractMatrix`: a new matrix with missing columns removed +""" +function impute!(imp::DropVars, data::AbstractMatrix) + mask = map(1:size(data, 2)) do i + try + imp.context() do c + for j in 1:size(data, 1) + ismissing(c, data[j, i]) + end + end + return true + catch e + if isa(e, ImputeError) + return false + else + rethrow(e) + end + end + end + + data = data[:, mask] + return data +end + +""" + impute!(imp::DropVars, table) + +Find remove columns in the table with too many missing elements. + +# Arguments +* `imp::DropVars`: this `Imputor` method +* `table`: a type that implements the Tables API. + +# Returns +* our data with the missing columns removed. +""" +function impute!(imp::DropVars, table) + @assert istable(table) + cols = Tables.columns(table) + + cnames = Iterators.filter(propertynames(cols)) do cname + try + imp.context() do c + col = getproperty(cols, cname) + for i in 1:length(col) + ismissing(c, col[i]) + end + end + return true + catch e + if isa(e, ImputeError) + return false + else + rethrow(e) + end + end + end + + table = Tables.select(table, cnames...) |> materializer(table) + return table +end diff --git a/test/runtests.jl b/test/runtests.jl index 085d643..7d064a6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,7 +6,17 @@ using RDatasets using Statistics using StatsBase -import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, ImputeError +import Impute: + Drop, + DropObs, + DropVars, + Interpolate, + Fill, + LOCF, + NOCB, + Context, + WeightedContext, + ImputeError @testset "Impute" begin a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) @@ -15,16 +25,53 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im ctx = Context(; limit=0.2) @testset "Drop" begin - result = impute(Drop(; context=ctx), a) - expected = copy(a) - deleteat!(expected, [2, 3, 7]) + @testset "DropObs" begin + result = impute(DropObs(; context=ctx), a) + expected = copy(a) + deleteat!(expected, [2, 3, 7]) - @test result == expected - @test result == Impute.drop(a; context=ctx) + @test result == expected + @test result == Impute.dropobs(a; context=ctx) - a2 = copy(a) - Impute.drop!(a2; context=ctx) - @test a2 == expected + a2 = copy(a) + Impute.dropobs!(a2; context=ctx) + @test a2 == expected + end + @testset "DropVars" begin + @testset "Matrix" begin + m = reshape(a, 5, 4) + + result = impute(DropVars(; context=ctx), m) + expected = copy(m)[:, 2:4] + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(m; context=ctx)) + + Impute.dropvars!(m; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original matrix + @test_broken isequal(m, expected) + end + @testset "DataFrame" begin + df = DataFrame( + :sin => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)), + :cos => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)), + ) + df.sin[[2, 3, 7, 12, 19]] .= missing + df.cos[[4, 9]] .= missing + + result = impute(DropVars(; context=ctx), df) + expected = df[[:cos]] + + @test isequal(result, expected) + @test isequal(result, Impute.dropvars(df; context=ctx)) + + Impute.dropvars!(df; context=ctx) + # The mutating test is broken because we need to making a copy of + # the original table + @test_broken isequal(df, expected) + end + end end @testset "Interpolate" begin @@ -116,9 +163,9 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im data = Matrix(dataset("boot", "neuro")) @testset "Drop" begin - result = impute(Drop(; context=ctx), data) + result = impute(DropObs(; context=ctx), data) @test size(result, 1) == 4 - @test result == Impute.drop(data; context=ctx) + @test result == Impute.dropobs(data; context=ctx) end @testset "Fill" begin @@ -134,8 +181,8 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @testset "Not enough data" begin ctx = Context(; limit=0.1) - @test_throws ImputeError impute(Drop(; context=ctx), a) - @test_throws ImputeError Impute.drop(a; context=ctx) + @test_throws ImputeError impute(DropObs(; context=ctx), a) + @test_throws ImputeError Impute.dropobs(a; context=ctx) end @testset "Chain" begin @@ -191,10 +238,10 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im data1 = dataset("boot", "neuro") # Missing values with `missing` data2 = Impute.fill(data1; value=NaN, context=ctx1) # Missing values with `NaN` - @test Impute.drop(data1; context=ctx1) == dropmissing(data1) + @test Impute.dropobs(data1; context=ctx1) == dropmissing(data1) - result1 = Impute.interp(data1; context=ctx1) |> Impute.drop!() - result2 = Impute.interp(data2; context=ctx2) |> Impute.drop!(; context=ctx2) + result1 = Impute.interp(data1; context=ctx1) |> Impute.dropobs!() + result2 = Impute.interp(data2; context=ctx2) |> Impute.dropobs!(; context=ctx2) @test result1 == result2 end @@ -202,8 +249,8 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im @testset "Contexts" begin @testset "Base" begin ctx = Context(; limit=0.1) - @test_throws ImputeError Impute.drop(a; context=ctx) - @test_throws ImputeError impute(Drop(; context=ctx), a) + @test_throws ImputeError Impute.dropobs(a; context=ctx) + @test_throws ImputeError impute(DropObs(; context=ctx), a) end @testset "Weighted" begin @@ -211,7 +258,7 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im # because missing earlier observations is less important than later ones. ctx = WeightedContext(eweights(20, 0.3); limit=0.1) @test isa(ctx, WeightedContext) - result = impute(Drop(), ctx, a) + result = impute(DropObs(), ctx, a) expected = copy(a) deleteat!(expected, [2, 3, 7]) @test result == expected @@ -219,7 +266,7 @@ import Impute: Drop, Interpolate, Fill, LOCF, NOCB, Context, WeightedContext, Im # If we reverse the weights such that earlier observations are more important # then our previous limit of 0.2 won't be enough to succeed. ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2) - @test_throws ImputeError impute(Drop(), ctx, a) + @test_throws ImputeError impute(DropObs(), ctx, a) end end From 5f1f4d8d5f49fef20b7d9fe49ab570c2e210228c Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 8 Jul 2019 18:28:12 -0500 Subject: [PATCH 12/34] Add a test for broadcasted imputation over a groupby. --- test/runtests.jl | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 7d064a6..fba4481 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -154,8 +154,40 @@ import Impute: @testset "DataFrame" begin ctx = Context(; limit=1.0) - data = dataset("boot", "neuro") - df = impute(Interpolate(; context=ctx), data) + @testset "Single DataFrame" begin + data = dataset("boot", "neuro") + df = impute(Interpolate(; context=ctx), data) + @test isequal(df, Impute.interp(data; context=ctx)) + end + @testset "GroupedDataFrame" begin + hod = repeat(1:24, 12 * 10) + obj = repeat(1:12, 24 * 10) + n = length(hod) + + df = DataFrame( + :hod => hod, + :obj => obj, + :val => Vector{Union{Float64, Missing}}( + [sin(x) * cos(y) for (x, y) in zip(hod, obj)] + ), + ) + + df.val[rand(1:n, 20)] .= missing + gdf1 = groupby(deepcopy(df), [:hod, :obj]) + gdf2 = groupby(df, [:hod, :obj]) + + f1 = x -> Impute.interp(x; context=ctx) |> Impute.locf!() |> Impute.nocb!() + f2 = x -> Impute.interp!(x; context=ctx) |> Impute.locf!() |> Impute.nocb!() + + result = vcat(f1.(gdf1)...) + @test df != result + @test size(result) == (24 * 12 * 10, 3) + @test !any(ismissing, Tables.matrix(result)) + + # Test that we can also mutate the dataframe directly + f2.(gdf2) + @test result == sort(df, (:hod, :obj)) + end end @testset "Matrix" begin From e51217145a197e9394212f5d692b3faebe77951b Mon Sep 17 00:00:00 2001 From: rofinn Date: Tue, 9 Jul 2019 13:55:35 -0500 Subject: [PATCH 13/34] Review changes. --- src/Impute.jl | 40 +++++++++++------------ src/context.jl | 73 +++++++++++++++++++++++------------------- src/deprecated.jl | 6 ++-- src/imputors.jl | 7 ++-- src/imputors/drop.jl | 10 +++--- src/imputors/fill.jl | 8 +++-- src/imputors/interp.jl | 2 +- src/imputors/locf.jl | 16 +++++---- src/imputors/nocb.jl | 5 ++- 9 files changed, 91 insertions(+), 76 deletions(-) diff --git a/src/Impute.jl b/src/Impute.jl index e72a9fd..6c63747 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -28,7 +28,7 @@ function __init__() @warn( """ - All matrix imputation methods will be switching to the JuliaStats column-major convention + All matrix imputation methods will be switching to the column-major convention (e.g., each column corresponds to an observation, and each row corresponds to a variable). """ ) @@ -51,31 +51,27 @@ Base.showerror(io::IO, err::ImputeError) = println(io, "ImputeError: $(err.msg)" include("context.jl") include("imputors.jl") -const global imputation_methods = Dict{Symbol, Type}( - :drop => DropObs, - :dropobs => DropObs, - :dropvars => DropVars, - :interp => Interpolate, - :fill => Fill, - :locf => LOCF, - :nocb => NOCB, +const global imputation_methods = ( + drop = DropObs, + dropobs = DropObs, + dropvars = DropVars, + interp = Interpolate, + fill = Fill, + locf = LOCF, + nocb = NOCB, ) include("deprecated.jl") -let - for (k, v) in imputation_methods - local typename = nameof(v) - local f = k - local f! = Symbol(k, :!) - - # NOTE: The - @eval begin - $f(data; kwargs...) = impute($typename(; _extract_context_kwargs(kwargs...)...), data) - $f!(data; kwargs...) = impute!($typename(; _extract_context_kwargs(kwargs...)...), data) - $f(; kwargs...) = data -> impute($typename(; _extract_context_kwargs(kwargs...)...), data) - $f!(; kwargs...) = data -> impute!($typename(; _extract_context_kwargs(kwargs...)...), data) - end +for (f, v) in pairs(imputation_methods) + typename = nameof(v) + f! = Symbol(f, :!) + + @eval begin + $f(data; kwargs...) = impute($typename(; _extract_context_kwargs(kwargs...)...), data) + $f!(data; kwargs...) = impute!($typename(; _extract_context_kwargs(kwargs...)...), data) + $f(; kwargs...) = data -> impute($typename(; _extract_context_kwargs(kwargs...)...), data) + $f!(; kwargs...) = data -> impute!($typename(; _extract_context_kwargs(kwargs...)...), data) end end diff --git a/src/context.jl b/src/context.jl index 6645174..2e2df5a 100644 --- a/src/context.jl +++ b/src/context.jl @@ -5,6 +5,10 @@ An imputation context records summary information about missing data for an impu """ abstract type AbstractContext end +# We implement a version of copy for all contexts which reconstructs the context from the +# raw fields. +Base.copy(ctx::T) where {T <: AbstractContext} = T(fieldvalues(ctx)...) + """ ismissing(ctx::AbstractContext, x) -> Bool @@ -18,23 +22,23 @@ exceeds our `ctx.limit` we throw an `ImputeError` * `x`: the value to check (may be an single values, abstract array or row) """ function Base.ismissing(ctx::AbstractContext, x) - missing = if isa(x, NamedTuple) - any(entry -> ctx.is_missing(entry[2]), pairs(x)) + was_missing = if isa(x, NamedTuple) + any(ctx.is_missing, Tuple(x)) elseif isa(x, AbstractArray) any(ctx.is_missing, x) else ctx.is_missing(x) end - missing_update!(ctx, missing) + missing_update!(ctx, was_missing) - return missing + return was_missing end """ findfirst(ctx::AbstractContext, data::AbstractVector) -> Int -Returns the first not missing index in `data`. +Returns the first non-missing index in `data`. # Arguments * `ctx::AbstractContext`: the context to pass into `ismissing` @@ -50,7 +54,7 @@ end """ findlast(ctx::AbstractContext, data::AbstractVector) -> Int -Returns the last not missing index in `data`. +Returns the last non-missing index in `data`. # Arguments * `ctx::AbstractContext`: the context to pass into `ismissing` @@ -66,7 +70,7 @@ end """ findnext(ctx::AbstractContext, data::AbstractVector) -> Int -Returns the next not missing index in `data`. +Returns the next non-missing index in `data`. # Arguments * `ctx::AbstractContext`: the context to pass into `ismissing` @@ -88,7 +92,7 @@ weighted. # Fields * `n::Int`: number of observations * `count::Int`: number of missing values found -* `limit::Float64`: allowable limit for missing values to impute +* `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). * `is_missing::Function`: returns a Bool if the value counts as missing * `on_complete::Function`: a function to run when imputation is complete """ @@ -105,30 +109,26 @@ function Context(; is_missing::Function=ismissing, on_complete::Function=complete ) - Context(0, 0, limit, is_missing, on_complete) + return Context(0, 0, limit, is_missing, on_complete) end -function (ctx::Context)(f::Function) +function Base.empty(ctx::Context) _ctx = copy(ctx) _ctx.num = 0 _ctx.count = 0 - result = f(_ctx) - ctx.on_complete(_ctx) - return result + return _ctx end -Base.copy(x::Context) = Context(x.num, x.count, x.limit, x.is_missing, x.on_complete) - -function missing_update!(ctx::Context, miss) +function missing_update!(ctx::Context, was_missing) ctx.num += 1 - if miss + if was_missing ctx.count += 1 end end -function complete(ctx::Context) +function complete(ctx::Context, data) missing_ratio = ctx.count / ctx.num if missing_ratio > ctx.limit @@ -136,6 +136,8 @@ function complete(ctx::Context) "More than $(ctx.limit * 100)% of values were missing ($missing_ratio)." )) end + + return data end @@ -149,11 +151,11 @@ This context type can be useful if some missing observation are more important t # Fields * `num::Int`: number of observations * `s::Float64`: sum of missing values weights -* `limit::Float64`: allowable limit for missing values to impute +* `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). * `is_missing::Function`: returns a Bool if the value counts as missing -* `on_complete::Function`: a function to run when imputation is complete +* `on_complete::Function`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). * `wv::AbstractWeights`: a set of statistical weights to use when evaluating the importance - of each observation + of each observation. Will be accumulated during imputation. """ mutable struct WeightedContext <: AbstractContext num::Int @@ -170,32 +172,26 @@ function WeightedContext( is_missing::Function=ismissing, on_complete::Function=complete ) - WeightedContext(0, 0.0, limit, is_missing, on_complete, wv) + return WeightedContext(0, 0.0, limit, is_missing, on_complete, wv) end -function (ctx::WeightedContext)(f::Function) +function Base.empty(ctx::WeightedContext) _ctx = copy(ctx) _ctx.num = 0 _ctx.s = 0.0 - result = f(_ctx) - ctx.on_complete(_ctx) - return result -end - -function Base.copy(x::WeightedContext) - WeightedContext(x.num, x.s, x.limit, x.is_missing, x.on_complete, x.wv) + return _ctx end -function missing_update!(ctx::WeightedContext, miss) +function missing_update!(ctx::WeightedContext, was_missing) ctx.num += 1 - if miss + if was_missing ctx.s += ctx.wv[ctx.num] end end -function complete(ctx::WeightedContext) +function complete(ctx::WeightedContext, data) missing_ratio = ctx.s / sum(ctx.wv) if missing_ratio > ctx.limit @@ -203,4 +199,15 @@ function complete(ctx::WeightedContext) "More than $(ctx.limit * 100)% of weighted values were missing ($missing_ratio)." )) end + + return data +end + +for T in [Context, WeightedContext] + @eval begin + function (ctx::$T)(f::Function) + _ctx = empty(ctx) + return ctx.on_complete(_ctx, f(_ctx)) + end + end end diff --git a/src/deprecated.jl b/src/deprecated.jl index a5174df..6f62a14 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -10,6 +10,7 @@ Base.@deprecate( impute!(imp::Imputor, context::AbstractContext, data; kwargs...), impute!(typeof(imp)(; context=context), data; kwargs...) ) + ##################################################################### # Deprecate all impute calls where the first argument is an Imputor # ##################################################################### @@ -88,6 +89,7 @@ function impute(data, args...; kwargs...) """, :impute ) + # Call `deepcopy` because we can trust that it's available for all types. return impute!(deepcopy(data), args...; kwargs...) end @@ -148,8 +150,8 @@ function chain(data, args...; kwargs...) """, :chain ) - result = deepcopy(data) - return chain!(data, args...; kwargs...) + # Call `deepcopy` because we can trust that it's available for all types. + return chain!(deepcopy(data), args...; kwargs...) end ##################### diff --git a/src/imputors.jl b/src/imputors.jl index 769926e..1ed2783 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -13,10 +13,11 @@ abstract type Imputor end """ impute(imp::Imputor, data) -Copies the `data` before calling the corresponding `impute!(imp, ...)` call. +Returns a new copy of the `data` with the missing data imputed by the imputor `imp`. """ function impute(imp::Imputor, data) - impute!(imp, deepcopy(data)) + # Call `deepcopy` because we can trust that it's available for all types. + return impute!(imp, deepcopy(data)) end """ @@ -33,7 +34,7 @@ if this is not the desired behaviour custom imputor methods should overload this * `AbstractMatrix`: the input `data` with values imputed """ function impute!(imp::Imputor, data::AbstractMatrix) - for i in 1:size(data, 2) + for i in axes(data, 2) impute!(imp, view(data, :, i)) end return data diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 8f02b88..e15f88c 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -11,7 +11,6 @@ struct DropObs <: Imputor context::AbstractContext end -"""DropObs(; context=Context()) -> DropObs""" DropObs(; context=Context()) = DropObs(context) """ @@ -54,7 +53,9 @@ NOTES (or premature optimizations): """ function impute!(imp::DropObs, data::AbstractMatrix) imp.context() do c - mask = map(i -> !ismissing(c, data[i, :]), 1:size(data, 1)) + mask = map(axes(data, 1)) do i + !ismissing(c, view(data, i, :)) + end return data[mask, :] end end @@ -101,7 +102,6 @@ struct DropVars <: Imputor context::AbstractContext end -"""DropVars(; context=Context()) -> DropVars""" DropVars(; context=Context()) = DropVars(context) """ @@ -119,10 +119,10 @@ requires copying the matrix. * `AbstractMatrix`: a new matrix with missing columns removed """ function impute!(imp::DropVars, data::AbstractMatrix) - mask = map(1:size(data, 2)) do i + mask = map(axes(data, 2)) do i try imp.context() do c - for j in 1:size(data, 1) + for j in axes(data, 1) ismissing(c, data[j, i]) end end diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index fa68d31..ee94955 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -20,13 +20,15 @@ Fill(; value=mean, context=Context()) = Fill(value, context) """ impute!(imp::Fill, data::AbstractVector) -Computes the fill value if `imp.value` is a `Function` (i.e., `imp.value(drop(copy(data)))`) -and replaces all missing values in the `data` with that value. +Fill in missing values with a values determined by `imp.value`. +If `imp.value` is a function then the fill values calculated by invoking that function on +the collection of all nonmissing values. """ function impute!(imp::Fill, data::AbstractVector) imp.context() do c fill_val = if isa(imp.value, Function) - imp.value(Iterators.drop(copy(data); context=c)) + # Call `deepcopy` because we can trust that it's available for all types. + imp.value(Iterators.drop(deepcopy(data); context=c)) else imp.value end diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index a6c178a..21f81f8 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -23,7 +23,7 @@ function impute!(imp::Interpolate, data::AbstractVector{<:Union{T, Missing}}) wh imp.context() do c i = findfirst(c, data) + 1 - while i < length(data) + while i < lastindex(data) if ismissing(c, data[i]) prev_idx = i - 1 next_idx = findnext(c, data, i + 1) diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index 10ccb8b..56a7438 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -1,3 +1,12 @@ +""" + LOCF <: Imputor + +Last observation carried forward. Fill in missing values with the most recent +nonmissing value. + +See also: +- [NOCB](@ref): Next Observation Carried Backward +""" struct LOCF <: Imputor context::AbstractContext end @@ -14,16 +23,11 @@ existing observation. WARNING: missing elements at the head of the array may not be imputed if there is no existing observation to carry forward. As a result, this method does not guarantee that all missing values will be imputed. - -# Usage -``` - -``` """ function impute!(imp::LOCF, data::AbstractVector) imp.context() do c start_idx = findfirst(c, data) + 1 - for i in start_idx:length(data) + for i in start_idx:lastindex(data) if ismissing(c, data[i]) data[i] = data[i-1] end diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index 32690a1..9475ac0 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -2,6 +2,9 @@ NOCB <: Imputor Fills in missing data using the Next Observation Carried Backward (NOCB) approach. + +See also: +- [LOCF](@ref): Last Observation Carried Forward """ struct NOCB <: Imputor context::AbstractContext @@ -28,7 +31,7 @@ that all missing values will be imputed. function impute!(imp::NOCB, data::AbstractVector) imp.context() do c end_idx = findlast(c, data) - 1 - for i in end_idx:-1:1 + for i in end_idx:-1:firstindex(data) if ismissing(c, data[i]) data[i] = data[i+1] end From 83a4bf595e761c542e70670ee765fb40be89e620 Mon Sep 17 00:00:00 2001 From: rofinn Date: Tue, 9 Jul 2019 16:06:20 -0500 Subject: [PATCH 14/34] Introduce a vardim kwarg to make the column-major convention easier to handle. --- src/Impute.jl | 2 ++ src/imputors.jl | 25 +++++++++++++++++++++++-- src/imputors/drop.jl | 20 +++++++++----------- src/imputors/fill.jl | 3 ++- src/imputors/interp.jl | 3 ++- src/imputors/locf.jl | 3 ++- src/imputors/nocb.jl | 3 ++- test/runtests.jl | 5 +++++ 8 files changed, 47 insertions(+), 17 deletions(-) diff --git a/src/Impute.jl b/src/Impute.jl index 6c63747..d1dbdcb 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -30,6 +30,8 @@ function __init__() """ All matrix imputation methods will be switching to the column-major convention (e.g., each column corresponds to an observation, and each row corresponds to a variable). + To maintain the existing behaviour please pass `vardim=2` to the `Imputor` constructors + or impute functions (e.g., `fill`, `interp`, `locf`). """ ) end diff --git a/src/imputors.jl b/src/imputors.jl index 1ed2783..6af333b 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -9,6 +9,27 @@ method. abstract type Imputor end +# A couple utility methods to avoid messing up var and obs dimensions +obsdim(imp::Imputor) = imp.vardim == 1 ? 2 : 1 +vardim(imp::Imputor) = imp.vardim + +function obswise(imp::Imputor, data::AbstractMatrix) + (imp.vardim == 1 ? view(data, :, i) : view(data, i, :) for i in axes(data, obsdim(imp))) +end + +function varwise(imp::Imputor, data::AbstractMatrix) + (imp.vardim == 1 ? view(data, i, :) : view(data, :, i) for i in axes(data, vardim(imp))) +end + +function filterobs(f::Function, imp::Imputor, data::AbstractMatrix) + mask = [f(x) for x in obswise(imp, data)] + return imp.vardim == 1 ? data[:, mask] : data[mask, :] +end + +function filtervars(f::Function, imp::Imputor, data::AbstractMatrix) + mask = [f(x) for x in varwise(imp, data)] + return imp.vardim == 1 ? data[mask, :] : data[:, mask] +end """ impute(imp::Imputor, data) @@ -34,8 +55,8 @@ if this is not the desired behaviour custom imputor methods should overload this * `AbstractMatrix`: the input `data` with values imputed """ function impute!(imp::Imputor, data::AbstractMatrix) - for i in axes(data, 2) - impute!(imp, view(data, :, i)) + for var in varwise(imp, data) + impute!(imp, var) end return data end diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index e15f88c..7bed526 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -8,10 +8,11 @@ Removes missing values from the `AbstractArray` or `Tables.table` provided. summary information """ struct DropObs <: Imputor + vardim::Int context::AbstractContext end -DropObs(; context=Context()) = DropObs(context) +DropObs(; vardim=2, context=Context()) = DropObs(vardim, context) """ impute!(imp::DropObs, data::AbstractVector) @@ -53,10 +54,9 @@ NOTES (or premature optimizations): """ function impute!(imp::DropObs, data::AbstractMatrix) imp.context() do c - mask = map(axes(data, 1)) do i - !ismissing(c, view(data, i, :)) + return filterobs(imp, data) do obs + !ismissing(c, obs) end - return data[mask, :] end end @@ -99,10 +99,11 @@ Removes missing values from the `AbstractArray` or `Tables.table` provided. summary information """ struct DropVars <: Imputor + vardim::Int context::AbstractContext end -DropVars(; context=Context()) = DropVars(context) +DropVars(; vardim=2, context=Context()) = DropVars(vardim, context) """ impute!(imp::DropVars, data::AbstractMatrix) @@ -119,11 +120,11 @@ requires copying the matrix. * `AbstractMatrix`: a new matrix with missing columns removed """ function impute!(imp::DropVars, data::AbstractMatrix) - mask = map(axes(data, 2)) do i + return filtervars(imp, data) do var try imp.context() do c - for j in axes(data, 1) - ismissing(c, data[j, i]) + for x in var + ismissing(c, x) end end return true @@ -135,9 +136,6 @@ function impute!(imp::DropVars, data::AbstractMatrix) end end end - - data = data[:, mask] - return data end """ diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index ee94955..582a8e4 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -11,11 +11,12 @@ Fills in the missing data with a specific value. """ struct Fill{T} <: Imputor value::T + vardim::Int context::AbstractContext end """Fill(; value=mean, context=Context()) -> Fill""" -Fill(; value=mean, context=Context()) = Fill(value, context) +Fill(; value=mean, vardim=2, context=Context()) = Fill(value, vardim, context) """ impute!(imp::Fill, data::AbstractVector) diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index 21f81f8..e8b4839 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -4,11 +4,12 @@ Performs linear interpolation between the nearest values in an vector. """ struct Interpolate <: Imputor + vardim::Int context::AbstractContext end """Interpolate(; context=Context()) -> Interpolate""" -Interpolate(; context=Context()) = Interpolate(context) +Interpolate(; vardim=2, context=Context()) = Interpolate(vardim, context) """ impute!(imp::Interpolate, data::AbstractVector) diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index 56a7438..19247b6 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -8,11 +8,12 @@ See also: - [NOCB](@ref): Next Observation Carried Backward """ struct LOCF <: Imputor + vardim::Int context::AbstractContext end """LOCF(; context=Context()) -> LOCF""" -LOCF(; context=Context()) = LOCF(context) +LOCF(; vardim=2, context=Context()) = LOCF(vardim, context) """ impute!(imp::LOCF, data::AbstractVector) diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index 9475ac0..3390991 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -7,11 +7,12 @@ See also: - [LOCF](@ref): Last Observation Carried Forward """ struct NOCB <: Imputor + vardim::Int context::AbstractContext end """NOCB(; context=Context()) -> NOCB""" -NOCB(; context=Context()) = NOCB(context) +NOCB(; vardim=2, context=Context()) = NOCB(vardim, context) """ impute!(imp::NOCB, data::AbstractVector) diff --git a/test/runtests.jl b/test/runtests.jl index fba4481..f1a972c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -46,6 +46,7 @@ import Impute: @test isequal(result, expected) @test isequal(result, Impute.dropvars(m; context=ctx)) + @test isequal(result', Impute.dropvars(m'; vardim=1, context=ctx)) Impute.dropvars!(m; context=ctx) # The mutating test is broken because we need to making a copy of @@ -192,12 +193,16 @@ import Impute: @testset "Matrix" begin ctx = Context(; limit=1.0) + expected = Matrix(Impute.dropobs(dataset("boot", "neuro"); context=ctx)) data = Matrix(dataset("boot", "neuro")) @testset "Drop" begin result = impute(DropObs(; context=ctx), data) @test size(result, 1) == 4 @test result == Impute.dropobs(data; context=ctx) + + @test result == expected + @test Impute.dropobs(data'; vardim=1, context=ctx) == expected' end @testset "Fill" begin From 7f90aad4be4aad16afcfe7358c29b479db5770cb Mon Sep 17 00:00:00 2001 From: rofinn Date: Tue, 9 Jul 2019 23:51:07 -0500 Subject: [PATCH 15/34] Cleanup docstrings and add jldoctests. --- .travis.yml | 25 ++--- docs/Project.toml | 9 ++ docs/make.jl | 2 +- docs/src/index.md | 10 ++ src/Impute.jl | 202 +++++++++++++++++++++++++++++++++++++++++ src/context.jl | 46 +++++----- src/imputors.jl | 43 ++++++++- src/imputors/drop.jl | 130 ++++++++++---------------- src/imputors/fill.jl | 42 +++++---- src/imputors/interp.jl | 36 ++++++-- src/imputors/locf.jl | 44 ++++++--- src/imputors/nocb.jl | 41 +++++---- 12 files changed, 451 insertions(+), 179 deletions(-) create mode 100644 docs/Project.toml diff --git a/.travis.yml b/.travis.yml index abafdbe..3e44a10 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,18 +17,13 @@ matrix: # - if [[ -a .git/shallow ]]; then git fetch --unshallow; fi # - julia -e 'Pkg.clone(pwd()); Pkg.build("Impute"); Pkg.test("Impute"; coverage=true)' after_success: - - | - julia -e ' - VERSION >= v"0.7.0-DEV.3656" && using Pkg - VERSION >= v"0.7.0-DEV.5183" || cd(Pkg.dir("Impute")) - Pkg.add("Coverage") - using Coverage - Codecov.submit(Codecov.process_folder()) - ' - - | - julia -e ' - VERSION >= v"0.7.0-DEV.3656" && using Pkg - VERSION >= v"0.7.0-DEV.5183" || cd(Pkg.dir("Impute")) - Pkg.add("Documenter") - include(joinpath("docs", "make.jl")) - ' + - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())' +jobs: + include: + - stage: "Documentation" + julia: 1.0 + os: linux + script: + - julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()' + - julia --project=docs/ docs/make.jl + after_success: skip diff --git a/docs/Project.toml b/docs/Project.toml new file mode 100644 index 0000000..df82f6a --- /dev/null +++ b/docs/Project.toml @@ -0,0 +1,9 @@ +[deps] +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575" +RDatasets = "ce6b1742-4840-55fa-b093-852dadbb1d8b" + +[compat] +DataFrames = ">= 0.16" +Documenter = "~0.22" diff --git a/docs/make.jl b/docs/make.jl index 6c28e25..c634dcb 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,4 +1,4 @@ -using Documenter, Impute, RDatasets +using Documenter, Impute makedocs( modules=[Impute], diff --git a/docs/src/index.md b/docs/src/index.md index 3afda96..48c5d72 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,4 +1,14 @@ # Impute + +```@meta +DocTestSetup = quote + @eval Main begin + using DataFrames + using Impute: Impute, DropObs, DropVars, Fill, Interpolate, NOCB, LOCF, Context, impute + end +end +``` + [![stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://invenia.github.io/Impute.jl/stable/) [![latest](https://img.shields.io/badge/docs-latest-blue.svg)](https://invenia.github.io/Impute.jl/latest/) [![Build Status](https://travis-ci.org/invenia/Impute.jl.svg?branch=master)](https://travis-ci.org/invenia/Impute.jl) diff --git a/src/Impute.jl b/src/Impute.jl index d1dbdcb..6ce0fcd 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -77,4 +77,206 @@ for (f, v) in pairs(imputation_methods) end end +@doc """ + Impute.dropobs(data; vardim=2, context=Context()) + +Removes missing observations from the `AbstractArray` or `Tables.table` provided. +See [DropObs](@ref) for details. + +# Example +``` +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.dropobs(df; vardim=1, context=Context(; limit=1.0)) +3×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64 │ Float64 │ +├─────┼─────────┼─────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 5.0 │ 5.5 │ +``` +""" dropobs + +@doc """ + Impute.dropvars(data; vardim=2, context=Context()) + +Finds variables with too many missing values in a `AbstractMatrix` or `Tables.table` and +removes them from the input data. See [DropVars](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.dropvars(df; vardim=1, context=Context(; limit=0.2)) +5×1 DataFrames.DataFrame +│ Row │ b │ +│ │ Float64⍰ │ +├─────┼──────────┤ +│ 1 │ 1.1 │ +│ 2 │ 2.2 │ +│ 3 │ 3.3 │ +│ 4 │ missing │ +│ 5 │ 5.5 │ +``` +""" dropvars + +@doc """ + Impute.interp(data; vardim=2, context=Context()) + +Performs linear interpolation between the nearest values in an vector. +See [Interpolate](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.interp(df; vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 3.0 │ 3.3 │ +│ 4 │ 4.0 │ 4.4 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" interp + +@doc """ + Impute.fill(data; value=mean, vardim=2, context=Context()) + +Fills in the missing data with a specific value. See [Fill](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.fill(df; value=-1.0, vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ -1.0 │ 3.3 │ +│ 4 │ -1.0 │ -1.0 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" fill + +@doc """ + Impute.locf(data; vardim=2, context=Context()) + +Iterates forwards through the `data` and fills missing data with the last existing +observation. See [LOCF](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.locf(df; vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 2.0 │ 3.3 │ +│ 4 │ 2.0 │ 3.3 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" locf + +@doc """ + Impute.nocb(data; vardim=2, context=Context()) + +Iterates backwards through the `data` and fills missing data with the next existing +observation. See [LOCF](@ref) for details. + +# Example +```jldoctest +julia> using DataFrames; using Impute: Impute, Context + +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> Impute.nocb(df; vardim=1, context=Context(; limit=1.0)) +5×2 DataFrames.DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 5.0 │ 3.3 │ +│ 4 │ 5.0 │ 5.5 │ +│ 5 │ 5.0 │ 5.5 │ +``` +""" nocb + + end # module diff --git a/src/context.jl b/src/context.jl index 2e2df5a..e7bc9ee 100644 --- a/src/context.jl +++ b/src/context.jl @@ -83,27 +83,27 @@ function Base.findnext(ctx::AbstractContext, data::AbstractVector, idx::Int) return findnext(x -> !ismissing(ctx, x), data, idx) end +mutable struct Context <: AbstractContext + num::Int + count::Int + limit::Float64 + is_missing::Function + on_complete::Function +end + """ Context Records base information about the missing data and assume all observations are equally weighted. -# Fields +# Keyword Arguments * `n::Int`: number of observations * `count::Int`: number of missing values found * `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). * `is_missing::Function`: returns a Bool if the value counts as missing * `on_complete::Function`: a function to run when imputation is complete """ -mutable struct Context <: AbstractContext - num::Int - count::Int - limit::Float64 - is_missing::Function - on_complete::Function -end - function Context(; limit::Float64=0.1, is_missing::Function=ismissing, @@ -141,31 +141,33 @@ function complete(ctx::Context, data) end +mutable struct WeightedContext <: AbstractContext + num::Int + s::Float64 + limit::Float64 + is_missing::Function + on_complete::Function + wv::AbstractWeights +end + """ - WeightedContext + WeightedContext(wv; limit=1.0, is_missing=ismissing, on_complete=complete) Records information about the missing data relative to a set of weights. This context type can be useful if some missing observation are more important than others (e.g., more recent observations in time series datasets) -# Fields +# Arguments +* `wv::AbstractWeights`: a set of statistical weights to use when evaluating the importance + of each observation. Will be accumulated during imputation. + +# Keyword Arguments * `num::Int`: number of observations * `s::Float64`: sum of missing values weights * `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). * `is_missing::Function`: returns a Bool if the value counts as missing * `on_complete::Function`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). -* `wv::AbstractWeights`: a set of statistical weights to use when evaluating the importance - of each observation. Will be accumulated during imputation. """ -mutable struct WeightedContext <: AbstractContext - num::Int - s::Float64 - limit::Float64 - is_missing::Function - on_complete::Function - wv::AbstractWeights -end - function WeightedContext( wv::AbstractWeights; limit::Float64=1.0, diff --git a/src/imputors.jl b/src/imputors.jl index 6af333b..a9f3e07 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -6,7 +6,6 @@ New imputation methods are expected to sutype `Imputor` and, at minimum, implement the `impute!{T<:Any}(imp::, ctx::Context, data::AbstractArray{T, 1})` method. """ - abstract type Imputor end # A couple utility methods to avoid messing up var and obs dimensions @@ -44,7 +43,7 @@ end """ impute!(imp::Imputor, data::AbstractMatrix) -Imputes the data in a matrix by imputing the values 1 column at a time; +Imputes the data in a matrix by imputing the values 1 variable at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments @@ -53,6 +52,21 @@ if this is not the desired behaviour custom imputor methods should overload this # Returns * `AbstractMatrix`: the input `data` with values imputed + +# Example +```jldoctest +julia> using Impute: Interpolate, Context, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), M) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 5.5 +``` """ function impute!(imp::Imputor, data::AbstractMatrix) for var in varwise(imp, data) @@ -73,6 +87,31 @@ if this is not the desired behaviour custom imputor methods should overload this # Returns * the input `data` with values imputed + +# Example +``jldoctest +julia> using DataFrames; using Impute: Interpolate, Context, impute +julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, 3.3, missing, 5.5]) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ missing │ 3.3 │ +│ 4 │ missing │ missing │ +│ 5 │ 5.0 │ 5.5 │ + +julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), df) +5×2 DataFrame +│ Row │ a │ b │ +│ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┤ +│ 1 │ 1.0 │ 1.1 │ +│ 2 │ 2.0 │ 2.2 │ +│ 3 │ 3.0 │ 3.3 │ +│ 4 │ 4.0 │ 4.4 │ +│ 5 │ 5.0 │ 5.5 │ """ function impute!(imp::Imputor, table) @assert istable(table) diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 7bed526..819c292 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -1,57 +1,41 @@ -""" - DropObs <: Imputor - -Removes missing values from the `AbstractArray` or `Tables.table` provided. - -# Fields -* `context::AbstractContext`: A context which keeps track of missing data - summary information -""" struct DropObs <: Imputor vardim::Int context::AbstractContext end -DropObs(; vardim=2, context=Context()) = DropObs(vardim, context) - """ - impute!(imp::DropObs, data::AbstractVector) + DropObs(; vardim=2, context=Context) + +Removes missing observations from the `AbstractArray` or `Tables.table` provided. + +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext=Context()`: A context which keeps track of missing data + summary information -Uses `filter!` to remove missing elements from the array. +# Example +```jldoctest +julia> using Impute: DropObs, Context, impute -# Arguments -* `imp::DropObs`: this `Imputor` method -* `data::AbstractVector`: the data to impute +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 -# Returns -* `AbstractVector`: our data array with missing elements removed +julia> impute(DropObs(; vardim=1, context=Context(; limit=1.0)), M) +2×3 Array{Union{Missing, Float64},2}: + 1.0 2.0 5.0 + 1.1 2.2 5.5 +``` """ +DropObs(; vardim=2, context=Context()) = DropObs(vardim, context) + function impute!(imp::DropObs, data::AbstractVector) imp.context() do c filter!(x -> !ismissing(c, x), data) end end -""" - impute!(imp::DropObs, data::AbstractMatrix) - -Finds the missing rows in the matrix and uses a mask (Vector{Bool}) to return the -`data` with those rows removed. Unfortunately, the mask approach requires copying the matrix. - -NOTES (or premature optimizations): -* We use `view`, but this will change the type of the `data` by returning a `SubArray` -* We might be able to do something clever by: - 1. reshaping the data to a vector - 2. running `deleteat!` for the appropriate indices and - 3. reshaping the data back to the desired shape. - -# Arguments -* `imp::DropObs`: this `Imputor` method -* `data::AbstractMatrix`: the data to impute - -# Returns -* `AbstractMatrix`: a new matrix with missing rows removed -""" function impute!(imp::DropObs, data::AbstractMatrix) imp.context() do c return filterobs(imp, data) do obs @@ -60,18 +44,10 @@ function impute!(imp::DropObs, data::AbstractMatrix) end end -""" - impute!(imp::DropObs, table) - -Finds the missing rows in the table and deletes them. - -# Arguments -* `imp::DropObs`: this `Imputor` method -* `table`: a type that implements the Tables API. +# Deleting elements from subarrays doesn't work so we need to collect that data into +# a separate array. +impute!(imp::DropObs, data::SubArray) = impute!(imp::DropObs, collect(data)) -# Returns -* our data with the missing rows removed. -""" function impute!(imp::DropObs, table) imp.context() do c @assert istable(table) @@ -88,37 +64,39 @@ function impute!(imp::DropObs, table) end -""" - DropVars <: Imputor - - -Removes missing values from the `AbstractArray` or `Tables.table` provided. - -# Fields -* `context::AbstractContext`: A context which keeps track of missing data - summary information -""" struct DropVars <: Imputor vardim::Int context::AbstractContext end -DropVars(; vardim=2, context=Context()) = DropVars(vardim, context) - """ - impute!(imp::DropVars, data::AbstractMatrix) + DropVars(; vardim=2, context=Context()) + -Finds columns in the matrix with too many missing values and uses a mask (Vector{Bool}) to -return the `data` with those columns removed. Unfortunately, the mask approach -requires copying the matrix. +Finds variables with too many missing values in a `AbstractMatrix` or `Tables.table` and +removes them from the input data. -# Arguments -* `imp::DropVars`: this `Imputor` method -* `data::AbstractMatrix`: the data to impute +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information + +# Examples +```jldoctest +julia> using Impute: DropVars, Context, impute -# Returns -* `AbstractMatrix`: a new matrix with missing columns removed +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(DropVars(; vardim=1, context=Context(; limit=0.2)), M) +1×5 Array{Union{Missing, Float64},2}: + 1.1 2.2 3.3 missing 5.5 +``` """ +DropVars(; vardim=2, context=Context()) = DropVars(vardim, context) + function impute!(imp::DropVars, data::AbstractMatrix) return filtervars(imp, data) do var try @@ -138,18 +116,6 @@ function impute!(imp::DropVars, data::AbstractMatrix) end end -""" - impute!(imp::DropVars, table) - -Find remove columns in the table with too many missing elements. - -# Arguments -* `imp::DropVars`: this `Imputor` method -* `table`: a type that implements the Tables API. - -# Returns -* our data with the missing columns removed. -""" function impute!(imp::DropVars, table) @assert istable(table) cols = Tables.columns(table) diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index 582a8e4..03b4fd7 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -1,35 +1,45 @@ +struct Fill{T} <: Imputor + value::T + vardim::Int + context::AbstractContext +end + """ - Fill <: Imputor + Fill(; value=mean, vardim=2, context=Context()) Fills in the missing data with a specific value. +The current implementation is univariate, so each variable in a table or matrix will +be handled independently. -# Fields +# Keyword Arguments * `value::Any`: A scalar missing value or a function that returns the a scalar if passed the data with missing data removed (e.g, `mean`) +* `vardim=2::Int`: Specify the dimension for variables in matrix input data * `context::AbstractContext`: A context which keeps track of missing data summary information -""" -struct Fill{T} <: Imputor - value::T - vardim::Int - context::AbstractContext -end -"""Fill(; value=mean, context=Context()) -> Fill""" -Fill(; value=mean, vardim=2, context=Context()) = Fill(value, vardim, context) +# Example +```jldoctest +julia> using Impute: Fill, Context, impute -""" - impute!(imp::Fill, data::AbstractVector) +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 -Fill in missing values with a values determined by `imp.value`. -If `imp.value` is a function then the fill values calculated by invoking that function on -the collection of all nonmissing values. +julia> impute(Fill(; vardim=1, context=Context(; limit=1.0)), M) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 2.66667 2.66667 5.0 + 1.1 2.2 3.3 3.025 5.5 +``` """ +Fill(; value=mean, vardim=2, context=Context()) = Fill(value, vardim, context) + function impute!(imp::Fill, data::AbstractVector) imp.context() do c fill_val = if isa(imp.value, Function) # Call `deepcopy` because we can trust that it's available for all types. - imp.value(Iterators.drop(deepcopy(data); context=c)) + imp.value(Impute.drop(deepcopy(data); context=c)) else imp.value end diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index e8b4839..c13aad2 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -1,25 +1,41 @@ -""" - Interpolate <: Imputor - -Performs linear interpolation between the nearest values in an vector. -""" struct Interpolate <: Imputor vardim::Int context::AbstractContext end -"""Interpolate(; context=Context()) -> Interpolate""" -Interpolate(; vardim=2, context=Context()) = Interpolate(vardim, context) - """ - impute!(imp::Interpolate, data::AbstractVector) + Interpolate(; vardim=2, context=Context()) -Uses linear interpolation between existing elements of a vector to fill in missing data. +Performs linear interpolation between the nearest values in an vector. +The current implementation is univariate, so each variable in a table or matrix will +be handled independently. WARNING: Missing values at the head or tail of the array cannot be interpolated if there are no existing values on both sides. As a result, this method does not guarantee that all missing values will be imputed. + +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information + +# Example +```jldoctest +julia> using Impute: Interpolate, Context, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), M) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 3.0 4.0 5.0 + 1.1 2.2 3.3 4.4 5.5 +``` """ +Interpolate(; vardim=2, context=Context()) = Interpolate(vardim, context) + function impute!(imp::Interpolate, data::AbstractVector{<:Union{T, Missing}}) where T imp.context() do c i = findfirst(c, data) + 1 diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index 19247b6..768353f 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -1,30 +1,44 @@ -""" - LOCF <: Imputor - -Last observation carried forward. Fill in missing values with the most recent -nonmissing value. - -See also: -- [NOCB](@ref): Next Observation Carried Backward -""" struct LOCF <: Imputor vardim::Int context::AbstractContext end -"""LOCF(; context=Context()) -> LOCF""" -LOCF(; vardim=2, context=Context()) = LOCF(vardim, context) - """ - impute!(imp::LOCF, data::AbstractVector) + LOCF(; vardim=2, context=Context()) -Iterates forwards through the `data` and fills missing data with the last -existing observation. +Last observation carried forward (LOCF) iterates forwards through the `data` and fills +missing data with the last existing observation. The current implementation is univariate, +so each variable in a table or matrix will be handled independently. + +See also: +- [NOCB](@ref): Next Observation Carried Backward WARNING: missing elements at the head of the array may not be imputed if there is no existing observation to carry forward. As a result, this method does not guarantee that all missing values will be imputed. + +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information + +# Example +```jldoctest +julia> using Impute: LOCF, Context, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(LOCF(; vardim=1, context=Context(; limit=1.0)), M) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 2.0 2.0 5.0 + 1.1 2.2 3.3 3.3 5.5 +``` """ +LOCF(; vardim=2, context=Context()) = LOCF(vardim, context) + function impute!(imp::LOCF, data::AbstractVector) imp.context() do c start_idx = findfirst(c, data) + 1 diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index 3390991..49c8e71 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -1,34 +1,43 @@ -""" - NOCB <: Imputor - -Fills in missing data using the Next Observation Carried Backward (NOCB) approach. - -See also: -- [LOCF](@ref): Last Observation Carried Forward -""" struct NOCB <: Imputor vardim::Int context::AbstractContext end -"""NOCB(; context=Context()) -> NOCB""" -NOCB(; vardim=2, context=Context()) = NOCB(vardim, context) - """ - impute!(imp::NOCB, data::AbstractVector) + NOCB(; vardim=2, context=Context()) -Iterates backwards through the `data` and fills missing data with the next -existing observation. +Next observation carried backward (NOCB) iterates backwards through the `data` and fills +missing data with the next existing observation. + +See also: +- [LOCF](@ref): Last Observation Carried Forward WARNING: missing elements at the tail of the array may not be imputed if there is no existing observation to carry backward. As a result, this method does not guarantee that all missing values will be imputed. -# Usage -``` +# Keyword Arguments +* `vardim=2::Int`: Specify the dimension for variables in matrix input data +* `context::AbstractContext`: A context which keeps track of missing data + summary information +# Example +```jldoctest +julia> using Impute: NOCB, Context, impute + +julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 missing missing 5.0 + 1.1 2.2 3.3 missing 5.5 + +julia> impute(NOCB(; vardim=1, context=Context(; limit=1.0)), M) +2×5 Array{Union{Missing, Float64},2}: + 1.0 2.0 5.0 5.0 5.0 + 1.1 2.2 3.3 5.5 5.5 ``` """ +NOCB(; vardim=2, context=Context()) = NOCB(vardim, context) + function impute!(imp::NOCB, data::AbstractVector) imp.context() do c end_idx = findlast(c, data) - 1 From 0d97c086a6849faea48e29867ba7d620df32ea77 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 10 Jul 2019 17:09:52 -0500 Subject: [PATCH 16/34] Remove test REQUIRE file. --- test/REQUIRE | 1 - 1 file changed, 1 deletion(-) delete mode 100644 test/REQUIRE diff --git a/test/REQUIRE b/test/REQUIRE deleted file mode 100644 index b163b18..0000000 --- a/test/REQUIRE +++ /dev/null @@ -1 +0,0 @@ -RDatasets 0.5 From eecc2d49279491e6748fc0caa391410fa5afa562 Mon Sep 17 00:00:00 2001 From: rofinn Date: Wed, 10 Jul 2019 18:16:13 -0500 Subject: [PATCH 17/34] Cleanup docs in README and index page. --- README.md | 116 ++++++++++-- docs/src/index.md | 450 +++------------------------------------------- 2 files changed, 123 insertions(+), 443 deletions(-) diff --git a/README.md b/README.md index 5c7f530..7d7e1d2 100644 --- a/README.md +++ b/README.md @@ -5,30 +5,110 @@ [![Build status](https://ci.appveyor.com/api/projects/status/github/invenia/Impute.jl?svg=true)](https://ci.appveyor.com/project/invenia/Impute-jl) [![codecov](https://codecov.io/gh/invenia/Impute.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/invenia/Impute.jl) -Impute.jl provides various data imputation methods for `Arrays` and `DataFrames` with various types of missing values. +Impute.jl provides various methods for handling missing data in Vectors, Matrices and [Tables](https://github.com/JuliaData/Tables.jl). ## Installation ```julia -Pkg.clone("https://github.com/invenia/Impute.jl") +julia> using Pkg; Pkg.add("Impute") ``` -## Features -* Operate over Vectors, Matrices or DataFrames -* Chaining of methods +## Quickstart +Let's start by loading our dependencies: +```julia +julia> using DataFrames, RDatasets, Impute +``` + +We'll also want some test data containing missings to work with: + +```julia +julia> df = dataset("boot", "neuro") +469×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤ +│ 1 │ missing │ -203.7 │ -84.1 │ 18.5 │ missing │ missing │ +│ 2 │ missing │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ missing │ +│ 3 │ missing │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ missing │ +│ 4 │ missing │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ missing │ +│ 5 │ missing │ missing │ -130.1 │ 25.8 │ 160.0 │ missing │ +│ 6 │ missing │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ missing │ +│ 7 │ missing │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ missing │ +⋮ +│ 462 │ missing │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ +│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ missing │ +│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ missing │ +│ 465 │ missing │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ missing │ +│ 466 │ missing │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ +│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +│ 468 │ missing │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ +│ 469 │ missing │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +``` -## Methods +Our first instinct might be to drop all observations, but this leaves us too few rows to work with: -* drop - remove missing -* locf - last observation carried forward -* nocb - next observation carried backward -* interp - linear interpolation of values in vector -* fill - replace with a specific value or a function which returns a value given the existing vector with missing values dropped. +```julia +julia> Impute.drop(df) +4×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ Float64 │ +├─────┼─────────┼─────────┼─────────┼─────────┼─────────┼─────────┤ +│ 1 │ -247.0 │ -132.2 │ -18.8 │ 28.2 │ 81.4 │ 237.9 │ +│ 2 │ -234.0 │ -140.8 │ -56.5 │ 28.0 │ 114.3 │ 222.9 │ +│ 3 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ +│ 4 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +``` -## TODO +We could try imputing the values with linear interpolation, but that still leaves missing +data at the head and tail of our dataset: + +```julia +julia> Impute.interp(df) +469×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤ +│ 1 │ missing │ -203.7 │ -84.1 │ 18.5 │ missing │ missing │ +│ 2 │ missing │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ missing │ +│ 3 │ missing │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ missing │ +│ 4 │ missing │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ missing │ +│ 5 │ missing │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ missing │ +│ 6 │ missing │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ missing │ +│ 7 │ missing │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ missing │ +⋮ +│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ +│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ +│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ +│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ +│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ +│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +│ 468 │ missing │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ +│ 469 │ missing │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +``` + +Finally, we can chain multiple simple methods together to give a complete dataset: + +```julia +julia> Impute.interp(df) |> Impute.locf() |> Impute.nocb() +469×6 DataFrames.DataFrame +│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ +│ │ Float64⍰ │ Float64⍰ │ Float64 │ Float64⍰ │ Float64⍰ │ Float64⍰ │ +├─────┼──────────┼──────────┼─────────┼──────────┼──────────┼──────────┤ +│ 1 │ -233.6 │ -203.7 │ -84.1 │ 18.5 │ 134.7 │ 222.7 │ +│ 2 │ -233.6 │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ 222.7 │ +│ 3 │ -233.6 │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ 222.7 │ +│ 4 │ -233.6 │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ 222.7 │ +│ 5 │ -233.6 │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ 222.7 │ +│ 6 │ -233.6 │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ 222.7 │ +│ 7 │ -233.6 │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ 222.7 │ +⋮ +│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ +│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ +│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ +│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ +│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ +│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +│ 468 │ -247.6 │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ +│ 469 │ -247.6 │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +``` -* Dropping rows in a matrix allocates extra memory (ie: `data[mask, :]` make a copy). -* More sophisticated imputation methods - 1. MICE - 2. EM - 3. kNN - 4. Regression +**Warning**: Your approach should depend on the properties of you data (e.g., MCAR, MAR, MNAR) diff --git a/docs/src/index.md b/docs/src/index.md index 48c5d72..8e83549 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,12 +1,8 @@ # Impute -```@meta -DocTestSetup = quote - @eval Main begin - using DataFrames - using Impute: Impute, DropObs, DropVars, Fill, Interpolate, NOCB, LOCF, Context, impute - end -end +```@setup quickstart +using DataFrames, RDatasets, Impute +df = dataset("boot", "neuro") ``` [![stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://invenia.github.io/Impute.jl/stable/) @@ -15,441 +11,45 @@ end [![Build status](https://ci.appveyor.com/api/projects/status/github/invenia/Impute.jl?svg=true)](https://ci.appveyor.com/project/invenia/Impute-jl) [![codecov](https://codecov.io/gh/invenia/Impute.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/invenia/Impute.jl) -Impute.jl provides various data imputation methods for `Arrays` and `DataFrames` with various types of missing values. +Impute.jl provides various methods for handling missing data in Vectors, Matrices and [Tables](https://github.com/JuliaData/Tables.jl). ## Installation ```julia -Pkg.clone("https://github.com/invenia/Impute.jl") +julia> using Pkg; Pkg.add("Impute") ``` -## Features - -* Operating over Vectors, Matrices and DataFrames -* Chaining of methods - -## Methods - -* drop - remove missing -* locf - last observation carried forward -* nocb - next observation carried backward -* interp - linear interpolation of values in vector -* fill - replace with a specific value or a function which returns a value given the existing vector with missing values dropped. - ## Quickstart -We'll start by imputing `NaN` values in 1-dimension vector. -```julia -julia> using Impute +Let's start by loading our dependencies: -julia> a = collect(1.0:1.0:20.0) -20-element Array{Float64,1}: - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - 6.0 - 7.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 - -julia> a[[2, 3, 7]] = NaN -NaN +```@repl +using DataFrames, RDatasets, Impute ``` -The most common approach to missing data is to remove them. -```julia -julia> impute(a, :drop; limit=0.2) -17-element Array{Float64,1}: - 1.0 - 4.0 - 5.0 - 6.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 -``` +We'll also want some test data containing `missing`s to work with: -But we may want use linear interpolation, filling, etc -```julia -julia> impute(a, :interp; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 2.0 - 3.0 - 4.0 - 5.0 - 6.0 - 7.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 - -julia> impute(a, :fill; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 11.6471 - 11.6471 - 4.0 - 5.0 - 6.0 - 11.6471 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 - -julia> impute(a, :locf; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 1.0 - 1.0 - 4.0 - 5.0 - 6.0 - 6.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 - -julia> impute(a, :nocb; limit=0.2) -20-element Array{Float64,1}: - 1.0 - 4.0 - 4.0 - 4.0 - 5.0 - 6.0 - 8.0 - 8.0 - 9.0 - 10.0 - 11.0 - 12.0 - 13.0 - 14.0 - 15.0 - 16.0 - 17.0 - 18.0 - 19.0 - 20.0 +```@repl quickstart +df = dataset("boot", "neuro") ``` -We can also perform these operations on `DataFrame`s. +Our first instinct might be to drop all observations, but this leaves us too few +rows to work with: -```julia -julia> using DataFrames +```@repl quickstart +Impute.drop(df) +``` -julia> using RDatasets +We could try imputing the values with linear interpolation, but that still leaves missing +data at the head and tail of our dataset: -julia> df = dataset("boot", "neuro") -2814 -Symbol[:V1,:V2,:V3,:V4,:V5,:V6] -6 -469×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼────────┼────────┼────────┼───────┼───────┼───────┤ -│ 1 │ NA │ -203.7 │ -84.1 │ 18.5 │ NA │ NA │ -│ 2 │ NA │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ NA │ -│ 3 │ NA │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ NA │ -│ 4 │ NA │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ NA │ -│ 5 │ NA │ NA │ -130.1 │ 25.8 │ 160.0 │ NA │ -│ 6 │ NA │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ NA │ -│ 7 │ NA │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ NA │ -│ 8 │ NA │ -221.6 │ -81.9 │ 27.5 │ 144.5 │ NA │ -│ 9 │ NA │ -153.7 │ -17.0 │ 76.1 │ 222.4 │ NA │ -│ 10 │ NA │ -184.7 │ -47.3 │ 74.4 │ 208.9 │ NA │ -│ 11 │ NA │ NA │ -148.8 │ 11.4 │ 137.7 │ NA │ -│ 12 │ NA │ -197.6 │ -6.4 │ 137.1 │ NA │ NA │ -│ 13 │ NA │ -247.8 │ -35.4 │ 80.9 │ 229.5 │ NA │ -│ 14 │ NA │ -227.0 │ -104.7 │ 20.2 │ 140.2 │ NA │ -│ 15 │ -233.6 │ -115.9 │ -10.5 │ 70.0 │ 202.6 │ NA │ -│ 16 │ NA │ -232.4 │ -100.6 │ 16.8 │ 145.1 │ NA │ -│ 17 │ NA │ -199.4 │ -58.2 │ 29.1 │ 184.4 │ NA │ -│ 18 │ NA │ -195.7 │ -89.5 │ 26.4 │ 142.7 │ NA │ -│ 19 │ NA │ -180.1 │ -65.0 │ 27.3 │ 171.1 │ NA │ -│ 20 │ NA │ NA │ -85.2 │ 27.1 │ NA │ NA │ -│ 21 │ NA │ -217.3 │ -77.1 │ 27.6 │ 151.5 │ NA │ -│ 22 │ NA │ -139.7 │ -15.8 │ 83.0 │ 215.5 │ NA │ -│ 23 │ -249.6 │ -132.8 │ -14.1 │ 78.1 │ 205.7 │ NA │ -│ 24 │ NA │ -152.7 │ -36.9 │ 29.7 │ 149.8 │ NA │ -│ 25 │ NA │ -224.1 │ -81.9 │ 29.1 │ 172.2 │ NA │ -│ 26 │ NA │ NA │ -235.8 │ 6.0 │ 144.4 │ NA │ -│ 27 │ NA │ -202.8 │ -45.1 │ 84.0 │ 227.3 │ NA │ -│ 28 │ -240.9 │ -138.4 │ -21.5 │ 73.4 │ 210.6 │ NA │ -│ 29 │ -247.1 │ -128.2 │ -31.3 │ 29.2 │ 143.1 │ NA │ -│ 30 │ NA │ -185.4 │ -80.3 │ 23.9 │ 115.8 │ 222.7 │ -│ 31 │ NA │ -182.5 │ -75.8 │ 27.5 │ 165.2 │ NA │ -│ 32 │ NA │ -202.2 │ -99.1 │ 23.8 │ 136.3 │ 242.5 │ -│ 33 │ NA │ -193.3 │ -82.6 │ 26.3 │ 160.5 │ NA │ -│ 34 │ NA │ -189.4 │ -63.3 │ 27.6 │ 136.8 │ NA │ -│ 35 │ NA │ -149.0 │ -31.0 │ 73.5 │ 187.8 │ NA │ -│ 36 │ NA │ -162.4 │ -26.5 │ 72.6 │ NA │ NA │ -│ 37 │ NA │ -213.4 │ -107.2 │ 24.7 │ 158.5 │ NA │ -⋮ -│ 432 │ NA │ -156.2 │ -32.9 │ 63.3 │ 182.8 │ NA │ -│ 433 │ NA │ -220.6 │ -114.2 │ 9.7 │ 106.4 │ 227.9 │ -│ 434 │ -219.9 │ -120.9 │ -1.3 │ 99.5 │ 207.6 │ NA │ -│ 435 │ NA │ -240.5 │ -110.3 │ 26.1 │ 142.8 │ NA │ -│ 436 │ NA │ -239.6 │ -121.4 │ 2.9 │ 124.9 │ NA │ -│ 437 │ NA │ -139.8 │ -7.3 │ 121.0 │ NA │ NA │ -│ 438 │ NA │ -212.0 │ -66.2 │ 50.4 │ 178.2 │ NA │ -│ 439 │ NA │ -232.7 │ -109.2 │ 18.4 │ 127.5 │ NA │ -│ 440 │ NA │ -236.3 │ -115.1 │ 5.1 │ 109.0 │ 212.0 │ -│ 441 │ -241.2 │ -107.1 │ -9.1 │ 95.1 │ 198.6 │ NA │ -│ 442 │ -226.7 │ -143.8 │ -30.4 │ 75.8 │ 196.6 │ NA │ -│ 443 │ NA │ -131.8 │ -26.5 │ 64.7 │ 177.2 │ NA │ -│ 444 │ NA │ -144.9 │ -0.9 │ 105.3 │ 230.9 │ NA │ -│ 445 │ NA │ -214.0 │ -81.8 │ 66.1 │ 191.3 │ NA │ -│ 446 │ NA │ -210.6 │ -94.3 │ 16.7 │ 125.5 │ 239.7 │ -│ 447 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 448 │ NA │ -156.0 │ -14.0 │ 113.7 │ 249.3 │ NA │ -│ 449 │ NA │ -210.5 │ -41.9 │ NA │ NA │ NA │ -│ 450 │ NA │ -189.2 │ -72.0 │ 56.8 │ 133.8 │ 246.7 │ -│ 451 │ NA │ -214.2 │ -102.2 │ 5.5 │ 75.6 │ 154.3 │ -│ 452 │ -219.6 │ -107.9 │ -16.0 │ 101.7 │ 186.0 │ NA │ -│ 453 │ NA │ -153.0 │ -38.0 │ 61.3 │ 144.4 │ 245.9 │ -│ 454 │ NA │ -179.8 │ -63.4 │ 56.0 │ 157.5 │ NA │ -│ 455 │ NA │ -174.5 │ -44.8 │ 73.3 │ 179.7 │ NA │ -│ 456 │ NA │ -206.8 │ -108.9 │ 3.7 │ 102.1 │ 210.3 │ -│ 457 │ NA │ -169.5 │ -79.7 │ 27.9 │ 129.4 │ 242.8 │ -│ 458 │ -222.2 │ -104.6 │ -2.4 │ 84.3 │ 204.7 │ NA │ -│ 459 │ -236.3 │ -124.0 │ -6.8 │ 95.7 │ 196.0 │ NA │ -│ 460 │ NA │ -216.5 │ -90.2 │ 27.8 │ 138.9 │ NA │ -│ 461 │ NA │ -163.2 │ -43.6 │ 69.5 │ 173.9 │ NA │ -│ 462 │ NA │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ -│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ NA │ -│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ NA │ -│ 465 │ NA │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ NA │ -│ 466 │ NA │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ -│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ -│ 468 │ NA │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ -│ 469 │ NA │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +```@repl quickstart +Impute.interp(df) +``` -julia> drop(df) -4×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼────────┼────────┼───────┼──────┼───────┼───────┤ -│ 1 │ -247.0 │ -132.2 │ -18.8 │ 28.2 │ 81.4 │ 237.9 │ -│ 2 │ -234.0 │ -140.8 │ -56.5 │ 28.0 │ 114.3 │ 222.9 │ -│ 3 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 4 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ +Finally, we can chain multiple simple methods together to give a complete dataset: -julia> interp(df) -469×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼──────────┼─────────┼────────┼───────┼────────┼─────────┤ -│ 1 │ NA │ -203.7 │ -84.1 │ 18.5 │ NA │ NA │ -│ 2 │ NA │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ NA │ -│ 3 │ NA │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ NA │ -│ 4 │ NA │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ NA │ -│ 5 │ NA │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ NA │ -│ 6 │ NA │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ NA │ -│ 7 │ NA │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ NA │ -│ 8 │ NA │ -221.6 │ -81.9 │ 27.5 │ 144.5 │ NA │ -│ 9 │ NA │ -153.7 │ -17.0 │ 76.1 │ 222.4 │ NA │ -│ 10 │ NA │ -184.7 │ -47.3 │ 74.4 │ 208.9 │ NA │ -│ 11 │ NA │ -191.15 │ -148.8 │ 11.4 │ 137.7 │ NA │ -│ 12 │ NA │ -197.6 │ -6.4 │ 137.1 │ 183.6 │ NA │ -│ 13 │ NA │ -247.8 │ -35.4 │ 80.9 │ 229.5 │ NA │ -│ 14 │ NA │ -227.0 │ -104.7 │ 20.2 │ 140.2 │ NA │ -│ 15 │ -233.6 │ -115.9 │ -10.5 │ 70.0 │ 202.6 │ NA │ -│ 16 │ -235.6 │ -232.4 │ -100.6 │ 16.8 │ 145.1 │ NA │ -│ 17 │ -237.6 │ -199.4 │ -58.2 │ 29.1 │ 184.4 │ NA │ -│ 18 │ -239.6 │ -195.7 │ -89.5 │ 26.4 │ 142.7 │ NA │ -│ 19 │ -241.6 │ -180.1 │ -65.0 │ 27.3 │ 171.1 │ NA │ -│ 20 │ -243.6 │ -198.7 │ -85.2 │ 27.1 │ 161.3 │ NA │ -│ 21 │ -245.6 │ -217.3 │ -77.1 │ 27.6 │ 151.5 │ NA │ -│ 22 │ -247.6 │ -139.7 │ -15.8 │ 83.0 │ 215.5 │ NA │ -│ 23 │ -249.6 │ -132.8 │ -14.1 │ 78.1 │ 205.7 │ NA │ -│ 24 │ -247.86 │ -152.7 │ -36.9 │ 29.7 │ 149.8 │ NA │ -│ 25 │ -246.12 │ -224.1 │ -81.9 │ 29.1 │ 172.2 │ NA │ -│ 26 │ -244.38 │ -213.45 │ -235.8 │ 6.0 │ 144.4 │ NA │ -│ 27 │ -242.64 │ -202.8 │ -45.1 │ 84.0 │ 227.3 │ NA │ -│ 28 │ -240.9 │ -138.4 │ -21.5 │ 73.4 │ 210.6 │ NA │ -│ 29 │ -247.1 │ -128.2 │ -31.3 │ 29.2 │ 143.1 │ NA │ -│ 30 │ -247.093 │ -185.4 │ -80.3 │ 23.9 │ 115.8 │ 222.7 │ -│ 31 │ -247.086 │ -182.5 │ -75.8 │ 27.5 │ 165.2 │ 232.6 │ -│ 32 │ -247.079 │ -202.2 │ -99.1 │ 23.8 │ 136.3 │ 242.5 │ -│ 33 │ -247.071 │ -193.3 │ -82.6 │ 26.3 │ 160.5 │ 242.082 │ -│ 34 │ -247.064 │ -189.4 │ -63.3 │ 27.6 │ 136.8 │ 241.664 │ -│ 35 │ -247.057 │ -149.0 │ -31.0 │ 73.5 │ 187.8 │ 241.245 │ -│ 36 │ -247.05 │ -162.4 │ -26.5 │ 72.6 │ 173.15 │ 240.827 │ -│ 37 │ -247.043 │ -213.4 │ -107.2 │ 24.7 │ 158.5 │ 240.409 │ -⋮ -│ 432 │ -219.99 │ -156.2 │ -32.9 │ 63.3 │ 182.8 │ 232.0 │ -│ 433 │ -219.945 │ -220.6 │ -114.2 │ 9.7 │ 106.4 │ 227.9 │ -│ 434 │ -219.9 │ -120.9 │ -1.3 │ 99.5 │ 207.6 │ 225.629 │ -│ 435 │ -222.943 │ -240.5 │ -110.3 │ 26.1 │ 142.8 │ 223.357 │ -│ 436 │ -225.986 │ -239.6 │ -121.4 │ 2.9 │ 124.9 │ 221.086 │ -│ 437 │ -229.029 │ -139.8 │ -7.3 │ 121.0 │ 151.55 │ 218.814 │ -│ 438 │ -232.071 │ -212.0 │ -66.2 │ 50.4 │ 178.2 │ 216.543 │ -│ 439 │ -235.114 │ -232.7 │ -109.2 │ 18.4 │ 127.5 │ 214.271 │ -│ 440 │ -238.157 │ -236.3 │ -115.1 │ 5.1 │ 109.0 │ 212.0 │ -│ 441 │ -241.2 │ -107.1 │ -9.1 │ 95.1 │ 198.6 │ 216.617 │ -│ 442 │ -226.7 │ -143.8 │ -30.4 │ 75.8 │ 196.6 │ 221.233 │ -│ 443 │ -224.52 │ -131.8 │ -26.5 │ 64.7 │ 177.2 │ 225.85 │ -│ 444 │ -222.34 │ -144.9 │ -0.9 │ 105.3 │ 230.9 │ 230.467 │ -│ 445 │ -220.16 │ -214.0 │ -81.8 │ 66.1 │ 191.3 │ 235.083 │ -│ 446 │ -217.98 │ -210.6 │ -94.3 │ 16.7 │ 125.5 │ 239.7 │ -│ 447 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 448 │ -216.56 │ -156.0 │ -14.0 │ 113.7 │ 249.3 │ 248.7 │ -│ 449 │ -217.32 │ -210.5 │ -41.9 │ 85.25 │ 191.55 │ 247.7 │ -│ 450 │ -218.08 │ -189.2 │ -72.0 │ 56.8 │ 133.8 │ 246.7 │ -│ 451 │ -218.84 │ -214.2 │ -102.2 │ 5.5 │ 75.6 │ 154.3 │ -│ 452 │ -219.6 │ -107.9 │ -16.0 │ 101.7 │ 186.0 │ 200.1 │ -│ 453 │ -220.033 │ -153.0 │ -38.0 │ 61.3 │ 144.4 │ 245.9 │ -│ 454 │ -220.467 │ -179.8 │ -63.4 │ 56.0 │ 157.5 │ 234.033 │ -│ 455 │ -220.9 │ -174.5 │ -44.8 │ 73.3 │ 179.7 │ 222.167 │ -│ 456 │ -221.333 │ -206.8 │ -108.9 │ 3.7 │ 102.1 │ 210.3 │ -│ 457 │ -221.767 │ -169.5 │ -79.7 │ 27.9 │ 129.4 │ 242.8 │ -│ 458 │ -222.2 │ -104.6 │ -2.4 │ 84.3 │ 204.7 │ 237.84 │ -│ 459 │ -236.3 │ -124.0 │ -6.8 │ 95.7 │ 196.0 │ 232.88 │ -│ 460 │ -237.875 │ -216.5 │ -90.2 │ 27.8 │ 138.9 │ 227.92 │ -│ 461 │ -239.45 │ -163.2 │ -43.6 │ 69.5 │ 173.9 │ 222.96 │ -│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ -│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ -│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ -│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ -│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ -│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ -│ 468 │ NA │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ -│ 469 │ NA │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ +```@repl quickstart +Impute.interp(df) |> Impute.locf() |> Impute.nocb() ``` -Finally, we can also chain imputation methods together. -As we saw in the last example linear interpolation can interpolate missing values -at the head or tail of the array (or column). -```julia -julia> chain(df, Impute.Interpolate(), Impute.LOCF(), Impute.NOCB(); limit=1.0) -469×6 DataFrames.DataFrame -│ Row │ V1 │ V2 │ V3 │ V4 │ V5 │ V6 │ -├─────┼──────────┼─────────┼────────┼───────┼────────┼─────────┤ -│ 1 │ -233.6 │ -203.7 │ -84.1 │ 18.5 │ 134.7 │ 222.7 │ -│ 2 │ -233.6 │ -203.0 │ -97.8 │ 25.8 │ 134.7 │ 222.7 │ -│ 3 │ -233.6 │ -249.0 │ -92.1 │ 27.8 │ 177.1 │ 222.7 │ -│ 4 │ -233.6 │ -231.5 │ -97.5 │ 27.0 │ 150.3 │ 222.7 │ -│ 5 │ -233.6 │ -227.3 │ -130.1 │ 25.8 │ 160.0 │ 222.7 │ -│ 6 │ -233.6 │ -223.1 │ -70.7 │ 62.1 │ 197.5 │ 222.7 │ -│ 7 │ -233.6 │ -164.8 │ -12.2 │ 76.8 │ 202.8 │ 222.7 │ -│ 8 │ -233.6 │ -221.6 │ -81.9 │ 27.5 │ 144.5 │ 222.7 │ -│ 9 │ -233.6 │ -153.7 │ -17.0 │ 76.1 │ 222.4 │ 222.7 │ -│ 10 │ -233.6 │ -184.7 │ -47.3 │ 74.4 │ 208.9 │ 222.7 │ -│ 11 │ -233.6 │ -191.15 │ -148.8 │ 11.4 │ 137.7 │ 222.7 │ -│ 12 │ -233.6 │ -197.6 │ -6.4 │ 137.1 │ 183.6 │ 222.7 │ -│ 13 │ -233.6 │ -247.8 │ -35.4 │ 80.9 │ 229.5 │ 222.7 │ -│ 14 │ -233.6 │ -227.0 │ -104.7 │ 20.2 │ 140.2 │ 222.7 │ -│ 15 │ -233.6 │ -115.9 │ -10.5 │ 70.0 │ 202.6 │ 222.7 │ -│ 16 │ -235.6 │ -232.4 │ -100.6 │ 16.8 │ 145.1 │ 222.7 │ -│ 17 │ -237.6 │ -199.4 │ -58.2 │ 29.1 │ 184.4 │ 222.7 │ -│ 18 │ -239.6 │ -195.7 │ -89.5 │ 26.4 │ 142.7 │ 222.7 │ -│ 19 │ -241.6 │ -180.1 │ -65.0 │ 27.3 │ 171.1 │ 222.7 │ -│ 20 │ -243.6 │ -198.7 │ -85.2 │ 27.1 │ 161.3 │ 222.7 │ -│ 21 │ -245.6 │ -217.3 │ -77.1 │ 27.6 │ 151.5 │ 222.7 │ -│ 22 │ -247.6 │ -139.7 │ -15.8 │ 83.0 │ 215.5 │ 222.7 │ -│ 23 │ -249.6 │ -132.8 │ -14.1 │ 78.1 │ 205.7 │ 222.7 │ -│ 24 │ -247.86 │ -152.7 │ -36.9 │ 29.7 │ 149.8 │ 222.7 │ -│ 25 │ -246.12 │ -224.1 │ -81.9 │ 29.1 │ 172.2 │ 222.7 │ -│ 26 │ -244.38 │ -213.45 │ -235.8 │ 6.0 │ 144.4 │ 222.7 │ -│ 27 │ -242.64 │ -202.8 │ -45.1 │ 84.0 │ 227.3 │ 222.7 │ -│ 28 │ -240.9 │ -138.4 │ -21.5 │ 73.4 │ 210.6 │ 222.7 │ -│ 29 │ -247.1 │ -128.2 │ -31.3 │ 29.2 │ 143.1 │ 222.7 │ -│ 30 │ -247.093 │ -185.4 │ -80.3 │ 23.9 │ 115.8 │ 222.7 │ -│ 31 │ -247.086 │ -182.5 │ -75.8 │ 27.5 │ 165.2 │ 232.6 │ -│ 32 │ -247.079 │ -202.2 │ -99.1 │ 23.8 │ 136.3 │ 242.5 │ -│ 33 │ -247.071 │ -193.3 │ -82.6 │ 26.3 │ 160.5 │ 242.082 │ -│ 34 │ -247.064 │ -189.4 │ -63.3 │ 27.6 │ 136.8 │ 241.664 │ -│ 35 │ -247.057 │ -149.0 │ -31.0 │ 73.5 │ 187.8 │ 241.245 │ -│ 36 │ -247.05 │ -162.4 │ -26.5 │ 72.6 │ 173.15 │ 240.827 │ -│ 37 │ -247.043 │ -213.4 │ -107.2 │ 24.7 │ 158.5 │ 240.409 │ -⋮ -│ 432 │ -219.99 │ -156.2 │ -32.9 │ 63.3 │ 182.8 │ 232.0 │ -│ 433 │ -219.945 │ -220.6 │ -114.2 │ 9.7 │ 106.4 │ 227.9 │ -│ 434 │ -219.9 │ -120.9 │ -1.3 │ 99.5 │ 207.6 │ 225.629 │ -│ 435 │ -222.943 │ -240.5 │ -110.3 │ 26.1 │ 142.8 │ 223.357 │ -│ 436 │ -225.986 │ -239.6 │ -121.4 │ 2.9 │ 124.9 │ 221.086 │ -│ 437 │ -229.029 │ -139.8 │ -7.3 │ 121.0 │ 151.55 │ 218.814 │ -│ 438 │ -232.071 │ -212.0 │ -66.2 │ 50.4 │ 178.2 │ 216.543 │ -│ 439 │ -235.114 │ -232.7 │ -109.2 │ 18.4 │ 127.5 │ 214.271 │ -│ 440 │ -238.157 │ -236.3 │ -115.1 │ 5.1 │ 109.0 │ 212.0 │ -│ 441 │ -241.2 │ -107.1 │ -9.1 │ 95.1 │ 198.6 │ 216.617 │ -│ 442 │ -226.7 │ -143.8 │ -30.4 │ 75.8 │ 196.6 │ 221.233 │ -│ 443 │ -224.52 │ -131.8 │ -26.5 │ 64.7 │ 177.2 │ 225.85 │ -│ 444 │ -222.34 │ -144.9 │ -0.9 │ 105.3 │ 230.9 │ 230.467 │ -│ 445 │ -220.16 │ -214.0 │ -81.8 │ 66.1 │ 191.3 │ 235.083 │ -│ 446 │ -217.98 │ -210.6 │ -94.3 │ 16.7 │ 125.5 │ 239.7 │ -│ 447 │ -215.8 │ -114.8 │ -18.4 │ 65.3 │ 171.6 │ 249.7 │ -│ 448 │ -216.56 │ -156.0 │ -14.0 │ 113.7 │ 249.3 │ 248.7 │ -│ 449 │ -217.32 │ -210.5 │ -41.9 │ 85.25 │ 191.55 │ 247.7 │ -│ 450 │ -218.08 │ -189.2 │ -72.0 │ 56.8 │ 133.8 │ 246.7 │ -│ 451 │ -218.84 │ -214.2 │ -102.2 │ 5.5 │ 75.6 │ 154.3 │ -│ 452 │ -219.6 │ -107.9 │ -16.0 │ 101.7 │ 186.0 │ 200.1 │ -│ 453 │ -220.033 │ -153.0 │ -38.0 │ 61.3 │ 144.4 │ 245.9 │ -│ 454 │ -220.467 │ -179.8 │ -63.4 │ 56.0 │ 157.5 │ 234.033 │ -│ 455 │ -220.9 │ -174.5 │ -44.8 │ 73.3 │ 179.7 │ 222.167 │ -│ 456 │ -221.333 │ -206.8 │ -108.9 │ 3.7 │ 102.1 │ 210.3 │ -│ 457 │ -221.767 │ -169.5 │ -79.7 │ 27.9 │ 129.4 │ 242.8 │ -│ 458 │ -222.2 │ -104.6 │ -2.4 │ 84.3 │ 204.7 │ 237.84 │ -│ 459 │ -236.3 │ -124.0 │ -6.8 │ 95.7 │ 196.0 │ 232.88 │ -│ 460 │ -237.875 │ -216.5 │ -90.2 │ 27.8 │ 138.9 │ 227.92 │ -│ 461 │ -239.45 │ -163.2 │ -43.6 │ 69.5 │ 173.9 │ 222.96 │ -│ 462 │ -241.025 │ -207.3 │ -88.3 │ 9.6 │ 104.1 │ 218.0 │ -│ 463 │ -242.6 │ -142.0 │ -21.8 │ 69.8 │ 148.7 │ 224.125 │ -│ 464 │ -235.9 │ -128.8 │ -33.1 │ 68.8 │ 177.1 │ 230.25 │ -│ 465 │ -239.8 │ -140.8 │ -38.7 │ 58.1 │ 186.3 │ 236.375 │ -│ 466 │ -243.7 │ -149.5 │ -40.3 │ 62.8 │ 139.7 │ 242.5 │ -│ 467 │ -247.6 │ -157.8 │ -53.3 │ 28.3 │ 122.9 │ 227.6 │ -│ 468 │ -247.6 │ -154.9 │ -50.8 │ 28.1 │ 119.9 │ 201.1 │ -│ 469 │ -247.6 │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ -``` +Warning: Your approach should depend on the properties of you data (e.g., MCAR, MAR, MNAR) From 3edef07ee1b67b4d84a2ab27927bc7ff4c91f8b6 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 11 Jul 2019 13:20:20 -0500 Subject: [PATCH 18/34] More PR review cleanup. --- Project.toml | 2 +- README.md | 2 +- docs/src/index.md | 2 +- src/context.jl | 27 +++++++++++++++++++++------ src/deprecated.jl | 9 ++++----- src/imputors.jl | 7 +++++-- src/imputors/drop.jl | 2 +- src/imputors/fill.jl | 2 +- test/deprecated.jl | 2 +- test/runtests.jl | 38 ++++++++++++++++++++++++++++++++++---- 10 files changed, 70 insertions(+), 23 deletions(-) diff --git a/Project.toml b/Project.toml index 328122e..fab3817 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,7 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -DataFrames = "0.17, 0.18" +DataFrames = ">= 0.16" IterTools = "1.2" Tables = "0.2" julia = "1" diff --git a/README.md b/README.md index 7d7e1d2..6943409 100644 --- a/README.md +++ b/README.md @@ -111,4 +111,4 @@ julia> Impute.interp(df) |> Impute.locf() |> Impute.nocb() │ 469 │ -247.6 │ -180.7 │ -70.9 │ 33.7 │ 114.8 │ 222.5 │ ``` -**Warning**: Your approach should depend on the properties of you data (e.g., MCAR, MAR, MNAR) +**Warning**: Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). diff --git a/docs/src/index.md b/docs/src/index.md index 8e83549..31a547f 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -52,4 +52,4 @@ Finally, we can chain multiple simple methods together to give a complete datase Impute.interp(df) |> Impute.locf() |> Impute.nocb() ``` -Warning: Your approach should depend on the properties of you data (e.g., MCAR, MAR, MNAR) +Warning: Your approach should depend on the properties of you data (e.g., [MCAR, MAR, MNAR](https://en.wikipedia.org/wiki/Missing_data#Types_of_missing_data)). diff --git a/src/context.jl b/src/context.jl index e7bc9ee..aa0ffa1 100644 --- a/src/context.jl +++ b/src/context.jl @@ -2,6 +2,17 @@ AbstractContext An imputation context records summary information about missing data for an imputation algorithm. +All `AbstractContext`s are callable with a function, which allows us to write code like: + +```julia +context() do c + # My imputation code using a clean context +end +``` + +This do-block will pass a fresh context to your code and apply the `on_complete` function on +the resulting data and context state. By default, `on_complete` will throw an +[ImputeError](@ref) if we have too many missing values. """ abstract type AbstractContext end @@ -12,7 +23,7 @@ Base.copy(ctx::T) where {T <: AbstractContext} = T(fieldvalues(ctx)...) """ ismissing(ctx::AbstractContext, x) -> Bool -Uses `ctx.is_missing` to determine if x is missing. If x is a named tuple or an abstract array +Uses `ctx.is_missing` to determine if x is missing. If x is a `NamedTuple` or an `AbstractArray` then `ismissing` will return true if `ctx.is_missing` returns true for any element. The ctx.count is increased whenever whenever we return true and if `ctx.count / ctx.num` exceeds our `ctx.limit` we throw an `ImputeError` @@ -100,7 +111,7 @@ weighted. # Keyword Arguments * `n::Int`: number of observations * `count::Int`: number of missing values found -* `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). +* `limit::Float64`: portion of total values allowed to be imputed (should be between 0.0 and 1.0). * `is_missing::Function`: returns a Bool if the value counts as missing * `on_complete::Function`: a function to run when imputation is complete """ @@ -163,10 +174,10 @@ This context type can be useful if some missing observation are more important t # Keyword Arguments * `num::Int`: number of observations -* `s::Float64`: sum of missing values weights -* `limit::Float64`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). +* `s::Float64`: sum of the weights of missing values +* `limit::Float64`: portion of total values allowed to be imputed (should be between 0.0 and 1.0). * `is_missing::Function`: returns a Bool if the value counts as missing -* `on_complete::Function`: allowable portion of total values allowed to be imputed (should be between 0.0 and 1.0). +* `on_complete::Function`: a function to run when imputation is complete """ function WeightedContext( wv::AbstractWeights; @@ -205,7 +216,11 @@ function complete(ctx::WeightedContext, data) return data end -for T in [Context, WeightedContext] +#= +Define our callable methods for each context. Once we drop 1.0 we should be able to just +define this on the `AbstractContext`. +=# +for T in (Context, WeightedContext) @eval begin function (ctx::$T)(f::Function) _ctx = empty(ctx) diff --git a/src/deprecated.jl b/src/deprecated.jl index 6f62a14..57a2e17 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -31,7 +31,7 @@ function impute!(data, method::Symbol, args...; limit::Float64=0.1) Base.depwarn( """ impute!(data, method) is deprecated. - Please use Impute.method!(data) or impute!(imputor, data). + Please use Impute.method!(data) or impute!(imputor::Imputor, data). """, :impute! ) @@ -62,7 +62,7 @@ Creates the appropriate `Imputor` type and `Context` (using `missing` function) function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) Base.depwarn( """ - impute!(data, missing, method) is deprecated. Please use impute!(imputor, data). + impute!(data, missing, method) is deprecated. Please use impute!(imputor::Imputor, data). """, :impute! ) @@ -167,12 +167,11 @@ function _extract_context_kwargs(kwargs...) limit = 1.0 if haskey(d, :limit) + limit = d[:limit] @warn( "Passing `limit` directly to impute functions is deprecated. " * - "Please pass a `context` in the future." + "Please pass `context=Context(; limit=$limit)` in the future." ) - - limit = d[:limit] delete!(d, :limit) end diff --git a/src/imputors.jl b/src/imputors.jl index a9f3e07..f2cedb3 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -3,8 +3,7 @@ An imputor stores information about imputing values in `AbstractArray`s and `Tables.table`s. New imputation methods are expected to sutype `Imputor` and, at minimum, -implement the `impute!{T<:Any}(imp::, ctx::Context, data::AbstractArray{T, 1})` -method. +implement the `impute!(imp::, data::AbstractVector)` method. """ abstract type Imputor end @@ -40,6 +39,10 @@ function impute(imp::Imputor, data) return impute!(imp, deepcopy(data)) end + +# This is a necessary fallback because the tables method doesn't have a type declaration +impute!(imp::Imputor, data::AbstractVector) = MethodError(impute!, (imp, data)) + """ impute!(imp::Imputor, data::AbstractMatrix) diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 819c292..9d0ed07 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -124,7 +124,7 @@ function impute!(imp::DropVars, table) try imp.context() do c col = getproperty(cols, cname) - for i in 1:length(col) + for i in eachindex(col) ismissing(c, col[i]) end end diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index 03b4fd7..4bca9a2 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -44,7 +44,7 @@ function impute!(imp::Fill, data::AbstractVector) imp.value end - for i in 1:length(data) + for i in eachindex(data) if ismissing(c, data[i]) data[i] = fill_val end diff --git a/test/deprecated.jl b/test/deprecated.jl index 6c93d43..6f2661e 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -1,5 +1,5 @@ @testset "deprecated" begin - a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) + a = allowmissing(1.0:1.0:20.0) a[[2, 3, 7]] .= missing mask = map(!ismissing, a) diff --git a/test/runtests.jl b/test/runtests.jl index f1a972c..47d5ceb 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,7 +19,7 @@ import Impute: ImputeError @testset "Impute" begin - a = Vector{Union{Float64, Missing}}(1.0:1.0:20.0) + a = allowmissing(1.0:1.0:20.0) a[[2, 3, 7]] .= missing mask = map(!ismissing, a) ctx = Context(; limit=0.2) @@ -38,6 +38,10 @@ import Impute: @test a2 == expected end @testset "DropVars" begin + @testset "Vector" begin + @test_throws MethodError Impute.dropvars(a) + end + @testset "Matrix" begin m = reshape(a, 5, 4) @@ -55,8 +59,8 @@ import Impute: end @testset "DataFrame" begin df = DataFrame( - :sin => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)), - :cos => Vector{Union{Float64, Missing}}(sin.(1.0:1.0:20.0)), + :sin => allowmissing(sin.(1.0:1.0:20.0)), + :cos => allowmissing(sin.(1.0:1.0:20.0)), ) df.sin[[2, 3, 7, 12, 19]] .= missing df.cos[[4, 9]] .= missing @@ -168,7 +172,7 @@ import Impute: df = DataFrame( :hod => hod, :obj => obj, - :val => Vector{Union{Float64, Missing}}( + :val => allowmissing( [sin(x) * cos(y) for (x, y) in zip(hod, obj)] ), ) @@ -307,5 +311,31 @@ import Impute: end end + @testset "Utils" begin + drop_dim1 = DropObs(; vardim=1) + drop_dim2 = DropObs(; vardim=2) + M = [1.0 2.0 3.0 4.0 5.0; 1.1 2.2 3.3 4.4 5.5] + + @testset "obswise" begin + @test map(sum, Impute.obswise(drop_dim1, M)) == [2.1, 4.2, 6.3, 8.4, 10.5] + @test map(sum, Impute.obswise(drop_dim2, M)) == [15, 16.5] + end + + @testset "varwise" begin + @test map(sum, Impute.varwise(drop_dim1, M)) == [15, 16.5] + @test map(sum, Impute.varwise(drop_dim2, M)) == [2.1, 4.2, 6.3, 8.4, 10.5] + end + + @testset "filterobs" begin + @test Impute.filterobs(x -> sum(x) > 5.0, drop_dim1, M) == M[:, 3:5] + @test Impute.filterobs(x -> sum(x) > 15.0, drop_dim2, M) == M[[false, true], :] + end + + @testset "filtervars" begin + @test Impute.filtervars(x -> sum(x) > 15.0, drop_dim1, M) == M[[false, true], :] + @test Impute.filtervars(x -> sum(x) > 5.0, drop_dim2, M) == M[:, 3:5] + end + end + include("deprecated.jl") end From 9254ebf145d466bd8cb1497bf96cb6fa6ad3736e Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 11 Jul 2019 14:14:13 -0500 Subject: [PATCH 19/34] Switched impute!(imp, data) -> impute!(data, imp) --- src/Impute.jl | 8 ++++---- src/deprecated.jl | 25 ++++++++++++++----------- src/imputors.jl | 26 +++++++++++++------------- src/imputors/chain.jl | 6 +++--- src/imputors/drop.jl | 16 ++++++++-------- src/imputors/fill.jl | 4 ++-- src/imputors/interp.jl | 4 ++-- src/imputors/locf.jl | 4 ++-- src/imputors/nocb.jl | 4 ++-- test/runtests.jl | 32 ++++++++++++++++---------------- 10 files changed, 66 insertions(+), 63 deletions(-) diff --git a/src/Impute.jl b/src/Impute.jl index 6ce0fcd..442339f 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -70,10 +70,10 @@ for (f, v) in pairs(imputation_methods) f! = Symbol(f, :!) @eval begin - $f(data; kwargs...) = impute($typename(; _extract_context_kwargs(kwargs...)...), data) - $f!(data; kwargs...) = impute!($typename(; _extract_context_kwargs(kwargs...)...), data) - $f(; kwargs...) = data -> impute($typename(; _extract_context_kwargs(kwargs...)...), data) - $f!(; kwargs...) = data -> impute!($typename(; _extract_context_kwargs(kwargs...)...), data) + $f(data; kwargs...) = impute(data, $typename(; _extract_context_kwargs(kwargs...)...)) + $f!(data; kwargs...) = impute!(data, $typename(; _extract_context_kwargs(kwargs...)...)) + $f(; kwargs...) = data -> impute(data, $typename(; _extract_context_kwargs(kwargs...)...)) + $f!(; kwargs...) = data -> impute!(data, $typename(; _extract_context_kwargs(kwargs...)...)) end end diff --git a/src/deprecated.jl b/src/deprecated.jl index 57a2e17..105c3b4 100644 --- a/src/deprecated.jl +++ b/src/deprecated.jl @@ -3,14 +3,17 @@ ############################################################################### Base.@deprecate( impute(imp::Imputor, context::AbstractContext, data; kwargs...), - impute(typeof(imp)(; context=context), data; kwargs...) + impute(data, typeof(imp)(; context=context, kwargs...)) ) Base.@deprecate( impute!(imp::Imputor, context::AbstractContext, data; kwargs...), - impute!(typeof(imp)(; context=context), data; kwargs...) + impute!(data, typeof(imp)(; context=context, kwargs...)) ) +Base.@deprecate impute(imp::Imputor, data) impute(data, imp) +Base.@deprecate impute!(imp::Imputor, data) impute!(data, imp) + ##################################################################### # Deprecate all impute calls where the first argument is an Imputor # ##################################################################### @@ -18,7 +21,7 @@ Base.@deprecate( impute!(data, method::Symbol=:interp, args...; limit::Float64=0.1) Looks up the `Imputor` type for the `method`, creates it and calls -`impute!(imputor::Imputor, data, limit::Float64)` with it. +`impute!(data, imputor::Imputor)` with it. # Arguments * `data`: the datset containing missing elements we should impute. @@ -31,7 +34,7 @@ function impute!(data, method::Symbol, args...; limit::Float64=0.1) Base.depwarn( """ impute!(data, method) is deprecated. - Please use Impute.method!(data) or impute!(imputor::Imputor, data). + Please use Impute.method!(data) or impute!(data, imputor::Imputor). """, :impute! ) @@ -42,14 +45,14 @@ function impute!(data, method::Symbol, args...; limit::Float64=0.1) imputor_type(; context=Context(; limit=limit)) end - return impute!(imputor, data) + return impute!(data, imputor) end """ impute!(data, missing::Function, method::Symbol=:interp, args...; limit::Float64=0.1) Creates the appropriate `Imputor` type and `Context` (using `missing` function) in order to call -`impute!(imputor::Imputor, ctx::Context, data)` with them. +`impute!(data, imputor::Imputor)` with them. # Arguments * `data`: the datset containing missing elements we should impute. @@ -62,7 +65,7 @@ Creates the appropriate `Imputor` type and `Context` (using `missing` function) function impute!(data, missing::Function, method::Symbol, args...; limit::Float64=0.1) Base.depwarn( """ - impute!(data, missing, method) is deprecated. Please use impute!(imputor::Imputor, data). + impute!(data, missing, method) is deprecated. Please use impute!(data, imputor::Imputor). """, :impute! ) @@ -73,7 +76,7 @@ function impute!(data, missing::Function, method::Symbol, args...; limit::Float6 imputor_type(; context=Context(; is_missing=missing, limit=limit)) end - return impute!(imputor, data) + return impute!(data, imputor) end """ @@ -85,7 +88,7 @@ function impute(data, args...; kwargs...) Base.depwarn( """ impute(data, args...; kwargs...) is deprecated. - Please use Impute.method(data) or impute(imputor, data). + Please use Impute.method(data) or impute(data, imputor::Imputor). """, :impute ) @@ -115,7 +118,7 @@ end """ chain!(data, imputors::Imputor...; kwargs...) -Creates a `Chain` with `imputors` and calls `impute!(imputor, data; kwargs...)` +Creates a `Chain` with `imputors` and calls `impute!(data, imputor)` """ function chain!(data, imputors::Imputor...; kwargs...) Base.depwarn( @@ -131,7 +134,7 @@ function chain!(data, imputors::Imputor...; kwargs...) imp = typeof(imputor)( (isa(x, AbstractContext) ? ctx : x for x in fieldvalues(imputor))... ) - data = impute!(imp, data) + data = impute!(data, imp) end return data diff --git a/src/imputors.jl b/src/imputors.jl index f2cedb3..78fcf90 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -30,28 +30,28 @@ function filtervars(f::Function, imp::Imputor, data::AbstractMatrix) end """ - impute(imp::Imputor, data) + impute(data, imp::Imputor) Returns a new copy of the `data` with the missing data imputed by the imputor `imp`. """ -function impute(imp::Imputor, data) +function impute(data, imp::Imputor) # Call `deepcopy` because we can trust that it's available for all types. - return impute!(imp, deepcopy(data)) + return impute!(deepcopy(data), imp) end # This is a necessary fallback because the tables method doesn't have a type declaration -impute!(imp::Imputor, data::AbstractVector) = MethodError(impute!, (imp, data)) +impute!(data::AbstractVector, imp::Imputor) = MethodError(impute!, (data, imp)) """ - impute!(imp::Imputor, data::AbstractMatrix) + impute!(data::AbstractMatrix, imp::Imputor) Imputes the data in a matrix by imputing the values 1 variable at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments -* `imp::Imputor`: the Imputor method to use * `data::AbstractMatrix`: the data to impute +* `imp::Imputor`: the Imputor method to use # Returns * `AbstractMatrix`: the input `data` with values imputed @@ -65,21 +65,21 @@ julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), M) +julia> impute(M, Interpolate(; vardim=1, context=Context(; limit=1.0))) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 3.0 4.0 5.0 1.1 2.2 3.3 4.4 5.5 ``` """ -function impute!(imp::Imputor, data::AbstractMatrix) +function impute!(data::AbstractMatrix, imp::Imputor) for var in varwise(imp, data) - impute!(imp, var) + impute!(var, imp) end return data end """ - impute!(imp::Imputor, table) + impute!(table, imp::Imputor) Imputes the data in a table by imputing the values 1 column at a time; if this is not the desired behaviour custom imputor methods should overload this method. @@ -105,7 +105,7 @@ julia> df = DataFrame(:a => [1.0, 2.0, missing, missing, 5.0], :b => [1.1, 2.2, │ 4 │ missing │ missing │ │ 5 │ 5.0 │ 5.5 │ -julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), df) +julia> impute(df, Interpolate(; vardim=1, context=Context(; limit=1.0))) 5×2 DataFrame │ Row │ a │ b │ │ │ Float64⍰ │ Float64⍰ │ @@ -116,14 +116,14 @@ julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), df) │ 4 │ 4.0 │ 4.4 │ │ 5 │ 5.0 │ 5.5 │ """ -function impute!(imp::Imputor, table) +function impute!(table, imp::Imputor) @assert istable(table) # Extract a columns iterate that we should be able to use to mutate the data. # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data columntable = Tables.columns(table) for cname in propertynames(columntable) - impute!(imp, getproperty(columntable, cname)) + impute!(getproperty(columntable, cname), imp) end return table diff --git a/src/imputors/chain.jl b/src/imputors/chain.jl index d9164a0..25c458d 100644 --- a/src/imputors/chain.jl +++ b/src/imputors/chain.jl @@ -18,7 +18,7 @@ Creates a Chain using the `Imputor`s provided (ordering matters). Chain(imputors::Imputor...) = Chain(collect(imputors)) """ - impute!(imp::Chain, data) + impute!(data, imp::Chain) Runs the `Imputor`s on the supplied data. @@ -29,9 +29,9 @@ Runs the `Imputor`s on the supplied data. # Returns * our imputed data """ -function impute!(imp::Chain, data) +function impute!(data, imp::Chain) for imputor in imp.imputors - data = impute!(imputor, data) + data = impute!(data, imputor) end return data diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 9d0ed07..9f618e3 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -22,7 +22,7 @@ julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(DropObs(; vardim=1, context=Context(; limit=1.0)), M) +julia> impute(M, DropObs(; vardim=1, context=Context(; limit=1.0))) 2×3 Array{Union{Missing, Float64},2}: 1.0 2.0 5.0 1.1 2.2 5.5 @@ -30,13 +30,13 @@ julia> impute(DropObs(; vardim=1, context=Context(; limit=1.0)), M) """ DropObs(; vardim=2, context=Context()) = DropObs(vardim, context) -function impute!(imp::DropObs, data::AbstractVector) +function impute!(data::AbstractVector, imp::DropObs) imp.context() do c filter!(x -> !ismissing(c, x), data) end end -function impute!(imp::DropObs, data::AbstractMatrix) +function impute!(data::AbstractMatrix, imp::DropObs) imp.context() do c return filterobs(imp, data) do obs !ismissing(c, obs) @@ -46,9 +46,9 @@ end # Deleting elements from subarrays doesn't work so we need to collect that data into # a separate array. -impute!(imp::DropObs, data::SubArray) = impute!(imp::DropObs, collect(data)) +impute!(data::SubArray, imp::DropObs) = impute!(collect(data), imp::DropObs) -function impute!(imp::DropObs, table) +function impute!(table, imp::DropObs) imp.context() do c @assert istable(table) rows = Tables.rows(table) @@ -90,14 +90,14 @@ julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(DropVars(; vardim=1, context=Context(; limit=0.2)), M) +julia> impute(M, DropVars(; vardim=1, context=Context(; limit=0.2))) 1×5 Array{Union{Missing, Float64},2}: 1.1 2.2 3.3 missing 5.5 ``` """ DropVars(; vardim=2, context=Context()) = DropVars(vardim, context) -function impute!(imp::DropVars, data::AbstractMatrix) +function impute!(data::AbstractMatrix, imp::DropVars) return filtervars(imp, data) do var try imp.context() do c @@ -116,7 +116,7 @@ function impute!(imp::DropVars, data::AbstractMatrix) end end -function impute!(imp::DropVars, table) +function impute!(table, imp::DropVars) @assert istable(table) cols = Tables.columns(table) diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index 4bca9a2..fc3685a 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -27,7 +27,7 @@ julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(Fill(; vardim=1, context=Context(; limit=1.0)), M) +julia> impute(M, Fill(; vardim=1, context=Context(; limit=1.0))) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 2.66667 2.66667 5.0 1.1 2.2 3.3 3.025 5.5 @@ -35,7 +35,7 @@ julia> impute(Fill(; vardim=1, context=Context(; limit=1.0)), M) """ Fill(; value=mean, vardim=2, context=Context()) = Fill(value, vardim, context) -function impute!(imp::Fill, data::AbstractVector) +function impute!(data::AbstractVector, imp::Fill) imp.context() do c fill_val = if isa(imp.value, Function) # Call `deepcopy` because we can trust that it's available for all types. diff --git a/src/imputors/interp.jl b/src/imputors/interp.jl index c13aad2..6e41398 100644 --- a/src/imputors/interp.jl +++ b/src/imputors/interp.jl @@ -28,7 +28,7 @@ julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), M) +julia> impute(M, Interpolate(; vardim=1, context=Context(; limit=1.0))) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 3.0 4.0 5.0 1.1 2.2 3.3 4.4 5.5 @@ -36,7 +36,7 @@ julia> impute(Interpolate(; vardim=1, context=Context(; limit=1.0)), M) """ Interpolate(; vardim=2, context=Context()) = Interpolate(vardim, context) -function impute!(imp::Interpolate, data::AbstractVector{<:Union{T, Missing}}) where T +function impute!(data::AbstractVector{<:Union{T, Missing}}, imp::Interpolate) where T imp.context() do c i = findfirst(c, data) + 1 diff --git a/src/imputors/locf.jl b/src/imputors/locf.jl index 768353f..02452ef 100644 --- a/src/imputors/locf.jl +++ b/src/imputors/locf.jl @@ -31,7 +31,7 @@ julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(LOCF(; vardim=1, context=Context(; limit=1.0)), M) +julia> impute(M, LOCF(; vardim=1, context=Context(; limit=1.0))) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 2.0 2.0 5.0 1.1 2.2 3.3 3.3 5.5 @@ -39,7 +39,7 @@ julia> impute(LOCF(; vardim=1, context=Context(; limit=1.0)), M) """ LOCF(; vardim=2, context=Context()) = LOCF(vardim, context) -function impute!(imp::LOCF, data::AbstractVector) +function impute!(data::AbstractVector, imp::LOCF) imp.context() do c start_idx = findfirst(c, data) + 1 for i in start_idx:lastindex(data) diff --git a/src/imputors/nocb.jl b/src/imputors/nocb.jl index 49c8e71..aca5798 100644 --- a/src/imputors/nocb.jl +++ b/src/imputors/nocb.jl @@ -30,7 +30,7 @@ julia> M = [1.0 2.0 missing missing 5.0; 1.1 2.2 3.3 missing 5.5] 1.0 2.0 missing missing 5.0 1.1 2.2 3.3 missing 5.5 -julia> impute(NOCB(; vardim=1, context=Context(; limit=1.0)), M) +julia> impute(M, NOCB(; vardim=1, context=Context(; limit=1.0))) 2×5 Array{Union{Missing, Float64},2}: 1.0 2.0 5.0 5.0 5.0 1.1 2.2 3.3 5.5 5.5 @@ -38,7 +38,7 @@ julia> impute(NOCB(; vardim=1, context=Context(; limit=1.0)), M) """ NOCB(; vardim=2, context=Context()) = NOCB(vardim, context) -function impute!(imp::NOCB, data::AbstractVector) +function impute!(data::AbstractVector, imp::NOCB) imp.context() do c end_idx = findlast(c, data) - 1 for i in end_idx:-1:firstindex(data) diff --git a/test/runtests.jl b/test/runtests.jl index 47d5ceb..094a657 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -26,7 +26,7 @@ import Impute: @testset "Drop" begin @testset "DropObs" begin - result = impute(DropObs(; context=ctx), a) + result = impute(a, DropObs(; context=ctx)) expected = copy(a) deleteat!(expected, [2, 3, 7]) @@ -45,7 +45,7 @@ import Impute: @testset "Matrix" begin m = reshape(a, 5, 4) - result = impute(DropVars(; context=ctx), m) + result = impute(m, DropVars(; context=ctx)) expected = copy(m)[:, 2:4] @test isequal(result, expected) @@ -65,7 +65,7 @@ import Impute: df.sin[[2, 3, 7, 12, 19]] .= missing df.cos[[4, 9]] .= missing - result = impute(DropVars(; context=ctx), df) + result = impute(df, DropVars(; context=ctx)) expected = df[[:cos]] @test isequal(result, expected) @@ -80,7 +80,7 @@ import Impute: end @testset "Interpolate" begin - result = impute(Interpolate(; context=ctx), a) + result = impute(a, Interpolate(; context=ctx)) @test result == collect(1.0:1.0:20) @test result == interp(a; context=ctx) @@ -105,7 +105,7 @@ import Impute: @testset "Fill" begin @testset "Value" begin fill_val = -1.0 - result = impute(Fill(; value=fill_val, context=ctx), a) + result = impute(a, Fill(; value=fill_val, context=ctx)) expected = copy(a) expected[[2, 3, 7]] .= fill_val @@ -114,7 +114,7 @@ import Impute: end @testset "Mean" begin - result = impute(Fill(; value=mean, context=ctx), a) + result = impute(a, Fill(; value=mean, context=ctx)) expected = copy(a) expected[[2, 3, 7]] .= mean(a[mask]) @@ -128,7 +128,7 @@ import Impute: end @testset "LOCF" begin - result = impute(LOCF(; context=ctx), a) + result = impute(a, LOCF(; context=ctx)) expected = copy(a) expected[2] = 1.0 expected[3] = 1.0 @@ -143,7 +143,7 @@ import Impute: end @testset "NOCB" begin - result = impute(NOCB(; context=ctx), a) + result = impute(a, NOCB(; context=ctx)) expected = copy(a) expected[2] = 4.0 expected[3] = 4.0 @@ -161,7 +161,7 @@ import Impute: ctx = Context(; limit=1.0) @testset "Single DataFrame" begin data = dataset("boot", "neuro") - df = impute(Interpolate(; context=ctx), data) + df = impute(data, Interpolate(; context=ctx)) @test isequal(df, Impute.interp(data; context=ctx)) end @testset "GroupedDataFrame" begin @@ -201,7 +201,7 @@ import Impute: data = Matrix(dataset("boot", "neuro")) @testset "Drop" begin - result = impute(DropObs(; context=ctx), data) + result = impute(data, DropObs(; context=ctx)) @test size(result, 1) == 4 @test result == Impute.dropobs(data; context=ctx) @@ -210,7 +210,7 @@ import Impute: end @testset "Fill" begin - result = impute(Fill(; value=0.0, context=ctx), data) + result = impute(data, Fill(; value=0.0, context=ctx)) @test size(result) == size(data) @test result == Impute.fill(data; value=0.0, context=ctx) @@ -222,7 +222,7 @@ import Impute: @testset "Not enough data" begin ctx = Context(; limit=0.1) - @test_throws ImputeError impute(DropObs(; context=ctx), a) + @test_throws ImputeError impute(a, DropObs(; context=ctx)) @test_throws ImputeError Impute.dropobs(a; context=ctx) end @@ -240,12 +240,12 @@ import Impute: # We can also use the Chain type with explicit Imputor types result2 = impute( + orig, Impute.Chain( Impute.Interpolate(; context=ctx), Impute.LOCF(), Impute.NOCB() ), - orig, ) @test result == result2 @@ -291,7 +291,7 @@ import Impute: @testset "Base" begin ctx = Context(; limit=0.1) @test_throws ImputeError Impute.dropobs(a; context=ctx) - @test_throws ImputeError impute(DropObs(; context=ctx), a) + @test_throws ImputeError impute(a, DropObs(; context=ctx)) end @testset "Weighted" begin @@ -299,7 +299,7 @@ import Impute: # because missing earlier observations is less important than later ones. ctx = WeightedContext(eweights(20, 0.3); limit=0.1) @test isa(ctx, WeightedContext) - result = impute(DropObs(), ctx, a) + result = impute(a, DropObs(; context=ctx)) expected = copy(a) deleteat!(expected, [2, 3, 7]) @test result == expected @@ -307,7 +307,7 @@ import Impute: # If we reverse the weights such that earlier observations are more important # then our previous limit of 0.2 won't be enough to succeed. ctx = WeightedContext(reverse!(eweights(20, 0.3)); limit=0.2) - @test_throws ImputeError impute(DropObs(), ctx, a) + @test_throws ImputeError impute(a, DropObs(; context=ctx)) end end From 2fece06dd293ca0c98aaa13ca590d273022be832 Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 11 Jul 2019 14:14:47 -0500 Subject: [PATCH 20/34] Remove matrix orientation deprecation. --- src/Impute.jl | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/Impute.jl b/src/Impute.jl index 442339f..79c97e1 100644 --- a/src/Impute.jl +++ b/src/Impute.jl @@ -25,15 +25,6 @@ function __init__() If you depend on a specific threshold please pass in an appropriate `AbstractContext`. """ ) - - @warn( - """ - All matrix imputation methods will be switching to the column-major convention - (e.g., each column corresponds to an observation, and each row corresponds to a variable). - To maintain the existing behaviour please pass `vardim=2` to the `Imputor` constructors - or impute functions (e.g., `fill`, `interp`, `locf`). - """ - ) end """ From f77421f009d94ab69fb9c067663f8b7925f9bde0 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Thu, 11 Jul 2019 14:16:42 -0500 Subject: [PATCH 21/34] Update test/runtests.jl Co-Authored-By: Nick Robinson --- test/runtests.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 094a657..7222a68 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -237,7 +237,6 @@ import Impute: # Confirm that we don't have any more missing values @test !any(ismissing, Matrix(result)) - # We can also use the Chain type with explicit Imputor types result2 = impute( orig, From e3ddd0841c5f4dc2795f35e831021b8668b62284 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Thu, 11 Jul 2019 14:19:44 -0500 Subject: [PATCH 22/34] Update src/imputors.jl Co-Authored-By: Nick Robinson --- src/imputors.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imputors.jl b/src/imputors.jl index 78fcf90..3db1937 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -118,7 +118,7 @@ julia> impute(df, Interpolate(; vardim=1, context=Context(; limit=1.0))) """ function impute!(table, imp::Imputor) @assert istable(table) - # Extract a columns iterate that we should be able to use to mutate the data. + # Extract a columns iterator that we should be able to use to mutate the data. # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data columntable = Tables.columns(table) From cadd28df0bacf9c63bebf2aae28d868f0cf3bda9 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Thu, 11 Jul 2019 14:21:42 -0500 Subject: [PATCH 23/34] Update src/imputors.jl Co-Authored-By: Nick Robinson --- src/imputors.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imputors.jl b/src/imputors.jl index 3db1937..bf143b6 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -46,7 +46,7 @@ impute!(data::AbstractVector, imp::Imputor) = MethodError(impute!, (data, imp)) """ impute!(data::AbstractMatrix, imp::Imputor) -Imputes the data in a matrix by imputing the values 1 variable at a time; +Impute the data in a matrix by imputing the values 1 variable at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments From 81fc7f8f85bc13d80e3d9a0fccdcdb6a79246dff Mon Sep 17 00:00:00 2001 From: rofinn Date: Thu, 11 Jul 2019 14:26:40 -0500 Subject: [PATCH 24/34] Missed PR review fixes. --- Project.toml | 1 - test/runtests.jl | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index fab3817..cc7dae5 100644 --- a/Project.toml +++ b/Project.toml @@ -5,7 +5,6 @@ version = "0.2.0" [deps] IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e" -Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" diff --git a/test/runtests.jl b/test/runtests.jl index 7222a68..68e55e9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -66,7 +66,7 @@ import Impute: df.cos[[4, 9]] .= missing result = impute(df, DropVars(; context=ctx)) - expected = df[[:cos]] + expected = select(df, :cos) @test isequal(result, expected) @test isequal(result, Impute.dropvars(df; context=ctx)) From 4b18a0d4dcc8dd4a2340398f97015f1e0220c07d Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Fri, 12 Jul 2019 16:25:51 -0500 Subject: [PATCH 25/34] Update src/imputors.jl Co-Authored-By: Lyndon White --- src/imputors.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imputors.jl b/src/imputors.jl index bf143b6..1d64113 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -46,7 +46,7 @@ impute!(data::AbstractVector, imp::Imputor) = MethodError(impute!, (data, imp)) """ impute!(data::AbstractMatrix, imp::Imputor) -Impute the data in a matrix by imputing the values 1 variable at a time; +Impute the data in a matrix by imputing the values one variable at a time; if this is not the desired behaviour custom imputor methods should overload this method. # Arguments From fafe2194e2d40cfd878ff96c90826939bb15fec5 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Fri, 12 Jul 2019 16:32:42 -0500 Subject: [PATCH 26/34] Update src/context.jl Co-Authored-By: Lyndon White --- src/context.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/context.jl b/src/context.jl index aa0ffa1..8f7d76e 100644 --- a/src/context.jl +++ b/src/context.jl @@ -181,7 +181,7 @@ This context type can be useful if some missing observation are more important t """ function WeightedContext( wv::AbstractWeights; - limit::Float64=1.0, + limit::Real=1.0, is_missing::Function=ismissing, on_complete::Function=complete ) From 8f0f4b632f5921697a66e63b16befdacd211cf7e Mon Sep 17 00:00:00 2001 From: rofinn Date: Fri, 12 Jul 2019 17:21:53 -0500 Subject: [PATCH 27/34] Throw MethodErrors in fallback table methods. --- src/imputors.jl | 7 ++----- src/imputors/drop.jl | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/imputors.jl b/src/imputors.jl index 1d64113..80eaedf 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -39,10 +39,6 @@ function impute(data, imp::Imputor) return impute!(deepcopy(data), imp) end - -# This is a necessary fallback because the tables method doesn't have a type declaration -impute!(data::AbstractVector, imp::Imputor) = MethodError(impute!, (data, imp)) - """ impute!(data::AbstractMatrix, imp::Imputor) @@ -117,7 +113,8 @@ julia> impute(df, Interpolate(; vardim=1, context=Context(; limit=1.0))) │ 5 │ 5.0 │ 5.5 │ """ function impute!(table, imp::Imputor) - @assert istable(table) + istable(table) || throw(MethodError(impute!, (table, imp))) + # Extract a columns iterator that we should be able to use to mutate the data. # NOTE: Mutation is not guaranteed for all table types, but it avoid copying the data columntable = Tables.columns(table) diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index 9f618e3..bd21b75 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -117,7 +117,7 @@ function impute!(data::AbstractMatrix, imp::DropVars) end function impute!(table, imp::DropVars) - @assert istable(table) + istable(table) || throw(MethodError(impute!, (table, imp))) cols = Tables.columns(table) cnames = Iterators.filter(propertynames(cols)) do cname From d8b51d49f3fdce1918df975f42a7b5d67d0dd0f3 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Mon, 15 Jul 2019 11:34:22 -0500 Subject: [PATCH 28/34] Update src/imputors/fill.jl Co-Authored-By: Nick Robinson --- src/imputors/fill.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/imputors/fill.jl b/src/imputors/fill.jl index fc3685a..1aad47b 100644 --- a/src/imputors/fill.jl +++ b/src/imputors/fill.jl @@ -12,7 +12,7 @@ The current implementation is univariate, so each variable in a table or matrix be handled independently. # Keyword Arguments -* `value::Any`: A scalar missing value or a function that returns the a scalar if +* `value::Any`: A scalar or a function that returns a scalar if passed the data with missing data removed (e.g, `mean`) * `vardim=2::Int`: Specify the dimension for variables in matrix input data * `context::AbstractContext`: A context which keeps track of missing data From 7c702272f32d667cf3577a75889f2cd28d993009 Mon Sep 17 00:00:00 2001 From: Rory Finnegan Date: Mon, 15 Jul 2019 11:34:40 -0500 Subject: [PATCH 29/34] Update src/context.jl Co-Authored-By: Nick Robinson --- src/context.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/context.jl b/src/context.jl index 8f7d76e..306acdd 100644 --- a/src/context.jl +++ b/src/context.jl @@ -112,7 +112,7 @@ weighted. * `n::Int`: number of observations * `count::Int`: number of missing values found * `limit::Float64`: portion of total values allowed to be imputed (should be between 0.0 and 1.0). -* `is_missing::Function`: returns a Bool if the value counts as missing +* `is_missing::Function`: must return a Bool indicating if the value counts as missing * `on_complete::Function`: a function to run when imputation is complete """ function Context(; From d5ff2c5dded96f0e3ea45bdc63d957da81b43af0 Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 15 Jul 2019 11:39:49 -0500 Subject: [PATCH 30/34] Use selectdim for obswise and varwise. --- src/imputors.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/imputors.jl b/src/imputors.jl index 80eaedf..4d4d471 100644 --- a/src/imputors.jl +++ b/src/imputors.jl @@ -12,11 +12,11 @@ obsdim(imp::Imputor) = imp.vardim == 1 ? 2 : 1 vardim(imp::Imputor) = imp.vardim function obswise(imp::Imputor, data::AbstractMatrix) - (imp.vardim == 1 ? view(data, :, i) : view(data, i, :) for i in axes(data, obsdim(imp))) + return (selectdim(data, obsdim(imp), i) for i in axes(data, obsdim(imp))) end function varwise(imp::Imputor, data::AbstractMatrix) - (imp.vardim == 1 ? view(data, i, :) : view(data, :, i) for i in axes(data, vardim(imp))) + return (selectdim(data, vardim(imp), i) for i in axes(data, vardim(imp))) end function filterobs(f::Function, imp::Imputor, data::AbstractMatrix) From 559107646cb1438eed118d833d6a65cc5fab1a52 Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 15 Jul 2019 11:41:18 -0500 Subject: [PATCH 31/34] =?UTF-8?q?Use=20=E2=88=98=20in=20tests=20to=20compo?= =?UTF-8?q?se=20imputor=20pipelines.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/runtests.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 68e55e9..2ce9fc7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -181,8 +181,8 @@ import Impute: gdf1 = groupby(deepcopy(df), [:hod, :obj]) gdf2 = groupby(df, [:hod, :obj]) - f1 = x -> Impute.interp(x; context=ctx) |> Impute.locf!() |> Impute.nocb!() - f2 = x -> Impute.interp!(x; context=ctx) |> Impute.locf!() |> Impute.nocb!() + f1 = Impute.interp(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() + f2 = Impute.interp!(; context=ctx) ∘ Impute.locf!() ∘ Impute.nocb!() result = vcat(f1.(gdf1)...) @test df != result From ec902fe091645b95153490502ed42f08e4d86cfc Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 15 Jul 2019 11:46:03 -0500 Subject: [PATCH 32/34] Change !any(ismissing, ...) tests to all(!ismissing, ...) --- test/deprecated.jl | 6 +++--- test/runtests.jl | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/deprecated.jl b/test/deprecated.jl index 6f2661e..961d86e 100644 --- a/test/deprecated.jl +++ b/test/deprecated.jl @@ -125,7 +125,7 @@ @test size(result) == size(orig) # Confirm that we don't have any more missing values - @test !any(ismissing, Matrix(result)) + @test all(!ismissing, Matrix(result)) end @testset "Column Table" begin @@ -140,7 +140,7 @@ @test size(result) == size(orig) # Confirm that we don't have any more missing values - @test !any(ismissing, result) + @test all(!ismissing, result) end @testset "Matrix" begin @@ -155,7 +155,7 @@ @test size(result) == size(data) # Confirm that we don't have any more missing values - @test !any(ismissing, result) + @test all(!ismissing, result) end end diff --git a/test/runtests.jl b/test/runtests.jl index 2ce9fc7..05e3002 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -187,7 +187,7 @@ import Impute: result = vcat(f1.(gdf1)...) @test df != result @test size(result) == (24 * 12 * 10, 3) - @test !any(ismissing, Tables.matrix(result)) + @test all(!ismissing, Tables.matrix(result)) # Test that we can also mutate the dataframe directly f2.(gdf2) @@ -235,7 +235,7 @@ import Impute: @test size(result) == size(orig) # Confirm that we don't have any more missing values - @test !any(ismissing, Matrix(result)) + @test all(!ismissing, Matrix(result)) # We can also use the Chain type with explicit Imputor types result2 = impute( @@ -259,7 +259,7 @@ import Impute: @test size(result) == size(orig) # Confirm that we don't have any more missing values - @test !any(ismissing, result) + @test all(!ismissing, result) end @testset "Matrix" begin @@ -268,7 +268,7 @@ import Impute: @test size(result) == size(data) # Confirm that we don't have any more missing values - @test !any(ismissing, result) + @test all(!ismissing, result) end end From e823cc24813c6a38daa41c6f559ae09bb4b06f4c Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 15 Jul 2019 12:00:43 -0500 Subject: [PATCH 33/34] Restrict RDatasets to >=0.6.2 --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index cc7dae5..b66b346 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] DataFrames = ">= 0.16" IterTools = "1.2" +RDatasets = ">= 0.6.2" Tables = "0.2" julia = "1" From 051d6ce01eb9e10bca2b6d67ec2ca1b464e9dda5 Mon Sep 17 00:00:00 2001 From: rofinn Date: Mon, 15 Jul 2019 12:04:28 -0500 Subject: [PATCH 34/34] Don't pipe to materializer. --- src/imputors/drop.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/imputors/drop.jl b/src/imputors/drop.jl index bd21b75..ad396e4 100644 --- a/src/imputors/drop.jl +++ b/src/imputors/drop.jl @@ -55,10 +55,11 @@ function impute!(table, imp::DropObs) # Unfortunately, we'll need to construct a new table # since Tables.rows is just an iterator - table = Iterators.filter(rows) do r + filtered = Iterators.filter(rows) do r !any(x -> ismissing(c, x), propertyvalues(r)) - end |> materializer(table) + end + table = materializer(table)(filtered) return table end end @@ -138,6 +139,7 @@ function impute!(table, imp::DropVars) end end - table = Tables.select(table, cnames...) |> materializer(table) + selected = Tables.select(table, cnames...) + table = materializer(table)(selected) return table end