Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pairwise #54

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ version = "0.4.0"
CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
NamedArrays = "86f7a689-2022-50b4-a561-43c23ac3c673"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"

[compat]
Expand All @@ -18,7 +19,9 @@ julia = "1"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "DataFrames"]
test = ["DataFrames", "LinearAlgebra", "Random", "Test"]
2 changes: 2 additions & 0 deletions src/FreqTables.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
module FreqTables
using Statistics
using CategoricalArrays
using Tables
using NamedArrays
using Missings

include("freqtable.jl")
include("pairwise.jl")

export freqtable, proptable, prop, Name
end # module
191 changes: 191 additions & 0 deletions src/pairwise.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
function _pairwise!(::Val{:none}, res::AbstractMatrix, f, x, y, symmetric::Bool)
m, n = size(res)
for j in 1:n, i in 1:m
symmetric && i > j && continue

# For performance, diagonal is special-cased
if f === cor && i == j && x[i] === y[j]
# If the type isn't concrete, 1 may not be converted to the right type
# and the final matrix will have an abstract eltype
# (missings are propagated via the second branch, but NaNs are ignored)
res[i, j] = isconcretetype(eltype(res)) ? 1 : one(f(x[i], y[j]))
else
res[i, j] = f(x[i], y[j])
end
end
if symmetric
for j in 1:n, i in (j+1):m
res[i, j] = res[j, i]
end
end
return res
end

function _pairwise!(::Val{:pairwise}, res::AbstractMatrix, f, x, y, symmetric::Bool)
m, n = size(res)
for j in 1:n
ynminds = .!ismissing.(y[j])
for i in 1:m
symmetric && i > j && continue

if x[i] === y[j]
ynm = view(y[j], ynminds)
# For performance, diagonal is special-cased
if f === cor && i == j
# If the type isn't concrete, 1 may not be converted to the right type
# and the final matrix will have an abstract eltype
# (missings and NaNs are ignored)
res[i, j] = isconcretetype(eltype(res)) ? 1 : one(f(ynm, ynm))
else
res[i, j] = f(ynm, ynm)
end
else
nminds = .!ismissing.(x[i]) .& ynminds
xnm = view(x[i], nminds)
ynm = view(y[j], nminds)
res[i, j] = f(xnm, ynm)
end
end
end
if symmetric
for j in 1:n, i in (j+1):m
res[i, j] = res[j, i]
end
end
return res
end

function _pairwise!(::Val{:listwise}, res::AbstractMatrix, f, x, y, symmetric::Bool)
m, n = size(res)
nminds = .!ismissing.(x[1])
for i in 2:m
nminds .&= .!ismissing.(x[i])
end
if x !== y
for j in 1:n
nminds .&= .!ismissing.(y[j])
end
end

# Computing integer indices once for all vectors is faster
nminds′ = findall(nminds)
# TODO: check whether wrapping views in a custom array type which asserts
# that entries cannot be `missing` (similar to `skipmissing`)
# could offer better performance
return _pairwise!(Val(:none), res, f,
[view(xi, nminds′) for xi in x],
[view(yi, nminds′) for yi in y],
symmetric)
end

function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmissing}
inds = keys(first(x))
for xi in x
keys(xi) == inds ||
throw(ArgumentError("All input vectors must have the same indices"))
end
for yi in y
keys(yi) == inds ||
throw(ArgumentError("All input vectors must have the same indices"))
end
x′ = collect(x)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe better use comprehension? It will narrow down the eltype of x'

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I'm not sure what's best. This will only make a difference for vector inputs, right? For these, if you write [x1, x2] you'll get a narrow type already (actually, narrower than what a comprehension would do alone thanks to promote_type as opposed to typejoin). So that would be useful mainly if you pass a vector that you allocated with an abstract type, in which case you may have reasons to do that (avoid unnecessary specialization...).

Anyway it shouldn't affect performance a lot since most of the time should be spent in f(xi, yi). Do you have a use case in mind?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no - I just know that collect does not narrow type.

y′ = collect(y)
m = length(x)
n = length(y)

T = Core.Compiler.return_type(f, Tuple{eltype(x′), eltype(y′)})
Tsm = Core.Compiler.return_type((x, y) -> f(disallowmissing(x), disallowmissing(y)),
Tuple{eltype(x′), eltype(y′)})

if skipmissing === :none
res = Matrix{T}(undef, m, n)
_pairwise!(Val(:none), res, f, x′, y′, symmetric)
elseif skipmissing === :pairwise
res = Matrix{Tsm}(undef, m, n)
_pairwise!(Val(:pairwise), res, f, x′, y′, symmetric)
elseif skipmissing === :listwise
res = Matrix{Tsm}(undef, m, n)
_pairwise!(Val(:listwise), res, f, x′, y′, symmetric)
else
throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise"))
end

# identity.(res) lets broadcasting compute a concrete element type
# TODO: using promote_type rather than typejoin (which broadcast uses) would make sense
# Once identity.(res) is inferred automatically (JuliaLang/julia#30485),
# the assertion can be removed
@static if isdefined(Base.Broadcast, :promote_typejoin_union) # Julia >= 1.6
U = Base.Broadcast.promote_typejoin_union(Union{T, Tsm})
return (isconcretetype(eltype(res)) ? res : identity.(res))::Matrix{<:U}
else
return (isconcretetype(eltype(res)) ? res : identity.(res))
end
end

function _pairwise_general(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmissing}
if symmetric && x !== y
throw(ArgumentError("symmetric=true only makes sense passing a single set of variables"))
end
if Tables.istable(x) && Tables.istable(y)
xcols = Tables.columns(x)
ycols = Tables.columns(y)
xcolnames = [String(nm) for nm in Tables.columnnames(xcols)]
ycolnames = [String(nm) for nm in Tables.columnnames(ycols)]
xcolsiter = (Tables.getcolumn(xcols, i) for i in 1:length(xcolnames))
ycolsiter = (Tables.getcolumn(ycols, i) for i in 1:length(ycolnames))
res = _pairwise(Val(skipmissing), f, xcolsiter, ycolsiter, symmetric)
return NamedArray(res, (xcolnames, ycolnames))
else
x′ = collect(x)
y′ = collect(y)
if all(xi -> xi isa AbstractArray, x′) && all(yi -> yi isa AbstractArray, y′)
return _pairwise(Val(skipmissing), f, x′, y′, symmetric)
else
throw(ArgumentError("x and y must be either iterators of AbstractArrays, " *
"or Tables.jl objects"))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reverse the order here I think.

end
end
end

"""
pairwise(f, x[, y], symmetric::Bool=false, skipmissing::Symbol=:none)

Return a matrix holding the result of applying `f` to all possible pairs
of vectors in iterators `x` and `y`. Rows correspond to
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why vectors? Cannot we say that x and y have to be iterables of iterables and we apply f to a their cross-product?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK - now I see why below. Maybe then require x and y to be AbstractVector{<:AbstractVector}?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I guess we could allow anything, f can take care of throwing an error if needed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also maybe write exactly what you have in code - if both x and y are Table.jl compliant then they are treated as table. Otherwise BOTH are treated as iterators?

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, to skip missing values we rely on view so I guess for that case we need AbstractVector. In the future we could use skipmissings but I'd say it's not high priority.

vectors in `x` and columns to vectors in `y`. If `y` is omitted then a
square matrix crossing `x` with itself is returned.

Alternatively, if `x` and `y` are tables (in the Tables.jl sense), return
a `NamedMatrix` holding the result of applying `f` to all possible pairs
of columns in `x` and `y`.

As a special case, if `f` is `cor`, diagonal cells are set to 1 even in
the presence `NaN` or `Inf` entries (but `missing` is propagated unless
`skipmissing` is different from `:none`).

# Keyword arguments
- `symmetric::Bool=false`: If `true`, `f` is only called to compute
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

formally commutative, as symmetric is a property of a binary relation typically.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Symmetric" referred to the table. Not sure what's the best term.

for the lower triangle of the matrix, and these values are copied
to fill the upper triangle. Only possible when `y` is omitted.
This is automatically set to `true` when `f` is `cor` or `cov`.
- `skipmissing::Symbol=:none`: If `:none` (the default), missing values
in input vectors are passed to `f` without any modification.
Use `:pairwise` to skip entries with a `missing` value in either
of the two vectors passed to `f` for a given pair of vectors in `x` and `y`.
Use `:listwise` to skip entries with a `missing` value in any of the
vectors in `x` or `y`; note that this is likely to drop a large part of
entries.
"""
pairwise(f, x, y=x; symmetric::Bool=false, skipmissing::Symbol=:none) =
_pairwise_general(Val(skipmissing), f, x, y, symmetric)

# cov(x) is faster than cov(x, x)
pairwise(::typeof(cov), x, y; symmetric::Bool=false, skipmissing::Symbol=:none) =
pairwise((x, y) -> x === y ? cov(x) : cov(x, y), x, y,
symmetric=symmetric, skipmissing=skipmissing)

pairwise(::typeof(cor), x; symmetric::Bool=true, skipmissing::Symbol=:none) =
pairwise(cor, x, x, symmetric=symmetric, skipmissing=skipmissing)

pairwise(::typeof(cov), x; symmetric::Bool=true, skipmissing::Symbol=:none) =
pairwise(cov, x, x, symmetric=symmetric, skipmissing=skipmissing)
167 changes: 167 additions & 0 deletions test/pairwise.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
using FreqTables
using FreqTables: pairwise
using Test, Random, Statistics, LinearAlgebra
using Missings, NamedArrays, DataFrames, Tables

const ≅ = isequal

Random.seed!(1)

# to avoid using specialized method
arbitrary_fun(x, y) = cor(x, y)

@testset "pairwise with $f" for f in (arbitrary_fun, cor, cov)
@testset "basic interface" begin
x = [rand(10) for _ in 1:4]
y = [rand(Float32, 10) for _ in 1:5]
# to test case where inference of returned eltype fails
z = [Vector{Any}(rand(Float32, 10)) for _ in 1:5]

res = @inferred pairwise(f, x, y)
@test res isa Matrix{Float64}
@test res == [f(xi, yi) for xi in x, yi in y]

res = pairwise(f, y, z)
@test res isa Matrix{Float32}
@test res == [f(yi, zi) for yi in y, zi in z]

res = pairwise(f, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]])
@test res isa Matrix{AbstractFloat}
@test res == [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]),
yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])]
@test typeof.(res) == [Float64 Float64
Float64 Float32]

@inferred pairwise(f, x, y)

@test_throws ArgumentError pairwise(f, [Int[]], [Int[]])
end

@testset "missing values handling interface" begin
xm = [ifelse.(rand(100) .> 0.9, missing, rand(100)) for _ in 1:4]
ym = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4]
zm = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4]

res = pairwise(f, xm, ym)
@test res isa Matrix{Missing}
@test res ≅ [missing for xi in xm, yi in ym]

res = pairwise(f, xm, ym, skipmissing=:pairwise)
@test res isa Matrix{Float64}
@test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym],
rtol=1e-6)

res = pairwise(f, ym, zm, skipmissing=:pairwise)
@test res isa Matrix{Float32}
@test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm],
rtol=1e-6)

nminds = mapreduce(x -> .!ismissing.(x),
(x, y) -> x .& y,
[xm; ym])
res = pairwise(f, xm, ym, skipmissing=:listwise)
@test res isa Matrix{Float64}
@test isapprox(res, [f(view(xi, nminds), view(yi, nminds)) for xi in xm, yi in ym],
rtol=1e-6)

if VERSION >= v"1.6.0-DEV"
# inference of cor fails so use an inferrable function
# to check that pairwise itself is inferrable
for skipmissing in (:none, :pairwise, :listwise)
g(x, y=x) = pairwise((x, y) -> x[1] * y[1], x, y, skipmissing=skipmissing)
@test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) ==
Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}},
Vector{Vector{Union{Float64, Missing}}}}) ==
Matrix{<: Union{Float64, Missing}}
if skipmissing in (:pairwise, :listwise)
@test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) ==
Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}},
Vector{Vector{Union{Float64, Missing}}}}) ==
Matrix{Float64}
end
end
end

@test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:something)

# variable with only missings
xm = [fill(missing, 10), rand(10)]
ym = [rand(10), rand(10)]

res = pairwise(f, xm, ym)
@test res isa Matrix{Union{Float64, Missing}}
@test res ≅ [f(xi, yi) for xi in xm, yi in ym]

@test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:pairwise)
@test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:listwise)
end

@testset "iterators" begin
x = (v for v in [rand(10) for _ in 1:4])
y = (v for v in [rand(10) for _ in 1:4])

@test pairwise(f, x, y) == pairwise(f, collect(x), collect(y))
@test pairwise(f, x) == pairwise(f, collect(x))
end

@testset "two-argument method" begin
x = [rand(10) for _ in 1:4]
@test pairwise(f, x) == pairwise(f, x, x)
end

@testset "symmetric" begin
x = [rand(10) for _ in 1:4]
y = [rand(10) for _ in 1:4]
@test pairwise(f, x, x, symmetric=true) ==
pairwise(f, x, symmetric=true) ==
Symmetric(pairwise(f, x, x), :U)
@test_throws ArgumentError pairwise(f, x, y, symmetric=true)
end

@testset "Tables method for $T" for T in (identity, DataFrame, Tables.rowtable)
x = rand(10)
y = rand(10)
z = rand(10)

res = pairwise(f, T((x=x, y=y)))
@test res isa NamedMatrix{Float64}
@test res == pairwise(f, [x, y])
@test names(res) == [["x", "y"], ["x", "y"]]

res = pairwise(f, T((x=x, y=y)), T((x=x, z=z, y=y)))
@test res isa NamedMatrix{Float64}
@test res == pairwise(f, [x, y], [x, z, y])
@test names(res) == [["x", "y"], ["x", "z", "y"]]

if T != DataFrame
@inferred pairwise(f, T((x=x, y=y)))
end
end

@testset "cor corner cases" begin
# Integer inputs must give a Float64 output
res = pairwise(cor, [[1, 2, 3], [1, 5, 2]])
@test res isa Matrix{Float64}
@test res == [cor(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]),
yi in ([1, 2, 3], [1, 5, 2])]

# NaNs are ignored for the diagonal
res = pairwise(cor, [[1, 2, NaN], [1, 5, 2]])
@test res isa Matrix{Float64}
@test res ≅ [1.0 NaN
NaN 1.0]

# missings are propagated even for the diagonal
res = pairwise(cor, [[1, 2, 7], [1, 5, missing]])
@test res isa Matrix{Union{Float64, Missing}}
@test res ≅ [1.0 missing
missing missing]

for sm in (:pairwise, :listwise)
res = pairwise(cor, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm)
@test res isa Matrix{Float64}
@test res ≅ [1.0 NaN
NaN 1.0]
end
end
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
include("freqtable.jl")
include("pairwise.jl")