Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pairwise #627

Merged
merged 22 commits into from
May 2, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/src/misc.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ levelsmap
indexmap
indicatormat
StatsBase.midpoints
pairwise
```
2 changes: 2 additions & 0 deletions src/StatsBase.jl
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ export
levelsmap, # construct a map from n unique elements to [1, ..., n]
findat, # find the position within a for elements in b
indicatormat, # construct indicator matrix
pairwise, # pairwise application of functions

# statistical models
CoefTable,
Expand Down Expand Up @@ -232,6 +233,7 @@ include("signalcorr.jl")
include("partialcor.jl")
include("empirical.jl")
include("hist.jl")
include("pairwise.jl")
include("misc.jl")

include("sampling.jl")
Expand Down
166 changes: 166 additions & 0 deletions src/pairwise.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
function _pairwise!(::Val{:none}, res::AbstractMatrix, f, x, y, symmetric::Bool)
m, n = size(res)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
for j in 1:n, i in 1:m
symmetric && i > j && continue

# For performance, diagonal is special-cased
if f === cor && i == j && x[i] === y[j]
# If the type isn't concrete, 1 may not be converted to the right type
# and the final matrix will have an abstract eltype
# (missings are propagated via the second branch, but NaNs are ignored)
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
res[i, j] = isconcretetype(eltype(res)) ? 1 : one(f(x[i], y[j]))
else
res[i, j] = f(x[i], y[j])
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
end
end
if symmetric
for j in 1:n, i in (j+1):m
res[i, j] = res[j, i]
end
end
return res
end

function _pairwise!(::Val{:pairwise}, res::AbstractMatrix, f, x, y, symmetric::Bool)
m, n = size(res)
for j in 1:n
ynminds = .!ismissing.(y[j])
for i in 1:m
symmetric && i > j && continue

if x[i] === y[j]
ynm = view(y[j], ynminds)
# For performance, diagonal is special-cased
if f === cor && i == j
# If the type isn't concrete, 1 may not be converted to the right type
# and the final matrix will have an abstract eltype
# (missings and NaNs are ignored)
res[i, j] = isconcretetype(eltype(res)) ? 1 : one(f(ynm, ynm))
else
res[i, j] = f(ynm, ynm)
end
else
nminds = .!ismissing.(x[i]) .& ynminds
xnm = view(x[i], nminds)
ynm = view(y[j], nminds)
res[i, j] = f(xnm, ynm)
end
end
end
if symmetric
for j in 1:n, i in (j+1):m
res[i, j] = res[j, i]
end
end
return res
end

function _pairwise!(::Val{:listwise}, res::AbstractMatrix, f, x, y, symmetric::Bool)
m, n = size(res)
nminds = .!ismissing.(x[1])
for i in 2:m
nminds .&= .!ismissing.(x[i])
end
if x !== y
for j in 1:n
nminds .&= .!ismissing.(y[j])
end
end

# Computing integer indices once for all vectors is faster
nminds′ = findall(nminds)
# TODO: check whether wrapping views in a custom array type which asserts
# that entries cannot be `missing` (similar to `skipmissing`)
# could offer better performance
return _pairwise!(Val(:none), res, f,
[view(xi, nminds′) for xi in x],
[view(yi, nminds′) for yi in y],
symmetric)
end

function _pairwise(::Val{skipmissing}, f, x, y, symmetric::Bool) where {skipmissing}
inds = keys(first(x))
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
if symmetric && x !== y
throw(ArgumentError("symmetric=true only makes sense passing " *
"a single set of variables (x === y)"))
end
for xi in x
keys(xi) == inds ||
throw(ArgumentError("All input vectors must have the same indices"))
end
for yi in y
keys(yi) == inds ||
throw(ArgumentError("All input vectors must have the same indices"))
end
x′ = x isa Union{AbstractArray, Tuple, NamedTuple} ? x : collect(x)
y′ = y isa Union{AbstractArray, Tuple, NamedTuple} ? y : collect(y)
m = length(x)
n = length(y)

T = Core.Compiler.return_type(f, Tuple{eltype(x′), eltype(y′)})
Tsm = Core.Compiler.return_type((x, y) -> f(disallowmissing(x), disallowmissing(y)),
Tuple{eltype(x′), eltype(y′)})

if skipmissing === :none
res = Matrix{T}(undef, m, n)
_pairwise!(Val(:none), res, f, x′, y′, symmetric)
elseif skipmissing === :pairwise
res = Matrix{Tsm}(undef, m, n)
_pairwise!(Val(:pairwise), res, f, x′, y′, symmetric)
elseif skipmissing === :listwise
res = Matrix{Tsm}(undef, m, n)
_pairwise!(Val(:listwise), res, f, x′, y′, symmetric)
else
throw(ArgumentError("skipmissing must be one of :none, :pairwise or :listwise"))
end

# identity.(res) lets broadcasting compute a concrete element type
# TODO: using promote_type rather than typejoin (which broadcast uses) would make sense
# Once identity.(res) is inferred automatically (JuliaLang/julia#30485),
# the assertion can be removed
@static if VERSION >= v"1.6.0-DEV"
U = Base.Broadcast.promote_typejoin_union(Union{T, Tsm})
return (isconcretetype(eltype(res)) ? res : identity.(res))::Matrix{<:U}
else
return (isconcretetype(eltype(res)) ? res : identity.(res))
end
end

"""
pairwise(f, x[, y], symmetric::Bool=false, skipmissing::Symbol=:none)

Return a matrix holding the result of applying `f` to all possible pairs
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
of vectors in iterators `x` and `y`. Rows correspond to
vectors in `x` and columns to vectors in `y`. If `y` is omitted then a
square matrix crossing `x` with itself is returned.

As a special case, if `f` is `cor`, diagonal cells are set to 1 even in
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
the presence `NaN` or `Inf` entries (but `missing` is propagated unless
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
`skipmissing` is different from `:none`).

# Keyword arguments
- `symmetric::Bool=false`: If `true`, `f` is only called to compute
for the lower triangle of the matrix, and these values are copied
to fill the upper triangle. Only possible when `y` is omitted.
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
This is automatically set to `true` when `f` is `cor` or `cov`.
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
- `skipmissing::Symbol=:none`: If `:none` (the default), missing values
in input vectors are passed to `f` without any modification.
Use `:pairwise` to skip entries with a `missing` value in either
of the two vectors passed to `f` for a given pair of vectors in `x` and `y`.
Use `:listwise` to skip entries with a `missing` value in any of the
vectors in `x` or `y`; note that this is likely to drop a large part of
nalimilan marked this conversation as resolved.
Show resolved Hide resolved
entries.
"""
pairwise(f, x, y=x; symmetric::Bool=false, skipmissing::Symbol=:none) =
_pairwise(Val(skipmissing), f, x, y, symmetric)

# cov(x) is faster than cov(x, x)
pairwise(::typeof(cov), x, y; symmetric::Bool=false, skipmissing::Symbol=:none) =
pairwise((x, y) -> x === y ? cov(x) : cov(x, y), x, y,
symmetric=symmetric, skipmissing=skipmissing)

pairwise(::typeof(cor), x; symmetric::Bool=true, skipmissing::Symbol=:none) =
pairwise(cor, x, x, symmetric=symmetric, skipmissing=skipmissing)

pairwise(::typeof(cov), x; symmetric::Bool=true, skipmissing::Symbol=:none) =
pairwise(cov, x, x, symmetric=symmetric, skipmissing=skipmissing)
148 changes: 148 additions & 0 deletions test/pairwise.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
using StatsBase
using Test, Random, Statistics, LinearAlgebra
using Missings

const ≅ = isequal

Random.seed!(1)

# to avoid using specialized method
arbitrary_fun(x, y) = cor(x, y)

@testset "pairwise with $f" for f in (arbitrary_fun, cor, cov)
@testset "basic interface" begin
x = [rand(10) for _ in 1:4]
y = [rand(Float32, 10) for _ in 1:5]
# to test case where inference of returned eltype fails
z = [Vector{Any}(rand(Float32, 10)) for _ in 1:5]

res = @inferred pairwise(f, x, y)
@test res isa Matrix{Float64}
@test res == [f(xi, yi) for xi in x, yi in y]

res = pairwise(f, y, z)
@test res isa Matrix{Float32}
@test res == [f(yi, zi) for yi in y, zi in z]

res = pairwise(f, Any[[1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]])
@test res isa Matrix{AbstractFloat}
@test res == [f(xi, yi) for xi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0]),
yi in ([1.0, 2.0, 3.0], [1.0f0, 3.0f0, 10.5f0])]
@test typeof.(res) == [Float64 Float64
Float64 Float32]

@inferred pairwise(f, x, y)

@test_throws ArgumentError pairwise(f, [Int[]], [Int[]])
end

@testset "missing values handling interface" begin
xm = [ifelse.(rand(100) .> 0.9, missing, rand(100)) for _ in 1:4]
ym = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4]
zm = [ifelse.(rand(100) .> 0.9, missing, rand(Float32, 100)) for _ in 1:4]

res = pairwise(f, xm, ym)
@test res isa Matrix{Missing}
@test res ≅ [missing for xi in xm, yi in ym]

res = pairwise(f, xm, ym, skipmissing=:pairwise)
@test res isa Matrix{Float64}
@test isapprox(res, [f(collect.(skipmissings(xi, yi))...) for xi in xm, yi in ym],
rtol=1e-6)

res = pairwise(f, ym, zm, skipmissing=:pairwise)
@test res isa Matrix{Float32}
@test isapprox(res, [f(collect.(skipmissings(yi, zi))...) for yi in ym, zi in zm],
rtol=1e-6)

nminds = mapreduce(x -> .!ismissing.(x),
(x, y) -> x .& y,
[xm; ym])
res = pairwise(f, xm, ym, skipmissing=:listwise)
@test res isa Matrix{Float64}
@test isapprox(res, [f(view(xi, nminds), view(yi, nminds)) for xi in xm, yi in ym],
rtol=1e-6)

if VERSION >= v"1.6.0-DEV"
# inference of cor fails so use an inferrable function
# to check that pairwise itself is inferrable
for skipmissing in (:none, :pairwise, :listwise)
g(x, y=x) = pairwise((x, y) -> x[1] * y[1], x, y, skipmissing=skipmissing)
@test Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) ==
Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}},
Vector{Vector{Union{Float64, Missing}}}}) ==
Matrix{<: Union{Float64, Missing}}
if skipmissing in (:pairwise, :listwise)
@test_broken Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}}}) ==
Core.Compiler.return_type(g, Tuple{Vector{Vector{Union{Float64, Missing}}},
Vector{Vector{Union{Float64, Missing}}}}) ==
Matrix{Float64}
end
end
end

@test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:something)

# variable with only missings
xm = [fill(missing, 10), rand(10)]
ym = [rand(10), rand(10)]

res = pairwise(f, xm, ym)
@test res isa Matrix{Union{Float64, Missing}}
@test res ≅ [f(xi, yi) for xi in xm, yi in ym]

if VERSION >= v"1.5" # Fails with UndefVarError on Julia 1.0
@test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:pairwise)
@test_throws ArgumentError pairwise(f, xm, ym, skipmissing=:listwise)
end
end

@testset "iterators" begin
x = (v for v in [rand(10) for _ in 1:4])
y = (v for v in [rand(10) for _ in 1:4])

@test pairwise(f, x, y) == pairwise(f, collect(x), collect(y))
@test pairwise(f, x) == pairwise(f, collect(x))
end

@testset "two-argument method" begin
x = [rand(10) for _ in 1:4]
@test pairwise(f, x) == pairwise(f, x, x)
end

@testset "symmetric" begin
x = [rand(10) for _ in 1:4]
y = [rand(10) for _ in 1:4]
@test pairwise(f, x, x, symmetric=true) ==
pairwise(f, x, symmetric=true) ==
Symmetric(pairwise(f, x, x), :U)
@test_throws ArgumentError pairwise(f, x, y, symmetric=true)
end

@testset "cor corner cases" begin
# Integer inputs must give a Float64 output
res = pairwise(cor, [[1, 2, 3], [1, 5, 2]])
@test res isa Matrix{Float64}
@test res == [cor(xi, yi) for xi in ([1, 2, 3], [1, 5, 2]),
yi in ([1, 2, 3], [1, 5, 2])]

# NaNs are ignored for the diagonal
res = pairwise(cor, [[1, 2, NaN], [1, 5, 2]])
@test res isa Matrix{Float64}
@test res ≅ [1.0 NaN
NaN 1.0]

# missings are propagated even for the diagonal
res = pairwise(cor, [[1, 2, 7], [1, 5, missing]])
@test res isa Matrix{Union{Float64, Missing}}
@test res ≅ [1.0 missing
missing missing]

for sm in (:pairwise, :listwise)
res = pairwise(cor, [[1, 2, NaN, 4], [1, 5, 5, missing]], skipmissing=sm)
@test res isa Matrix{Float64}
@test res ≅ [1.0 NaN
NaN 1.0]
end
end
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ tests = ["ambiguous",
"rankcorr",
"signalcorr",
"misc",
"pairwise",
"robust",
"sampling",
"wsampling",
Expand Down