Skip to content

Commit ae17198

Browse files
jeffwongandreasnoack
authored andcommitted
Faster Sparse Covariance Matrix (#22735)
* optimize sparse covariance matrix * test for optimized sparse covariance matrix * make spcov a function for the base method cov * fix unit test * use inbounds on for loop * better comments and even faster implementation * test for NaN and Inf * robust testing for NaN, Inf * NaN and Inf testing * fix corrected keyword * documentation about Inf * style * less allocations * parens to prevent overflow * simplify comments
1 parent f047602 commit ae17198

File tree

2 files changed

+146
-0
lines changed

2 files changed

+146
-0
lines changed

base/sparse/linalg.jl

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -941,3 +941,22 @@ end
941941
chol(A::SparseMatrixCSC) = error("Use cholfact() instead of chol() for sparse matrices.")
942942
lu(A::SparseMatrixCSC) = error("Use lufact() instead of lu() for sparse matrices.")
943943
eig(A::SparseMatrixCSC) = error("Use eigs() instead of eig() for sparse matrices.")
944+
945+
function Base.cov(X::SparseMatrixCSC, vardim::Int=1; corrected::Bool=true)
946+
a, b = size(X)
947+
n, p = vardim == 1 ? (a, b) : (b, a)
948+
949+
# Cov(X) = E[(X-μ)'(X-μ)]
950+
# = X'X - X'μ
951+
952+
# Compute X'X using sparse matrix operations
953+
out = Matrix(Base.unscaled_covzm(X, vardim))
954+
955+
# Compute X'μ
956+
sums = sum(X, vardim)
957+
@inbounds for j in 1:p, i in 1:p
958+
part2 = sums[i] * (sums[j] / n)
959+
out[i,j] -= part2
960+
end
961+
return scale!(out, inv(n-Int(corrected)))
962+
end

test/sparse/sparse.jl

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1843,3 +1843,130 @@ end
18431843
B = A[5:-1:1, 5:-1:1]
18441844
@test issymmetric(B)
18451845
end
1846+
1847+
# Faster covariance function for sparse matrices
1848+
# Prevents densifying the input matrix when subtracting the mean
1849+
# Test against dense implementation
1850+
# PR https://github.com/JuliaLang/julia/pull/22735
1851+
# Part of this test needed to be hacked due to the treatment
1852+
# of Inf in sparse matrix algebra
1853+
# https://github.com/JuliaLang/julia/issues/22921
1854+
# The issue will be resolved in
1855+
# https://github.com/JuliaLang/julia/issues/22733
1856+
@testset "optimizing sparse covariance" begin
1857+
n = 10
1858+
p = 5
1859+
srand(1)
1860+
x_sparse = sprand(n, p, .50)
1861+
x_dense = convert(Matrix{Float64}, x_sparse)
1862+
@test cov(x_sparse, 1, corrected=true) cov(x_dense, 1, corrected=true)
1863+
@test cov(x_sparse, 1, corrected=false) cov(x_dense, 1, corrected=false)
1864+
@test cov(x_sparse, 2, corrected=true) cov(x_dense, 2, corrected=true)
1865+
@test cov(x_sparse, 2, corrected=false) cov(x_dense, 2, corrected=false)
1866+
1867+
# Test with NaN
1868+
x_sparse[1,1] = NaN
1869+
x_dense[1,1] = NaN
1870+
1871+
cov_sparse = cov(x_sparse, 1, corrected=true)
1872+
cov_dense = cov(x_dense, 1, corrected=true)
1873+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1874+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1875+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1876+
1877+
cov_sparse = cov(x_sparse, 2, corrected=true)
1878+
cov_dense = cov(x_dense, 2, corrected=true)
1879+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1880+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1881+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1882+
1883+
cov_sparse = cov(x_sparse, 1, corrected=false)
1884+
cov_dense = cov(x_dense, 1, corrected=false)
1885+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1886+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1887+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1888+
1889+
cov_sparse = cov(x_sparse, 2, corrected=false)
1890+
cov_dense = cov(x_dense, 2, corrected=false)
1891+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1892+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1893+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1894+
1895+
# Test with Inf
1896+
x_sparse[1,1] = Inf
1897+
x_dense[1,1] = Inf
1898+
1899+
cov_sparse = cov(x_sparse, 1, corrected=true)
1900+
1901+
# Sparse matrix algebra generates Inf, but
1902+
# dense matrix algebra generates NaN
1903+
# NaN NaN -Inf -Inf NaN
1904+
# NaN 0.124035 0.00830252 -0.0430049 0.021373
1905+
# -Inf 0.00830252 0.111628 -0.0149783 0.00773125
1906+
# -Inf -0.0430049 -0.0149783 0.099782 -0.0496011
1907+
# NaN 0.021373 0.00773125 -0.0496011 0.126186
1908+
1909+
cov_sparse[isinf.(cov_sparse)] = NaN
1910+
cov_dense = cov(x_dense, 1, corrected=true)
1911+
1912+
# NaN NaN NaN NaN NaN
1913+
# NaN 0.124035 0.00830252 -0.0430049 0.021373
1914+
# NaN 0.00830252 0.111628 -0.0149783 0.00773125
1915+
# NaN -0.0430049 -0.0149783 0.099782 -0.0496011
1916+
# NaN 0.021373 0.00773125 -0.0496011 0.126186
1917+
1918+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1919+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1920+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1921+
1922+
cov_sparse = cov(x_sparse, 2, corrected=true)
1923+
cov_sparse[isinf.(cov_sparse)] = NaN
1924+
cov_dense = cov(x_dense, 2, corrected=true)
1925+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1926+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1927+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1928+
1929+
cov_sparse = cov(x_sparse, 1, corrected=false)
1930+
cov_sparse[isinf.(cov_sparse)] = NaN
1931+
cov_dense = cov(x_dense, 1, corrected=false)
1932+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1933+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1934+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1935+
1936+
cov_sparse = cov(x_sparse, 2, corrected=false)
1937+
cov_sparse[isinf.(cov_sparse)] = NaN
1938+
cov_dense = cov(x_dense, 2, corrected=false)
1939+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1940+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1941+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1942+
1943+
# Test with NaN and Inf
1944+
x_sparse[2,1] = NaN
1945+
x_dense[2,1] = NaN
1946+
1947+
cov_sparse = cov(x_sparse, 1, corrected=true)
1948+
cov_dense = cov(x_dense, 1, corrected=true)
1949+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1950+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1951+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1952+
1953+
cov_sparse = cov(x_sparse, 2, corrected=true)
1954+
cov_sparse[isinf.(cov_sparse)] = NaN
1955+
cov_dense = cov(x_dense, 2, corrected=true)
1956+
@test cov_sparse[3:end, 3:end] cov_dense[3:end, 3:end]
1957+
@test isequal(cov_sparse[1:end, 1:2], cov_dense[1:end, 1:2])
1958+
@test isequal(cov_sparse[1:2, 1:end], cov_dense[1:2, 1:end])
1959+
1960+
cov_sparse = cov(x_sparse, 1, corrected=false)
1961+
cov_dense = cov(x_dense, 1, corrected=false)
1962+
@test cov_sparse[2:end, 2:end] cov_dense[2:end, 2:end]
1963+
@test isequal(cov_sparse[1:end, 1], cov_dense[1:end, 1])
1964+
@test isequal(cov_sparse[1, 1:end], cov_dense[1, 1:end])
1965+
1966+
cov_sparse = cov(x_sparse, 2, corrected=false)
1967+
cov_sparse[isinf.(cov_sparse)] = NaN
1968+
cov_dense = cov(x_dense, 2, corrected=false)
1969+
@test cov_sparse[3:end, 3:end] cov_dense[3:end, 3:end]
1970+
@test isequal(cov_sparse[1:end, 1:2], cov_dense[1:end, 1:2])
1971+
@test isequal(cov_sparse[1:2, 1:end], cov_dense[1:2, 1:end])
1972+
end

0 commit comments

Comments
 (0)