diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl index a402de0a..05746ce7 100644 --- a/src/PartitionedArrays.jl +++ b/src/PartitionedArrays.jl @@ -13,6 +13,7 @@ using BlockArrays export length_to_ptrs! export rewind_ptrs! +export jagged_range export jagged_array export GenericJaggedArray export JaggedArray @@ -24,6 +25,12 @@ export compresscoo export indextype export sparse_matrix export sparse_matrix! +export index_array +export pointer_array +export halfperm +export halfperm! +export symbolic_halfperm +export symbolic_halfperm! include("sparse_utils.jl") export linear_indices @@ -169,9 +176,17 @@ export spmv! export spmtv! export spmm export spmm! +export spmmm +export spmmm! export spmtm export spmtm! +export spmtmm +export spmtmm! export centralize +export explicit_transpose +export explicit_transpose! +export add +export add! include("p_sparse_matrix.jl") export BRange @@ -193,6 +208,16 @@ export node_coordinates_unit_cube export nullspace_linear_elasticity export nullspace_linear_elasticity! export near_nullspace_linear_elasticity +export prolongator include("gallery.jl") +export add +export subtract +export mul +export matmul +export matmul! +export rap +export rap! +include("sequential_implementations.jl") + end # module diff --git a/src/gallery.jl b/src/gallery.jl index b48f9575..06933725 100644 --- a/src/gallery.jl +++ b/src/gallery.jl @@ -587,6 +587,3 @@ function nullspace_linear_elasticity!(B,x) end B end - - - diff --git a/src/jagged_array.jl b/src/jagged_array.jl index 60a74dec..4a58f48a 100644 --- a/src/jagged_array.jl +++ b/src/jagged_array.jl @@ -154,6 +154,15 @@ function JaggedArray{T,Ti}(a::AbstractArray{<:AbstractArray}) where {T,Ti} JaggedArray(data,ptrs) end +# New +function jagged_range(a::Union{JaggedArray,GenericJaggedArray},i::Integer) + u = one(eltype(a.ptrs)) + pini = a.ptrs[i] + pend = a.ptrs[i+1]-u + pini:pend +end + +########### Base.size(a::Union{JaggedArray,GenericJaggedArray}) = (length(a.ptrs)-1,) function Base.getindex(a::Union{JaggedArray,GenericJaggedArray},i::Int) diff --git a/src/p_range.jl b/src/p_range.jl index 2b3f05c0..81e391b8 100644 --- a/src/p_range.jl +++ b/src/p_range.jl @@ -408,7 +408,7 @@ end """ neigs_snd, neigs_rcv = assembly_neighbors(index_partition;kwargs...) -Return the ids of the neighbor parts from we send and receive data respectively +Return the ids of the neighbor parts from which we send and receive data respectively in the assembly of distributed vectors defined on the index partition `index_partition`. partition `index_partition`. `kwargs` are delegated to [`ExchangeGraph`](@ref) @@ -470,7 +470,7 @@ end function assembly_local_indices(indices,neighbors_snd,neighbors_rcv) cache = map(assembly_cache,indices) - mask = map(cache) do mycache + mask = map(cache) do mycache isassigned(mycache.local_indices_snd) && isassigned(mycache.local_indices_rcv) end if ! getany(mask) diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl index f6695eba..447d96b4 100644 --- a/src/p_sparse_matrix.jl +++ b/src/p_sparse_matrix.jl @@ -948,9 +948,9 @@ parallel implementations. # Properties -- `matrix_partition::A` -- `row_partition::B` -- `col_partition::C` +- `matrix_partition::B` +- `row_partition::C` +- `col_partition::D` - `assembled::Bool` `matrix_partition[i]` contains a (sparse) matrix with the local rows and the @@ -964,7 +964,7 @@ is fully contained in the own rows. # Supertype hierarchy - PSparseMatrix{V,A,B,C,T} <: AbstractMatrix{T} + PSparseMatrix{V,B,C,D,T} <: AbstractMatrix{T} with `T=eltype(V)`. """ @@ -1587,12 +1587,15 @@ function psparse_assemble_impl(A,::Type,rows;kwargs...) error("Case not implemented yet") end -function psparse_assemble_impl( - A, - ::Type{<:AbstractSplitMatrix}, - rows; - reuse=Val(false), - assembly_neighbors_options_cols=(;)) + +# New assemble +#################### + +function psparse_assemble_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, + ::Type{T}, + rows; + reuse=Val(false), + assembly_neighbors_options_cols=(;)) where {T<:AbstractSplitMatrix, Tv} function setup_cache_snd(A,parts_snd,rows_sa,cols_sa) A_ghost_own = A.blocks.ghost_own @@ -1613,7 +1616,6 @@ function psparse_assemble_impl( ptrs[owner_to_p[owner]+1] += 1 end length_to_ptrs!(ptrs) - Tv = eltype(A_ghost_own) ndata = ptrs[end]-1 I_snd_data = zeros(Int,ndata) J_snd_data = zeros(Int,ndata) @@ -1646,11 +1648,13 @@ function psparse_assemble_impl( k_snd = JaggedArray(k_snd_data,ptrs) (;I_snd,J_snd,V_snd,k_snd,parts_snd) end + function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv) k_rcv_data = zeros(Int32,length(I_rcv.data)) k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs) (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv) end + function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa) nz_own_own = findnz(A.blocks.own_own) nz_own_ghost = findnz(A.blocks.own_ghost) @@ -1687,6 +1691,7 @@ function psparse_assemble_impl( aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) triplets, own_ghost_J, aux end + function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) (own_own_triplet,own_ghost_triplet) = triplets (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux @@ -1698,7 +1703,6 @@ function psparse_assemble_impl( n_ghost_rows = ghost_length(rows_fa) n_ghost_cols = ghost_length(cols_fa) Ti = indextype(A.blocks.own_own) - Tv = eltype(A.blocks.own_own) own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols) own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols) ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols) @@ -1708,12 +1712,12 @@ function psparse_assemble_impl( nnz_own_own = nnz(own_own) k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...) k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...) - for p in 1:length(I_rcv_own) + for p in eachindex(I_rcv_own) i = I_rcv_own[p] j = J_rcv_own[p] k_rcv_own[p] = nzindex(own_own,i,j) end - for p in 1:length(I_rcv_ghost) + for p in eachindex(I_rcv_ghost) i = I_rcv_ghost[p] j = J_rcv_ghost[p] k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own @@ -1721,40 +1725,55 @@ function psparse_assemble_impl( cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...) values, cache end - rows_sa = partition(axes(A,1)) - cols_sa = partition(axes(A,2)) - #rows = map(remove_ghost,rows_sa) - cols = map(remove_ghost,cols_sa) - parts_snd, parts_rcv = assembly_neighbors(rows_sa) - cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa) - I_snd = map(i->i.I_snd,cache_snd) - J_snd = map(i->i.J_snd,cache_snd) - V_snd = map(i->i.V_snd,cache_snd) - graph = ExchangeGraph(parts_snd,parts_rcv) - t_I = exchange(I_snd,graph) - t_J = exchange(J_snd,graph) - t_V = exchange(V_snd,graph) - @fake_async begin - I_rcv = fetch(t_I) - J_rcv = fetch(t_J) - V_rcv = fetch(t_V) - cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv) - triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays - J_owner = find_owner(cols_sa,J) - rows_fa = rows - cols_fa = map(union_ghost,cols,J,J_owner) - assembly_neighbors(cols_fa;assembly_neighbors_options_cols...) - vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays - assembled = true - B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled) - if val_parameter(reuse) == false - B - else - B, cache + + function _psparse_assemble_impl( + A, + ::Type{T}, + rows; + reuse=Val(false), + assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix + + + rows_sa = partition(axes(A,1)) + cols_sa = partition(axes(A,2)) + cols = map(remove_ghost,cols_sa) + parts_snd, parts_rcv = assembly_neighbors(rows_sa) + cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa) + + I_snd = map(i->i.I_snd,cache_snd) + J_snd = map(i->i.J_snd,cache_snd) + V_snd = map(i->i.V_snd,cache_snd) + graph = ExchangeGraph(parts_snd,parts_rcv) + t_I = exchange(I_snd,graph) + t_J = exchange(J_snd,graph) + t_V = exchange(V_snd,graph) + @fake_async begin + I_rcv = fetch(t_I) + J_rcv = fetch(t_J) + V_rcv = fetch(t_V) + cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv) + triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays + J_owner = find_owner(cols_sa,J) + rows_fa = rows + cols_fa = map(union_ghost,cols,J,J_owner) + assembly_neighbors(cols_fa;assembly_neighbors_options_cols...) + vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays + assembled = true + B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled) + if val_parameter(reuse) == false + B + else + B, cache + end end end + + _psparse_assemble_impl(A,T,rows;reuse,assembly_neighbors_options_cols) end +# End new assemble +#################### + function psparse_assemble_impl!(B,A,::Type,cache) error("case not implemented") end @@ -1833,13 +1852,14 @@ function consistent!(B::PSparseMatrix,A::PSparseMatrix,cache) psparse_consistent_impl!(B,A,T,cache) end -function psparse_consistent_impl( - A, - ::Type{<:AbstractSplitMatrix}, - rows_co; - reuse=Val(false)) +# New consistent +#################### +function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D}, + ::Type{T}, + rows_co; + reuse=Val(false)) where {T<:AbstractSplitMatrix, Tv} - function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) + function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa) own_to_local_row = own_to_local(rows_co) own_to_global_row = own_to_global(rows_co) own_to_global_col = own_to_global(cols_fa) @@ -1847,7 +1867,8 @@ function psparse_consistent_impl( nl = size(A,1) li_to_ps_ptrs = zeros(Int32,nl+1) for p in 1:length(lids_snd) - for li in lids_snd[p] + for li_ptr in jagged_range(lids_snd,p) + li = lids_snd.data[li_ptr] li_to_ps_ptrs[li+1] += 1 end end @@ -1855,37 +1876,42 @@ function psparse_consistent_impl( ndata = li_to_ps_ptrs[end]-1 li_to_ps_data = zeros(Int32,ndata) for p in 1:length(lids_snd) - for li in lids_snd[p] + for li_ptr in jagged_range(lids_snd,p) + li = lids_snd.data[li_ptr] q = li_to_ps_ptrs[li] li_to_ps_data[q] = p li_to_ps_ptrs[li] = q + 1 end end + rewind_ptrs!(li_to_ps_ptrs) li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs) ptrs = zeros(Int32,length(parts_snd)+1) for (i,j,v) in nziterator(A.blocks.own_own) li = own_to_local_row[i] - for p in li_to_ps[li] + for li_ptr in jagged_range(li_to_ps,li) + p = li_to_ps.data[li_ptr] ptrs[p+1] += 1 end end + for (i,j,v) in nziterator(A.blocks.own_ghost) li = own_to_local_row[i] - for p in li_to_ps[li] + for ptr in jagged_range(li_to_ps,li) + p=li_to_ps.data[ptr] ptrs[p+1] += 1 end end length_to_ptrs!(ptrs) ndata = ptrs[end]-1 - T = eltype(A) I_snd = JaggedArray(zeros(Int,ndata),ptrs) J_snd = JaggedArray(zeros(Int,ndata),ptrs) - V_snd = JaggedArray(zeros(T,ndata),ptrs) + V_snd = JaggedArray(zeros(Tv,ndata),ptrs) k_snd = JaggedArray(zeros(Int32,ndata),ptrs) for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own)) li = own_to_local_row[i] - for p in li_to_ps[li] + for p_ptr in jagged_range(li_to_ps,li) + p = li_to_ps.data[p_ptr] q = ptrs[p] I_snd.data[q] = own_to_global_row[i] J_snd.data[q] = own_to_global_col[j] @@ -1894,10 +1920,12 @@ function psparse_consistent_impl( ptrs[p] += 1 end end + nnz_own_own = nnz(A.blocks.own_own) for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost)) li = own_to_local_row[i] - for p in li_to_ps[li] + for p_ptr in jagged_range(li_to_ps,li) + p=li_to_ps.data[p_ptr] q = ptrs[p] I_snd.data[q] = own_to_global_row[i] J_snd.data[q] = ghost_to_global_col[j] @@ -1910,18 +1938,21 @@ function psparse_consistent_impl( cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd) cache_snd end - function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + + function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) cache_rcv end - function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) + + function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co) I_rcv_data = cache_rcv.I_rcv.data J_rcv_data = cache_rcv.J_rcv.data V_rcv_data = cache_rcv.V_rcv.data global_to_own_col = global_to_own(cols_co) - global_to_ghost_col = global_to_ghost(cols_co) - is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data) - is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data) + # global_to_ghost_col = global_to_ghost(cols_co) + is_own_condition = k -> global_to_own_col[k]!=0 + is_own = is_own_condition.(J_rcv_data) + is_ghost = map(!,is_own) # inverse is_own bitvector to effectively represent is_ghost mask I_rcv_own = I_rcv_data[is_own] J_rcv_own = J_rcv_data[is_own] V_rcv_own = V_rcv_data[is_own] @@ -1932,16 +1963,14 @@ function psparse_consistent_impl( map_global_to_ghost!(I_rcv_ghost,rows_co) map_global_to_own!(J_rcv_own,cols_co) map_global_to_ghost!(J_rcv_ghost,cols_co) - I2,J2,V2 = findnz(A.blocks.own_ghost) - map_ghost_to_global!(J2,cols_fa) - map_global_to_ghost!(J2,cols_co) - n_own_rows = own_length(rows_co) n_ghost_rows = ghost_length(rows_co) + n_own_rows = own_length(rows_co) n_own_cols = own_length(cols_co) n_ghost_cols = ghost_length(cols_co) TA = typeof(A.blocks.ghost_own) own_own = A.blocks.own_own - own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved + # New own_ghost shares index and value arrays with existing own_ghost block. Pointer arrays are newly allocated (in case of CSC and CSR). + own_ghost = expand_sparse_matrix(A.blocks.own_ghost,n_own_rows,n_ghost_cols) ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols) ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols) K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own) @@ -1953,41 +1982,52 @@ function psparse_consistent_impl( V_rcv = cache_rcv.V_rcv parts_snd = cache_snd.parts_snd parts_rcv = cache_rcv.parts_rcv - cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost) + cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_own,is_ghost,V_rcv_own,V_rcv_ghost,K_own,K_ghost) values,cache end - @assert matching_own_indices(axes(A,1),PRange(rows_co)) - rows_fa = partition(axes(A,1)) - cols_fa = partition(axes(A,2)) - # snd and rcv are swapped on purpose - parts_rcv,parts_snd = assembly_neighbors(rows_co) - lids_rcv,lids_snd = assembly_local_indices(rows_co) - cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) - I_snd = map(i->i.I_snd,cache_snd) - J_snd = map(i->i.J_snd,cache_snd) - V_snd = map(i->i.V_snd,cache_snd) - graph = ExchangeGraph(parts_snd,parts_rcv) - t_I = exchange(I_snd,graph) - t_J = exchange(J_snd,graph) - t_V = exchange(V_snd,graph) - @fake_async begin - I_rcv = fetch(t_I) - J_rcv = fetch(t_J) - V_rcv = fetch(t_V) - J_rcv_data = map(x->x.data,J_rcv) - J_rcv_owner = find_owner(cols_fa,J_rcv_data) - cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner) - cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) - values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays - B = PSparseMatrix(values,rows_co,cols_co,A.assembled) - if val_parameter(reuse) == false - B - else - B,cache + + function _psparse_consistent_impl( + A, + ::Type{T}, + rows_co; + reuse=Val(false)) where T<:AbstractSplitMatrix + @assert matching_own_indices(axes(A,1),PRange(rows_co)) + cols_fa = partition(axes(A,2)) + # snd and rcv are swapped on purpose + parts_rcv,parts_snd = assembly_neighbors(rows_co) + # assembly_neighbors is called again in assembly_local_indices? + lids_rcv,lids_snd = assembly_local_indices(rows_co,parts_rcv,parts_snd) + cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa) + I_snd = map(i->i.I_snd,cache_snd) + J_snd = map(i->i.J_snd,cache_snd) + V_snd = map(i->i.V_snd,cache_snd) + graph = ExchangeGraph(parts_snd,parts_rcv) + t_I = exchange(I_snd,graph) + t_J = exchange(J_snd,graph) + t_V = exchange(V_snd,graph) + @fake_async begin + I_rcv = fetch(t_I) + J_rcv = fetch(t_J) + V_rcv = fetch(t_V) + J_rcv_data = map(x->x.data,J_rcv) + J_rcv_owner = find_owner(cols_fa,J_rcv_data) + cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner) + cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv) + values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays + B = PSparseMatrix(values,rows_co,cols_co,A.assembled) + if val_parameter(reuse) == false + B + else + B,cache + end end end + _psparse_consistent_impl(A,T,rows_co;reuse) end +# End new consistent +#################### + function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache) function setup_snd(A,cache) k_snd_data = cache.k_snd.data @@ -2005,13 +2045,14 @@ function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache) end end function setup_rcv(B,cache) - is_ghost = cache.is_ghost is_own = cache.is_own + is_ghost = cache.is_ghost V_rcv_data = cache.V_rcv.data K_own = cache.K_own K_ghost = cache.K_ghost + # Allocates memory, while cache.V_rcv_own/ghost could be reused. V_rcv_own = V_rcv_data[is_own] - V_rcv_ghost = V_rcv_data[is_ghost] + V_rcv_ghost = V_rcv_data[is_ghost] setcoofast!(B.blocks.ghost_own,V_rcv_own,K_own) setcoofast!(B.blocks.ghost_ghost,V_rcv_ghost,K_ghost) B @@ -2209,6 +2250,20 @@ function sparse_diag_matrix(d::PVector,shape) psparse(I,J,V,row_partition,col_partition;assembled=true) |> fetch end +# Version of sparse_diag_matrix for preserving local matrix type T (when default CSC is not wanted) +function sparse_diag_matrix(::Type{T},d::PVector,shape) where T + row_partition,col_partition = map(partition,shape) + function setup(own_d,rows,cols) + I = own_to_global(rows) |> collect + J = own_to_global(cols) |> collect + V = own_d + I,J,V + end + I,J,V = map(setup,own_values(d),row_partition,col_partition) |> tuple_of_arrays + psparse(T,I,J,V,row_partition,col_partition;assembled=true) |> fetch +end + +### OLD ### function rap(R,A,P;reuse=Val(false)) Ac = R*A*P if val_parameter(reuse) @@ -2217,6 +2272,16 @@ function rap(R,A,P;reuse=Val(false)) Ac end +### NEW ### +function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) + Ac, cache = spmmm(R,A,P;reuse=true) + if val_parameter(reuse) + return Ac, cache + end + Ac +end + +### OLD ### function rap!(Ac,R,A,P,cache) # TODO improve performance tmp = R*A*P @@ -2224,6 +2289,30 @@ function rap!(Ac,R,A,P,cache) Ac end +### NEW ### +function rap!(Ac::PSparseMatrix,R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) + spmmm!(Ac,R,A,P,cache) + Ac +end + +### NEW ### +function rap(Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) + spmtmm(Pt.parent,A,P;reuse=reuse) +end + +function rap!(Ac::PSparseMatrix,Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix,cache) + spmtmm!(Ac,Pt.parent,A,P,cache) +end + +function rap(A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false)) + spmtmm(P,A,P;reuse=reuse) +end + +function rap!(Ac::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) + spmtmm!(Ac,A,P,cache) +end +### End NEW ### + function spmm(A,B;reuse=Val(false)) C = A*B if val_parameter(reuse) @@ -2237,28 +2326,83 @@ function spmm!(C,A,B,state) C end +### OLD ### +# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) +# # TODO latency hiding +# @assert A.assembled +# @assert B.assembled +# col_partition = partition(axes(A,2)) +# C,cacheC = consistent(B,col_partition;reuse=true) |> fetch +# D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays +# assembled = true +# D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) +# if val_parameter(reuse) +# cache = (C,cacheC,cacheD) +# return D,cache +# end +# D +# end + +# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) +# (C,cacheC,cacheD)= cache +# consistent!(C,B,cacheC) |> wait +# map(spmm!,partition(D),partition(A),partition(C),cacheD) +# D +# end + +### NEW ### function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) - # TODO latency hiding @assert A.assembled @assert B.assembled - col_partition = partition(axes(A,2)) - C,cacheC = consistent(B,col_partition;reuse=true) |> fetch - D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays - assembled = true - D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled) + t = consistent(B,partition(axes(A,2)),reuse=true) + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + C_own_own_1 = map(matmul,A_own_own,own_own_values(B)) + + # Wait for consistent + B2, cacheB2 = fetch(t) + C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2)) + C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2)) + C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2)) + + C_own_own = map(add, C_own_own_1, C_own_own_2) + C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2) + + Coo_cache = map(construct_spmm_cache, C_own_own) + Cog_cache = map(construct_spmm_cache, C_own_ghost) + + C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part + ghost_own = similar(own_own,0,size(own_own,2)) + ghost_ghost = similar(own_own,0,size(own_ghost,2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks,A_part.row_permutation,B_part.col_permutation) + end + + C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true) if val_parameter(reuse) - cache = (C,cacheC,cacheD) - return D,cache + cache = (B2,cacheB2,(Coo_cache,Cog_cache)) + return C,cache end - D + C end -function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) - (C,cacheC,cacheD)= cache - consistent!(C,B,cacheC) |> wait - map(spmm!,partition(D),partition(A),partition(C),cacheD) - D +function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) + (B2,cacheB2,(Coo_cache,Cog_cache)) = cache + t = consistent!(B2,B,cacheB2) + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + C_own_own = own_own_values(C) + C_own_ghost = own_ghost_values(C) + + map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache) + wait(t) + map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache) + + map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache) + map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache) + C end +### End NEW ### function spmtm(A,B;reuse=Val(false)) C = transpose(A)*B @@ -2273,27 +2417,101 @@ function spmtm!(C,A,B,cache) C end +### OLD ### +# function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) +# # TODO latency hiding +# @assert A.assembled +# @assert B.assembled +# D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays +# assembled = false +# D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled) +# C,cacheC = assemble(D;reuse=true) |> fetch +# if val_parameter(reuse) +# cache = (D,cacheC,cacheD) +# return C,cache +# end +# C +# end + +# function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) +# (D,cacheC,cacheD)= cache +# map(spmtm!,partition(D),partition(A),partition(B),cacheD) +# assemble!(C,D,cacheC) |> wait +# C +# end + +### NEW ### function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false)) - # TODO latency hiding @assert A.assembled @assert B.assembled - D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Bog = own_ghost_values(B) + + C1go = map((A,B)->matmul(transpose(A),B),Aog,Boo) + C1gg = map((A,B)->matmul(transpose(A),B),Aog,Bog) + + C1_values = map(C1go, C1gg, partition(A), partition(B)) do ghost_own, ghost_ghost, A_part, B_part + own_own = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_own, 2)) + own_ghost = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_ghost, 2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks,A_part.col_permutation,B_part.col_permutation) + end + assembled = false - D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled) - C,cacheC = assemble(D;reuse=true) |> fetch + C1_unassembled = PSparseMatrix(C1_values,partition(axes(A,2)),partition(axes(B,2)),assembled) + t = assemble(C1_unassembled,reuse=true) + + C2oo = map((A,B)->matmul(transpose(A),B),Aoo,Boo) + C2og = map((A,B)->matmul(transpose(A),B),Aoo,Bog) + + C2_values = map(C2oo, C2og, partition(A), partition(B)) do own_own, own_ghost, A_part, B_part + ghost_own = similar(own_own,0,size(own_own,2)) + ghost_ghost = similar(own_own,0,size(own_ghost,2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks, A_part.col_permutation, B_part.col_permutation) + end + + # No cache returned by SparseArrays, so this is a workaround. + Coo_cache = map(construct_spmtm_cache, C2oo) + Cog_cache = map(construct_spmtm_cache, C2og) + Cgo_cache = map(construct_spmtm_cache, C1go) + Cgg_cache = map(construct_spmtm_cache, C1gg) + + assembled = true + C2 = PSparseMatrix(C2_values,partition(axes(A,2)),partition(axes(B,2)),assembled) + C1, assemblyCache = fetch(t) + C, mergeCache = add(C1, C2) + if val_parameter(reuse) - cache = (D,cacheC,cacheD) + sequential_caches = (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache) + cache = (C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches) return C,cache end C end function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) - (D,cacheC,cacheD)= cache - map(spmtm!,partition(D),partition(A),partition(B),cacheD) - assemble!(C,D,cacheC) |> wait + C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches = cache + (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache) = sequential_caches + + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Bog = own_ghost_values(B) + + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache) + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache) + + t = assemble!(C1, C1_unassembled, assemblyCache) + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache) + map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache) + wait(t) + add!(C, C1, C2, mergeCache) C end +### End NEW ### function Base.:*(A::PSparseMatrix,B::PSparseMatrix) C = spmm(A,B) @@ -2314,6 +2532,15 @@ function Base.:-(I::LinearAlgebra.UniformScaling,A::PSparseMatrix) D-A end +# Version of I-A for preserving local matrix type T (when default CSC is not wanted) +function Base.:-(T,I::LinearAlgebra.UniformScaling,A::PSparseMatrix) + Tv = eltype(A) + row_partition = partition(axes(A,1)) + d = pones(Tv,row_partition) + D = sparse_diag_matrix(T,d,axes(A)) + D-A +end + Base.similar(a::PSparseMatrix) = similar(a,eltype(a)) function Base.similar(a::PSparseMatrix,::Type{T}) where T matrix_partition = map(partition(a)) do values @@ -2400,6 +2627,74 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) end end +### NEW ### +# Repartition that follows local data layout of type T (some sparse matrix format) +function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T + @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" + function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) + I1,J1,V1 = findnz(A_own_own) + I2,J2,V2 = findnz(A_own_ghost) + map_own_to_global!(I1,A_rows) + map_own_to_global!(I2,A_rows) + map_own_to_global!(J1,A_cols) + map_ghost_to_global!(J2,A_cols) + I = vcat(I1,I2) + J = vcat(J1,J2) + V = vcat(V1,V2) + (I,J,V) + end + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + A_rows = partition(axes(A,1)) + A_cols = partition(axes(A,2)) + I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays + + t = psparse(T,I,J,V,new_rows,new_cols;reuse=true) + @fake_async begin + B,cacheB = fetch(t) + if val_parameter(reuse) == false + B + else + cache = (V,cacheB) + B, cache + end + end +end + +### NEW ### +# Repartition that follows local data layout by using sparse function "sparse" +function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) + @assert A.assembled "repartition on a sub-assembled matrix not implemented yet" + function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols) + I1,J1,V1 = findnz(A_own_own) + I2,J2,V2 = findnz(A_own_ghost) + map_own_to_global!(I1,A_rows) + map_own_to_global!(I2,A_rows) + map_own_to_global!(J1,A_cols) + map_ghost_to_global!(J2,A_cols) + I = vcat(I1,I2) + J = vcat(J1,J2) + V = vcat(V1,V2) + (I,J,V) + end + A_own_own = own_own_values(A) + A_own_ghost = own_ghost_values(A) + A_rows = partition(axes(A,1)) + A_cols = partition(axes(A,2)) + I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays + t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true) + @fake_async begin + B,cacheB = fetch(t) + if val_parameter(reuse) == false + B + else + cache = (V,cacheB) + B, cache + end + end +end + + """ repartition!(B::PSparseMatrix,A::PSparseMatrix,cache) """ @@ -2469,6 +2764,28 @@ function centralize(A::PSparseMatrix) own_own_values(a_in_main) |> multicast |> getany end +### NEW ### +# Centralize function with local storage layout of type T (some sparse matrix format) +function centralize(::Type{T},A::PSparseMatrix) where T + m,n = size(A) + ranks = linear_indices(partition(A)) + rows_trivial = trivial_partition(ranks,m) + cols_trivial = trivial_partition(ranks,n) + a_in_main = repartition(T,A,rows_trivial,cols_trivial) |> fetch + own_own_values(a_in_main) |> multicast |> getany +end + +### NEW ### +# Centralize function that follows local data layout resulting from "sparse" +function centralize(sparse,A::PSparseMatrix) + m,n = size(A) + ranks = linear_indices(partition(A)) + rows_trivial = trivial_partition(ranks,m) + cols_trivial = trivial_partition(ranks,n) + a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch + own_own_values(a_in_main) |> multicast |> getany +end + """ psystem(I,J,V,I2,V2,rows,cols;kwargs...) """ @@ -2705,3 +3022,324 @@ function laplace_matrix(nodes_per_dir,parts_per_dir,ranks) I,J,V = map(setup,node_partition) |> tuple_of_arrays A = psparse(sparse,I,J,V,node_partition,node_partition) |> fetch end + + +################ NEW ################ + +# Locally transpose SplitMatrix +function explicit_transpose(A::AbstractSplitMatrix) + own_own = halfperm(A.blocks.own_own) + own_ghost = halfperm(A.blocks.ghost_own) + ghost_own = halfperm(A.blocks.own_ghost) + ghost_ghost = halfperm(A.blocks.ghost_ghost) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks,A.col_permutation,A.row_permutation) +end + +# Redistribute PSparseMatrix, returns unassembled transpose and a assmbly task when reuse is true, or only the assembly task otherwise +function explicit_transpose(A::PSparseMatrix;reuse=false) + mats = map(explicit_transpose,partition(A)) + rows, cols = axes(A) + B = PSparseMatrix(mats,partition(cols),partition(rows),false) + t = assemble(B,reuse=reuse) + if val_parameter(reuse) + B,t + else + t + end +end + +function explicit_transpose!(B::AbstractSplitMatrix,A::AbstractSplitMatrix) + halfperm!(B.blocks.own_own,A.blocks.own_own) + halfperm!(B.blocks.own_ghost,A.blocks.ghost_own) + halfperm!(B.blocks.ghost_own,A.blocks.own_ghost) + halfperm!(B.blocks.ghost_ghost,A.blocks.ghost_ghost) +end + +function explicit_transpose!(B::PSparseMatrix,B_local::PSparseMatrix,A::PSparseMatrix,cache) + map(explicit_transpose!,partition(B_local),partition(A)) + assemble!(B, B_local, cache) +end + +function add(A::PSparseMatrix,B::PSparseMatrix) + function add_own_own(A,B) + C = add(A,B) + # reuse IA/IB for cache + KA = precompute_nzindex(C,A) + KB = precompute_nzindex(C,B) + C,(KA,KB) + end + function add_own_ghost(own_ghost_A, own_ghost_B, colsA, colsB, cols) + # Minimize allocated memory, but could be replaced with findnz(...) + iA,jA = find_indices(own_ghost_A) # local nonzero + vA = nonzeros(own_ghost_A) + iB,jB = find_indices(own_ghost_B) # local nonzero + vB = nonzeros(own_ghost_B) + jC = zeros(eltype(jA), (length(jA) + length(jB))) + ghostA_to_global = ghost_to_global(colsA) + ghostB_to_global = ghost_to_global(colsB) + global_to_ghostC = global_to_ghost(cols) + l = zero(eltype(jA)) + for k in eachindex(jA) + l += 1 + j = jA[k] + jC[l] = global_to_ghostC[ghostA_to_global[j]] + jA[k] = jC[l] + end + for k in eachindex(jB) + l += 1 + j = jB[k] + jC[l] = global_to_ghostC[ghostB_to_global[j]] + jB[k] = jC[l] + end + own_ghost = compresscoo(typeof(own_ghost_A), vcat(iA, iB), jC, vcat(vA, vB), size(own_ghost_A, 1), ghost_length(cols)) + # reuse auxiliary iA, iB arrays as caches + precompute_nzindex!(iA,own_ghost,iA,jA) + precompute_nzindex!(iB,own_ghost,iB,jB) + own_ghost, (iA, iB) + end + function _add(A,B) + colsA = partition(axes(A,2)) + colsB = partition(axes(B,2)) + J = map(ghost_to_global, colsB) + J_owner = map(ghost_to_owner, colsB) + cols = map(union_ghost, colsA, J, J_owner) + rows = partition(axes(A,1)) + Coo, Koo = map(add_own_own, own_own_values(A), own_own_values(B)) |> tuple_of_arrays + Cog, Kog = map(add_own_ghost, own_ghost_values(A), own_ghost_values(B), colsA, colsB, cols) |> tuple_of_arrays + C_vals = map(Coo,Cog,rows,cols) do Coo, Cog, rows, cols + Cgo = similar(Coo, 0, size(Coo,2)) + Cgg = similar(Coo, 0, size(Cog,2)) + blocks = split_matrix_blocks(Coo, Cog, Cgo, Cgg) + split_matrix(blocks, local_permutation(rows), local_permutation(cols)) + end + assembled = true + K = (Koo, Kog) + PSparseMatrix(C_vals,rows,cols,assembled), K + end + _add(A,B) +end + +function add!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache) + function add_blocks!(C, A, B, K) + K_A, K_B = K + sparse_matrix!(C, nonzeros(A), K_A) + sparse_matrix!(C, nonzeros(B), K_B, reset=false) + end + Koo, Kog = cache + map(add_blocks!, own_own_values(C), own_own_values(A), own_own_values(B), Koo) + map(add_blocks!, own_ghost_values(C), own_ghost_values(A), own_ghost_values(B), Kog) +end + +# Interpret A as if its transpose is needed +function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false)) + @assert A.assembled + @assert B.assembled + @assert C.assembled + consistency_task = consistent(C, partition(axes(B,2)),reuse=true) + + Aoo = own_own_values(A) + Boo = own_own_values(B) + Cog = own_own_values(C) + + Aog = own_ghost_values(A) + Bog = own_ghost_values(B) + + Doo1, Doo_cache = map((A,B,C)->rap(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays + Dgo1, Dgo_cache = map((A,B,C)->rap(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays + + # Collect ghost rows from P before continuing + C2, consistencyCache = fetch(consistency_task) + + Cog2 = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + Dgo2, Dgo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays + Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays + Dog2, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays + + Dgo = map(add,Dgo1,Dgo2) # different sparsity patterns so not in-place. + Dog = map(add,Dog1,Dog2) + + D1_values = map(Dgo, Dog, partition(C), partition(C2)) do ghost_own, ghost_ghost, C_part, C2_part + own_own = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_own, 2)) + own_ghost = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_ghost, 2)) + blocks = split_matrix_blocks(own_own, own_ghost, ghost_own, ghost_ghost) + split_matrix(blocks, C_part.col_permutation, C2_part.col_permutation) + end + D1_unassembled = PSparseMatrix(D1_values, partition(axes(C,2)), partition(axes(C2,2)), false) + assembly_task = assemble(D1_unassembled, reuse=true) + + Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays + Doo2,Doo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays + Dog2,Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays + + Doo = map(add,Doo1,Doo2) + Dog = map(add,Dog1,Dog2) + + Doo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Doo_cache,Doo) + Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog) + Dgo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dgo_cache,Dgo) + Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog) + + D2_values = map(Doo, Dog, partition(C2)) do own_own, own_ghost, C_part + ghost_own = similar(own_own,0,size(own_own, 2)) + ghost_ghost = similar(own_ghost,0,size(own_ghost, 2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks, C_part.col_permutation, C_part.col_permutation) + end + + D1, assemblyCache = fetch(assembly_task) + D2 = PSparseMatrix(D2_values, partition(axes(D1,1)), partition(axes(C2,2)), true) + D, mergeCache = add(D1, D2) + sequential_caches = (Doo_cache_final, Dog_cache_final, Dgo_cache_final, Dog_cache_final) + if val_parameter(reuse) + cache = (C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches) + return D,cache + end + D +end + +function spmtmm(A::PSparseMatrix,P::PSparseMatrix;kwargs...) + @assert A.assembled + @assert P.assembled + spmtmm(transpose(P),A,P;kwargs...) +end + +function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache) + C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches = cache + Doo_cache, Dog_cache, Dgo_cache, Dgg_cache = sequential_caches + C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache = cache + + consistency_task = consistent!(C2, C, consistencyCache) + Doo = own_own_values(D2) + Dog = own_ghost_values(D2) + Dgo = ghost_own_values(D1_unassembled) + Dgg = ghost_ghost_values(D1_unassembled) + + Aoo = own_own_values(A) + Boo = own_own_values(B) + Coo = own_own_values(C) + + Aog = own_ghost_values(A) + Bog = own_ghost_values(B) + + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache) + + # Collect ghost rows from P before continuing + wait(consistency_task) + Cog2 = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache) + + assembly_task = assemble!(D1, D1_unassembled, assemblyCache) + + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache) + map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache) + + wait(assembly_task) + add!(D, D1, D2, mergeCache) + D +end + +function spmtmm!(C::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache) + spmtmm!(C,P,A,P,cache) +end + +function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false)) + @assert A.assembled + @assert B.assembled + @assert C.assembled + B2_task = consistent(B,partition(axes(A,2)),reuse=true) + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Coo = own_own_values(C) + + Doo1,Doo_cache = map(rap,Aoo,Boo,Coo) |> tuple_of_arrays + B2, Bcache = fetch(B2_task) + C2_task = consistent(C,partition(axes(B2,2)),reuse=true) + + Bog = own_ghost_values(B2) + Bgo = ghost_own_values(B2) + Bgg = ghost_ghost_values(B2) + + Doo2,Doo_cache = map(rap,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays + Doo12 = map(add,Doo1,Doo2) + + C2, Ccache = fetch(C2_task) + + Cog = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + Doo3,Doo_cache = map(rap,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays + Doo4,Doo_cache = map(rap,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays + + Doo34 = map(add,Doo3,Doo4) + Doo = map(add,Doo12,Doo34) + + Dog1,Dog_cache = map(rap,Aoo,Boo,Cog) |> tuple_of_arrays + Dog2,Dog_cache = map(rap,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays + Dog3,Dog_cache = map(rap,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays + Dog4,Dog_cache = map(rap,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays + + Dog12 = map(add,Dog1,Dog2) + Dog34 = map(add,Dog3,Dog4) + Dog = map(add,Dog12,Dog34) + + D_values = map(Doo, Dog, partition(A),partition(C2)) do own_own, own_ghost, A_part,C_part + ghost_own = similar(own_own,0,size(own_own, 2)) + ghost_ghost = similar(own_ghost,0,size(own_ghost, 2)) + blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost) + split_matrix(blocks, A_part.row_permutation, C_part.col_permutation) + end + + D = PSparseMatrix(D_values, partition(axes(A,1)), partition(axes(C2,2)), true) + if val_parameter(reuse) + cache = B2,Bcache,C2,Ccache,(Doo_cache,Dog_cache) + return D,cache + end + D +end + +function spmmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache) + B2,Bcache,C2,Ccache,sequential_caches = cache + Doo_cache, Dog_cache = sequential_caches + B2_task = consistent!(B2,B,Bcache) + + Doo = own_own_values(D) + Dog = own_ghost_values(D) + Aoo = own_own_values(A) + Aog = own_ghost_values(A) + Boo = own_own_values(B) + Coo = own_own_values(C) + map(rap!,Doo,Aoo,Boo,Coo,Doo_cache) + wait(B2_task) + + C2_task = consistent!(C2,C,Ccache) + Bog = own_ghost_values(B2) + Bgo = ghost_own_values(B2) + Bgg = ghost_ghost_values(B2) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache) + + wait(C2_task) + Cog = own_ghost_values(C2) + Cgo = ghost_own_values(C2) + Cgg = ghost_ghost_values(C2) + + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache) + map(rap!,Dog,Aoo,Boo,Cog,Dog_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache) + map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache) + D +end \ No newline at end of file diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl new file mode 100644 index 00000000..ed952606 --- /dev/null +++ b/src/sequential_implementations.jl @@ -0,0 +1,1680 @@ +function matmul(A::Union{Transpose{TvA,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvA, + B::Union{Transpose{TvB,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvB) + A*B +end + +function matmul(A::SparseMatrixCSR,B::SparseMatrixCSR) + C = matmul(ascsc(B),ascsc(A)) + ascsr(C) +end + +function matmul(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR) + C = matmul(ascsc(B),transpose(ascsc(At.parent))) + ascsr(C) +end + +function matmul(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) + C = transpose(ascsc(Bt.parent))*ascsc(A) + ascsr(C) +end + +function matmul(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB) + C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent)) + ascsr(C) +end + +function mul(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> x*a, A.nzval)) +end + +function mul(A::SparseMatrixCSR,x::Number) mul(x,A) end + +# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. +function add(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} + if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) + p,q = size(A) + nnz_C_upperbound = nnz(A) + nnz(B) + IC = Vector{Ti}(undef, p+1) + JC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + for i in 1:p + IC[i] = pC + jpA_range = nzrange(A, i) + jpA, jpA_end = jpA_range.start, jpA_range.stop + jpB_range = nzrange(B, i) + jpB, jpB_end = jpB_range.start, jpB_range.stop + while jpA <= jpA_end && jpB <= jpB_end + jA = JA[jpA] + jB = JB[jpB] + if jA < jB + JC[pC] = jA + VC[pC] = VA[jpA] + jpA += 1 + elseif jB < jA + JC[pC] = jB + VC[pC] = VB[jpB] + jpB += 1 + else + JC[pC] = jA + VC[pC] = VA[jpA] + VB[jpB] + jpA += 1 + jpB += 1 + end + pC += 1 + end + while jpA <= jpA_end + JC[pC] = JA[jpA] + VC[pC] = VA[jpA] + jpA += 1 + pC += 1 + end + while jpB <= jpB_end + JC[pC] = JB[jpB] + VC[pC] = VB[jpB] + jpB += 1 + pC += 1 + end + end + IC[end] = pC + resize!(JC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSR{Bi}(p,q,IC,JC,VC) # A += B +end + +# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. +function subtract(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB} + if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) + nnz_C_upperbound = nnz(A) + nnz(B) + p,r = size(A) + IC = Vector{Ti}(undef, p+1) + JC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + for i in 1:p + IC[i] = pC + jpA_range = nzrange(A, i) + jpA, jpA_end = jpA_range.start, jpA_range.stop + jpB_range = nzrange(B, i) + jpB, jpB_end = jpB_range.start, jpB_range.stop + while jpA <= jpA_end && jpB <= jpB_end + jA = JA[jpA] + jB = JB[jpB] + if jA < jB + JC[pC] = jA + VC[pC] = VA[jpA] + jpA += 1 + elseif jB < jA + JC[pC] = jB + VC[pC] = -VB[jpB] + jpB += 1 + else + JC[pC] = jA + VC[pC] = VA[jpA] - VB[jpB] + jpA += 1 + jpB += 1 + end + pC += 1 + end + while jpA <= jpA_end + JC[pC] = JA[jpA] + VC[pC] = VA[jpA] + jpA += 1 + pC += 1 + end + while jpB <= jpB_end + JC[pC] = JB[jpB] + VC[pC] = -VB[jpB] + jpB += 1 + pC += 1 + end + end + IC[end] = pC + resize!(JC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSR{Bi}(p,r,IC,JC,VC) # A += B +end + +function subtract(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a->-a, A.nzval)) +end + +# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros. +function add(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} + if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) + p,q = size(A) + nnz_C_upperbound = nnz(A) + nnz(B) + JC = Vector{Ti}(undef, q+1) + IC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + IA = rowvals(A) + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:q + JC[j] = pC + ipA_range = nzrange(A, j) + ipA, ipA_end = ipA_range.start, ipA_range.stop + ipB_range = nzrange(B, j) + ipB, ipB_end = ipB_range.start, ipB_range.stop + while ipA <= ipA_end && ipB <= ipB_end + iA = IA[ipA] + iB = IB[ipB] + if iA < iB + IC[pC] = iA + VC[pC] = VA[ipA] + ipA += 1 + elseif iB < iA + IC[pC] = iB + VC[pC] = VB[ipB] + ipB += 1 + else + IC[pC] = iA + VC[pC] = VA[ipA] + VB[ipB] + ipA += 1 + ipB += 1 + end + pC += 1 + end + while ipA <= ipA_end + IC[pC] = IA[ipA] + VC[pC] = VA[ipA] + ipA += 1 + pC += 1 + end + while ipB <= ipB_end + IC[pC] = IB[ipB] + VC[pC] = VB[ipB] + ipB += 1 + pC += 1 + end + end + JC[end] = pC + resize!(IC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC) +end + +# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B. +function subtract(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB} + if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end + Ti = promote_type(TiA,TiB) + Tv = promote_type(TvA,TvB) + p,q = size(A) + nnz_C_upperbound = nnz(A) + nnz(B) + JC = Vector{Ti}(undef, q+1) + IC = Vector{Ti}(undef, nnz_C_upperbound) + VC = Vector{Tv}(undef, nnz_C_upperbound) + + pC = 1 + IA = rowvals(A) + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:q + JC[j] = pC + ipA_range = nzrange(A, j) + ipA, ipA_end = ipA_range.start, ipA_range.stop + ipB_range = nzrange(B, j) + ipB, ipB_end = ipB_range.start, ipB_range.stop + while ipA <= ipA_end && ipB <= ipB_end + iA = IA[ipA] + iB = IB[ipB] + if iA < iB + IC[pC] = iA + VC[pC] = VA[ipA] + ipA += 1 + elseif iB < iA + IC[pC] = iB + VC[pC] = VB[ipB] + ipB += 1 + else + IC[pC] = iA + VC[pC] = VA[ipA] - VB[ipB] + ipA += 1 + ipB += 1 + end + pC += 1 + end + while ipA <= ipA_end + IC[pC] = IA[ipA] + VC[pC] = VA[ipA] + ipA += 1 + pC += 1 + end + while ipB <= ipB_end + IC[pC] = IB[ipB] + VC[pC] = -VB[ipB] + ipB += 1 + pC += 1 + end + end + JC[end] = pC + resize!(IC, (pC-1)) + resize!(VC, (pC-1)) + SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC) +end + +function subtract(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval)) +end + +function matmul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + cache) + matmul!(ascsr(C),ascsr(B),ascsr(A),cache) + C +end + +function matmul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + α::Number, + β::Number, + cache) + matmul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache) + C +end + +function matmul!(C::SparseMatrixCSC, + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC) + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + VC .= 0 + IC = rowvals(C) + JA = rowvals(A) # When virtually transposed rowvals represent colvals. + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:s + # loop over columns "j" in row i of A + Bj = nzrange(B, j) + ptrB_start = Bj.start + ptrB_stop = Bj.stop + for ip in nzrange(C, j) + i = IC[ip] + # loop over columns "k" in row j of B + Ai = nzrange(A, i) + ptrB = ptrB_start + ptrA = Ai.start + vC = 0 + while ptrA <= Ai.stop && ptrB <= ptrB_stop + jA = JA[ptrA] + iB = IB[ptrB] + if jA < iB + ptrA += 1 + elseif iB < jA + ptrB += 1 + else # jA == iB + vC += VA[ptrA]*VB[ptrB] + ptrA += 1 + ptrB += 1 + end + end + VC[ip] = vC + end + end + C +end + +function matmul!(C::SparseMatrixCSC{Tv,Ti}, + At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}}, + B::SparseMatrixCSC{Tv,Ti}, + α::Number, + β::Number) where {Tv,Ti} + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + IC = rowvals(C) + VC .*= β + JA = rowvals(A) # When virtually transposed rowvals represent colvals. + VA = nonzeros(A) + IB = rowvals(B) + VB = nonzeros(B) + for j in 1:s + # loop over columns "j" in row i of A + Bj = nzrange(B, j) + for jp in nzrange(C, j) + i = IC[jp] + # loop over columns "k" in row j of B + Ai = nzrange(A, i) + ptrB = Bj.start + ptrA = Ai.start + vC = 0 + while ptrA <= Ai.stop && ptrB <= Bj.stop + jA = JA[ptrA] + iB = IB[ptrB] + if jA == iB + vC += VA[ptrA]*VB[ptrB] + ptrA += 1 + ptrB += 1 + elseif jA < iB + ptrA += 1 + else + ptrB += 1 + end + end + VC[jp] += α*vC + end + end + C +end + +function matmul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv) + matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A)) + C +end + +function matmul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + B::SparseMatrixCSR, + cache) + a,b = size(C) + p,q = size(A) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + JC = colvals(C) + VC = nonzeros(C) + VC .= zero(eltype(C)) + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + # A cache here would remove need for allocating acumulating arrays + # xb = zeros(Ti, p) + xb,x = cache + xb .= 0 + # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop. + for i in 1:p # ! + # loop over rows Ai in col Bj + for jpa in nzrange(A, i) + ja = JA[jpa] + va = VA[jpa] + # loop over columns "k" in row j of B + for jpb in nzrange(B, ja) + jb = JB[jpb] + vb = VB[jpb] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[jb] != i + xb[jb] = i + x[jb] = va*vb + else + x[jb] += va*vb + end + end + end + for jpc in nzrange(C,i) + jc = JC[jpc] + # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required. + if xb[jc] == i + VC[jpc] = x[jc] + end + end + end + C +end + +function matmul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, + cache) + matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),cache) + C +end + +function matmul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + B::SparseMatrixCSR, + α::Number, + β::Number, + cache) + a,b = size(C) + p,q = size(A) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + JC = colvals(C) + VC = nonzeros(C) + VC .*= β + JA = colvals(A) + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + # A cache here would remove need for allocating acumulating arrays + # xb = zeros(Ti, p) + xb,x = cache + xb .= 0 + # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop. + for i in 1:p # ! + # loop over rows Ai in col Bj + for jpa in nzrange(A, i) + ja = JA[jpa] + va = VA[jpa] + # loop over columns "k" in row j of B + for jpb in nzrange(B, ja) + jb = JB[jpb] + vb = VB[jpb] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[jb] != i + xb[jb] = i + x[jb] = va*vb + else + x[jb] += va*vb + end + end + end + for jpc in nzrange(C,i) + jc = JC[jpc] + # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required. + if xb[jc] == i + VC[jpc] += α * x[jc] + end + end + end + C +end + +function matmul!(C::SparseMatrixCSC, + A::SparseMatrixCSC, + Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv, + α::Number, + β::Number, + cache) + matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache) + C +end + +function matmul!(C::SparseMatrixCSC, + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + cache) + matmul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent))) + C +end + +function matmul!(C::SparseMatrixCSC, + At::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + α::Number, + β::Number, + cache) + matmul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β) + C +end + +# Workaround to supply in-place matmul with auxiliary array, as these are not returned by multiply function exported by SparseArrays +function construct_spmm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti} + q = size(A,2) + xb = zeros(Ti,q) + x = similar(xb,Tv) + xb,x +end +function construct_spmm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + construct_spmm_cache(ascsr(A)) +end + +function construct_spmtm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti} + q = size(A,2) + xb = zeros(Ti,q) + x = similar(xb,Tv) + xb,x +end + +function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + construct_spmtm_cache(ascsr(A)) +end + +function matmul!(C::SparseMatrixCSR, + At::Transpose{Tv,<:SparseMatrixCSR} where Tv, + B::SparseMatrixCSR, + cache) + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + VC .= zero((eltype(C))) + JC = colvals(C) + JA = colvals(A) # When virtually transposed colvals represent rowvals. + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + xb,x = cache + xb .= 0 + for k in 1:q + # loop over columns "j" in row i of B + for jpb in nzrange(B,k) + jb = JB[jpb] + vb = VB[jpb] + xb[jb] = k + x[jb] = vb + end + for ipa in nzrange(A,k) + ia = JA[ipa] # interpret column index of A as row index of A^T. + va = VA[ipa] + for jpc in nzrange(C, ia) + jc = JC[jpc] + # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C. + if xb[jc] == k + VC[jpc] += va*x[jc] + end + end + end + + end + C +end + +function matmul!(C::SparseMatrixCSR, + At::Transpose{Tv,<:SparseMatrixCSR} where Tv, + B::SparseMatrixCSR, + α::Number, + β::Number, + cache) + a,b = size(C) + p,q = size(At) + r,s = size(B) + if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end + if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end + A = At.parent + VC = nonzeros(C) + VC .*= β + JC = colvals(C) + JA = colvals(A) # When virtually transposed colvals represent rowvals. + VA = nonzeros(A) + JB = colvals(B) + VB = nonzeros(B) + xb,x = cache + xb .= 0 + for k in 1:q + # loop over columns "j" in row i of B + for jpb in nzrange(B,k) + jb = JB[jpb] + vb = VB[jpb] + xb[jb] = k + x[jb] = α*vb + end + for ipa in nzrange(A,k) + ia = JA[ipa] # interpret column index of A as row index of A^T. + va = VA[ipa] + for jpc in nzrange(C, ia) + jc = JC[jpc] + # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C. + if xb[jc] == k + VC[jpc] += va*x[jc] + end + end + end + + end + C +end + +function matmul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv) + matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A)) + C +end + +function matmul!(C::SparseMatrixCSR, + A::SparseMatrixCSR, + Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + α::Number, + β::Number) + matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β) + C +end + +function rap(A::Union{Transpose{TA,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TA, + B::M where M<:AbstractSparseMatrix, + C::Union{Transpose{TC,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TC + ;reuse=Val(true)) + D,cache = rap(A,B,C) + if val_parameter(reuse) + return D,cache + end + D +end + +# PtAP variants +function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(Rt) + m,r = size(A) + n,s = size(P) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + function rap_symbolic_count!(R,A,P) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) + JR = R.data + JA = colvals(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + xbRA = zeros(Ti, r) + xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm + max_rR = find_max_row_length(R) + max_rA = find_max_row_length(A) + max_rP = find_max_row_length(P) + + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) + JRA = Vector{Ti}(undef,max_rC) + IC = Vector{Ti}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{Ti}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP + xbRA .= 0 + xbC .= 0 + cache = (xbRA,JRA,xbC,JAP) + SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized + end + function rap_symbolic_fill!(C,R,A,P,cache) + (xbRA,JRA,xbC,JAP) = cache + JC = colvals(C) + JR = R.data + JA = colvals(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + pC = 0 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + pC += 1 + xbC[k] = i + JC[pC] = k + end + end + end + end + xbC .= 0 + outer_cache = (xbC,similar(xbC, eltype(C)),JAP) + C, outer_cache # values not yet initialized + end + function _rap(Rt,A,P) + R = symbolic_halfperm(Rt.parent) + C,symbolic_cache = rap_symbolic_count!(R,A,P) # precompute nz structure with a symbolic transpose + _,outer_cache = rap_symbolic_fill!(C,R,A,P,symbolic_cache) + Ct = symbolic_halfperm(C) + symbolic_halfperm!(C,Ct) + rap!(C,Rt,A,P,outer_cache),(outer_cache...,R) + end + _rap(Rt,A,P) +end + +function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}, + cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(Rt) + m,r = size(A) + n,s = size(P) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + + function rap_symbolic_count(R,A,P) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) + JR = R.data + JA = colvals(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + xbRA = zeros(Ti, r) + xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm + max_rR = find_max_row_length(R) + max_rA = find_max_row_length(A) + max_rP = find_max_row_length(P) + + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) + JRA = Vector{Ti}(undef,max_rC) + IC = Vector{Ti}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{Ti}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP + xbRA .= 0 + xbC .= 0 + SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized + end + function rap_symbolic_fill!(C,R,A,P,cache) + (xbRA,JRA,xbC,JAP) = cache + JC = colvals(C) + JR = R.data + JA = colvals(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + pC = 0 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in jagged_range(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + pC += 1 + xbC[k] = i + JC[pC] = k + end + end + end + end + xbC .= 0 + C, (xbC,similar(xbC, eltype(C)),JAP) # values not yet initialized + end + function _rap(Rt,A,P,old_cache) + xb,x,JAP,R = old_cache + old_outer_cache = (xb,x,JAP) + C,symbolic_cache = rap_symbolic_count(R, A, P) + _,new_outer_cache = rap_symbolic_fill!(C,R, A, P, symbolic_cache) + Ct = symbolic_halfperm(C) + symbolic_halfperm!(C,Ct) + outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache) + rap!(C,Rt,A,P,outer_cache),(outer_cache...,R) + end + _rap(Rt,A,P,cache) +end + +function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR}) + (xb,x,JAP,_) = cache + (xb,x,JAP) +end + +function rap!(C::SparseMatrixCSR, + Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + A::SparseMatrixCSR, + P::SparseMatrixCSR, + cache) + (a,b) = size(C) + p,q = size(Rt) + m,r = size(A) + n,s = size(P) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end + R = Rt.parent + JC = colvals(C) + VC = nonzeros(C) + VC .= zero(eltype(C)) + + JA = colvals(A) + VA = nonzeros(A) + JP = colvals(P) + VP = nonzeros(P) + xb, x, JAP = cache + xb .= 0 + # loop over rows in A + for i in 1:m + lp = 0 + # loop over columns "j" in row i of A + for jp in nzrange(A, i) + j = JA[jp] + va = VA[jp] + # loop over columns "k" in row j of B + for kp in nzrange(P, j) + k = JP[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + lp += 1 + JAP[lp] = k + xb[k] = i + x[k] = va * VP[kp] + else + x[k] += va * VP[kp] + end + end + end + for kp in nzrange(R, i) + k = colvals(R)[kp] # rowvals when transposed conceptually + v = nonzeros(R)[kp] + for jp in nzrange(C,k) + j = JC[jp] + if xb[j] == i + VC[jp] += v*x[j] + end + end + end + end + C +end + +function rap!(C::SparseMatrixCSR, + Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + A::SparseMatrixCSR, + P::SparseMatrixCSR, + α::Number, + β::Number, + cache) + (a,b) = size(C) + p,q = size(Rt) + m,r = size(A) + n,s = size(P) + if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end + if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end + if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end + R = Rt.parent + JC = colvals(C) + VC = nonzeros(C) + JA = colvals(A) + VA = nonzeros(A) + JP = colvals(P) + VP = nonzeros(P) + xb, x, JAP = cache + xb .= 0 + VC .*= β + # loop over rows in A + for i in 1:m + lp = 0 + # loop over columns "j" in row i of A + for jp in nzrange(A, i) + j = JA[jp] + va = α*VA[jp] + # loop over columns "k" in row j of B + for kp in nzrange(P, j) + k = JP[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + lp += 1 + JAP[lp] = k + xb[k] = i + x[k] = va*VP[kp] + else + x[k] += va*VP[kp] + end + end + end + for kp in nzrange(R, i) + k = colvals(R)[kp] # rowvals when transposed conceptually + vpl = nonzeros(R)[kp] + for jp in nzrange(C,k) + j = JC[jp] + if xb[j] == i + VC[jp] += vpl*x[j] + end + end + end + end + C +end + +# RAP variants +function rap(R::SparseMatrixCSR{Bi,TvR,TiR}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(R) + m,r = size(A) + n,s = size(P) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + + function rap_symbolic!(R,A,P) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) + + JR = colvals(R) + JA = colvals(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + xbRA = zeros(Ti, r) + xbC = zeros(Ti, s+1) # this vector will also serve as as colptr array in halfperm + xRA = similar(xbRA, Tv) # sparse accumulator + xC = similar(xbC, Tv) # sparse accumulator + max_rR = find_max_row_length(R) + max_rA = find_max_row_length(A) + max_rP = find_max_row_length(P) + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) + + JRA = Vector{Ti}(undef,max_rC) + IC = Vector{Ti}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{Ti}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + cache = (xbRA,xRA,JRA,xbC,xC) + SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized + end + function rap_numeric!(C,R,A,P,cache) + JR = colvals(R) + VR = nonzeros(R) + JA = colvals(A) + VA = nonzeros(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) + JC = colvals(C) + VC = nonzeros(C) + (xbRA,xRA,JRA,xbC,xC) = cache + jpC = 1 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + xRA[k] = vpl * VA[kp] + else + xRA[k] += vpl * VA[kp] + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + JC[jpC] = k + jpC += 1 + xC[k] = xRA[j]*VP[kp] + else + xC[k] += xRA[j]*VP[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + VC[ind] = xC[j] + end + end + end + function _rap(R,A,P) + C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(R,A,P) + xbRA .= 0 + xbC .= 0 + cache = (xbRA,xRA,JRA,xbC,xC) + rap_numeric!(C,R,A,P,cache) + Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) + halfperm!(C,Ct) + C,cache + end + _rap(R,A,P) +end + +function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSR) + (xb,x,JAP,_) = cache + (xb,x,JAP) +end + +function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSC) + reduce_spmmmt_cache(cache,SparseMatrixCSR) +end + +function rap(R::SparseMatrixCSR{Bi,TvR,TiR}, + A::SparseMatrixCSR{Bi,TvA,TiA}, + P::SparseMatrixCSR{Bi,TvP,TiP}, + cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP} + p,q = size(R) + m,r = size(A) + n,s = size(P) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + + function rap_symbolic!(R,A,P,cache) + Ti = promote_type(TiR,TiA,TiP) + Tv = promote_type(TvR,TvA,TvP) + JR = colvals(R) + JA = colvals(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + (xbRA,_,JRA,xbC,_) = cache + IC = Vector{Ti}(undef,p+1) + nnz_C = 1 + IC[1] = nnz_C + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + end + end + end + ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + ccC += 1 + end + end + end + nnz_C += ccC + IC[i+1] = nnz_C + end + JC = Vector{Ti}(undef, nnz_C-1) + VC = zeros(Tv,nnz_C-1) + SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized + end + function rap_numeric!(C,R,A,P,cache) + JR = colvals(R) + VR = nonzeros(R) + JA = colvals(A) + VA = nonzeros(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) + JC = colvals(C) + VC = nonzeros(C) + (xbRA,xRA,JRA,xbC,xC) = cache + jpC = 1 + for i in 1:p + ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + ccRA += 1 + JRA[ccRA] = k + xbRA[k] = i + xRA[k] = vpl * VA[kp] + else + xRA[k] += vpl * VA[kp] + end + end + end + for jp in 1:ccRA + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + JC[jpC] = k + jpC += 1 + xC[k] = xRA[j]*VP[kp] + else + xC[k] += xRA[j]*VP[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + VC[ind] = xC[j] + end + end + end + function _rap(R,A,P,old_cache) + max_rR = find_max_row_length(R) + max_rA = find_max_row_length(A) + max_rP = find_max_row_length(P) + (xbRA,xRA,JRA,xbC,xC) = old_cache + max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR)) + JRA2 = max_rC > length(JRA) ? similar(JRA,max_rC) : JRA + if r > length(xbRA) + xbRA2 = similar(xbRA,r) + xRA2 = similar(xRA,r) + else + xbRA2 = xbRA + xRA2 = xRA + end + + new_cache = (xbRA2,xRA2,JRA2,xbC,xC) + xbRA2 .= 0 + xbC .= 0 + C = rap_symbolic!(R,A,P,new_cache) + xbRA2 .= 0 + xbC .= 0 + rap_numeric!(C,R,A,P,new_cache) + Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C) + halfperm!(C,Ct) + C,new_cache + end + _rap(R,A,P,cache) +end + +function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR) + (xbRA,xRA,JRA,_,_) = cache + (xbRA,xRA,JRA) +end + +function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC) + reduce_spmtmm_cache(cache,SparseMatrixCSR) +end + +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + P::SparseMatrixCSR, + cache) + p,q = size(R) + m,r = size(A) + n,s = size(P) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + JR = colvals(R) + VR = nonzeros(R) + JA = colvals(A) + VA = nonzeros(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) + JC = colvals(C) + VC = nonzeros(C) + VC .= zero(eltype(C)) + (xbRA,xRA,JRA,xbC,xC) = cache + xbRA .= 0 + xbC .= 0 + for i in 1:p + lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] + + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + va = VA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + lp += 1 + JRA[lp] = k + xbRA[k] = i + xRA[k] = vpl * va + else + xRA[k] += vpl * va + end + end + end + for jp in 1:lp + j = JRA[jp] + vra = xRA[j] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + xC[k] = vra*VP[kp] + else + xC[k] += vra*VP[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + if xbC[j] == i + VC[ind] = xC[j] + end + end + end + C +end + +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + P::SparseMatrixCSR, + α::Number, + β::Number, + cache) + p,q = size(R) + m,r = size(A) + n,s = size(P) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + JR = colvals(R) + VR = nonzeros(R) + JA = colvals(A) + VA = nonzeros(A) + JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) + JC = colvals(C) + VC = nonzeros(C) + VC .*= β + (xbRA,xRA,JRA,xbC,xC) = cache + xbRA .= 0 + xbC .= 0 + # xC .= zero(Tv) + for i in 1:p + lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xbRA[k] != i + lp += 1 + JRA[lp] = k + xbRA[k] = i + xRA[k] = vpl * VA[kp] + else + xRA[k] += vpl * VA[kp] + end + end + end + for jp in 1:lp + j = JRA[jp] + for kp in nzrange(P,j) + k = JP[kp] + if xbC[k] != i + xbC[k] = i + xC[k] = xRA[j]*VP[kp] + else + xC[k] += xRA[j]*VP[kp] + end + end + end + for ind in nzrange(C,i) + j = JC[ind] + if xbC[j] == i + VC[ind] += α*xC[j] + end + end + end + C +end + +# RARt variants +function rap(R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv) + p,q = size(R) + m,r = size(A) + n,s = size(Pt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end + rap(R,A,copy(Pt)) +end + +function rap(R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + cache) + p,q = size(R) + m,r = size(A) + n,s = size(Pt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end + rap(R,A,copy(Pt),cache) +end + +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + cache) + p,q = size(R) + m,r = size(A) + n,s = size(Pt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + P = Pt.parent + JR = colvals(R) + VR = nonzeros(R) + JA = colvals(A) + VA = nonzeros(A) + IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) + JC = colvals(C) + VC = nonzeros(C) + # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed + xb,x = cache + xb .= 0 + for i in 1:p + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + xb[k] = i + x[k] = vpl * VA[kp] + else + x[k] += vpl * VA[kp] + end + end + end + for jpP in nzrange(C,i) + jP = JC[jpP] + v = zero(eltype(C)) + for ip in nzrange(P,jP) + iP = IP[ip] + if xb[iP] == i + v += x[iP]*VP[ip] + end + end + VC[jpP] = v + end + end + C +end + +function rap!(C::SparseMatrixCSR, + R::SparseMatrixCSR, + A::SparseMatrixCSR, + Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv, + α::Number, + β::Number, + cache) + p,q = size(R) + m,r = size(A) + n,s = size(Pt) + if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end + if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end + P = Pt.parent + JR = colvals(R) + VR = nonzeros(R) + JA = colvals(A) + VA = nonzeros(A) + IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed. + VP = nonzeros(P) + JC = colvals(C) + VC = nonzeros(C) + VC .*= β + # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed + xb,x = cache + xb .= 0 + for i in 1:p + # loop over columns "j" in row i of A + for jp in nzrange(R, i) + j = JR[jp] + vpl = VR[jp] + # loop over columns "k" in row j of B + for kp in nzrange(A, j) + k = JA[kp] + # since C is constructed rowwise, xb tracks if a column index is present in a new row in C. + if xb[k] != i + xb[k] = i + x[k] = vpl * VA[kp] + else + x[k] += vpl * VA[kp] + end + end + end + for jpP in nzrange(C,i) + jP = JC[jpP] + v = zero(eltype(C)) + for ip in nzrange(P,jP) + iP = IP[ip] + if xb[iP] == i + v += x[iP]*VP[ip] + end + end + VC[jpP] += α*v + end + end + C +end + +### CSC in terms of CSR +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC) + D,cache = rap(ascsr(C),ascsr(B),ascsr(A)) + ascsc(D),cache +end + +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) + D,new_cache = rap(ascsr(C),ascsr(B),ascsr(A),cache) + ascsc(D),new_cache +end + +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) + rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache) + D +end + +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + α::Number, + β::Number, + cache) + rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache) + D +end + +# PtAP +function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC) + D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent))) + ascsc(D),cache +end + +function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) + D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) + ascsc(D),cache +end + +function rap!(D::SparseMatrixCSC, + A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + cache) + rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache) + D +end + +function rap!(D::SparseMatrixCSC, + A::Transpose{Tv,<:SparseMatrixCSC} where Tv, + B::SparseMatrixCSC, + C::SparseMatrixCSC, + α::Number, + β::Number, + cache) + rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache) + D +end + +# RARt +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv) + D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A)) + ascsc(D),new_cache +end +function rap(A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv, + cache) + D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) + ascsc(D),new_cache +end + +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv, + cache) + rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache) + D +end + +function rap!(D::SparseMatrixCSC, + A::SparseMatrixCSC, + B::SparseMatrixCSC, + C::Transpose{Tv,<:SparseMatrixCSC} where Tv, + α::Number, + β::Number, + cache) + rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache) + D +end \ No newline at end of file diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl index 4d31a029..def57040 100644 --- a/src/sparse_utils.jl +++ b/src/sparse_utils.jl @@ -465,10 +465,9 @@ function sparse_matrix!(A,V,K;reset=true) A end - # Notation # csrr: csr with repeated and unsorted columns -# csru: csr witu unsorted columns +# csru: csr with unsorted columns # csc: csc with sorted columns struct SparseMatrixCSRR{Tv,Ti,A} @@ -689,3 +688,308 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A) b end + +################ NEW ################ +# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array. +# Only use for read-only operations. +function findnz_minimal(A::SparseMatrixCSC) + J = ptr_to_coo(A.colptr) + rowvals(A),J,nonzeros(A) +end +function findnz_minimal(A::SparseMatrixCSR) + I = ptr_to_coo(A.rowptr) + I,colvals(A),nonzeros(A) +end + +# Behaves like findnz, but without the values. +function find_indices(A::SparseMatrixCSC) + I,J,_ = findnz_minimal(A) + copy(I),J +end +function find_indices(A::SparseMatrixCSR) + I,J,_ = findnz_minimal(A) + I,copy(J) +end + +# TODO Could be done without binary searches from nzindex(...), when it is known that A and C are ordered, and A is a guaranteed submatrix of C. +function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray) + I,J,_ = findnz_minimal(A) + K = similar(I) + K .= 0 + for (p,(i,j)) in enumerate(zip(I,J)) + if i < 1 || j < 1 + continue + end + K[p] = nzindex(C,i,j) + end + K +end + +# General matrix expansion to a larger size, allocates new matrix with new size. +function expand_sparse_matrix(A,m,n) + compresscoo(typeof(A),findnz(A)...,m,n) +end + +# Expand matrix to a larger size without changing non-zero entries. +# Might allocate a new pointer array, but shares index and value arrays with A. +function expand_sparse_matrix(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti},m,n) where Bi + p,q = size(A) + @assert m >= p + @assert n >= q + if m > p + new_rowptr = similar(A.rowptr,m+1) + map!(identity,new_rowptr,A.rowptr) + last_index = A.rowptr[end] + for i in p+1:m+1 + new_rowptr[i] = last_index + end + else + new_rowptr = A.rowptr + end + SparseMatrixCSR{Bi}(m,n,new_rowptr,A.colval,A.nzval) +end + +# Expand matrix to a larger size without changing non-zero entries. +# Might allocate a new pointer array, but shares index and value arrays with A. +function expand_sparse_matrix(A::SparseMatrixCSC{Tv,Ti},m,n) where {Tv,Ti} + p,q = size(A) + @assert m >= p + @assert n >= q + if n > q + new_colptr = similar(A.colptr,n+1) + map!(identity,new_colptr,A.colptr) + last_index = A.colptr[end] + for j in q+1:n+1 + new_colptr[j] = last_index + end + else + new_colptr = A.colptr + end + SparseMatrixCSC{Tv,Ti}(m,n,new_colptr,A.rowval,A.nzval) +end + +# Currently not implemented by the SparseMatricesCSR package +function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi + SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[]) +end + +# Currently not implemented by SparseMatricesCSR +function Base.similar(A::SparseMatrixCSR{Bi}) where Bi + SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A))) +end + +# Currently not implemented by the SparseMatricesCSR module +function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti}) + Acsc_T = copy(transpose(ascsc(At.parent))) # materialize SparseMatrixCSC transpose + ascsr(Acsc_T) +end + +function pointer_array(A::SparseMatrixCSR) + A.rowptr +end + +function pointer_array(A::SparseMatrixCSC) + A.colptr +end + +function index_array(A::SparseMatrixCSR) + colvals(A) +end + +function index_array(A::SparseMatrixCSC) + rowvals(A) +end + +function ptr_to_coo(ptr_array) + K = zeros(Int32, (ptr_array[end]-1)) + for i in 1:(length(ptr_array)-1) + for p in ptr_array[i]:ptr_array[i+1]-1 + K[p] = i + end + end + K +end + +function find_max_row_length(A::SparseMatrixCSR) + max_rA = 0 + for i in 1:size(A,1) + l = length(nzrange(A,i)) + max_rA = max_rA > l ? max_rA : l + end + max_rA +end + +function find_max_row_length(A::JaggedArray) + max_rA = 0 + for i in 1:length(A.ptrs)-1 + l = length(jagged_range(A,i)) + max_rA = max_rA > l ? max_rA : l + end + max_rA +end + +function find_max_col_length(A::SparseMatrixCSC) + max_cA = 0 + for j in 1:size(A,2) + l = length(nzrange(A,j)) + max_cA = max_cA > l ? max_cA : l + end + max_cA +end + +# Lazily convert CSC matrix to CSR matrix, by interpreting columnpointers as row pointers, and colvals as rowvals, +# effectively transposing it in the process. +function ascsr(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti} + p,q = size(A) + SparseMatrixCSR{1}(q,p,A.colptr,rowvals(A),nonzeros(A)) +end + +# Lazily convert CSR matrix to CSC matrix, by interpreting rowpointers as column pointers, and rowvals as colvals, +# effectively transposing it in the process. +function ascsc(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + p,q = size(A) + SparseMatrixCSC{Tv,Ti}(q,p,A.rowptr,colvals(A),nonzeros(A)) +end + +function halfperm(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + q = size(A,2) + JA,VA = colvals(A),nonzeros(A) + IAt,JAt,VAt = similar(A.rowptr,q+1),similar(JA),similar(VA) + halfperm!(IAt,JAt,VAt,A) +end + +# transpose A into At using vectors IAt,JAt, and VAt +function halfperm!(IAt,JAt,VAt,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + JA,VA = colvals(A),nonzeros(A) + p,q = size(A) + count_occurrences!(IAt,JA) + counts_to_ptrs!(IAt) + shift_by_one!(IAt) + for i in 1:p + for jp in nzrange(A,i) + j = JA[jp] + jpt = IAt[j+1] + JAt[jpt] = i + VAt[jpt] = VA[jp] + IAt[j+1] = jpt+1 + end + end + IAt[1] = 1 + SparseMatrixCSR{Bi}(q,p,IAt,JAt,VAt) +end + +# retranspose At back into A +function halfperm!(A::SparseMatrixCSR{Bi,Tv,Ti},At::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti} + IA,JA,VA = A.rowptr,colvals(A),nonzeros(A) + JAt,VAt = colvals(At),nonzeros(At) + p,q = size(At) + shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc. + IA[1] = 1 + for i in 1:p + for jpt in nzrange(At,i) + j = JAt[jpt] + jp = IA[j+1] + JA[jp] = i + VA[jp] = VAt[jpt] + IA[j+1] = jp+1 + end + end + At +end + +function halfperm!(A::SparseMatrixCSC,At::SparseMatrixCSC) + halfperm!(ascsr(A),ascsr(At)) + A +end + +function halfperm(A::SparseMatrixCSC) + At = halfperm(ascsr(A)) + ascsc(At) +end + +function count_occurrences!(v1::AbstractVector{<:Integer},v2::AbstractVector{<:Integer};set_zero=true) + if set_zero + v1 .= 0 + end + foreach(i->v1[i]+=1,v2) + v1 +end + +# shift all entries one element to the right in-place. Not circular. +function shift_by_one!(v) + l = length(v) + prev = v[1] + tmp = prev + for i in 1:l-1 + tmp = v[i+1] + v[i+1] = prev + prev = tmp + end +end + +function counts_to_ptrs!(v) + l = length(v) + v[1] += 1 + foreach(i->v[i]+=v[i-1],2:l) + shift_by_one!(v) + v[1] = 1 +end + +function symbolic_halfperm(A::SparseMatrixCSR) + q = size(A,2) + JA = colvals(A) + IAt,JAt = similar(A.rowptr,q+1),similar(JA) + symbolic_halfperm!(IAt,JAt,A) +end + +# transpose A into At using vectors IAt,JAt, and VAt +function symbolic_halfperm!(IAt,JAt,A::SparseMatrixCSR) + JA= colvals(A) + p,q = size(A) + count_occurrences!(IAt,JA) + counts_to_ptrs!(IAt) + shift_by_one!(IAt) + for i in 1:p + for jp in nzrange(A,i) + j = JA[jp] + jpt = IAt[j+1] + JAt[jpt] = i + IAt[j+1] = jpt+1 + end + end + IAt[1] = 1 + JaggedArray(JAt,IAt) +end + +# transpose A into At using vectors IAt,JAt, and VAt +function symbolic_halfperm!(JAt,IAt,A::SparseMatrixCSC) + symbolic_halfperm!(JAt,IAt,ascsr(A)) +end + +function symbolic_halfperm(A::SparseMatrixCSC) + symbolic_halfperm(ascsr(A)) +end + +# retranspose At back into A +function symbolic_halfperm!(A::SparseMatrixCSR,At::JaggedArray) + IA,JA = pointer_array(A),index_array(A) + JAt = At.data + # p = size(A,1) + shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc. + IA[1] = 1 + for i in 1:size(A,2) + for jpt in jagged_range(At,i) + j = JAt[jpt] + jp = IA[j+1] + JA[jp] = i + IA[j+1] = jp+1 + end + end + A +end + +# retranspose At back into A +function symbolic_halfperm!(A::SparseMatrixCSC,At::JaggedArray) + symbolic_halfperm!(ascsr(A),At) + A +end \ No newline at end of file diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl index 2c1a61ab..a175b722 100644 --- a/test/debug_array/runtests.jl +++ b/test/debug_array/runtests.jl @@ -23,4 +23,6 @@ using PartitionedArrays @testset "fem_example" begin include("fem_example.jl") end +@testset "spmtmm_tests" begin include("spmtmm_tests.jl") end + end #module diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl new file mode 100644 index 00000000..1b154b59 --- /dev/null +++ b/test/debug_array/spmtmm_tests.jl @@ -0,0 +1,31 @@ +module DebugArraySpMtMMTests + +using PartitionedArrays +using Test + +include(joinpath("..","spmtmm_tests.jl")) + +v = 1:5 +A = sparse(v,v,v) +Z = subtract(A,A) +@test nnz(Z) == nnz(A) +display(Z) + +B = sparse(v,v,-v) +Z = add(A,B) +@test nnz(Z) == nnz(A) +display(Z) + +A = sparsecsr(v,v,v) +Z = subtract(A,A) +@test nnz(Z) == nnz(A) +display(Z) + +B = sparsecsr(v,v,-v) +Z = add(A,B) +@test nnz(Z) == nnz(A) +display(Z) + +with_debug(spmtmm_tests) + +end # module diff --git a/test/mpi_array/drivers/spmtmm_tests.jl b/test/mpi_array/drivers/spmtmm_tests.jl new file mode 100644 index 00000000..50c3668a --- /dev/null +++ b/test/mpi_array/drivers/spmtmm_tests.jl @@ -0,0 +1,10 @@ +module MPIArrayPrimitivesTests + +using PartitionedArrays + +include(joinpath("..","..","spmtmm_tests.jl")) + +with_mpi(spmtmm_tests) + +end # module + diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl index 26a3a5d3..ffdc1f1e 100644 --- a/test/mpi_array/runtests.jl +++ b/test/mpi_array/runtests.jl @@ -13,5 +13,6 @@ using PartitionedArrays @testset "p_timer_tests" begin include("p_timer_tests.jl") end @testset "fdm_example" begin include("fdm_example.jl") end @testset "fem_example" begin include("fem_example.jl") end +@testset "spmtmm_tests" begin include("spmtmm_tests.jl") end end #module diff --git a/test/mpi_array/spmtmm_tests.jl b/test/mpi_array/spmtmm_tests.jl new file mode 100644 index 00000000..c9063604 --- /dev/null +++ b/test/mpi_array/spmtmm_tests.jl @@ -0,0 +1,4 @@ +using MPI +include("run_mpi_driver.jl") +file = joinpath(@__DIR__,"drivers","spmtmm_tests.jl") +run_mpi_driver(file;procs=4) diff --git a/test/p_sparse_matrix_tests.jl b/test/p_sparse_matrix_tests.jl index bff0f963..9efe0093 100644 --- a/test/p_sparse_matrix_tests.jl +++ b/test/p_sparse_matrix_tests.jl @@ -496,7 +496,6 @@ function p_sparse_matrix_tests(distribute) A_seq = centralize(A) spmm!(B,Z,A,cacheB) @test centralize(B) ≈ Z_seq*(A_seq) - B = transpose(Z)*A @test centralize(B) ≈ transpose(Z_seq)*A_seq diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl new file mode 100644 index 00000000..bf80328a --- /dev/null +++ b/test/spmtmm_tests.jl @@ -0,0 +1,229 @@ +using SparseArrays +using SparseMatricesCSR +using PartitionedArrays +using LinearAlgebra +using Test + +function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + if A.colptr != B.colptr && return false; end + if rowvals(A) != rowvals(B) && return false; end + if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end + true +end + +# Structurally A and B must be equal, but numerically the can be approximately equal +function approx_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR,args...) + if size(A) != size(B) && return false; end + if length(nonzeros(A)) != length(nonzeros(B)) && return false; end + if A.rowptr != B.rowptr && return false; end + if colvals(A) != colvals(B) && return false; end + if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end + true +end + +function parallel_tests(pA,pB,sparse_func) + A = centralize(sparse_func,pA) + B = centralize(sparse_func,pB) + # explicit parallel transpose + + pBt = explicit_transpose(pB) |> fetch + Bt = centralize(sparse_func,pBt) + @test Bt == copy(transpose(B)) + hp_B = halfperm(B) + B_struct = symbolic_halfperm(B) + @test pointer_array(hp_B) == B_struct.ptrs + @test index_array(hp_B) == B_struct.data + @test Bt == hp_B + + pBt_local,t = explicit_transpose(pB,reuse=true) + pBt, transpose_cache = fetch(t) + Bt = centralize(sparse_func,pBt) + @test Bt == copy(transpose(B)) + hp_B = halfperm(B) + @test Bt == hp_B + + t = explicit_transpose!(pBt,pBt_local,pB,transpose_cache) + wait(t) + Bt = centralize(sparse_func,pBt) + @test Bt == copy(transpose(B)) + hp_B = halfperm(B) + @test Bt == hp_B + + AB0 = matmul(A,B) + C0 = matmul(transpose(B),AB0) + # test basic sequential csr implementations to default csc sequential implementations. + pAB,cacheAB = spmm(pA,pB,reuse=true) + AB = centralize(sparse_func,pAB) + @test approx_equivalent(AB,AB0) + + # pB will be transposed internally + pC,cacheC = spmtm(pB,pAB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + spmm!(pAB,pA,pB,cacheAB) + AB = centralize(sparse_func,pAB) + + @test approx_equivalent(AB,AB0) + spmtm!(pC,pB,pAB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + pC,cacheC = spmtmm(pB,pA,pB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + spmtmm!(pC,pB,pA,pB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + # test basic sequential csr implementations to default csc sequential implementations. + pC,cacheC = spmm(pBt,pAB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + spmm!(pC,pBt,pAB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + # pB will be transposed internally + pC,cacheC = spmmm(pBt,pA,pB,reuse=true) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + spmmm!(pC,pBt,pA,pB,cacheC) + C = centralize(sparse_func,pC) + @test approx_equivalent(C,C0) + + # unequal sizes backward (small to large) + if size(pA) != size(pB) + CB0 = matmul(C0,Bt) + D0 = matmul(transpose(Bt),CB0) + pCB,cacheCB = spmm(pC,pBt,reuse=true) + CB = centralize(sparse_func,pCB) + @test approx_equivalent(CB,CB0) + + pD,cacheD = spmtm(pBt,pCB,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + spmm!(pCB,pC,pBt,cacheCB) + CB = centralize(sparse_func,pCB) + @test approx_equivalent(CB,CB0) + spmtm!(pD,pBt,pCB,cacheD) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + + pD,cacheD = spmtmm(pBt,pC,pBt,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + spmtmm!(pD,pBt,pC,pBt,cacheD) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + + pD,cacheD = spmm(pB,pCB,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + + pD,cacheD = spmmm(pB,pC,pBt,reuse=true) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + spmmm!(pD,pB,pC,pBt,cacheD) + D = centralize(sparse_func,pD) + @test approx_equivalent(D,D0) + end +end + +# function parallel_time(pA,pB,sparse_func) +# A = centralize(sparse_func,pA) +# B = centralize(sparse_func,pB) +# # explicit parallel transpose +# pBt = explicit_transpose(pB) |> fetch +# Bt = centralize(sparse_func,pBt) +# @test Bt == copy(transpose(B)) +# hp_B = halfperm(B) +# @test Bt == hp_B + +# AB0 = A*B +# C0 = transpose(B)*AB0 +# # test basic sequential csr implementations to default csc sequential implementations. +# pAB,cacheAB = spmm(pA,pB,reuse=true) +# print("spmm:\t") +# @time spmm(pA,pB,reuse=true) + +# # pB will be transposed internally +# pC,cacheC = spmtm(pB,pAB,reuse=true) +# print("spmtm:\t") +# @time spmtm(pB,pAB,reuse=true) +# spmm!(pAB,pA,pB,cacheAB) +# print("spmm!:\t") +# @time spmm!(pAB,pA,pB,cacheAB) +# spmtm!(pC,pB,pAB,cacheC) +# print("spmtm!:\t") +# @time spmtm!(pC,pB,pAB,cacheC) +# # pC,cacheC = spmtmm(pA,pB) +# pC,cacheC = spmtmm(pB,pA,pB,reuse=true) +# print("spmtmm:\t") +# # @time spmtmm(pA,pB) +# @time spmtmm(pB,pA,pB,reuse=true) +# # spmtmm!(pC,pA,pB,cacheC) +# spmtmm!(pC,pB,pA,pB,cacheC) +# print("spmtmm!:") +# # @time spmtmm!(pC,pA,pB,cacheC) +# @time spmtmm!(pC,pB,pA,pB,cacheC) +# pC,cacheC = spmm(pBt,pAB,reuse=true) +# print("spmm:\t") +# @time spmm(pBt,pAB,reuse=true) +# spmm!(pC,pBt,pAB,cacheC) +# print("spmm!:\t") +# @time spmm!(pC,pBt,pAB,cacheC) + +# # pB will be transposed internally +# pC,cacheC = spmmm(pBt,pA,pB,reuse=true) +# print("spmmm: ") +# @time spmmm(pBt,pA,pB,reuse=true) +# spmmm!(pC,pBt,pA,pB,cacheC) +# print("spmmm!:") +# @time spmmm!(pC,pBt,pA,pB,cacheC) + +# # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC) +# print("Local SpMM:\t") +# C = A*B +# @time C = A*B +# X,cache = rap(Bt,A,B) +# print("RAP:\t") +# @time rap(Bt,A,B) +# rap!(X,Bt,A,B,cache) +# print("RAP!:\t") +# @time rap!(X,Bt,A,B,cache) +# end + +# function Base.display(A::SparseMatrixCSR) +# display(halfperm(A) |> PartitionedArrays.ascsc) +# end + +function spmtmm_tests(distribute) + nodes_per_dir = (5,5,5) + parts_per_dir = (1,2,2) + np = prod(parts_per_dir) + ranks = distribute(LinearIndices((np,))) + for (TiA,TiB,TvA,TvB) in [(Int32,Int32,Float32,Float32),(Int32,Int64,Float32,Float32),(Int32,Int32,Float32,Float64),(Int32,Int64,Float32,Float64),(Int32,Int64,Int64,Int64),(Int32,Int64,Int64,Float32),(Int32,Int64,Float64,Int32)] + pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch + pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch + parallel_tests(pA,pB,sparsecsr) + + # Testing with a real prolongator requires PartitionedSolvers + # T = eltype(typeof(own_own_values(pA).items)) + # pB = prolongator(T,pA) + # parallel_tests(pA,pB,sparsecsr) + + #### CSC #### + pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch + pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch + parallel_tests(pA,pB,sparse) + + # Testing with a real prolongator requires PartitionedSolvers + # T = eltype(typeof(own_own_values(pA).items)) + # pB = prolongator(T,pA) + # parallel_tests(pA,pB,sparse) + # break + end +end \ No newline at end of file diff --git a/times.txt b/times.txt new file mode 100644 index 00000000..e4fd4e27 --- /dev/null +++ b/times.txt @@ -0,0 +1 @@ +Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2017, max = 0.2017, avg = 0.2017), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4044642, max = 0.4044642, avg = 0.4044642), "Phase 1" => (min = 2.0e-7, max = 2.0e-7, avg = 2.0e-7))