diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
index a402de0a..05746ce7 100644
--- a/src/PartitionedArrays.jl
+++ b/src/PartitionedArrays.jl
@@ -13,6 +13,7 @@ using BlockArrays
 
 export length_to_ptrs!
 export rewind_ptrs!
+export jagged_range
 export jagged_array
 export GenericJaggedArray
 export JaggedArray
@@ -24,6 +25,12 @@ export compresscoo
 export indextype
 export sparse_matrix
 export sparse_matrix!
+export index_array
+export pointer_array
+export halfperm
+export halfperm!
+export symbolic_halfperm
+export symbolic_halfperm!
 include("sparse_utils.jl")
 
 export linear_indices
@@ -169,9 +176,17 @@ export spmv!
 export spmtv!
 export spmm
 export spmm!
+export spmmm
+export spmmm!
 export spmtm
 export spmtm!
+export spmtmm
+export spmtmm!
 export centralize
+export explicit_transpose
+export explicit_transpose!
+export add
+export add!
 include("p_sparse_matrix.jl")
 
 export BRange
@@ -193,6 +208,16 @@ export node_coordinates_unit_cube
 export nullspace_linear_elasticity
 export nullspace_linear_elasticity!
 export near_nullspace_linear_elasticity
+export prolongator
 include("gallery.jl")
 
+export add
+export subtract
+export mul
+export matmul
+export matmul!
+export rap
+export rap!
+include("sequential_implementations.jl")
+
 end # module
diff --git a/src/gallery.jl b/src/gallery.jl
index b48f9575..06933725 100644
--- a/src/gallery.jl
+++ b/src/gallery.jl
@@ -587,6 +587,3 @@ function nullspace_linear_elasticity!(B,x)
     end
     B
 end
-
-
-
diff --git a/src/jagged_array.jl b/src/jagged_array.jl
index 60a74dec..4a58f48a 100644
--- a/src/jagged_array.jl
+++ b/src/jagged_array.jl
@@ -154,6 +154,15 @@ function JaggedArray{T,Ti}(a::AbstractArray{<:AbstractArray}) where {T,Ti}
   JaggedArray(data,ptrs)
 end
 
+# New
+function jagged_range(a::Union{JaggedArray,GenericJaggedArray},i::Integer)
+  u = one(eltype(a.ptrs))
+  pini = a.ptrs[i]
+  pend = a.ptrs[i+1]-u
+  pini:pend
+end
+
+###########
 
 Base.size(a::Union{JaggedArray,GenericJaggedArray}) = (length(a.ptrs)-1,)
 function Base.getindex(a::Union{JaggedArray,GenericJaggedArray},i::Int)
diff --git a/src/p_range.jl b/src/p_range.jl
index 2b3f05c0..81e391b8 100644
--- a/src/p_range.jl
+++ b/src/p_range.jl
@@ -408,7 +408,7 @@ end
 """
     neigs_snd, neigs_rcv = assembly_neighbors(index_partition;kwargs...)
 
-Return the ids of the neighbor parts from we send and receive data respectively
+Return the ids of the neighbor parts from which we send and receive data respectively
 in the assembly of distributed vectors defined on the index
 partition `index_partition`.
 partition `index_partition`. `kwargs` are delegated to [`ExchangeGraph`](@ref)
@@ -470,7 +470,7 @@ end
 
 function assembly_local_indices(indices,neighbors_snd,neighbors_rcv)
     cache = map(assembly_cache,indices)
-    mask =  map(cache) do mycache
+    mask = map(cache) do mycache
         isassigned(mycache.local_indices_snd) && isassigned(mycache.local_indices_rcv)
     end
     if ! getany(mask)
diff --git a/src/p_sparse_matrix.jl b/src/p_sparse_matrix.jl
index f6695eba..447d96b4 100644
--- a/src/p_sparse_matrix.jl
+++ b/src/p_sparse_matrix.jl
@@ -948,9 +948,9 @@ parallel implementations.
 
 # Properties
 
-- `matrix_partition::A`
-- `row_partition::B`
-- `col_partition::C`
+- `matrix_partition::B`
+- `row_partition::C`
+- `col_partition::D`
 - `assembled::Bool`
 
 `matrix_partition[i]` contains a (sparse) matrix with the local rows and the
@@ -964,7 +964,7 @@ is fully contained in the own rows.
 
 # Supertype hierarchy
 
-    PSparseMatrix{V,A,B,C,T} <: AbstractMatrix{T}
+    PSparseMatrix{V,B,C,D,T} <: AbstractMatrix{T}
 
 with `T=eltype(V)`.
 """
@@ -1587,12 +1587,15 @@ function psparse_assemble_impl(A,::Type,rows;kwargs...)
     error("Case not implemented yet")
 end
 
-function psparse_assemble_impl(
-        A,
-        ::Type{<:AbstractSplitMatrix},
-        rows;
-        reuse=Val(false),
-        assembly_neighbors_options_cols=(;))
+
+# New assemble
+####################
+
+function psparse_assemble_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
+                               ::Type{T},
+                               rows;
+                               reuse=Val(false),
+                               assembly_neighbors_options_cols=(;)) where {T<:AbstractSplitMatrix, Tv}
 
     function setup_cache_snd(A,parts_snd,rows_sa,cols_sa)
         A_ghost_own   = A.blocks.ghost_own
@@ -1613,7 +1616,6 @@ function psparse_assemble_impl(
             ptrs[owner_to_p[owner]+1] += 1
         end
         length_to_ptrs!(ptrs)
-        Tv = eltype(A_ghost_own)
         ndata = ptrs[end]-1
         I_snd_data = zeros(Int,ndata)
         J_snd_data = zeros(Int,ndata)
@@ -1646,11 +1648,13 @@ function psparse_assemble_impl(
         k_snd = JaggedArray(k_snd_data,ptrs)
         (;I_snd,J_snd,V_snd,k_snd,parts_snd)
     end
+    
     function setup_cache_rcv(I_rcv,J_rcv,V_rcv,parts_rcv)
         k_rcv_data = zeros(Int32,length(I_rcv.data))
         k_rcv = JaggedArray(k_rcv_data,I_rcv.ptrs)
         (;I_rcv,J_rcv,V_rcv,k_rcv,parts_rcv)
     end
+    
     function setup_own_triplets(A,cache_rcv,rows_sa,cols_sa)
         nz_own_own = findnz(A.blocks.own_own)
         nz_own_ghost = findnz(A.blocks.own_ghost)
@@ -1687,6 +1691,7 @@ function psparse_assemble_impl(
         aux = (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost)
         triplets, own_ghost_J, aux
     end
+    
     function finalize_values(A,rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux)
         (own_own_triplet,own_ghost_triplet) = triplets
         (I_rcv_own,J_rcv_own,k_rcv_own,I_rcv_ghost,J_rcv_ghost,k_rcv_ghost,nz_own_own,nz_own_ghost) = aux
@@ -1698,7 +1703,6 @@ function psparse_assemble_impl(
         n_ghost_rows = ghost_length(rows_fa)
         n_ghost_cols = ghost_length(cols_fa)
         Ti = indextype(A.blocks.own_own)
-        Tv = eltype(A.blocks.own_own)
         own_own = compresscoo(TA,own_own_triplet...,n_own_rows,n_own_cols)
         own_ghost = compresscoo(TA,own_ghost_triplet...,n_own_rows,n_ghost_cols)
         ghost_own = compresscoo(TA,Ti[],Ti[],Tv[],n_ghost_rows,n_own_cols)
@@ -1708,12 +1712,12 @@ function psparse_assemble_impl(
         nnz_own_own = nnz(own_own)
         k_own_sa = precompute_nzindex(own_own,own_own_triplet[1:2]...)
         k_ghost_sa = precompute_nzindex(own_ghost,own_ghost_triplet[1:2]...)
-        for p in 1:length(I_rcv_own)
+        for p in eachindex(I_rcv_own)
             i = I_rcv_own[p]
             j = J_rcv_own[p]
             k_rcv_own[p] = nzindex(own_own,i,j)
         end
-        for p in 1:length(I_rcv_ghost)
+        for p in eachindex(I_rcv_ghost)
             i = I_rcv_ghost[p]
             j = J_rcv_ghost[p]
             k_rcv_ghost[p] = nzindex(own_ghost,i,j) + nnz_own_own
@@ -1721,40 +1725,55 @@ function psparse_assemble_impl(
         cache = (;k_own_sa,k_ghost_sa,cache_snd...,cache_rcv...)
         values, cache
     end
-    rows_sa = partition(axes(A,1))
-    cols_sa = partition(axes(A,2))
-    #rows = map(remove_ghost,rows_sa)
-    cols = map(remove_ghost,cols_sa)
-    parts_snd, parts_rcv = assembly_neighbors(rows_sa)
-    cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa)
-    I_snd = map(i->i.I_snd,cache_snd)
-    J_snd = map(i->i.J_snd,cache_snd)
-    V_snd = map(i->i.V_snd,cache_snd)
-    graph = ExchangeGraph(parts_snd,parts_rcv)
-    t_I = exchange(I_snd,graph)
-    t_J = exchange(J_snd,graph)
-    t_V = exchange(V_snd,graph)
-    @fake_async begin
-        I_rcv = fetch(t_I)
-        J_rcv = fetch(t_J)
-        V_rcv = fetch(t_V)
-        cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv)
-        triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays
-        J_owner = find_owner(cols_sa,J)
-        rows_fa = rows
-        cols_fa = map(union_ghost,cols,J,J_owner)
-        assembly_neighbors(cols_fa;assembly_neighbors_options_cols...)
-        vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays
-        assembled = true
-        B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled)
-        if val_parameter(reuse) == false
-            B
-        else
-            B, cache
+    
+    function _psparse_assemble_impl(
+                                A,
+                                ::Type{T},
+                                rows;
+                                reuse=Val(false),
+                                assembly_neighbors_options_cols=(;)) where T<:AbstractSplitMatrix
+    
+    
+        rows_sa = partition(axes(A,1))
+        cols_sa = partition(axes(A,2))
+        cols = map(remove_ghost,cols_sa)
+        parts_snd, parts_rcv = assembly_neighbors(rows_sa)
+        cache_snd = map(setup_cache_snd,partition(A),parts_snd,rows_sa,cols_sa)
+    
+        I_snd = map(i->i.I_snd,cache_snd)
+        J_snd = map(i->i.J_snd,cache_snd)
+        V_snd = map(i->i.V_snd,cache_snd)
+        graph = ExchangeGraph(parts_snd,parts_rcv)
+        t_I = exchange(I_snd,graph)
+        t_J = exchange(J_snd,graph)
+        t_V = exchange(V_snd,graph)
+        @fake_async begin
+            I_rcv = fetch(t_I)
+            J_rcv = fetch(t_J)
+            V_rcv = fetch(t_V)
+            cache_rcv = map(setup_cache_rcv,I_rcv,J_rcv,V_rcv,parts_rcv)
+            triplets,J,aux = map(setup_own_triplets,partition(A),cache_rcv,rows_sa,cols_sa) |> tuple_of_arrays
+            J_owner = find_owner(cols_sa,J)
+            rows_fa = rows
+            cols_fa = map(union_ghost,cols,J,J_owner)
+            assembly_neighbors(cols_fa;assembly_neighbors_options_cols...)
+            vals_fa, cache = map(finalize_values,partition(A),rows_fa,cols_fa,cache_snd,cache_rcv,triplets,aux) |> tuple_of_arrays
+            assembled = true
+            B = PSparseMatrix(vals_fa,rows_fa,cols_fa,assembled)
+            if val_parameter(reuse) == false
+                B
+            else
+                B, cache
+            end
         end
     end
+
+    _psparse_assemble_impl(A,T,rows;reuse,assembly_neighbors_options_cols)
 end
 
+# End new assemble
+####################
+
 function psparse_assemble_impl!(B,A,::Type,cache)
     error("case not implemented")
 end
@@ -1833,13 +1852,14 @@ function consistent!(B::PSparseMatrix,A::PSparseMatrix,cache)
     psparse_consistent_impl!(B,A,T,cache)
 end
 
-function psparse_consistent_impl(
-    A,
-    ::Type{<:AbstractSplitMatrix},
-    rows_co;
-    reuse=Val(false))
+# New consistent
+####################
+function psparse_consistent_impl(A::PSparseMatrix{V,B,C,D,Tv} where {V,B,C,D},
+                                 ::Type{T},
+                                 rows_co;
+                                 reuse=Val(false)) where {T<:AbstractSplitMatrix, Tv}
 
-    function setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
+    function consistent_setup_snd(A,parts_snd,lids_snd,rows_co,cols_fa)
         own_to_local_row = own_to_local(rows_co)
         own_to_global_row = own_to_global(rows_co)
         own_to_global_col = own_to_global(cols_fa)
@@ -1847,7 +1867,8 @@ function psparse_consistent_impl(
         nl = size(A,1)
         li_to_ps_ptrs = zeros(Int32,nl+1)
         for p in 1:length(lids_snd)
-            for li in lids_snd[p]
+            for li_ptr in jagged_range(lids_snd,p)
+                li = lids_snd.data[li_ptr]
                 li_to_ps_ptrs[li+1] += 1
             end
         end
@@ -1855,37 +1876,42 @@ function psparse_consistent_impl(
         ndata = li_to_ps_ptrs[end]-1
         li_to_ps_data = zeros(Int32,ndata)
         for p in 1:length(lids_snd)
-            for li in lids_snd[p]
+            for li_ptr in jagged_range(lids_snd,p)
+                li = lids_snd.data[li_ptr]
                 q = li_to_ps_ptrs[li]
                 li_to_ps_data[q] = p
                 li_to_ps_ptrs[li] = q + 1
             end
         end
+    
         rewind_ptrs!(li_to_ps_ptrs)
         li_to_ps = JaggedArray(li_to_ps_data,li_to_ps_ptrs)
         ptrs = zeros(Int32,length(parts_snd)+1)
         for (i,j,v) in nziterator(A.blocks.own_own)
             li = own_to_local_row[i]
-            for p in li_to_ps[li]
+            for li_ptr in jagged_range(li_to_ps,li)
+                p = li_to_ps.data[li_ptr]
                 ptrs[p+1] += 1
             end
         end
+    
         for (i,j,v) in nziterator(A.blocks.own_ghost)
             li = own_to_local_row[i]
-            for p in li_to_ps[li]
+            for ptr in jagged_range(li_to_ps,li)
+                p=li_to_ps.data[ptr]
                 ptrs[p+1] += 1
             end
         end
         length_to_ptrs!(ptrs)
         ndata = ptrs[end]-1
-        T = eltype(A)
         I_snd = JaggedArray(zeros(Int,ndata),ptrs)
         J_snd = JaggedArray(zeros(Int,ndata),ptrs)
-        V_snd = JaggedArray(zeros(T,ndata),ptrs)
+        V_snd = JaggedArray(zeros(Tv,ndata),ptrs)
         k_snd = JaggedArray(zeros(Int32,ndata),ptrs)
         for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_own))
             li = own_to_local_row[i]
-            for p in li_to_ps[li]
+            for p_ptr in jagged_range(li_to_ps,li)
+                p = li_to_ps.data[p_ptr]
                 q = ptrs[p]
                 I_snd.data[q] = own_to_global_row[i]
                 J_snd.data[q] = own_to_global_col[j]
@@ -1894,10 +1920,12 @@ function psparse_consistent_impl(
                 ptrs[p] += 1
             end
         end
+    
         nnz_own_own = nnz(A.blocks.own_own)
         for (k,(i,j,v)) in enumerate(nziterator(A.blocks.own_ghost))
             li = own_to_local_row[i]
-            for p in li_to_ps[li]
+            for p_ptr in jagged_range(li_to_ps,li)
+                p=li_to_ps.data[p_ptr]
                 q = ptrs[p]
                 I_snd.data[q] = own_to_global_row[i]
                 J_snd.data[q] = ghost_to_global_col[j]
@@ -1910,18 +1938,21 @@ function psparse_consistent_impl(
         cache_snd = (;parts_snd,lids_snd,I_snd,J_snd,V_snd,k_snd)
         cache_snd
     end
-    function setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+        
+    function consistent_setup_rcv(parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
         cache_rcv = (;parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
         cache_rcv
     end
-    function finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
+    
+    function consistent_finalize(A,cache_snd,cache_rcv,rows_co,cols_fa,cols_co)
         I_rcv_data = cache_rcv.I_rcv.data
         J_rcv_data = cache_rcv.J_rcv.data
         V_rcv_data = cache_rcv.V_rcv.data
         global_to_own_col = global_to_own(cols_co)
-        global_to_ghost_col = global_to_ghost(cols_co)
-        is_own = findall(j->global_to_own_col[j]!=0,J_rcv_data)
-        is_ghost = findall(j->global_to_ghost_col[j]!=0,J_rcv_data)
+        # global_to_ghost_col = global_to_ghost(cols_co)
+        is_own_condition = k -> global_to_own_col[k]!=0
+        is_own = is_own_condition.(J_rcv_data)
+        is_ghost = map(!,is_own) # inverse is_own bitvector to effectively represent is_ghost mask
         I_rcv_own = I_rcv_data[is_own]
         J_rcv_own = J_rcv_data[is_own]
         V_rcv_own = V_rcv_data[is_own]
@@ -1932,16 +1963,14 @@ function psparse_consistent_impl(
         map_global_to_ghost!(I_rcv_ghost,rows_co)
         map_global_to_own!(J_rcv_own,cols_co)
         map_global_to_ghost!(J_rcv_ghost,cols_co)
-        I2,J2,V2 = findnz(A.blocks.own_ghost)
-        map_ghost_to_global!(J2,cols_fa)
-        map_global_to_ghost!(J2,cols_co)
-        n_own_rows = own_length(rows_co)
         n_ghost_rows = ghost_length(rows_co)
+        n_own_rows = own_length(rows_co)
         n_own_cols = own_length(cols_co)
         n_ghost_cols = ghost_length(cols_co)
         TA = typeof(A.blocks.ghost_own)
         own_own = A.blocks.own_own
-        own_ghost = compresscoo(TA,I2,J2,V2,n_own_rows,n_ghost_cols) # TODO this can be improved
+        # New own_ghost shares index and value arrays with existing own_ghost block. Pointer arrays are newly allocated (in case of CSC and CSR).
+        own_ghost = expand_sparse_matrix(A.blocks.own_ghost,n_own_rows,n_ghost_cols)
         ghost_own = compresscoo(TA,I_rcv_own,J_rcv_own,V_rcv_own,n_ghost_rows,n_own_cols)
         ghost_ghost = compresscoo(TA,I_rcv_ghost,J_rcv_ghost,V_rcv_ghost,n_ghost_rows,n_ghost_cols)
         K_own = precompute_nzindex(ghost_own,I_rcv_own,J_rcv_own)
@@ -1953,41 +1982,52 @@ function psparse_consistent_impl(
         V_rcv = cache_rcv.V_rcv
         parts_snd = cache_snd.parts_snd
         parts_rcv = cache_rcv.parts_rcv
-        cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_ghost,is_own,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
+        cache = (;parts_snd,parts_rcv,k_snd,V_snd,V_rcv,is_own,is_ghost,V_rcv_own,V_rcv_ghost,K_own,K_ghost)
         values,cache
     end
-    @assert matching_own_indices(axes(A,1),PRange(rows_co))
-    rows_fa = partition(axes(A,1))
-    cols_fa = partition(axes(A,2))
-    # snd and rcv are swapped on purpose
-    parts_rcv,parts_snd = assembly_neighbors(rows_co)
-    lids_rcv,lids_snd = assembly_local_indices(rows_co)
-    cache_snd = map(setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
-    I_snd = map(i->i.I_snd,cache_snd)
-    J_snd = map(i->i.J_snd,cache_snd)
-    V_snd = map(i->i.V_snd,cache_snd)
-    graph = ExchangeGraph(parts_snd,parts_rcv)
-    t_I = exchange(I_snd,graph)
-    t_J = exchange(J_snd,graph)
-    t_V = exchange(V_snd,graph)
-    @fake_async begin
-        I_rcv = fetch(t_I)
-        J_rcv = fetch(t_J)
-        V_rcv = fetch(t_V)
-        J_rcv_data = map(x->x.data,J_rcv)
-        J_rcv_owner = find_owner(cols_fa,J_rcv_data)
-        cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner)
-        cache_rcv = map(setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
-        values,cache = map(finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
-        B = PSparseMatrix(values,rows_co,cols_co,A.assembled)
-        if val_parameter(reuse) == false
-            B
-        else
-            B,cache
+
+    function _psparse_consistent_impl(
+                                A,
+                                ::Type{T},
+                                rows_co;
+                                reuse=Val(false)) where T<:AbstractSplitMatrix
+        @assert matching_own_indices(axes(A,1),PRange(rows_co))
+        cols_fa = partition(axes(A,2))
+        # snd and rcv are swapped on purpose
+        parts_rcv,parts_snd = assembly_neighbors(rows_co)
+        # assembly_neighbors is called again in assembly_local_indices?
+        lids_rcv,lids_snd = assembly_local_indices(rows_co,parts_rcv,parts_snd)
+        cache_snd = map(consistent_setup_snd,partition(A),parts_snd,lids_snd,rows_co,cols_fa)
+        I_snd = map(i->i.I_snd,cache_snd)
+        J_snd = map(i->i.J_snd,cache_snd)
+        V_snd = map(i->i.V_snd,cache_snd)
+        graph = ExchangeGraph(parts_snd,parts_rcv)
+        t_I = exchange(I_snd,graph)
+        t_J = exchange(J_snd,graph)
+        t_V = exchange(V_snd,graph)
+        @fake_async begin
+            I_rcv = fetch(t_I)
+            J_rcv = fetch(t_J)
+            V_rcv = fetch(t_V)
+            J_rcv_data = map(x->x.data,J_rcv)
+            J_rcv_owner = find_owner(cols_fa,J_rcv_data)
+            cols_co = map(union_ghost,cols_fa,J_rcv_data,J_rcv_owner)
+            cache_rcv = map(consistent_setup_rcv,parts_rcv,lids_rcv,I_rcv,J_rcv,V_rcv)
+            values,cache = map(consistent_finalize,partition(A),cache_snd,cache_rcv,rows_co,cols_fa,cols_co) |> tuple_of_arrays
+            B = PSparseMatrix(values,rows_co,cols_co,A.assembled)
+            if val_parameter(reuse) == false
+                B
+            else
+                B,cache
+            end
         end
     end
+    _psparse_consistent_impl(A,T,rows_co;reuse)
 end
 
+# End new consistent
+####################
+
 function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache)
     function setup_snd(A,cache)
         k_snd_data = cache.k_snd.data
@@ -2005,13 +2045,14 @@ function psparse_consistent_impl!(B,A,::Type{<:AbstractSplitMatrix},cache)
         end
     end
     function setup_rcv(B,cache)
-        is_ghost = cache.is_ghost
         is_own = cache.is_own
+        is_ghost = cache.is_ghost
         V_rcv_data = cache.V_rcv.data
         K_own = cache.K_own
         K_ghost = cache.K_ghost
+        # Allocates memory, while cache.V_rcv_own/ghost could be reused.
         V_rcv_own = V_rcv_data[is_own]
-        V_rcv_ghost = V_rcv_data[is_ghost]
+        V_rcv_ghost = V_rcv_data[is_ghost]  
         setcoofast!(B.blocks.ghost_own,V_rcv_own,K_own)
         setcoofast!(B.blocks.ghost_ghost,V_rcv_ghost,K_ghost)
         B
@@ -2209,6 +2250,20 @@ function sparse_diag_matrix(d::PVector,shape)
     psparse(I,J,V,row_partition,col_partition;assembled=true) |> fetch
 end
 
+# Version of sparse_diag_matrix for preserving local matrix type T (when default CSC is not wanted)
+function sparse_diag_matrix(::Type{T},d::PVector,shape) where T
+    row_partition,col_partition = map(partition,shape)
+    function setup(own_d,rows,cols)
+        I = own_to_global(rows) |> collect
+        J = own_to_global(cols) |> collect
+        V = own_d
+        I,J,V
+    end
+    I,J,V = map(setup,own_values(d),row_partition,col_partition) |> tuple_of_arrays
+    psparse(T,I,J,V,row_partition,col_partition;assembled=true) |> fetch
+end
+
+### OLD ###
 function rap(R,A,P;reuse=Val(false))
     Ac = R*A*P
     if val_parameter(reuse)
@@ -2217,6 +2272,16 @@ function rap(R,A,P;reuse=Val(false))
     Ac
 end
 
+### NEW ###
+function rap(R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
+    Ac, cache = spmmm(R,A,P;reuse=true)
+    if val_parameter(reuse)
+        return Ac, cache
+    end
+    Ac
+end
+
+### OLD ###
 function rap!(Ac,R,A,P,cache)
     # TODO improve performance
     tmp = R*A*P
@@ -2224,6 +2289,30 @@ function rap!(Ac,R,A,P,cache)
     Ac
 end
 
+### NEW ###
+function rap!(Ac::PSparseMatrix,R::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmmm!(Ac,R,A,P,cache)
+    Ac
+end
+
+### NEW ###
+function rap(Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
+    spmtmm(Pt.parent,A,P;reuse=reuse)
+end
+
+function rap!(Ac::PSparseMatrix,Pt::Transpose{Tv,<:PSparseMatrix} where Tv, A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmtmm!(Ac,Pt.parent,A,P,cache)
+end
+
+function rap(A::PSparseMatrix,P::PSparseMatrix;reuse=Val(false))
+    spmtmm(P,A,P;reuse=reuse)
+end
+
+function rap!(Ac::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmtmm!(Ac,A,P,cache)
+end
+### End NEW ###
+
 function spmm(A,B;reuse=Val(false))
     C = A*B
     if val_parameter(reuse)
@@ -2237,28 +2326,83 @@ function spmm!(C,A,B,state)
     C
 end
 
+### OLD ###
+# function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+#     # TODO latency hiding
+#     @assert A.assembled
+#     @assert B.assembled
+#     col_partition = partition(axes(A,2))
+#     C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
+#     D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
+#     assembled = true
+#     D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
+#     if val_parameter(reuse)
+#         cache = (C,cacheC,cacheD)
+#         return D,cache
+#     end
+#     D
+# end
+
+# function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+#     (C,cacheC,cacheD)= cache
+#     consistent!(C,B,cacheC) |> wait
+#     map(spmm!,partition(D),partition(A),partition(C),cacheD)
+#     D
+# end
+
+### NEW ###
 function spmm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
-    # TODO latency hiding
     @assert A.assembled
     @assert B.assembled
-    col_partition = partition(axes(A,2))
-    C,cacheC = consistent(B,col_partition;reuse=true) |> fetch
-    D_partition,cacheD = map((args...)->spmm(args...;reuse=true),partition(A),partition(C)) |> tuple_of_arrays
-    assembled = true
-    D = PSparseMatrix(D_partition,partition(axes(A,1)),partition(axes(C,2)),assembled)
+    t = consistent(B,partition(axes(A,2)),reuse=true)
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    C_own_own_1 = map(matmul,A_own_own,own_own_values(B))
+
+    # Wait for consistent
+    B2, cacheB2 = fetch(t)
+    C_own_ghost_1 = map(matmul,A_own_own,own_ghost_values(B2))
+    C_own_own_2 = map(matmul,A_own_ghost,ghost_own_values(B2))
+    C_own_ghost_2 = map(matmul,A_own_ghost,ghost_ghost_values(B2))
+    
+    C_own_own = map(add, C_own_own_1, C_own_own_2)
+    C_own_ghost = map(add, C_own_ghost_1, C_own_ghost_2)
+    
+    Coo_cache = map(construct_spmm_cache, C_own_own)
+    Cog_cache = map(construct_spmm_cache, C_own_ghost)
+    
+    C_values = map(C_own_own,C_own_ghost,partition(A),partition(B2)) do own_own,own_ghost,A_part,B_part
+        ghost_own = similar(own_own,0,size(own_own,2))
+        ghost_ghost = similar(own_own,0,size(own_ghost,2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks,A_part.row_permutation,B_part.col_permutation)
+    end
+    
+    C = PSparseMatrix(C_values,partition(axes(A,1)),partition(axes(B2,2)),true)
     if val_parameter(reuse)
-        cache = (C,cacheC,cacheD)
-        return D,cache
+        cache = (B2,cacheB2,(Coo_cache,Cog_cache))
+        return C,cache
     end
-    D
+    C
 end
 
-function spmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-    (C,cacheC,cacheD)= cache
-    consistent!(C,B,cacheC) |> wait
-    map(spmm!,partition(D),partition(A),partition(C),cacheD)
-    D
+function spmm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+    (B2,cacheB2,(Coo_cache,Cog_cache)) = cache
+    t = consistent!(B2,B,cacheB2)
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    C_own_own = own_own_values(C)
+    C_own_ghost = own_ghost_values(C)
+
+    map(matmul!,C_own_own,A_own_own,own_own_values(B),Coo_cache)
+    wait(t)
+    map(matmul!,C_own_ghost,A_own_own,own_ghost_values(B2),Cog_cache)
+
+    map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_own,A_own_ghost,ghost_own_values(B2),Coo_cache)
+    map((C,A,B,cache)->matmul!(C,A,B,1,1,cache),C_own_ghost,A_own_ghost,ghost_ghost_values(B2),Cog_cache)
+    C
 end
+### End NEW ###
 
 function spmtm(A,B;reuse=Val(false))
     C = transpose(A)*B
@@ -2273,27 +2417,101 @@ function spmtm!(C,A,B,cache)
     C
 end
 
+### OLD ###
+# function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
+#     # TODO latency hiding
+#     @assert A.assembled
+#     @assert B.assembled
+#     D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays
+#     assembled = false
+#     D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled)
+#     C,cacheC = assemble(D;reuse=true) |> fetch
+#     if val_parameter(reuse)
+#         cache = (D,cacheC,cacheD)
+#         return C,cache
+#     end
+#     C
+# end
+
+# function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+#     (D,cacheC,cacheD)= cache
+#     map(spmtm!,partition(D),partition(A),partition(B),cacheD)
+#     assemble!(C,D,cacheC) |> wait
+#     C
+# end
+
+### NEW ###
 function spmtm(A::PSparseMatrix,B::PSparseMatrix;reuse=Val(false))
-    # TODO latency hiding
     @assert A.assembled
     @assert B.assembled
-    D_partition,cacheD = map((args...)->spmtm(args...;reuse=true),partition(A),partition(B)) |> tuple_of_arrays
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Bog = own_ghost_values(B)
+
+    C1go = map((A,B)->matmul(transpose(A),B),Aog,Boo)
+    C1gg = map((A,B)->matmul(transpose(A),B),Aog,Bog)
+
+    C1_values = map(C1go, C1gg, partition(A), partition(B)) do ghost_own, ghost_ghost, A_part, B_part
+        own_own = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_own, 2))
+        own_ghost = similar(ghost_ghost, size(A_part.blocks.own_own, 2), size(B_part.blocks.own_ghost, 2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks,A_part.col_permutation,B_part.col_permutation)
+    end
+    
     assembled = false
-    D = PSparseMatrix(D_partition,partition(axes(A,2)),partition(axes(B,2)),assembled)
-    C,cacheC = assemble(D;reuse=true) |> fetch
+    C1_unassembled = PSparseMatrix(C1_values,partition(axes(A,2)),partition(axes(B,2)),assembled)
+    t = assemble(C1_unassembled,reuse=true)
+
+    C2oo = map((A,B)->matmul(transpose(A),B),Aoo,Boo)
+    C2og = map((A,B)->matmul(transpose(A),B),Aoo,Bog)
+
+    C2_values = map(C2oo, C2og, partition(A), partition(B)) do own_own, own_ghost, A_part, B_part
+        ghost_own = similar(own_own,0,size(own_own,2))
+        ghost_ghost = similar(own_own,0,size(own_ghost,2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks, A_part.col_permutation, B_part.col_permutation)
+    end
+
+    # No cache returned by SparseArrays, so this is a workaround. 
+    Coo_cache = map(construct_spmtm_cache, C2oo)
+    Cog_cache = map(construct_spmtm_cache, C2og)
+    Cgo_cache = map(construct_spmtm_cache, C1go)
+    Cgg_cache = map(construct_spmtm_cache, C1gg)
+
+    assembled = true
+    C2 = PSparseMatrix(C2_values,partition(axes(A,2)),partition(axes(B,2)),assembled)
+    C1, assemblyCache = fetch(t)
+    C, mergeCache = add(C1, C2)
+
     if val_parameter(reuse)
-        cache = (D,cacheC,cacheD)
+        sequential_caches = (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache)
+        cache = (C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches)
         return C,cache
     end
     C
 end
 
 function spmtm!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
-    (D,cacheC,cacheD)= cache
-    map(spmtm!,partition(D),partition(A),partition(B),cacheD)
-    assemble!(C,D,cacheC) |> wait
+    C1, C1_unassembled, assemblyCache, C2, mergeCache, sequential_caches = cache
+    (Coo_cache,Cog_cache,Cgo_cache,Cgg_cache) = sequential_caches
+
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Bog = own_ghost_values(B)
+
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_own_values(C1_unassembled),Aog,Boo,Cgo_cache)
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),ghost_ghost_values(C1_unassembled),Aog,Bog,Cgg_cache)
+        
+    t = assemble!(C1, C1_unassembled, assemblyCache)
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_own_values(C2),Aoo,Boo,Coo_cache)
+    map((C,A,B,cache)->matmul!(C,transpose(A),B,cache),own_ghost_values(C2),Aoo,Bog,Cog_cache)
+    wait(t)
+    add!(C, C1, C2, mergeCache)
     C
 end
+### End NEW ###
 
 function Base.:*(A::PSparseMatrix,B::PSparseMatrix)
     C = spmm(A,B)
@@ -2314,6 +2532,15 @@ function Base.:-(I::LinearAlgebra.UniformScaling,A::PSparseMatrix)
     D-A
 end
 
+# Version of I-A for preserving local matrix type T (when default CSC is not wanted)
+function Base.:-(T,I::LinearAlgebra.UniformScaling,A::PSparseMatrix)
+    Tv = eltype(A)
+    row_partition = partition(axes(A,1))
+    d = pones(Tv,row_partition)
+    D = sparse_diag_matrix(T,d,axes(A))
+    D-A
+end
+
 Base.similar(a::PSparseMatrix) = similar(a,eltype(a))
 function Base.similar(a::PSparseMatrix,::Type{T}) where T
     matrix_partition = map(partition(a)) do values
@@ -2400,6 +2627,74 @@ function repartition(A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
     end
 end
 
+### NEW ###
+# Repartition that follows local data layout of type T (some sparse matrix format)
+function repartition(::Type{T},A::PSparseMatrix,new_rows,new_cols;reuse=Val(false)) where T
+    @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
+    function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
+        I1,J1,V1 = findnz(A_own_own)
+        I2,J2,V2 = findnz(A_own_ghost)
+        map_own_to_global!(I1,A_rows)
+        map_own_to_global!(I2,A_rows)
+        map_own_to_global!(J1,A_cols)
+        map_ghost_to_global!(J2,A_cols)
+        I = vcat(I1,I2)
+        J = vcat(J1,J2)
+        V = vcat(V1,V2)
+        (I,J,V)
+    end
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    A_rows = partition(axes(A,1))
+    A_cols = partition(axes(A,2))
+    I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays
+
+    t = psparse(T,I,J,V,new_rows,new_cols;reuse=true)
+    @fake_async begin
+        B,cacheB = fetch(t)
+        if val_parameter(reuse) == false
+            B
+        else
+            cache = (V,cacheB)
+            B, cache
+        end
+    end
+end
+
+### NEW ###
+# Repartition that follows local data layout by using sparse function "sparse"
+function repartition(sparse,A::PSparseMatrix,new_rows,new_cols;reuse=Val(false))
+    @assert A.assembled "repartition on a sub-assembled matrix not implemented yet"
+    function prepare_triplets(A_own_own,A_own_ghost,A_rows,A_cols)
+        I1,J1,V1 = findnz(A_own_own)
+        I2,J2,V2 = findnz(A_own_ghost)
+        map_own_to_global!(I1,A_rows)
+        map_own_to_global!(I2,A_rows)
+        map_own_to_global!(J1,A_cols)
+        map_ghost_to_global!(J2,A_cols)
+        I = vcat(I1,I2)
+        J = vcat(J1,J2)
+        V = vcat(V1,V2)
+        (I,J,V)
+    end
+    A_own_own = own_own_values(A)
+    A_own_ghost = own_ghost_values(A)
+    A_rows = partition(axes(A,1))
+    A_cols = partition(axes(A,2))
+    I,J,V = map(prepare_triplets,A_own_own,A_own_ghost,A_rows,A_cols) |> tuple_of_arrays
+    t = psparse(sparse,I,J,V,new_rows,new_cols;reuse=true)
+    @fake_async begin
+        B,cacheB = fetch(t)
+        if val_parameter(reuse) == false
+            B
+        else
+            cache = (V,cacheB)
+            B, cache
+        end
+    end
+end
+
+
 """
     repartition!(B::PSparseMatrix,A::PSparseMatrix,cache)
 """
@@ -2469,6 +2764,28 @@ function centralize(A::PSparseMatrix)
     own_own_values(a_in_main) |> multicast |> getany
 end
 
+### NEW ### 
+# Centralize function with local storage layout of type T (some sparse matrix format)
+function centralize(::Type{T},A::PSparseMatrix) where T
+    m,n = size(A)
+    ranks = linear_indices(partition(A))
+    rows_trivial = trivial_partition(ranks,m)
+    cols_trivial = trivial_partition(ranks,n)
+    a_in_main = repartition(T,A,rows_trivial,cols_trivial) |> fetch
+    own_own_values(a_in_main) |> multicast |> getany
+end
+
+### NEW ### 
+# Centralize function that follows local data layout resulting from "sparse"
+function centralize(sparse,A::PSparseMatrix)
+    m,n = size(A)
+    ranks = linear_indices(partition(A))
+    rows_trivial = trivial_partition(ranks,m)
+    cols_trivial = trivial_partition(ranks,n)
+    a_in_main = repartition(sparse,A,rows_trivial,cols_trivial) |> fetch
+    own_own_values(a_in_main) |> multicast |> getany
+end
+
 """
     psystem(I,J,V,I2,V2,rows,cols;kwargs...)
 """
@@ -2705,3 +3022,324 @@ function laplace_matrix(nodes_per_dir,parts_per_dir,ranks)
     I,J,V = map(setup,node_partition) |> tuple_of_arrays
     A = psparse(sparse,I,J,V,node_partition,node_partition) |> fetch
 end
+
+
+################ NEW ################
+
+# Locally transpose SplitMatrix
+function explicit_transpose(A::AbstractSplitMatrix)
+    own_own = halfperm(A.blocks.own_own)
+    own_ghost = halfperm(A.blocks.ghost_own)
+    ghost_own = halfperm(A.blocks.own_ghost)
+    ghost_ghost = halfperm(A.blocks.ghost_ghost)
+    blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+    split_matrix(blocks,A.col_permutation,A.row_permutation)
+end
+
+# Redistribute PSparseMatrix, returns unassembled transpose and a assmbly task when reuse is true, or only the assembly task otherwise
+function explicit_transpose(A::PSparseMatrix;reuse=false)
+    mats = map(explicit_transpose,partition(A))
+    rows, cols = axes(A)
+    B = PSparseMatrix(mats,partition(cols),partition(rows),false)
+    t = assemble(B,reuse=reuse)
+    if val_parameter(reuse)
+        B,t
+    else
+        t
+    end
+end
+
+function explicit_transpose!(B::AbstractSplitMatrix,A::AbstractSplitMatrix)
+    halfperm!(B.blocks.own_own,A.blocks.own_own)
+    halfperm!(B.blocks.own_ghost,A.blocks.ghost_own)
+    halfperm!(B.blocks.ghost_own,A.blocks.own_ghost)
+    halfperm!(B.blocks.ghost_ghost,A.blocks.ghost_ghost)
+end
+
+function explicit_transpose!(B::PSparseMatrix,B_local::PSparseMatrix,A::PSparseMatrix,cache)
+    map(explicit_transpose!,partition(B_local),partition(A))
+    assemble!(B, B_local, cache)
+end
+
+function add(A::PSparseMatrix,B::PSparseMatrix)
+    function add_own_own(A,B)
+        C = add(A,B)
+        # reuse IA/IB for cache
+        KA = precompute_nzindex(C,A)
+        KB = precompute_nzindex(C,B)
+        C,(KA,KB)
+    end
+    function add_own_ghost(own_ghost_A, own_ghost_B, colsA, colsB, cols)
+        # Minimize allocated memory, but could be replaced with findnz(...)
+        iA,jA = find_indices(own_ghost_A) # local nonzero
+        vA = nonzeros(own_ghost_A)
+        iB,jB = find_indices(own_ghost_B) # local nonzero
+        vB = nonzeros(own_ghost_B)
+        jC = zeros(eltype(jA), (length(jA) + length(jB)))
+        ghostA_to_global = ghost_to_global(colsA)
+        ghostB_to_global = ghost_to_global(colsB)
+        global_to_ghostC = global_to_ghost(cols)
+        l = zero(eltype(jA))
+        for k in eachindex(jA)
+            l += 1
+            j = jA[k]
+            jC[l] = global_to_ghostC[ghostA_to_global[j]]
+            jA[k] = jC[l]
+        end
+        for k in eachindex(jB)
+            l += 1
+            j = jB[k]
+            jC[l] = global_to_ghostC[ghostB_to_global[j]]
+            jB[k] = jC[l]
+        end
+        own_ghost = compresscoo(typeof(own_ghost_A), vcat(iA, iB), jC, vcat(vA, vB), size(own_ghost_A, 1), ghost_length(cols))
+        # reuse auxiliary iA, iB arrays as caches
+        precompute_nzindex!(iA,own_ghost,iA,jA)
+        precompute_nzindex!(iB,own_ghost,iB,jB)
+        own_ghost, (iA, iB)
+    end
+    function _add(A,B)
+        colsA = partition(axes(A,2))
+        colsB = partition(axes(B,2))
+        J = map(ghost_to_global, colsB)
+        J_owner = map(ghost_to_owner, colsB)
+        cols = map(union_ghost, colsA, J, J_owner)
+        rows = partition(axes(A,1))
+        Coo, Koo = map(add_own_own, own_own_values(A), own_own_values(B)) |> tuple_of_arrays
+        Cog, Kog = map(add_own_ghost, own_ghost_values(A), own_ghost_values(B), colsA, colsB, cols) |> tuple_of_arrays
+        C_vals = map(Coo,Cog,rows,cols) do Coo, Cog, rows, cols
+            Cgo = similar(Coo, 0, size(Coo,2))
+            Cgg = similar(Coo, 0, size(Cog,2))
+            blocks = split_matrix_blocks(Coo, Cog, Cgo, Cgg)
+            split_matrix(blocks, local_permutation(rows), local_permutation(cols))
+        end
+        assembled = true
+        K = (Koo, Kog)
+        PSparseMatrix(C_vals,rows,cols,assembled), K
+    end
+    _add(A,B)
+end
+
+function add!(C::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,cache)
+    function add_blocks!(C, A, B, K)
+        K_A, K_B = K
+        sparse_matrix!(C, nonzeros(A), K_A)
+        sparse_matrix!(C, nonzeros(B), K_B, reset=false)
+    end
+    Koo, Kog = cache
+    map(add_blocks!, own_own_values(C), own_own_values(A), own_own_values(B), Koo)
+    map(add_blocks!, own_ghost_values(C), own_ghost_values(A), own_ghost_values(B), Kog)
+end
+
+# Interpret A as if its transpose is needed
+function spmtmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false))
+    @assert A.assembled
+    @assert B.assembled
+    @assert C.assembled
+    consistency_task = consistent(C, partition(axes(B,2)),reuse=true)
+    
+    Aoo = own_own_values(A)
+    Boo = own_own_values(B)
+    Cog = own_own_values(C)
+    
+    Aog = own_ghost_values(A)
+    Bog = own_ghost_values(B)
+    
+    Doo1, Doo_cache = map((A,B,C)->rap(transpose(A),B,C), Aoo,Boo,Cog) |> tuple_of_arrays
+    Dgo1, Dgo_cache = map((A,B,C)->rap(transpose(A),B,C), Aog,Boo,Cog) |> tuple_of_arrays
+    
+    # Collect ghost rows from P before continuing
+    C2, consistencyCache = fetch(consistency_task)
+
+    Cog2 = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    Dgo2, Dgo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgo,Dgo_cache) |> tuple_of_arrays
+    Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Boo,Cog2,Dgo_cache) |> tuple_of_arrays
+    Dog2, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aog,Bog,Cgg,Dog_cache) |> tuple_of_arrays        
+
+    Dgo = map(add,Dgo1,Dgo2) # different sparsity patterns so not in-place.
+    Dog = map(add,Dog1,Dog2)
+
+    D1_values = map(Dgo, Dog, partition(C), partition(C2)) do ghost_own, ghost_ghost, C_part, C2_part
+        own_own = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_own, 2))
+        own_ghost = similar(ghost_ghost, size(C_part.blocks.own_own, 2), size(C2_part.blocks.own_ghost, 2))
+        blocks = split_matrix_blocks(own_own, own_ghost, ghost_own, ghost_ghost)
+        split_matrix(blocks, C_part.col_permutation, C2_part.col_permutation)
+    end
+    D1_unassembled = PSparseMatrix(D1_values, partition(axes(C,2)), partition(axes(C2,2)), false)
+    assembly_task = assemble(D1_unassembled, reuse=true)
+
+    Dog1, Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Boo,Cog2,Doo_cache) |> tuple_of_arrays
+    Doo2,Doo_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
+    Dog2,Dog_cache = map((A,B,C,cache)->rap(transpose(A),B,C,cache), Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
+
+    Doo = map(add,Doo1,Doo2)
+    Dog = map(add,Dog1,Dog2)
+
+    Doo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Doo_cache,Doo)
+    Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog)
+    Dgo_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dgo_cache,Dgo)
+    Dog_cache_final = map((cache,D)->reduce_spmtmm_cache(cache,typeof(D)),Dog_cache,Dog)
+
+    D2_values = map(Doo, Dog, partition(C2)) do own_own, own_ghost, C_part
+        ghost_own = similar(own_own,0,size(own_own, 2))
+        ghost_ghost = similar(own_ghost,0,size(own_ghost, 2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks, C_part.col_permutation, C_part.col_permutation)
+    end
+
+    D1, assemblyCache = fetch(assembly_task)
+    D2 = PSparseMatrix(D2_values, partition(axes(D1,1)), partition(axes(C2,2)), true)
+    D, mergeCache = add(D1, D2)
+    sequential_caches = (Doo_cache_final, Dog_cache_final, Dgo_cache_final, Dog_cache_final)
+    if val_parameter(reuse)
+        cache = (C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches)
+        return D,cache
+    end
+    D
+end
+
+function spmtmm(A::PSparseMatrix,P::PSparseMatrix;kwargs...)
+    @assert A.assembled
+    @assert P.assembled
+    spmtmm(transpose(P),A,P;kwargs...)
+end
+
+function spmtmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache)
+    C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache, sequential_caches = cache
+    Doo_cache, Dog_cache, Dgo_cache, Dgg_cache = sequential_caches
+    C2, consistencyCache, D1, D1_unassembled, assemblyCache, D2, mergeCache = cache
+    
+    consistency_task = consistent!(C2, C, consistencyCache)
+    Doo = own_own_values(D2)
+    Dog = own_ghost_values(D2)
+    Dgo = ghost_own_values(D1_unassembled)
+    Dgg = ghost_ghost_values(D1_unassembled)
+
+    Aoo = own_own_values(A)
+    Boo = own_own_values(B)
+    Coo = own_own_values(C)
+
+    Aog = own_ghost_values(A)
+    Bog = own_ghost_values(B)
+    
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Doo,Aoo,Boo,Coo,Doo_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgo,Aog,Boo,Coo,Dgo_cache)
+    
+    # Collect ghost rows from P before continuing
+    wait(consistency_task)
+    Cog2 = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dgg,Aog,Boo,Cog2,Dgg_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgo,Aog,Bog,Cgo,Dgo_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dgg,Aog,Bog,Cgg,Dgg_cache)
+
+    assembly_task = assemble!(D1, D1_unassembled, assemblyCache)
+    
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Doo,Aoo,Bog,Cgo,Doo_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,cache), Dog,Aoo,Boo,Cog2,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,transpose(A),B,C,1,1,cache), Dog,Aoo,Bog,Cgg,Dog_cache)
+    
+    wait(assembly_task)
+    add!(D, D1, D2, mergeCache)
+    D
+end
+
+function spmtmm!(C::PSparseMatrix,A::PSparseMatrix,P::PSparseMatrix,cache)
+    spmtmm!(C,P,A,P,cache)
+end
+
+function spmmm(A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix;reuse=Val(false))
+    @assert A.assembled
+    @assert B.assembled
+    @assert C.assembled
+    B2_task = consistent(B,partition(axes(A,2)),reuse=true)
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Coo = own_own_values(C)
+
+    Doo1,Doo_cache = map(rap,Aoo,Boo,Coo) |> tuple_of_arrays
+    B2, Bcache = fetch(B2_task)
+    C2_task = consistent(C,partition(axes(B2,2)),reuse=true)
+
+    Bog = own_ghost_values(B2)
+    Bgo = ghost_own_values(B2)
+    Bgg = ghost_ghost_values(B2)
+
+    Doo2,Doo_cache = map(rap,Aog,Bgo,Coo,Doo_cache) |> tuple_of_arrays
+    Doo12 = map(add,Doo1,Doo2)
+
+    C2, Ccache = fetch(C2_task)
+  
+    Cog = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    Doo3,Doo_cache = map(rap,Aoo,Bog,Cgo,Doo_cache) |> tuple_of_arrays
+    Doo4,Doo_cache = map(rap,Aog,Bgg,Cgo,Doo_cache) |> tuple_of_arrays
+  
+    Doo34 = map(add,Doo3,Doo4)
+    Doo = map(add,Doo12,Doo34)
+  
+    Dog1,Dog_cache = map(rap,Aoo,Boo,Cog) |> tuple_of_arrays
+    Dog2,Dog_cache = map(rap,Aog,Bgo,Cog,Dog_cache) |> tuple_of_arrays
+    Dog3,Dog_cache = map(rap,Aoo,Bog,Cgg,Dog_cache) |> tuple_of_arrays
+    Dog4,Dog_cache = map(rap,Aog,Bgg,Cgg,Dog_cache) |> tuple_of_arrays
+
+    Dog12 = map(add,Dog1,Dog2)
+    Dog34 = map(add,Dog3,Dog4)
+    Dog = map(add,Dog12,Dog34)
+
+    D_values = map(Doo, Dog, partition(A),partition(C2)) do own_own, own_ghost, A_part,C_part
+        ghost_own = similar(own_own,0,size(own_own, 2))
+        ghost_ghost = similar(own_ghost,0,size(own_ghost, 2))
+        blocks = split_matrix_blocks(own_own,own_ghost,ghost_own,ghost_ghost)
+        split_matrix(blocks, A_part.row_permutation, C_part.col_permutation)
+    end
+
+    D = PSparseMatrix(D_values, partition(axes(A,1)), partition(axes(C2,2)), true)
+    if val_parameter(reuse)
+        cache = B2,Bcache,C2,Ccache,(Doo_cache,Dog_cache)
+        return D,cache
+    end
+    D
+end
+
+function spmmm!(D::PSparseMatrix,A::PSparseMatrix,B::PSparseMatrix,C::PSparseMatrix,cache)
+    B2,Bcache,C2,Ccache,sequential_caches = cache
+    Doo_cache, Dog_cache = sequential_caches
+    B2_task = consistent!(B2,B,Bcache)
+
+    Doo = own_own_values(D)
+    Dog = own_ghost_values(D)
+    Aoo = own_own_values(A)
+    Aog = own_ghost_values(A)
+    Boo = own_own_values(B)
+    Coo = own_own_values(C)
+    map(rap!,Doo,Aoo,Boo,Coo,Doo_cache)
+    wait(B2_task)
+
+    C2_task = consistent!(C2,C,Ccache)
+    Bog = own_ghost_values(B2)
+    Bgo = ghost_own_values(B2)
+    Bgg = ghost_ghost_values(B2)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgo,Coo,Doo_cache)
+
+    wait(C2_task)
+    Cog = own_ghost_values(C2)
+    Cgo = ghost_own_values(C2)
+    Cgg = ghost_ghost_values(C2)
+
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aoo,Bog,Cgo,Doo_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Doo,Aog,Bgg,Cgo,Doo_cache)
+    map(rap!,Dog,Aoo,Boo,Cog,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgo,Cog,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aoo,Bog,Cgg,Dog_cache)
+    map((D,A,B,C,cache)->rap!(D,A,B,C,1,1,cache),Dog,Aog,Bgg,Cgg,Dog_cache)
+    D
+end
\ No newline at end of file
diff --git a/src/sequential_implementations.jl b/src/sequential_implementations.jl
new file mode 100644
index 00000000..ed952606
--- /dev/null
+++ b/src/sequential_implementations.jl
@@ -0,0 +1,1680 @@
+function matmul(A::Union{Transpose{TvA,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvA,
+                B::Union{Transpose{TvB,<:SparseMatrixCSC},<:SparseMatrixCSC} where TvB)
+    A*B
+end
+
+function matmul(A::SparseMatrixCSR,B::SparseMatrixCSR)
+    C = matmul(ascsc(B),ascsc(A))
+    ascsr(C)
+end
+
+function matmul(At::Transpose{Tv,<:SparseMatrixCSR} where Tv,B::SparseMatrixCSR)
+    C = matmul(ascsc(B),transpose(ascsc(At.parent)))
+    ascsr(C)
+end
+
+function matmul(A::SparseMatrixCSR,Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
+    C = transpose(ascsc(Bt.parent))*ascsc(A)
+    ascsr(C)
+end
+
+function matmul(At::Transpose{TvA,<:SparseMatrixCSR} where TvA,Bt::Transpose{TvB,<:SparseMatrixCSR} where TvB)
+    C = transpose(ascsc(Bt.parent))*transpose(ascsc(At.parent))
+    ascsr(C)
+end
+
+function mul(x::Number,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a -> x*a, A.nzval))
+end
+
+function mul(A::SparseMatrixCSR,x::Number) mul(x,A) end
+
+# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
+function add(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
+    if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
+    p,q = size(A)
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    IC = Vector{Ti}(undef, p+1)
+    JC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    for i in 1:p
+        IC[i] = pC
+        jpA_range = nzrange(A, i)
+        jpA, jpA_end = jpA_range.start, jpA_range.stop
+        jpB_range = nzrange(B, i)
+        jpB, jpB_end = jpB_range.start, jpB_range.stop
+        while jpA <= jpA_end && jpB <= jpB_end
+            jA = JA[jpA]
+            jB = JB[jpB]
+            if jA < jB
+                JC[pC] = jA
+                VC[pC] = VA[jpA]
+                jpA += 1
+            elseif jB < jA
+                JC[pC] = jB
+                VC[pC] = VB[jpB]
+                jpB += 1
+            else
+                JC[pC] = jA
+                VC[pC] = VA[jpA] + VB[jpB]
+                jpA += 1
+                jpB += 1
+            end
+            pC += 1
+        end
+        while jpA <= jpA_end
+            JC[pC] = JA[jpA]
+            VC[pC] = VA[jpA]
+            jpA += 1
+            pC += 1
+        end
+        while jpB <= jpB_end
+            JC[pC] = JB[jpB]
+            VC[pC] = VB[jpB]
+            jpB += 1
+            pC += 1
+        end
+    end
+    IC[end] = pC
+    resize!(JC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSR{Bi}(p,q,IC,JC,VC)   # A += B
+end
+
+# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
+function subtract(A::SparseMatrixCSR{Bi,TvA,TiA},B::SparseMatrixCSR{Bi,TvB,TiB}) where {Bi,TvA,TvB,TiA,TiB}
+    if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    p,r = size(A)
+    IC = Vector{Ti}(undef, p+1)
+    JC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    for i in 1:p
+        IC[i] = pC
+        jpA_range = nzrange(A, i)
+        jpA, jpA_end = jpA_range.start, jpA_range.stop
+        jpB_range = nzrange(B, i)
+        jpB, jpB_end = jpB_range.start, jpB_range.stop
+        while jpA <= jpA_end && jpB <= jpB_end
+            jA = JA[jpA]
+            jB = JB[jpB]
+            if jA < jB
+                JC[pC] = jA
+                VC[pC] = VA[jpA]
+                jpA += 1
+            elseif jB < jA
+                JC[pC] = jB
+                VC[pC] = -VB[jpB]
+                jpB += 1
+            else
+                JC[pC] = jA
+                VC[pC] = VA[jpA] - VB[jpB]
+                jpA += 1
+                jpB += 1
+            end
+            pC += 1
+        end
+        while jpA <= jpA_end
+            JC[pC] = JA[jpA]
+            VC[pC] = VA[jpA]
+            jpA += 1
+            pC += 1
+        end
+        while jpB <= jpB_end
+            JC[pC] = JB[jpB]
+            VC[pC] = -VB[jpB]
+            jpB += 1
+            pC += 1
+        end
+    end
+    IC[end] = pC
+    resize!(JC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSR{Bi}(p,r,IC,JC,VC)   # A += B
+end
+
+function subtract(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(A.colval), map(a->-a, A.nzval))
+end
+
+# Alternative to lazy csr to csc for matrix addition that does not drop structural zeros.
+function add(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
+    if size(A) != size(B) && throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
+    p,q = size(A)
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    JC = Vector{Ti}(undef, q+1)
+    IC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    IA = rowvals(A)
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:q
+        JC[j] = pC
+        ipA_range = nzrange(A, j)
+        ipA, ipA_end = ipA_range.start, ipA_range.stop
+        ipB_range = nzrange(B, j)
+        ipB, ipB_end = ipB_range.start, ipB_range.stop
+        while ipA <= ipA_end && ipB <= ipB_end
+            iA = IA[ipA]
+            iB = IB[ipB]
+            if iA < iB
+                IC[pC] = iA
+                VC[pC] = VA[ipA]
+                ipA += 1
+            elseif iB < iA
+                IC[pC] = iB
+                VC[pC] = VB[ipB]
+                ipB += 1
+            else
+                IC[pC] = iA
+                VC[pC] = VA[ipA] + VB[ipB]
+                ipA += 1
+                ipB += 1
+            end
+            pC += 1
+        end
+        while ipA <= ipA_end
+            IC[pC] = IA[ipA]
+            VC[pC] = VA[ipA]
+            ipA += 1
+            pC += 1
+        end
+        while ipB <= ipB_end
+            IC[pC] = IB[ipB]
+            VC[pC] = VB[ipB]
+            ipB += 1
+            pC += 1
+        end
+    end
+    JC[end] = pC
+    resize!(IC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC)
+end
+
+# Alternative to lazy csr to csc for matrix subtraction that does not drop structural zeros. Subtracts B from A, i.e. A - B.
+function subtract(A::SparseMatrixCSC{TvA,TiA},B::SparseMatrixCSC{TvB,TiB}) where {TvA,TvB,TiA,TiB}
+    if size(A) == size(B) || throw(DimensionMismatch("Size of B $(size(B)) must match size of A $(size(A))"));end
+    Ti = promote_type(TiA,TiB)
+    Tv = promote_type(TvA,TvB)
+    p,q = size(A)
+    nnz_C_upperbound = nnz(A) + nnz(B)
+    JC = Vector{Ti}(undef, q+1)
+    IC = Vector{Ti}(undef, nnz_C_upperbound)
+    VC = Vector{Tv}(undef, nnz_C_upperbound)
+    
+    pC = 1
+    IA = rowvals(A)
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:q
+        JC[j] = pC
+        ipA_range = nzrange(A, j)
+        ipA, ipA_end = ipA_range.start, ipA_range.stop
+        ipB_range = nzrange(B, j)
+        ipB, ipB_end = ipB_range.start, ipB_range.stop
+        while ipA <= ipA_end && ipB <= ipB_end
+            iA = IA[ipA]
+            iB = IB[ipB]
+            if iA < iB
+                IC[pC] = iA
+                VC[pC] = VA[ipA]
+                ipA += 1
+            elseif iB < iA
+                IC[pC] = iB
+                VC[pC] = VB[ipB]
+                ipB += 1
+            else
+                IC[pC] = iA
+                VC[pC] = VA[ipA] - VB[ipB]
+                ipA += 1
+                ipB += 1
+            end
+            pC += 1
+        end
+        while ipA <= ipA_end
+            IC[pC] = IA[ipA]
+            VC[pC] = VA[ipA]
+            ipA += 1
+            pC += 1
+        end
+        while ipB <= ipB_end
+            IC[pC] = IB[ipB]
+            VC[pC] = -VB[ipB]
+            ipB += 1
+            pC += 1
+        end
+    end
+    JC[end] = pC
+    resize!(IC, (pC-1))
+    resize!(VC, (pC-1))
+    SparseMatrixCSC{Tv,Ti}(p,q,JC,IC,VC)
+end
+
+function subtract(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    SparseMatrixCSC{Tv,Ti}(size(A)..., copy(A.colptr), copy(A.rowval), map(a->-a, A.nzval))
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 A::SparseMatrixCSC,
+                 B::SparseMatrixCSC,
+                 cache)
+    matmul!(ascsr(C),ascsr(B),ascsr(A),cache)
+    C
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 A::SparseMatrixCSC,
+                 B::SparseMatrixCSC,
+                 α::Number,
+                 β::Number,
+                 cache)
+    matmul!(ascsr(C),ascsr(B),ascsr(A),α,β,cache)
+    C
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 B::SparseMatrixCSC)
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    VC .= 0
+    IC = rowvals(C)
+    JA = rowvals(A) # When virtually transposed rowvals represent colvals.
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:s
+        # loop over columns "j" in row i of A
+        Bj = nzrange(B, j)
+        ptrB_start = Bj.start
+        ptrB_stop = Bj.stop
+        for ip in nzrange(C, j)
+            i = IC[ip]
+            # loop over columns "k" in row j of B
+            Ai = nzrange(A, i)
+            ptrB = ptrB_start
+            ptrA = Ai.start
+            vC = 0
+            while ptrA <= Ai.stop && ptrB <= ptrB_stop
+                jA = JA[ptrA]
+                iB = IB[ptrB]
+                if jA < iB
+                    ptrA += 1
+                elseif iB < jA
+                    ptrB += 1
+                else # jA == iB
+                    vC += VA[ptrA]*VB[ptrB]
+                    ptrA += 1
+                    ptrB += 1
+                end
+            end
+            VC[ip] = vC
+        end
+    end
+    C
+end
+
+function matmul!(C::SparseMatrixCSC{Tv,Ti},
+                 At::Transpose{Tv,SparseMatrixCSC{Tv,Ti}},
+                 B::SparseMatrixCSC{Tv,Ti},
+                 α::Number,
+                 β::Number) where {Tv,Ti}
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    IC = rowvals(C)
+    VC .*= β
+    JA = rowvals(A) # When virtually transposed rowvals represent colvals.
+    VA = nonzeros(A)
+    IB = rowvals(B)
+    VB = nonzeros(B)
+    for j in 1:s
+        # loop over columns "j" in row i of A
+        Bj = nzrange(B, j)
+        for jp in nzrange(C, j)
+            i = IC[jp]
+            # loop over columns "k" in row j of B
+            Ai = nzrange(A, i)
+            ptrB = Bj.start
+            ptrA = Ai.start
+            vC = 0
+            while ptrA <= Ai.stop && ptrB <= Bj.stop
+                jA = JA[ptrA]
+                iB = IB[ptrB]
+                if jA == iB
+                    vC += VA[ptrA]*VB[ptrB]
+                    ptrA += 1
+                    ptrB += 1
+                elseif jA < iB
+                    ptrA += 1
+                else
+                    ptrB += 1
+                end
+            end
+            VC[jp] += α*vC
+        end
+    end
+    C
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 A::SparseMatrixCSC,
+                 Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv)
+    matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A))
+    C
+end
+
+function matmul!(C::SparseMatrixCSR,
+                 A::SparseMatrixCSR,
+                 B::SparseMatrixCSR,
+                 cache)
+    a,b = size(C)
+    p,q = size(A)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .= zero(eltype(C))
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    # A cache here would remove need for allocating acumulating arrays
+    # xb = zeros(Ti, p)
+    xb,x = cache
+    xb .= 0
+    # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop.
+    for i in 1:p # !
+        # loop over rows Ai in col Bj
+        for jpa in nzrange(A, i) 
+            ja = JA[jpa]
+            va = VA[jpa]
+            # loop over columns "k" in row j of B
+            for jpb in nzrange(B, ja) 
+                jb = JB[jpb]
+                vb = VB[jpb]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[jb] != i
+                    xb[jb] = i
+                    x[jb] = va*vb
+                else
+                    x[jb] += va*vb
+                end
+            end
+        end
+        for jpc in nzrange(C,i)
+            jc = JC[jpc]
+            # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required.
+            if xb[jc] == i
+                VC[jpc] = x[jc]
+            end
+        end
+    end
+    C
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 A::SparseMatrixCSC,
+                 Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 cache)
+    matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),cache)
+    C
+end
+
+function matmul!(C::SparseMatrixCSR,
+                 A::SparseMatrixCSR,
+                 B::SparseMatrixCSR,
+                 α::Number,
+                 β::Number,
+                 cache)
+    a,b = size(C)
+    p,q = size(A)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .*= β
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    # A cache here would remove need for allocating acumulating arrays
+    # xb = zeros(Ti, p)
+    xb,x = cache
+    xb .= 0
+    # x = similar(xb, Tv) # sparse accumulator, can be zeros() to remove if statement in inner loop.
+    for i in 1:p # !
+        # loop over rows Ai in col Bj
+        for jpa in nzrange(A, i) 
+            ja = JA[jpa]
+            va = VA[jpa]
+            # loop over columns "k" in row j of B
+            for jpb in nzrange(B, ja) 
+                jb = JB[jpb]
+                vb = VB[jpb]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[jb] != i
+                    xb[jb] = i
+                    x[jb] = va*vb
+                else
+                    x[jb] += va*vb
+                end
+            end
+        end
+        for jpc in nzrange(C,i)
+            jc = JC[jpc]
+            # To support in-place products whose sparsity patterns are subsets of the sparsity of C, this check is required.
+            if xb[jc] == i
+                VC[jpc] += α * x[jc]
+            end
+        end
+    end
+    C
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 A::SparseMatrixCSC,
+                 Bt::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 α::Number,
+                 β::Number,
+                 cache)
+    matmul!(ascsr(C),transpose(ascsr(Bt.parent)),ascsr(A),α,β,cache)
+    C
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 B::SparseMatrixCSC,
+                 cache)
+    matmul!(ascsr(C),ascsr(B),transpose(ascsr(At.parent)))
+    C
+end
+
+function matmul!(C::SparseMatrixCSC,
+                 At::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+                 B::SparseMatrixCSC,
+                 α::Number,
+                 β::Number,
+                 cache)
+    matmul!(ascsr(C),ascsr(A),transpose(ascsr(At.parent)),α,β)
+    C
+end
+
+# Workaround to supply in-place matmul with auxiliary array, as these are not returned by multiply function exported by SparseArrays
+function construct_spmm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti}
+    q = size(A,2)
+    xb = zeros(Ti,q)
+    x = similar(xb,Tv)
+    xb,x
+end
+function construct_spmm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    construct_spmm_cache(ascsr(A))
+end
+
+function construct_spmtm_cache(A::SparseMatrixCSR{Bi,Tv,Ti} where Bi) where {Tv,Ti}
+    q = size(A,2)
+    xb = zeros(Ti,q)
+    x = similar(xb,Tv)
+    xb,x
+end
+
+function construct_spmtm_cache(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    construct_spmtm_cache(ascsr(A))
+end
+
+function matmul!(C::SparseMatrixCSR,
+                 At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                 B::SparseMatrixCSR,
+                 cache)
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    VC .= zero((eltype(C)))
+    JC = colvals(C)
+    JA = colvals(A) # When virtually transposed colvals represent rowvals.
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    xb,x = cache
+    xb .= 0
+    for k in 1:q
+        # loop over columns "j" in row i of B
+        for jpb in nzrange(B,k)
+            jb = JB[jpb]
+            vb = VB[jpb]
+            xb[jb] = k
+            x[jb] = vb
+        end
+        for ipa in nzrange(A,k)
+            ia = JA[ipa] # interpret column index of A as row index of A^T.
+            va = VA[ipa]
+            for jpc in nzrange(C, ia)
+                jc = JC[jpc]
+                # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C.
+                if xb[jc] == k
+                    VC[jpc] += va*x[jc]
+                end
+            end
+        end
+
+    end
+    C
+end
+
+function matmul!(C::SparseMatrixCSR,
+                 At::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                 B::SparseMatrixCSR,
+                 α::Number,
+                 β::Number,
+                 cache)
+    a,b = size(C)
+    p,q = size(At)
+    r,s = size(B)
+    if q != r && throw(DimensionMismatch("A has dimensions ($(p),$(q)) but B has dimensions ($(p),$(q))"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("C has dimensions $((a,b)) but AB will have dimensions ($(p),$(s))"));end
+    A = At.parent
+    VC = nonzeros(C)
+    VC .*= β
+    JC = colvals(C)
+    JA = colvals(A) # When virtually transposed colvals represent rowvals.
+    VA = nonzeros(A)
+    JB = colvals(B)
+    VB = nonzeros(B)
+    xb,x = cache
+    xb .= 0
+    for k in 1:q
+        # loop over columns "j" in row i of B
+        for jpb in nzrange(B,k)
+            jb = JB[jpb]
+            vb = VB[jpb]
+            xb[jb] = k
+            x[jb] = α*vb
+        end
+        for ipa in nzrange(A,k)
+            ia = JA[ipa] # interpret column index of A as row index of A^T.
+            va = VA[ipa]
+            for jpc in nzrange(C, ia)
+                jc = JC[jpc]
+                # This check is required, as the outerproduct might not contribute to to all nonzero entries in this row of C.
+                if xb[jc] == k
+                    VC[jpc] += va*x[jc]
+                end
+            end
+        end
+
+    end
+    C
+end
+
+function matmul!(C::SparseMatrixCSR,
+                 A::SparseMatrixCSR,
+                 Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
+    matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A))
+    C
+end
+
+function matmul!(C::SparseMatrixCSR,
+                 A::SparseMatrixCSR,
+                 Bt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+                 α::Number,
+                 β::Number)
+    matmul!(ascsc(C), transpose(ascsc(Bt.parent)), ascsc(A), α, β)
+    C
+end
+
+function rap(A::Union{Transpose{TA,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TA,
+             B::M where M<:AbstractSparseMatrix,
+             C::Union{Transpose{TC,<:AbstractSparseMatrix},<:AbstractSparseMatrix} where TC
+             ;reuse=Val(true))
+    D,cache = rap(A,B,C)
+    if val_parameter(reuse)
+        return D,cache
+    end
+    D
+end
+
+# PtAP variants
+function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(Rt)
+    m,r = size(A)
+    n,s = size(P)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+    function rap_symbolic_count!(R,A,P)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
+        JR = R.data
+        JA = colvals(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        xbRA = zeros(Ti, r)
+        xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm
+        max_rR = find_max_row_length(R)
+        max_rA = find_max_row_length(A)
+        max_rP = find_max_row_length(P)
+
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
+        JRA = Vector{Ti}(undef,max_rC)
+        IC = Vector{Ti}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{Ti}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP
+        xbRA .= 0
+        xbC .= 0
+        cache = (xbRA,JRA,xbC,JAP)
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
+    end
+    function rap_symbolic_fill!(C,R,A,P,cache)
+        (xbRA,JRA,xbC,JAP) = cache
+        JC = colvals(C)
+        JR = R.data
+        JA = colvals(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        pC = 0
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1 
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        pC += 1
+                        xbC[k] = i
+                        JC[pC] = k
+                    end
+                end
+            end
+        end
+        xbC .= 0
+        outer_cache = (xbC,similar(xbC, eltype(C)),JAP)
+        C, outer_cache # values not yet initialized
+    end
+    function _rap(Rt,A,P)
+        R = symbolic_halfperm(Rt.parent)
+        C,symbolic_cache = rap_symbolic_count!(R,A,P) # precompute nz structure with a symbolic transpose
+        _,outer_cache = rap_symbolic_fill!(C,R,A,P,symbolic_cache)
+        Ct = symbolic_halfperm(C)
+        symbolic_halfperm!(C,Ct)
+        rap!(C,Rt,A,P,outer_cache),(outer_cache...,R)
+    end
+    _rap(Rt,A,P)
+end
+
+function rap(Rt::Transpose{TvR,SparseMatrixCSR{Bi,TvR,TiR}},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP},
+             cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(Rt)
+    m,r = size(A)
+    n,s = size(P)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+
+    function rap_symbolic_count(R,A,P)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
+        JR = R.data
+        JA = colvals(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        xbRA = zeros(Ti, r)
+        xbC = zeros(Ti, s) # this vector will also serve as as colptr array in halfperm
+        max_rR = find_max_row_length(R)
+        max_rA = find_max_row_length(A)
+        max_rP = find_max_row_length(P)
+
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
+        JRA = Vector{Ti}(undef,max_rC)
+        IC = Vector{Ti}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{Ti}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        JAP = Vector{Ti}(undef,min(max_rA*max_rP,s)) # upper bound estimate for length of virtual row of AP
+        xbRA .= 0
+        xbC .= 0
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC),(xbRA,JRA,xbC,JAP) # values in CSR matrix not yet initialized
+    end
+    function rap_symbolic_fill!(C,R,A,P,cache)
+        (xbRA,JRA,xbC,JAP) = cache
+        JC = colvals(C)
+        JR = R.data
+        JA = colvals(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        pC = 0
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in jagged_range(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1 
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        pC += 1
+                        xbC[k] = i
+                        JC[pC] = k
+                    end
+                end
+            end
+        end
+        xbC .= 0
+        C, (xbC,similar(xbC, eltype(C)),JAP) # values not yet initialized
+    end
+    function _rap(Rt,A,P,old_cache)
+        xb,x,JAP,R = old_cache
+        old_outer_cache = (xb,x,JAP)
+        C,symbolic_cache = rap_symbolic_count(R, A, P)
+        _,new_outer_cache = rap_symbolic_fill!(C,R, A, P, symbolic_cache)
+        Ct = symbolic_halfperm(C)
+        symbolic_halfperm!(C,Ct)
+        outer_cache = map((c1,c2) -> length(c1) >= length(c2) ? c1 : c2, old_outer_cache,new_outer_cache)
+        rap!(C,Rt,A,P,outer_cache),(outer_cache...,R)
+    end
+    _rap(Rt,A,P,cache)
+end
+
+function reduce_spmtmm_cache(cache,::Type{SparseMatrixCSR})
+    (xb,x,JAP,_) = cache
+    (xb,x,JAP)
+end
+
+function rap!(C::SparseMatrixCSR, 
+              Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
+              cache)
+    (a,b) = size(C)
+    p,q = size(Rt)
+    m,r = size(A)
+    n,s = size(P)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end
+    R = Rt.parent
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .= zero(eltype(C))
+
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JP = colvals(P)
+    VP = nonzeros(P)
+    xb, x, JAP = cache
+    xb .= 0
+    # loop over rows in A
+    for i in 1:m
+        lp = 0
+        # loop over columns "j" in row i of A
+        for jp in nzrange(A, i)
+            j = JA[jp]
+            va = VA[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(P, j)
+                k = JP[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    lp += 1
+                    JAP[lp] = k
+                    xb[k] = i
+                    x[k] = va * VP[kp]
+                else
+                    x[k] += va * VP[kp]
+                end
+            end
+        end
+        for kp in nzrange(R, i)
+            k = colvals(R)[kp] # rowvals when transposed conceptually
+            v = nonzeros(R)[kp]
+            for jp in nzrange(C,k)
+                j = JC[jp]
+                if xb[j] == i
+                    VC[jp] += v*x[j]
+                end 
+            end
+        end
+    end
+    C
+end
+
+function rap!(C::SparseMatrixCSR,
+              Rt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
+              α::Number,
+              β::Number,
+              cache)
+    (a,b) = size(C)
+    p,q = size(Rt)
+    m,r = size(A)
+    n,s = size(P)
+    if r != n && throw(DimensionMismatch("Invalid dimensions for A*P: ($m,$r)*($n,$s),"));end
+    if q != m && throw(DimensionMismatch("Invalid dimensions: R*AP: ($p,$q)*($m,$s)"));end
+    if (a,b) != (p,s) && throw(DimensionMismatch("Dimensions of C $(size(C)) don't match dimensions of R*A*P ($p,$q)*($m,$r)*($n,$s)."));end
+    R = Rt.parent
+    JC = colvals(C)
+    VC = nonzeros(C)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JP = colvals(P)
+    VP = nonzeros(P)
+    xb, x, JAP = cache
+    xb .= 0
+    VC .*= β
+    # loop over rows in A
+    for i in 1:m
+        lp = 0
+        # loop over columns "j" in row i of A
+        for jp in nzrange(A, i)
+            j = JA[jp]
+            va = α*VA[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(P, j)
+                k = JP[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    lp += 1
+                    JAP[lp] = k
+                    xb[k] = i
+                    x[k] = va*VP[kp]
+                else
+                    x[k] += va*VP[kp]
+                end
+            end
+        end
+        for kp in nzrange(R, i)
+            k = colvals(R)[kp] # rowvals when transposed conceptually
+            vpl = nonzeros(R)[kp]
+            for jp in nzrange(C,k)
+                j = JC[jp]
+                if xb[j] == i
+                    VC[jp] += vpl*x[j]
+                end 
+            end
+        end
+    end
+    C
+end
+
+# RAP variants
+function rap(R::SparseMatrixCSR{Bi,TvR,TiR},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP}) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(P)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+
+    function rap_symbolic!(R,A,P)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
+
+        JR = colvals(R)
+        JA = colvals(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        xbRA = zeros(Ti, r)
+        xbC = zeros(Ti, s+1) # this vector will also serve as as colptr array in halfperm
+        xRA = similar(xbRA, Tv) # sparse accumulator
+        xC = similar(xbC, Tv) # sparse accumulator
+        max_rR = find_max_row_length(R)
+        max_rA = find_max_row_length(A)
+        max_rP = find_max_row_length(P)
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
+
+        JRA = Vector{Ti}(undef,max_rC)
+        IC = Vector{Ti}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0
+            # loop over columns "j" in row i of A
+            for jp in nzrange(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{Ti}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        cache = (xbRA,xRA,JRA,xbC,xC)
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC), cache # values not yet initialized
+    end
+    function rap_numeric!(C,R,A,P,cache)
+        JR = colvals(R)
+        VR = nonzeros(R)
+        JA = colvals(A)
+        VA = nonzeros(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        VP = nonzeros(P)
+        JC = colvals(C)
+        VC = nonzeros(C)
+        (xbRA,xRA,JRA,xbC,xC) = cache
+        jpC = 1
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in nzrange(R, i)
+                j = JR[jp]
+                vpl = VR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                        xRA[k] = vpl * VA[kp]
+                    else
+                        xRA[k] += vpl * VA[kp]
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        JC[jpC] = k
+                        jpC += 1
+                        xC[k] = xRA[j]*VP[kp]
+                    else
+                        xC[k] += xRA[j]*VP[kp]
+                    end
+                end
+            end
+            for ind in nzrange(C,i)
+                j = JC[ind]
+                VC[ind] = xC[j]
+            end
+        end
+    end
+    function _rap(R,A,P)
+        C,(xbRA,xRA,JRA,xbC,xC) = rap_symbolic!(R,A,P)
+        xbRA .= 0
+        xbC .= 0
+        cache = (xbRA,xRA,JRA,xbC,xC)
+        rap_numeric!(C,R,A,P,cache)
+        Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
+        halfperm!(C,Ct)
+        C,cache
+    end
+    _rap(R,A,P)
+end
+
+function reduce_spmtmm_cache(cache,::Type{M} where M <: SparseMatrixCSR)
+    (xb,x,JAP,_) = cache
+    (xb,x,JAP)
+end
+
+function reduce_spmtmm_cache(cache,::Type{M}  where M <: SparseMatrixCSC)
+    reduce_spmmmt_cache(cache,SparseMatrixCSR)
+end
+
+function rap(R::SparseMatrixCSR{Bi,TvR,TiR},
+             A::SparseMatrixCSR{Bi,TvA,TiA},
+             P::SparseMatrixCSR{Bi,TvP,TiP},
+             cache) where {Bi,TvR,TvA,TvP,TiR,TiA,TiP}
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(P)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+
+    function rap_symbolic!(R,A,P,cache)
+        Ti = promote_type(TiR,TiA,TiP)
+        Tv = promote_type(TvR,TvA,TvP)
+        JR = colvals(R)
+        JA = colvals(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        (xbRA,_,JRA,xbC,_) = cache
+        IC = Vector{Ti}(undef,p+1)
+        nnz_C = 1
+        IC[1] = nnz_C
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in nzrange(R, i)
+                j = JR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                    end
+                end
+            end
+            ccC = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        ccC += 1
+                    end
+                end
+            end
+            nnz_C += ccC
+            IC[i+1] = nnz_C
+        end
+        JC = Vector{Ti}(undef, nnz_C-1)
+        VC = zeros(Tv,nnz_C-1)
+        SparseMatrixCSR{Bi}(p,s,IC,JC,VC) # values not yet initialized
+    end
+    function rap_numeric!(C,R,A,P,cache)
+        JR = colvals(R)
+        VR = nonzeros(R)
+        JA = colvals(A)
+        VA = nonzeros(A)
+        JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+        VP = nonzeros(P)
+        JC = colvals(C)
+        VC = nonzeros(C)
+        (xbRA,xRA,JRA,xbC,xC) = cache
+        jpC = 1
+        for i in 1:p
+            ccRA = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+            # loop over columns "j" in row i of A
+            for jp in nzrange(R, i)
+                j = JR[jp]
+                vpl = VR[jp]
+                # loop over columns "k" in row j of B
+                for kp in nzrange(A, j)
+                    k = JA[kp]
+                    # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                    if xbRA[k] != i
+                        ccRA += 1
+                        JRA[ccRA] = k
+                        xbRA[k] = i
+                        xRA[k] = vpl * VA[kp]
+                    else
+                        xRA[k] += vpl * VA[kp]
+                    end
+                end
+            end
+            for jp in 1:ccRA
+                j = JRA[jp]
+                for kp in nzrange(P,j)
+                    k = JP[kp]
+                    if xbC[k] != i
+                        xbC[k] = i
+                        JC[jpC] = k
+                        jpC += 1
+                        xC[k] = xRA[j]*VP[kp]
+                    else
+                        xC[k] += xRA[j]*VP[kp]
+                    end
+                end
+            end
+            for ind in nzrange(C,i)
+                j = JC[ind]
+                VC[ind] = xC[j]
+            end
+        end
+    end
+    function _rap(R,A,P,old_cache)
+        max_rR = find_max_row_length(R)
+        max_rA = find_max_row_length(A)
+        max_rP = find_max_row_length(P)
+        (xbRA,xRA,JRA,xbC,xC) = old_cache
+        max_rC = max((max_rR*max_rA*max_rP),(max_rA*max_rR))
+        JRA2 = max_rC > length(JRA) ? similar(JRA,max_rC) : JRA
+        if r > length(xbRA)
+            xbRA2 = similar(xbRA,r)
+            xRA2 = similar(xRA,r)
+        else
+            xbRA2 = xbRA
+            xRA2 = xRA
+        end
+
+        new_cache = (xbRA2,xRA2,JRA2,xbC,xC)
+        xbRA2 .= 0
+        xbC .= 0
+        C = rap_symbolic!(R,A,P,new_cache)
+        xbRA2 .= 0
+        xbC .= 0
+        rap_numeric!(C,R,A,P,new_cache)
+        Ct = halfperm!(xbC,similar(colvals(C)),similar(nonzeros(C)),C)
+        halfperm!(C,Ct)
+        C,new_cache
+    end
+    _rap(R,A,P,cache)
+end
+
+function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSR)
+    (xbRA,xRA,JRA,_,_) = cache
+    (xbRA,xRA,JRA)
+end
+
+function reduce_spmmmt_cache(cache,::Type{M} where M <: SparseMatrixCSC)
+    reduce_spmtmm_cache(cache,SparseMatrixCSR)
+end
+
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
+              cache)
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(P)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    JR = colvals(R)
+    VR = nonzeros(R)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .= zero(eltype(C))
+    (xbRA,xRA,JRA,xbC,xC) = cache
+    xbRA .= 0
+    xbC .= 0
+    for i in 1:p
+        lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+        # loop over columns "j" in row i of A
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
+
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                va = VA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xbRA[k] != i
+                    lp += 1
+                    JRA[lp] = k
+                    xbRA[k] = i
+                    xRA[k] = vpl * va
+                else
+                    xRA[k] += vpl * va
+                end
+            end
+        end
+        for jp in 1:lp
+            j = JRA[jp]
+            vra = xRA[j]
+            for kp in nzrange(P,j)
+                k = JP[kp]
+                if xbC[k] != i
+                    xbC[k] = i
+                    xC[k] = vra*VP[kp]
+                else
+                    xC[k] += vra*VP[kp]
+                end
+            end
+        end
+        for ind in nzrange(C,i)
+            j = JC[ind]
+            if xbC[j] == i
+                VC[ind] = xC[j]
+            end
+        end
+    end
+    C
+end
+
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR,
+              A::SparseMatrixCSR,
+              P::SparseMatrixCSR,
+              α::Number,
+              β::Number,
+              cache)
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(P)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    JR = colvals(R)
+    VR = nonzeros(R)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    JP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .*= β
+    (xbRA,xRA,JRA,xbC,xC) = cache
+    xbRA .= 0
+    xbC .= 0
+    # xC .= zero(Tv)
+    for i in 1:p
+        lp = 0 # local column pointer, refresh every row, start at 0 to allow empty rows
+        # loop over columns "j" in row i of A
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xbRA[k] != i
+                    lp += 1
+                    JRA[lp] = k
+                    xbRA[k] = i
+                    xRA[k] = vpl * VA[kp]
+                else
+                    xRA[k] += vpl * VA[kp]
+                end
+            end
+        end
+        for jp in 1:lp
+            j = JRA[jp]
+            for kp in nzrange(P,j)
+                k = JP[kp]
+                if xbC[k] != i
+                    xbC[k] = i
+                    xC[k] = xRA[j]*VP[kp]
+                else
+                    xC[k] += xRA[j]*VP[kp]
+                end
+            end
+        end
+        for ind in nzrange(C,i)
+            j = JC[ind]
+            if xbC[j] == i
+                VC[ind] += α*xC[j]
+            end
+        end
+    end
+    C
+end
+
+# RARt variants
+function rap(R::SparseMatrixCSR,
+             A::SparseMatrixCSR,
+             Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv)
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(Pt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
+    rap(R,A,copy(Pt))
+end
+
+function rap(R::SparseMatrixCSR,
+             A::SparseMatrixCSR,
+             Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+             cache) 
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(Pt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions for RA*P: ($p,$r)*($n,$s)"));end
+    rap(R,A,copy(Pt),cache)
+end
+
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR, 
+              A::SparseMatrixCSR, 
+              Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+              cache)
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(Pt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    P = Pt.parent
+    JR = colvals(R)
+    VR = nonzeros(R)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed
+    xb,x = cache
+    xb .= 0
+    for i in 1:p
+        # loop over columns "j" in row i of A
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    xb[k] = i
+                    x[k] = vpl * VA[kp]
+                else
+                    x[k] += vpl * VA[kp]
+                end
+            end
+        end
+        for jpP in nzrange(C,i)
+            jP = JC[jpP]
+            v = zero(eltype(C))
+            for ip in nzrange(P,jP)
+                iP = IP[ip]
+                if xb[iP] == i
+                    v += x[iP]*VP[ip]
+                end
+            end
+            VC[jpP] = v
+        end
+    end
+    C
+end
+
+function rap!(C::SparseMatrixCSR,
+              R::SparseMatrixCSR,
+              A::SparseMatrixCSR,
+              Pt::Transpose{Tv,<:SparseMatrixCSR} where Tv,
+              α::Number,
+              β::Number,
+              cache)
+    p,q = size(R)
+    m,r = size(A)
+    n,s = size(Pt)
+    if q == m || throw(DimensionMismatch("Invalid dimensions for R*A: ($p,$q)*($m,$r),"));end
+    if r == n || throw(DimensionMismatch("Invalid dimensions: RA*P: ($p,$r)*($n,$s)"));end
+    P = Pt.parent
+    JR = colvals(R)
+    VR = nonzeros(R)
+    JA = colvals(A)
+    VA = nonzeros(A)
+    IP = colvals(P) # colvals can be interpreted as rowvals when P is virtually transposed.
+    VP = nonzeros(P)
+    JC = colvals(C)
+    VC = nonzeros(C)
+    VC .*= β
+    # some cache items are present with the regular rap product in mind, which is how the allocating verison is performed
+    xb,x = cache
+    xb .= 0
+    for i in 1:p
+        # loop over columns "j" in row i of A
+        for jp in nzrange(R, i)
+            j = JR[jp]
+            vpl = VR[jp]
+            # loop over columns "k" in row j of B
+            for kp in nzrange(A, j)
+                k = JA[kp]
+                # since C is constructed rowwise, xb tracks if a column index is present in a new row in C.
+                if xb[k] != i
+                    xb[k] = i
+                    x[k] = vpl * VA[kp]
+                else
+                    x[k] += vpl * VA[kp]
+                end
+            end
+        end
+        for jpP in nzrange(C,i)
+            jP = JC[jpP]
+            v = zero(eltype(C))
+            for ip in nzrange(P,jP)
+                iP = IP[ip]
+                if xb[iP] == i
+                    v += x[iP]*VP[ip]
+                end
+            end
+            VC[jpP] += α*v
+        end
+    end
+    C
+end
+
+### CSC in terms of CSR
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC)
+    D,cache = rap(ascsr(C),ascsr(B),ascsr(A))
+    ascsc(D),cache
+end
+
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC,
+             cache)
+    D,new_cache = rap(ascsr(C),ascsr(B),ascsr(A),cache)
+    ascsc(D),new_cache
+end
+
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
+              cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),cache)
+    D
+end
+
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
+              α::Number,
+              β::Number,
+              cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),ascsr(A),α,β,cache)
+    D
+end
+
+# PtAP
+function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC)
+    D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)))
+    ascsc(D),cache
+end
+
+function rap(A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+             B::SparseMatrixCSC,
+             C::SparseMatrixCSC,
+             cache)
+    D,cache = rap(ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
+    ascsc(D),cache
+end
+
+function rap!(D::SparseMatrixCSC,
+              A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
+              cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),cache)
+    D
+end
+
+function rap!(D::SparseMatrixCSC,
+              A::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+              B::SparseMatrixCSC,
+              C::SparseMatrixCSC,
+              α::Number,
+              β::Number,
+              cache)
+    rap!(ascsr(D),ascsr(C),ascsr(B),transpose(ascsr(A.parent)),α,β,cache)
+    D
+end
+
+# RARt
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::Transpose{Tv,<:SparseMatrixCSC} where Tv)
+    D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A))
+    ascsc(D),new_cache
+end
+function rap(A::SparseMatrixCSC,
+             B::SparseMatrixCSC,
+             C::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+             cache)
+    D,new_cache = rap(transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
+    ascsc(D),new_cache
+end
+
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+              cache)
+    rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),cache)
+    D
+end
+
+function rap!(D::SparseMatrixCSC,
+              A::SparseMatrixCSC,
+              B::SparseMatrixCSC,
+              C::Transpose{Tv,<:SparseMatrixCSC} where Tv,
+              α::Number,
+              β::Number,
+              cache)
+    rap!(ascsr(D),transpose(ascsr(C.parent)),ascsr(B),ascsr(A),α,β,cache)
+    D
+end
\ No newline at end of file
diff --git a/src/sparse_utils.jl b/src/sparse_utils.jl
index 4d31a029..def57040 100644
--- a/src/sparse_utils.jl
+++ b/src/sparse_utils.jl
@@ -465,10 +465,9 @@ function sparse_matrix!(A,V,K;reset=true)
     A
 end
 
-
 # Notation
 # csrr: csr with repeated and unsorted columns
-# csru: csr witu unsorted columns
+# csru: csr with unsorted columns
 # csc: csc with sorted columns
 
 struct SparseMatrixCSRR{Tv,Ti,A}
@@ -689,3 +688,308 @@ function spmv_csc!(b,x,colptr_A,rowval_A,nzval_A)
     b
 end
 
+
+################ NEW ################
+# Variants for findnz() that only allocates memory for the conversion of the pointer array to an index array.
+# Only use for read-only operations.
+function findnz_minimal(A::SparseMatrixCSC)
+    J = ptr_to_coo(A.colptr)
+    rowvals(A),J,nonzeros(A)
+end
+function findnz_minimal(A::SparseMatrixCSR)
+    I = ptr_to_coo(A.rowptr)
+    I,colvals(A),nonzeros(A)
+end
+
+# Behaves like findnz, but without the values.
+function find_indices(A::SparseMatrixCSC)
+    I,J,_ = findnz_minimal(A)
+    copy(I),J
+end
+function find_indices(A::SparseMatrixCSR)
+    I,J,_ = findnz_minimal(A)
+    I,copy(J)
+end
+
+# TODO Could be done without binary searches from nzindex(...), when it is known that A and C are ordered, and A is a guaranteed submatrix of C.
+function precompute_nzindex(C::AbstractSparseArray,A::AbstractSparseArray)
+    I,J,_ = findnz_minimal(A)
+    K = similar(I)
+    K .= 0
+    for (p,(i,j)) in enumerate(zip(I,J))
+        if i < 1 || j < 1
+            continue
+        end
+        K[p] = nzindex(C,i,j)
+    end
+    K
+end
+
+# General matrix expansion to a larger size, allocates new matrix with new size.
+function expand_sparse_matrix(A,m,n)
+    compresscoo(typeof(A),findnz(A)...,m,n)
+end
+
+# Expand matrix to a larger size without changing non-zero entries. 
+# Might allocate a new pointer array, but shares index and value arrays with A.
+function expand_sparse_matrix(A::SparseMatrixCSR{Bi,Tv,Ti} where {Tv, Ti},m,n) where Bi
+    p,q = size(A)
+    @assert m >= p
+    @assert n >= q
+    if m > p
+        new_rowptr = similar(A.rowptr,m+1)
+        map!(identity,new_rowptr,A.rowptr)
+        last_index = A.rowptr[end]
+        for i in p+1:m+1
+            new_rowptr[i] = last_index
+        end
+    else
+        new_rowptr = A.rowptr
+    end
+    SparseMatrixCSR{Bi}(m,n,new_rowptr,A.colval,A.nzval)
+end
+
+# Expand matrix to a larger size without changing non-zero entries. 
+# Might allocate a new pointer array, but shares index and value arrays with A.
+function expand_sparse_matrix(A::SparseMatrixCSC{Tv,Ti},m,n) where {Tv,Ti}
+    p,q = size(A)
+    @assert m >= p
+    @assert n >= q
+    if n > q
+        new_colptr = similar(A.colptr,n+1)
+        map!(identity,new_colptr,A.colptr)
+        last_index = A.colptr[end]
+        for j in q+1:n+1
+            new_colptr[j] = last_index
+        end
+    else
+        new_colptr = A.colptr
+    end
+    SparseMatrixCSC{Tv,Ti}(m,n,new_colptr,A.rowval,A.nzval)
+end
+
+# Currently not implemented by the SparseMatricesCSR package
+function Base.similar(A::SparseMatrixCSR{Bi}, m::Integer, n::Integer) where Bi
+    SparseMatrixCSR{1}(m, n, ones(eltype(A.rowptr), m+1), eltype(A.colval)[], eltype(A.nzval)[])
+end
+
+# Currently not implemented by SparseMatricesCSR
+function Base.similar(A::SparseMatrixCSR{Bi}) where Bi
+    SparseMatrixCSR{Bi}(size(A)..., copy(A.rowptr), copy(colvals(A)), similar(nonzeros(A)))
+end
+
+# Currently not implemented by the SparseMatricesCSR module
+function Base.copy(At::Transpose{Tv,SparseMatrixCSR{Bi,Tv,Ti}} where {Bi,Tv,Ti})
+    Acsc_T = copy(transpose(ascsc(At.parent))) # materialize SparseMatrixCSC transpose
+    ascsr(Acsc_T)
+end
+
+function pointer_array(A::SparseMatrixCSR)
+    A.rowptr
+end
+
+function pointer_array(A::SparseMatrixCSC)
+    A.colptr
+end
+
+function index_array(A::SparseMatrixCSR)
+    colvals(A)
+end
+
+function index_array(A::SparseMatrixCSC)
+    rowvals(A)
+end
+
+function ptr_to_coo(ptr_array)
+    K = zeros(Int32, (ptr_array[end]-1))
+    for i in 1:(length(ptr_array)-1)
+        for p in ptr_array[i]:ptr_array[i+1]-1
+            K[p] = i
+        end
+    end
+    K
+end
+
+function find_max_row_length(A::SparseMatrixCSR)
+    max_rA = 0
+    for i in 1:size(A,1)
+        l = length(nzrange(A,i))
+        max_rA = max_rA > l ? max_rA : l
+    end
+    max_rA
+end
+
+function find_max_row_length(A::JaggedArray)
+    max_rA = 0
+    for i in 1:length(A.ptrs)-1
+        l = length(jagged_range(A,i))
+        max_rA = max_rA > l ? max_rA : l
+    end
+    max_rA
+end
+
+function find_max_col_length(A::SparseMatrixCSC)
+    max_cA = 0
+    for j in 1:size(A,2)
+        l = length(nzrange(A,j))
+        max_cA = max_cA > l ? max_cA : l
+    end
+    max_cA
+end
+
+# Lazily convert CSC matrix to CSR matrix, by interpreting columnpointers as row pointers, and colvals as rowvals,
+# effectively transposing it in the process.
+function ascsr(A::SparseMatrixCSC{Tv,Ti}) where {Tv,Ti}
+    p,q = size(A)
+    SparseMatrixCSR{1}(q,p,A.colptr,rowvals(A),nonzeros(A))
+end
+
+# Lazily convert CSR matrix to CSC matrix, by interpreting rowpointers as column pointers, and rowvals as colvals,
+# effectively transposing it in the process.
+function ascsc(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    p,q = size(A)
+    SparseMatrixCSC{Tv,Ti}(q,p,A.rowptr,colvals(A),nonzeros(A))
+end
+
+function halfperm(A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    q = size(A,2)
+    JA,VA = colvals(A),nonzeros(A)
+    IAt,JAt,VAt = similar(A.rowptr,q+1),similar(JA),similar(VA)
+    halfperm!(IAt,JAt,VAt,A)
+end
+
+# transpose A into At using vectors IAt,JAt, and VAt
+function halfperm!(IAt,JAt,VAt,A::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    JA,VA = colvals(A),nonzeros(A)
+    p,q = size(A)
+    count_occurrences!(IAt,JA)
+    counts_to_ptrs!(IAt)
+    shift_by_one!(IAt)
+    for i in 1:p
+        for jp in nzrange(A,i)
+            j = JA[jp]
+            jpt = IAt[j+1]
+            JAt[jpt] = i
+            VAt[jpt] = VA[jp]
+            IAt[j+1] = jpt+1
+        end
+    end
+    IAt[1] = 1
+    SparseMatrixCSR{Bi}(q,p,IAt,JAt,VAt)
+end
+
+# retranspose At back into A
+function halfperm!(A::SparseMatrixCSR{Bi,Tv,Ti},At::SparseMatrixCSR{Bi,Tv,Ti}) where {Bi,Tv,Ti}
+    IA,JA,VA = A.rowptr,colvals(A),nonzeros(A)
+    JAt,VAt = colvals(At),nonzeros(At)
+    p,q = size(At)
+    shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc.
+    IA[1] = 1
+    for i in 1:p
+        for jpt in nzrange(At,i)
+            j = JAt[jpt]
+            jp = IA[j+1]
+            JA[jp] = i
+            VA[jp] = VAt[jpt]
+            IA[j+1] = jp+1
+        end
+    end
+    At
+end
+
+function halfperm!(A::SparseMatrixCSC,At::SparseMatrixCSC)
+    halfperm!(ascsr(A),ascsr(At))
+    A
+end
+
+function halfperm(A::SparseMatrixCSC)
+    At = halfperm(ascsr(A))
+    ascsc(At)
+end
+
+function count_occurrences!(v1::AbstractVector{<:Integer},v2::AbstractVector{<:Integer};set_zero=true)
+    if set_zero
+        v1 .= 0
+    end
+    foreach(i->v1[i]+=1,v2)
+    v1
+end
+
+# shift all entries one element to the right in-place. Not circular.
+function shift_by_one!(v)
+    l = length(v)
+    prev = v[1]
+    tmp = prev
+    for i in 1:l-1
+        tmp = v[i+1]
+        v[i+1] = prev
+        prev = tmp
+    end
+end
+
+function counts_to_ptrs!(v)
+    l = length(v)
+    v[1] += 1
+    foreach(i->v[i]+=v[i-1],2:l)
+    shift_by_one!(v)
+    v[1] = 1
+end
+
+function symbolic_halfperm(A::SparseMatrixCSR)
+    q = size(A,2)
+    JA = colvals(A)
+    IAt,JAt = similar(A.rowptr,q+1),similar(JA)
+    symbolic_halfperm!(IAt,JAt,A)
+end
+
+# transpose A into At using vectors IAt,JAt, and VAt
+function symbolic_halfperm!(IAt,JAt,A::SparseMatrixCSR)
+    JA= colvals(A)
+    p,q = size(A)
+    count_occurrences!(IAt,JA)
+    counts_to_ptrs!(IAt)
+    shift_by_one!(IAt)
+    for i in 1:p
+        for jp in nzrange(A,i)
+            j = JA[jp]
+            jpt = IAt[j+1]
+            JAt[jpt] = i
+            IAt[j+1] = jpt+1
+        end
+    end
+    IAt[1] = 1
+    JaggedArray(JAt,IAt)
+end
+
+# transpose A into At using vectors IAt,JAt, and VAt
+function symbolic_halfperm!(JAt,IAt,A::SparseMatrixCSC)
+    symbolic_halfperm!(JAt,IAt,ascsr(A))
+end
+
+function symbolic_halfperm(A::SparseMatrixCSC)
+    symbolic_halfperm(ascsr(A))
+end
+
+# retranspose At back into A
+function symbolic_halfperm!(A::SparseMatrixCSR,At::JaggedArray)
+    IA,JA = pointer_array(A),index_array(A)
+    JAt = At.data
+    # p = size(A,1)
+    shift_by_one!(IA) # pointer to row 1 must be located at IA[2], row 2 at IA[3] etc.
+    IA[1] = 1
+    for i in 1:size(A,2)
+        for jpt in jagged_range(At,i)
+            j = JAt[jpt]
+            jp = IA[j+1]
+            JA[jp] = i
+            IA[j+1] = jp+1
+        end
+    end
+    A
+end
+
+# retranspose At back into A
+function symbolic_halfperm!(A::SparseMatrixCSC,At::JaggedArray)
+    symbolic_halfperm!(ascsr(A),At)
+    A
+end
\ No newline at end of file
diff --git a/test/debug_array/runtests.jl b/test/debug_array/runtests.jl
index 2c1a61ab..a175b722 100644
--- a/test/debug_array/runtests.jl
+++ b/test/debug_array/runtests.jl
@@ -23,4 +23,6 @@ using PartitionedArrays
 
 @testset "fem_example" begin include("fem_example.jl")  end
 
+@testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
+
 end #module
diff --git a/test/debug_array/spmtmm_tests.jl b/test/debug_array/spmtmm_tests.jl
new file mode 100644
index 00000000..1b154b59
--- /dev/null
+++ b/test/debug_array/spmtmm_tests.jl
@@ -0,0 +1,31 @@
+module DebugArraySpMtMMTests
+
+using PartitionedArrays
+using Test
+
+include(joinpath("..","spmtmm_tests.jl"))
+
+v = 1:5
+A = sparse(v,v,v)
+Z = subtract(A,A)
+@test nnz(Z) == nnz(A)
+display(Z)
+
+B = sparse(v,v,-v)
+Z = add(A,B)
+@test nnz(Z) == nnz(A)
+display(Z)
+
+A = sparsecsr(v,v,v)
+Z = subtract(A,A)
+@test nnz(Z) == nnz(A)
+display(Z)
+
+B = sparsecsr(v,v,-v)
+Z = add(A,B)
+@test nnz(Z) == nnz(A)
+display(Z)
+
+with_debug(spmtmm_tests)
+
+end # module
diff --git a/test/mpi_array/drivers/spmtmm_tests.jl b/test/mpi_array/drivers/spmtmm_tests.jl
new file mode 100644
index 00000000..50c3668a
--- /dev/null
+++ b/test/mpi_array/drivers/spmtmm_tests.jl
@@ -0,0 +1,10 @@
+module MPIArrayPrimitivesTests
+
+using PartitionedArrays
+
+include(joinpath("..","..","spmtmm_tests.jl"))
+
+with_mpi(spmtmm_tests)
+
+end # module
+
diff --git a/test/mpi_array/runtests.jl b/test/mpi_array/runtests.jl
index 26a3a5d3..ffdc1f1e 100644
--- a/test/mpi_array/runtests.jl
+++ b/test/mpi_array/runtests.jl
@@ -13,5 +13,6 @@ using PartitionedArrays
 @testset "p_timer_tests" begin include("p_timer_tests.jl")  end
 @testset "fdm_example" begin include("fdm_example.jl")  end
 @testset "fem_example" begin include("fem_example.jl")  end
+@testset "spmtmm_tests" begin include("spmtmm_tests.jl")  end
 
 end #module
diff --git a/test/mpi_array/spmtmm_tests.jl b/test/mpi_array/spmtmm_tests.jl
new file mode 100644
index 00000000..c9063604
--- /dev/null
+++ b/test/mpi_array/spmtmm_tests.jl
@@ -0,0 +1,4 @@
+using MPI
+include("run_mpi_driver.jl")
+file = joinpath(@__DIR__,"drivers","spmtmm_tests.jl")
+run_mpi_driver(file;procs=4)
diff --git a/test/p_sparse_matrix_tests.jl b/test/p_sparse_matrix_tests.jl
index bff0f963..9efe0093 100644
--- a/test/p_sparse_matrix_tests.jl
+++ b/test/p_sparse_matrix_tests.jl
@@ -496,7 +496,6 @@ function p_sparse_matrix_tests(distribute)
     A_seq = centralize(A)
     spmm!(B,Z,A,cacheB)
     @test centralize(B) ≈ Z_seq*(A_seq)
-
     B = transpose(Z)*A
     @test centralize(B) ≈ transpose(Z_seq)*A_seq
 
diff --git a/test/spmtmm_tests.jl b/test/spmtmm_tests.jl
new file mode 100644
index 00000000..bf80328a
--- /dev/null
+++ b/test/spmtmm_tests.jl
@@ -0,0 +1,229 @@
+using SparseArrays
+using SparseMatricesCSR
+using PartitionedArrays
+using LinearAlgebra
+using Test
+
+function approx_equivalent(A::SparseMatrixCSC, B::SparseMatrixCSC,args...)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    if A.colptr != B.colptr && return false; end
+    if rowvals(A) != rowvals(B) && return false; end
+    if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end
+    true
+end
+
+# Structurally A and B must be equal, but numerically the can be approximately equal
+function approx_equivalent(A::SparseMatrixCSR, B::SparseMatrixCSR,args...)
+    if size(A) != size(B) && return false; end
+    if length(nonzeros(A)) != length(nonzeros(B)) && return false; end
+    if A.rowptr != B.rowptr && return false; end
+    if colvals(A) != colvals(B) && return false; end
+    if !isapprox(nonzeros(A),nonzeros(B),args...) && return false; end
+    true
+end
+
+function parallel_tests(pA,pB,sparse_func)
+    A = centralize(sparse_func,pA)
+    B = centralize(sparse_func,pB)
+    # explicit parallel transpose
+
+    pBt = explicit_transpose(pB) |> fetch
+    Bt = centralize(sparse_func,pBt)
+    @test Bt == copy(transpose(B))
+    hp_B = halfperm(B)
+    B_struct = symbolic_halfperm(B)
+    @test pointer_array(hp_B) == B_struct.ptrs
+    @test index_array(hp_B) == B_struct.data
+    @test Bt == hp_B
+
+    pBt_local,t = explicit_transpose(pB,reuse=true)
+    pBt, transpose_cache = fetch(t)
+    Bt = centralize(sparse_func,pBt)
+    @test Bt == copy(transpose(B))
+    hp_B = halfperm(B)
+    @test Bt == hp_B
+
+    t = explicit_transpose!(pBt,pBt_local,pB,transpose_cache)
+    wait(t)
+    Bt = centralize(sparse_func,pBt)
+    @test Bt == copy(transpose(B))
+    hp_B = halfperm(B)
+    @test Bt == hp_B
+
+    AB0 = matmul(A,B)
+    C0 = matmul(transpose(B),AB0)
+    # test basic sequential csr implementations to default csc sequential implementations.
+    pAB,cacheAB = spmm(pA,pB,reuse=true)
+    AB = centralize(sparse_func,pAB)
+    @test approx_equivalent(AB,AB0)
+    
+    # pB will be transposed internally
+    pC,cacheC = spmtm(pB,pAB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    spmm!(pAB,pA,pB,cacheAB)
+    AB = centralize(sparse_func,pAB)
+
+    @test approx_equivalent(AB,AB0)
+    spmtm!(pC,pB,pAB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    pC,cacheC = spmtmm(pB,pA,pB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    spmtmm!(pC,pB,pA,pB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    
+    # test basic sequential csr implementations to default csc sequential implementations.
+    pC,cacheC = spmm(pBt,pAB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    spmm!(pC,pBt,pAB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    # pB will be transposed internally
+    pC,cacheC = spmmm(pBt,pA,pB,reuse=true)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+    spmmm!(pC,pBt,pA,pB,cacheC)
+    C = centralize(sparse_func,pC)
+    @test approx_equivalent(C,C0)
+
+    # unequal sizes backward (small to large)
+    if size(pA) != size(pB)
+        CB0 = matmul(C0,Bt)
+        D0 = matmul(transpose(Bt),CB0)
+        pCB,cacheCB = spmm(pC,pBt,reuse=true)
+        CB = centralize(sparse_func,pCB)
+        @test approx_equivalent(CB,CB0)
+
+        pD,cacheD = spmtm(pBt,pCB,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        spmm!(pCB,pC,pBt,cacheCB)
+        CB = centralize(sparse_func,pCB)
+        @test approx_equivalent(CB,CB0)
+        spmtm!(pD,pBt,pCB,cacheD)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        
+        pD,cacheD = spmtmm(pBt,pC,pBt,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        spmtmm!(pD,pBt,pC,pBt,cacheD)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+
+        pD,cacheD = spmm(pB,pCB,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        
+        pD,cacheD = spmmm(pB,pC,pBt,reuse=true)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+        spmmm!(pD,pB,pC,pBt,cacheD)
+        D = centralize(sparse_func,pD)
+        @test approx_equivalent(D,D0)
+    end
+end
+
+# function parallel_time(pA,pB,sparse_func)
+#     A = centralize(sparse_func,pA)
+#     B = centralize(sparse_func,pB)
+#     # explicit parallel transpose
+#     pBt = explicit_transpose(pB) |> fetch
+#     Bt = centralize(sparse_func,pBt)
+#     @test Bt == copy(transpose(B))
+#     hp_B = halfperm(B)
+#     @test Bt == hp_B
+
+#     AB0 = A*B
+#     C0 = transpose(B)*AB0
+#     # test basic sequential csr implementations to default csc sequential implementations.
+#     pAB,cacheAB = spmm(pA,pB,reuse=true)
+#     print("spmm:\t")
+#     @time spmm(pA,pB,reuse=true)
+    
+#     # pB will be transposed internally
+#     pC,cacheC = spmtm(pB,pAB,reuse=true)
+#     print("spmtm:\t")
+#     @time spmtm(pB,pAB,reuse=true)
+#     spmm!(pAB,pA,pB,cacheAB)
+#     print("spmm!:\t")
+#     @time spmm!(pAB,pA,pB,cacheAB)
+#     spmtm!(pC,pB,pAB,cacheC)
+#     print("spmtm!:\t")
+#     @time spmtm!(pC,pB,pAB,cacheC)
+#     # pC,cacheC = spmtmm(pA,pB)
+#     pC,cacheC = spmtmm(pB,pA,pB,reuse=true)
+#     print("spmtmm:\t")
+#     # @time spmtmm(pA,pB)
+#     @time spmtmm(pB,pA,pB,reuse=true)
+#     # spmtmm!(pC,pA,pB,cacheC)
+#     spmtmm!(pC,pB,pA,pB,cacheC)
+#     print("spmtmm!:")
+#     # @time spmtmm!(pC,pA,pB,cacheC)
+#     @time spmtmm!(pC,pB,pA,pB,cacheC)
+#     pC,cacheC = spmm(pBt,pAB,reuse=true)
+#     print("spmm:\t")
+#     @time spmm(pBt,pAB,reuse=true)
+#     spmm!(pC,pBt,pAB,cacheC)
+#     print("spmm!:\t")
+#     @time spmm!(pC,pBt,pAB,cacheC)
+
+#     # pB will be transposed internally
+#     pC,cacheC = spmmm(pBt,pA,pB,reuse=true)
+#     print("spmmm: ")
+#     @time spmmm(pBt,pA,pB,reuse=true)
+#     spmmm!(pC,pBt,pA,pB,cacheC)
+#     print("spmmm!:")
+#     @time spmmm!(pC,pBt,pA,pB,cacheC)
+
+#     # @code_warntype spmmm!(pC,pBt,pA,pB,cacheC)
+#     print("Local SpMM:\t")
+#     C = A*B
+#     @time C = A*B
+#     X,cache = rap(Bt,A,B)
+#     print("RAP:\t")
+#     @time rap(Bt,A,B)
+#     rap!(X,Bt,A,B,cache)
+#     print("RAP!:\t")
+#     @time rap!(X,Bt,A,B,cache)
+# end
+
+# function Base.display(A::SparseMatrixCSR)
+#     display(halfperm(A) |> PartitionedArrays.ascsc)
+# end
+
+function spmtmm_tests(distribute)
+    nodes_per_dir = (5,5,5)
+    parts_per_dir = (1,2,2)
+    np = prod(parts_per_dir)
+    ranks = distribute(LinearIndices((np,)))
+    for (TiA,TiB,TvA,TvB) in [(Int32,Int32,Float32,Float32),(Int32,Int64,Float32,Float32),(Int32,Int32,Float32,Float64),(Int32,Int64,Float32,Float64),(Int32,Int64,Int64,Int64),(Int32,Int64,Int64,Float32),(Int32,Int64,Float64,Int32)]
+        pA = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiA,value_type=TvA)...) |> fetch
+        pB = psparse(sparsecsr,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks;index_type=TiB,value_type=TvB)...) |> fetch
+        parallel_tests(pA,pB,sparsecsr)
+
+        # Testing with a real prolongator requires PartitionedSolvers
+        # T = eltype(typeof(own_own_values(pA).items))
+        # pB = prolongator(T,pA)
+        # parallel_tests(pA,pB,sparsecsr)
+        
+        #### CSC ####
+        pA = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiA, value_type=TvA)...) |> fetch
+        pB = psparse(sparse,laplacian_fdm(nodes_per_dir,parts_per_dir,ranks; index_type = TiB, value_type=TvB)...) |> fetch
+        parallel_tests(pA,pB,sparse)
+
+        # Testing with a real prolongator requires PartitionedSolvers
+        # T = eltype(typeof(own_own_values(pA).items))
+        # pB = prolongator(T,pA)
+        # parallel_tests(pA,pB,sparse)
+        # break
+    end
+end
\ No newline at end of file
diff --git a/times.txt b/times.txt
new file mode 100644
index 00000000..e4fd4e27
--- /dev/null
+++ b/times.txt
@@ -0,0 +1 @@
+Dict{String, @NamedTuple{min::Float64, max::Float64, avg::Float64}}("Phase 143" => (min = 0.2017, max = 0.2017, avg = 0.2017), "Phase 3" => (min = 1.0e-7, max = 1.0e-7, avg = 1.0e-7), "Matrix Assembly" => (min = 0.4044642, max = 0.4044642, avg = 0.4044642), "Phase 1" => (min = 2.0e-7, max = 2.0e-7, avg = 2.0e-7))