From 4dbdc803de6913be162c44d27038e535219ba836 Mon Sep 17 00:00:00 2001
From: Jishnu Bhattacharya <jishnub.github@gmail.com>
Date: Tue, 10 Sep 2024 12:09:54 +0530
Subject: [PATCH] Avoid materializing arrays in bidiag matmul (#55450)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, small `Bidiagonal`/`Tridiagonal` matrices are materialized in
matrix multiplications, but this is wasteful and unnecessary. This PR
changes this to use a naive matrix multiplication for small matrices,
and fall back to the banded multiplication for larger ones.
Multiplication by a `Bidiagonal` falls back to a banded matrix
multiplication for all sizes in the current implementation, and iterates
in a cache-friendly manner for the non-`Bidiagonal` matrix.

In certain cases, the matrices were being materialized if the
non-structured matrix was small, even if the structured matrix was
large. This is changed as well in this PR.

Some improvements in performance:
```julia
julia> B = Bidiagonal(rand(3), rand(2), :U); A = rand(size(B)...); C = similar(A);

julia> @btime mul!($C, $A, $B);
  193.152 ns (6 allocations: 352 bytes) # nightly v"1.12.0-DEV.1034"
  18.826 ns (0 allocations: 0 bytes) # This PR

julia> T = Tridiagonal(rand(99), rand(100), rand(99)); A = rand(2, size(T,2)); C = similar(A);

julia> @btime mul!($C, $A, $T);
  9.398 μs (8 allocations: 79.94 KiB) # nightly
  416.407 ns (0 allocations: 0 bytes) # This PR

julia> B = Bidiagonal(rand(300), rand(299), :U); A = rand(20000, size(B,2)); C = similar(A);

julia> @btime mul!($C, $A, $B);
  33.395 ms (0 allocations: 0 bytes) # nightly
  6.695 ms (0 allocations: 0 bytes) # This PR (cache-friendly)
```

Closes https://github.com/JuliaLang/julia/pull/55414

---------

Co-authored-by: Daniel Karrasch <daniel.karrasch@posteo.de>
---
 stdlib/LinearAlgebra/src/LinearAlgebra.jl |   4 +-
 stdlib/LinearAlgebra/src/bidiag.jl        | 330 +++++++++++++++++++---
 stdlib/LinearAlgebra/test/bidiag.jl       |  85 ++++--
 stdlib/LinearAlgebra/test/tridiag.jl      |  71 +++++
 4 files changed, 422 insertions(+), 68 deletions(-)

diff --git a/stdlib/LinearAlgebra/src/LinearAlgebra.jl b/stdlib/LinearAlgebra/src/LinearAlgebra.jl
index 27d4255fb656b8..17216845b350c8 100644
--- a/stdlib/LinearAlgebra/src/LinearAlgebra.jl
+++ b/stdlib/LinearAlgebra/src/LinearAlgebra.jl
@@ -673,7 +673,9 @@ matprod_dest(A::Diagonal, B::Diagonal, TS) = _matprod_dest_diag(B, TS)
 _matprod_dest_diag(A, TS) = similar(A, TS)
 function _matprod_dest_diag(A::SymTridiagonal, TS)
     n = size(A, 1)
-    Tridiagonal(similar(A, TS, n-1), similar(A, TS, n), similar(A, TS, n-1))
+    ev = similar(A, TS, max(0, n-1))
+    dv = similar(A, TS, n)
+    Tridiagonal(ev, dv, similar(ev))
 end
 
 # Special handling for adj/trans vec
diff --git a/stdlib/LinearAlgebra/src/bidiag.jl b/stdlib/LinearAlgebra/src/bidiag.jl
index d86bad7e41435a..8bc5b1c47f3668 100644
--- a/stdlib/LinearAlgebra/src/bidiag.jl
+++ b/stdlib/LinearAlgebra/src/bidiag.jl
@@ -557,7 +557,8 @@ end
 # function to get the internally stored vectors for Bidiagonal and [Sym]Tridiagonal
 # to avoid allocations in _mul! below (#24324, #24578)
 _diag(A::Tridiagonal, k) = k == -1 ? A.dl : k == 0 ? A.d : A.du
-_diag(A::SymTridiagonal, k) = k == 0 ? A.dv : A.ev
+_diag(A::SymTridiagonal{<:Number}, k) = k == 0 ? A.dv : A.ev
+_diag(A::SymTridiagonal, k) = k == 0 ? view(A, diagind(A, IndexStyle(A))) : view(A, diagind(A, 1, IndexStyle(A)))
 function _diag(A::Bidiagonal, k)
     if k == 0
         return A.dv
@@ -577,12 +578,45 @@ function _bibimul!(C, A, B, _add)
     check_A_mul_B!_sizes(size(C), size(A), size(B))
     n = size(A,1)
     iszero(n) && return C
-    n <= 3 && return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
+    if n <= 3
+        # naive multiplication
+        for I in CartesianIndices(C)
+            _modify!(_add, sum(A[I[1], k] * B[k, I[2]] for k in axes(A,2)), C, I)
+        end
+        return C
+    end
     # We use `_rmul_or_fill!` instead of `_modify!` here since using
     # `_modify!` in the following loop will not update the
     # off-diagonal elements for non-zero beta.
     _rmul_or_fill!(C, _add.beta)
     iszero(_add.alpha) && return C
+    @inbounds begin
+        # first column of C
+        C[1,1] += _add(A[1,1]*B[1,1] + A[1, 2]*B[2,1])
+        C[2,1] += _add(A[2,1]*B[1,1] + A[2,2]*B[2,1])
+        C[3,1] += _add(A[3,2]*B[2,1])
+        # second column of C
+        C[1,2] += _add(A[1,1]*B[1,2] + A[1,2]*B[2,2])
+        C[2,2] += _add(A[2,1]*B[1,2] + A[2,2]*B[2,2] + A[2,3]*B[3,2])
+        C[3,2] += _add(A[3,2]*B[2,2] + A[3,3]*B[3,2])
+        C[4,2] += _add(A[4,3]*B[3,2])
+    end # inbounds
+    # middle columns
+    __bibimul!(C, A, B, _add)
+    @inbounds begin
+        C[n-3,n-1] += _add(A[n-3,n-2]*B[n-2,n-1])
+        C[n-2,n-1] += _add(A[n-2,n-2]*B[n-2,n-1] + A[n-2,n-1]*B[n-1,n-1])
+        C[n-1,n-1] += _add(A[n-1,n-2]*B[n-2,n-1] + A[n-1,n-1]*B[n-1,n-1] + A[n-1,n]*B[n,n-1])
+        C[n,  n-1] += _add(A[n,n-1]*B[n-1,n-1] + A[n,n]*B[n,n-1])
+        # last column of C
+        C[n-2,  n] += _add(A[n-2,n-1]*B[n-1,n])
+        C[n-1,  n] += _add(A[n-1,n-1]*B[n-1,n  ] + A[n-1,n]*B[n,n  ])
+        C[n,    n] += _add(A[n,n-1]*B[n-1,n  ] + A[n,n]*B[n,n  ])
+    end # inbounds
+    C
+end
+function __bibimul!(C, A, B, _add)
+    n = size(A,1)
     Al = _diag(A, -1)
     Ad = _diag(A, 0)
     Au = _diag(A, 1)
@@ -590,44 +624,198 @@ function _bibimul!(C, A, B, _add)
     Bd = _diag(B, 0)
     Bu = _diag(B, 1)
     @inbounds begin
-        # first row of C
-        C[1,1] += _add(A[1,1]*B[1,1] + A[1, 2]*B[2, 1])
-        C[1,2] += _add(A[1,1]*B[1,2] + A[1,2]*B[2,2])
-        C[1,3] += _add(A[1,2]*B[2,3])
-        # second row of C
-        C[2,1] += _add(A[2,1]*B[1,1] + A[2,2]*B[2,1])
-        C[2,2] += _add(A[2,1]*B[1,2] + A[2,2]*B[2,2] + A[2,3]*B[3,2])
-        C[2,3] += _add(A[2,2]*B[2,3] + A[2,3]*B[3,3])
-        C[2,4] += _add(A[2,3]*B[3,4])
         for j in 3:n-2
-            Ajj₋1   = Al[j-1]
-            Ajj     = Ad[j]
+            Aj₋2j₋1 = Au[j-2]
+            Aj₋1j   = Au[j-1]
             Ajj₊1   = Au[j]
-            Bj₋1j₋2 = Bl[j-2]
-            Bj₋1j₋1 = Bd[j-1]
+            Aj₋1j₋1 = Ad[j-1]
+            Ajj     = Ad[j]
+            Aj₊1j₊1 = Ad[j+1]
+            Ajj₋1   = Al[j-1]
+            Aj₊1j   = Al[j]
+            Aj₊2j₊1 = Al[j+1]
             Bj₋1j   = Bu[j-1]
-            Bjj₋1   = Bl[j-1]
             Bjj     = Bd[j]
-            Bjj₊1   = Bu[j]
             Bj₊1j   = Bl[j]
-            Bj₊1j₊1 = Bd[j+1]
-            Bj₊1j₊2 = Bu[j+1]
-            C[j,j-2]  += _add( Ajj₋1*Bj₋1j₋2)
-            C[j, j-1] += _add(Ajj₋1*Bj₋1j₋1 + Ajj*Bjj₋1)
-            C[j, j  ] += _add(Ajj₋1*Bj₋1j   + Ajj*Bjj       + Ajj₊1*Bj₊1j)
-            C[j, j+1] += _add(Ajj  *Bjj₊1   + Ajj₊1*Bj₊1j₊1)
-            C[j, j+2] += _add(Ajj₊1*Bj₊1j₊2)
+
+            C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+            C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+            C[j,   j] += _add(Ajj₋1*Bj₋1j + Ajj*Bjj + Ajj₊1*Bj₊1j)
+            C[j+1, j] += _add(Aj₊1j*Bjj + Aj₊1j₊1*Bj₊1j)
+            C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
         end
-        # row before last of C
-        C[n-1,n-3] += _add(A[n-1,n-2]*B[n-2,n-3])
-        C[n-1,n-2] += _add(A[n-1,n-1]*B[n-1,n-2] + A[n-1,n-2]*B[n-2,n-2])
-        C[n-1,n-1] += _add(A[n-1,n-2]*B[n-2,n-1] + A[n-1,n-1]*B[n-1,n-1] + A[n-1,n]*B[n,n-1])
-        C[n-1,n  ] += _add(A[n-1,n-1]*B[n-1,n  ] + A[n-1,  n]*B[n  ,n  ])
-        # last row of C
-        C[n,n-2] += _add(A[n,n-1]*B[n-1,n-2])
-        C[n,n-1] += _add(A[n,n-1]*B[n-1,n-1] + A[n,n]*B[n,n-1])
-        C[n,n  ] += _add(A[n,n-1]*B[n-1,n  ] + A[n,n]*B[n,n  ])
-    end # inbounds
+    end
+    C
+end
+function __bibimul!(C, A, B::Bidiagonal, _add)
+    n = size(A,1)
+    Al = _diag(A, -1)
+    Ad = _diag(A, 0)
+    Au = _diag(A, 1)
+    Bd = _diag(B, 0)
+    if B.uplo == 'U'
+        Bu = _diag(B, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋2j₋1 = Au[j-2]
+                Aj₋1j   = Au[j-1]
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Ajj₋1   = Al[j-1]
+                Aj₊1j   = Al[j]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+
+                C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj₋1*Bj₋1j + Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj)
+            end
+        end
+    else # B.uplo == 'L'
+        Bl = _diag(B, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j   = Au[j-1]
+                Ajj₊1   = Au[j]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Aj₊1j   = Al[j]
+                Aj₊2j₊1 = Al[j+1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-1, j] += _add(Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj + Ajj₊1*Bj₊1j)
+                C[j+1, j] += _add(Aj₊1j*Bjj + Aj₊1j₊1*Bj₊1j)
+                C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
+            end
+        end
+    end
+    C
+end
+function __bibimul!(C, A::Bidiagonal, B, _add)
+    n = size(A,1)
+    Bl = _diag(B, -1)
+    Bd = _diag(B, 0)
+    Bu = _diag(B, 1)
+    Ad = _diag(A, 0)
+    if A.uplo == 'U'
+        Au = _diag(A, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋2j₋1 = Au[j-2]
+                Aj₋1j   = Au[j-1]
+                Ajj₊1   = Au[j]
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj       + Ajj₊1*Bj₊1j)
+                C[j+1, j] += _add(Aj₊1j₊1*Bj₊1j)
+            end
+        end
+    else # A.uplo == 'L'
+        Al = _diag(A, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Ajj₋1   = Al[j-1]
+                Aj₊1j   = Al[j]
+                Aj₊2j₊1 = Al[j+1]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j)
+                C[j,   j] += _add(Ajj₋1*Bj₋1j   + Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj   + Aj₊1j₊1*Bj₊1j)
+                C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
+            end
+        end
+    end
+    C
+end
+function __bibimul!(C, A::Bidiagonal, B::Bidiagonal, _add)
+    n = size(A,1)
+    Ad = _diag(A, 0)
+    Bd = _diag(B, 0)
+    if A.uplo == 'U' && B.uplo == 'U'
+        Au = _diag(A, 1)
+        Bu = _diag(B, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋2j₋1 = Au[j-2]
+                Aj₋1j   = Au[j-1]
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+
+                C[j-2, j] += _add(Aj₋2j₋1*Bj₋1j)
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j + Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj)
+            end
+        end
+    elseif A.uplo == 'U' && B.uplo == 'L'
+        Au = _diag(A, 1)
+        Bl = _diag(B, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j   = Au[j-1]
+                Ajj₊1   = Au[j]
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j-1, j] += _add(Aj₋1j*Bjj)
+                C[j,   j] += _add(Ajj*Bjj + Ajj₊1*Bj₊1j)
+                C[j+1, j] += _add(Aj₊1j₊1*Bj₊1j)
+            end
+        end
+    elseif A.uplo == 'L' && B.uplo == 'U'
+        Al = _diag(A, -1)
+        Bu = _diag(B, 1)
+        @inbounds begin
+            for j in 3:n-2
+                Aj₋1j₋1 = Ad[j-1]
+                Ajj     = Ad[j]
+                Ajj₋1   = Al[j-1]
+                Aj₊1j   = Al[j]
+                Bj₋1j   = Bu[j-1]
+                Bjj     = Bd[j]
+
+                C[j-1, j] += _add(Aj₋1j₋1*Bj₋1j)
+                C[j,   j] += _add(Ajj₋1*Bj₋1j   + Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj)
+            end
+        end
+    else # A.uplo == 'L' && B.uplo == 'L'
+        Al = _diag(A, -1)
+        Bl = _diag(B, -1)
+        @inbounds begin
+            for j in 3:n-2
+                Ajj     = Ad[j]
+                Aj₊1j₊1 = Ad[j+1]
+                Aj₊1j   = Al[j]
+                Aj₊2j₊1 = Al[j+1]
+                Bjj     = Bd[j]
+                Bj₊1j   = Bl[j]
+
+                C[j,   j] += _add(Ajj*Bjj)
+                C[j+1, j] += _add(Aj₊1j*Bjj   + Aj₊1j₊1*Bj₊1j)
+                C[j+2, j] += _add(Aj₊2j₊1*Bj₊1j)
+            end
+        end
+    end
     C
 end
 
@@ -744,7 +932,52 @@ function _mul!(C::AbstractVecOrMat, A::BiTriSym, B::AbstractVecOrMat, _add::MulA
     nB = size(B,2)
     (iszero(nA) || iszero(nB)) && return C
     iszero(_add.alpha) && return _rmul_or_fill!(C, _add.beta)
-    nA <= 3 && return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
+    if nA <= 3
+        # naive multiplication
+        for I in CartesianIndices(C)
+            col = Base.tail(Tuple(I))
+            _modify!(_add, sum(A[I[1], k] * B[k, col...] for k in axes(A,2)), C, I)
+        end
+        return C
+    end
+    _mul_bitrisym!(C, A, B, _add)
+end
+function _mul_bitrisym!(C::AbstractVecOrMat, A::Bidiagonal, B::AbstractVecOrMat, _add::MulAddMul)
+    nA = size(A,1)
+    nB = size(B,2)
+    d = A.dv
+    if A.uplo == 'U'
+        u = A.ev
+        @inbounds begin
+            for j = 1:nB
+                b₀, b₊ = B[1, j], B[2, j]
+                _modify!(_add, d[1]*b₀ + u[1]*b₊, C, (1, j))
+                for i = 2:nA - 1
+                    b₀, b₊ = b₊, B[i + 1, j]
+                    _modify!(_add, d[i]*b₀ + u[i]*b₊, C, (i, j))
+                end
+                _modify!(_add, d[nA]*b₊, C, (nA, j))
+            end
+        end
+    else
+        l = A.ev
+        @inbounds begin
+            for j = 1:nB
+                b₀, b₊ = B[1, j], B[2, j]
+                _modify!(_add, d[1]*b₀, C, (1, j))
+                for i = 2:nA - 1
+                    b₋, b₀, b₊ = b₀, b₊, B[i + 1, j]
+                    _modify!(_add, l[i - 1]*b₋ + d[i]*b₀, C, (i, j))
+                end
+                _modify!(_add, l[nA - 1]*b₀ + d[nA]*b₊, C, (nA, j))
+            end
+        end
+    end
+    C
+end
+function _mul_bitrisym!(C::AbstractVecOrMat, A::TriSym, B::AbstractVecOrMat, _add::MulAddMul)
+    nA = size(A,1)
+    nB = size(B,2)
     l = _diag(A, -1)
     d = _diag(A, 0)
     u = _diag(A, 1)
@@ -769,8 +1002,9 @@ function _mul!(C::AbstractMatrix, A::AbstractMatrix, B::TriSym, _add::MulAddMul)
     m = size(B,2)
     (iszero(m) || iszero(n)) && return C
     iszero(_add.alpha) && return _rmul_or_fill!(C, _add.beta)
-    if n <= 3 || m <= 1
-        return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
+    if m == 1
+        B11 = B[1,1]
+        return mul!(C, A, B11, _add.alpha, _add.beta)
     end
     Bl = _diag(B, -1)
     Bd = _diag(B, 0)
@@ -804,21 +1038,18 @@ function _mul!(C::AbstractMatrix, A::AbstractMatrix, B::Bidiagonal, _add::MulAdd
     m, n = size(A)
     (iszero(m) || iszero(n)) && return C
     iszero(_add.alpha) && return _rmul_or_fill!(C, _add.beta)
-    if size(A, 1) <= 3 || size(B, 2) <= 1
-        return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
-    end
     @inbounds if B.uplo == 'U'
+        for j in n:-1:2, i in 1:m
+            _modify!(_add, A[i,j] * B.dv[j] + A[i,j-1] * B.ev[j-1], C, (i, j))
+        end
         for i in 1:m
-            for j in n:-1:2
-                _modify!(_add, A[i,j] * B.dv[j] + A[i,j-1] * B.ev[j-1], C, (i, j))
-            end
             _modify!(_add, A[i,1] * B.dv[1], C, (i, 1))
         end
     else # uplo == 'L'
+        for j in 1:n-1, i in 1:m
+            _modify!(_add, A[i,j] * B.dv[j] + A[i,j+1] * B.ev[j], C, (i, j))
+        end
         for i in 1:m
-            for j in 1:n-1
-                _modify!(_add, A[i,j] * B.dv[j] + A[i,j+1] * B.ev[j], C, (i, j))
-            end
             _modify!(_add, A[i,n] * B.dv[n], C, (i, n))
         end
     end
@@ -834,7 +1065,12 @@ function _dibimul!(C, A, B, _add)
     check_A_mul_B!_sizes(size(C), size(A), size(B))
     n = size(A,1)
     iszero(n) && return C
-    n <= 3 && return mul!(C, Array(A), Array(B), _add.alpha, _add.beta)
+    if n <= 3
+        for I in CartesianIndices(C)
+            _modify!(_add, A.diag[I[1]] * B[I[1], I[2]], C, I)
+        end
+        return C
+    end
     _rmul_or_fill!(C, _add.beta)  # see the same use above
     iszero(_add.alpha) && return C
     Ad = A.diag
diff --git a/stdlib/LinearAlgebra/test/bidiag.jl b/stdlib/LinearAlgebra/test/bidiag.jl
index ef50658a642fb4..58c228e39e2264 100644
--- a/stdlib/LinearAlgebra/test/bidiag.jl
+++ b/stdlib/LinearAlgebra/test/bidiag.jl
@@ -1026,26 +1026,71 @@ end
     @test_throws "cannot set entry" B[1,2] = 4
 end
 
-@testset "mul with empty arrays" begin
-    A = zeros(5,0)
-    B = Bidiagonal(zeros(0), zeros(0), :U)
-    BL = Bidiagonal(zeros(5), zeros(4), :U)
-    @test size(A * B) == size(A)
-    @test size(BL * A) == size(A)
-    @test size(B * B) == size(B)
-    C = similar(A)
-    @test mul!(C, A, B) == A * B
-    @test mul!(C, BL, A) == BL * A
-    @test mul!(similar(B), B, B) == B * B
-    @test mul!(similar(B, size(B)), B, B) == B * B
-
-    v = zeros(size(B,2))
-    @test size(B * v) == size(v)
-    @test mul!(similar(v), B, v) == B * v
-
-    D = Diagonal(zeros(size(B,2)))
-    @test size(B * D) == size(D * B) == size(D)
-    @test mul!(similar(D), B, D) == mul!(similar(D), D, B) == B * D
+@testset "mul for small matrices" begin
+    @testset for n in 0:6
+        D = Diagonal(rand(n))
+        v = rand(n)
+        @testset for uplo in (:L, :U)
+            B = Bidiagonal(rand(n), rand(max(n-1,0)), uplo)
+            M = Matrix(B)
+
+            @test B * v ≈ M * v
+            @test mul!(similar(v), B, v) ≈ M * v
+            @test mul!(ones(size(v)), B, v, 2, 3) ≈ M * v * 2 .+ 3
+
+            @test B * B ≈ M * M
+            @test mul!(similar(B, size(B)), B, B) ≈ M * M
+            @test mul!(ones(size(B)), B, B, 2, 4) ≈ M * M * 2 .+ 4
+
+            for m in 0:6
+                AL = rand(m,n)
+                AR = rand(n,m)
+                @test AL * B ≈ AL * M
+                @test B * AR ≈ M * AR
+                @test mul!(similar(AL), AL, B) ≈ AL * M
+                @test mul!(similar(AR), B, AR) ≈ M * AR
+                @test mul!(ones(size(AL)), AL, B, 2, 4) ≈ AL * M * 2 .+ 4
+                @test mul!(ones(size(AR)), B, AR, 2, 4) ≈ M * AR * 2 .+ 4
+            end
+
+            @test B * D ≈ M * D
+            @test D * B ≈ D * M
+            @test mul!(similar(B), B, D) ≈ M * D
+            @test mul!(similar(B), B, D) ≈ M * D
+            @test mul!(similar(B, size(B)), D, B) ≈ D * M
+            @test mul!(similar(B, size(B)), B, D) ≈ M * D
+            @test mul!(ones(size(B)), D, B, 2, 4) ≈ D * M * 2 .+ 4
+            @test mul!(ones(size(B)), B, D, 2, 4) ≈ M * D * 2 .+ 4
+        end
+        BL = Bidiagonal(rand(n), rand(max(0, n-1)), :L)
+        ML = Matrix(BL)
+        BU = Bidiagonal(rand(n), rand(max(0, n-1)), :U)
+        MU = Matrix(BU)
+        T = Tridiagonal(zeros(max(0, n-1)), zeros(n), zeros(max(0, n-1)))
+        @test mul!(T, BL, BU) ≈ ML * MU
+        @test mul!(T, BU, BL) ≈ MU * ML
+        T = Tridiagonal(ones(max(0, n-1)), ones(n), ones(max(0, n-1)))
+        @test mul!(copy(T), BL, BU, 2, 3) ≈ ML * MU * 2 + T * 3
+        @test mul!(copy(T), BU, BL, 2, 3) ≈ MU * ML * 2 + T * 3
+    end
+
+    n = 4
+    arr = SizedArrays.SizedArray{(2,2)}(reshape([1:4;],2,2))
+    for B in (
+            Bidiagonal(fill(arr,n), fill(arr,n-1), :L),
+            Bidiagonal(fill(arr,n), fill(arr,n-1), :U),
+            )
+        @test B * B ≈ Matrix(B) * Matrix(B)
+        BL = Bidiagonal(fill(arr,n), fill(arr,n-1), :L)
+        BU = Bidiagonal(fill(arr,n), fill(arr,n-1), :U)
+        @test BL * B ≈ Matrix(BL) * Matrix(B)
+        @test BU * B ≈ Matrix(BU) * Matrix(B)
+        @test B * BL ≈ Matrix(B) * Matrix(BL)
+        @test B * BU ≈ Matrix(B) * Matrix(BU)
+        D = Diagonal(fill(arr,n))
+        @test D * B ≈ Matrix(D) * Matrix(B)
+        @test B * D ≈ Matrix(B) * Matrix(D)
+    end
 end
 
 end # module TestBidiagonal
diff --git a/stdlib/LinearAlgebra/test/tridiag.jl b/stdlib/LinearAlgebra/test/tridiag.jl
index 3330fa682fe5e6..15ac7f9f2147fe 100644
--- a/stdlib/LinearAlgebra/test/tridiag.jl
+++ b/stdlib/LinearAlgebra/test/tridiag.jl
@@ -970,4 +970,75 @@ end
     @test sprint(show, S) == "SymTridiagonal($(repr(diag(S))), $(repr(diag(S,1))))"
 end
 
+@testset "mul for small matrices" begin
+    @testset for n in 0:6
+        for T in (
+                Tridiagonal(rand(max(n-1,0)), rand(n), rand(max(n-1,0))),
+                SymTridiagonal(rand(n), rand(max(n-1,0))),
+                )
+            M = Matrix(T)
+            @test T * T ≈ M * M
+            @test mul!(similar(T, size(T)), T, T) ≈ M * M
+            @test mul!(ones(size(T)), T, T, 2, 4) ≈ M * M * 2 .+ 4
+
+            for m in 0:6
+                AR = rand(n,m)
+                AL = rand(m,n)
+                @test AL * T ≈ AL * M
+                @test T * AR ≈ M * AR
+                @test mul!(similar(AL), AL, T) ≈ AL * M
+                @test mul!(similar(AR), T, AR) ≈ M * AR
+                @test mul!(ones(size(AL)), AL, T, 2, 4) ≈ AL * M * 2 .+ 4
+                @test mul!(ones(size(AR)), T, AR, 2, 4) ≈ M * AR * 2 .+ 4
+            end
+
+            v = rand(n)
+            @test T * v ≈ M * v
+            @test mul!(similar(v), T, v) ≈ M * v
+
+            D = Diagonal(rand(n))
+            @test T * D ≈ M * D
+            @test D * T ≈ D * M
+            @test mul!(Tridiagonal(similar(T)), D, T) ≈ D * M
+            @test mul!(Tridiagonal(similar(T)), T, D) ≈ M * D
+            @test mul!(similar(T, size(T)), D, T) ≈ D * M
+            @test mul!(similar(T, size(T)), T, D) ≈ M * D
+            @test mul!(ones(size(T)), D, T, 2, 4) ≈ D * M * 2 .+ 4
+            @test mul!(ones(size(T)), T, D, 2, 4) ≈ M * D * 2 .+ 4
+
+            for uplo in (:U, :L)
+                B = Bidiagonal(rand(n), rand(max(0, n-1)), uplo)
+                @test T * B ≈ M * B
+                @test B * T ≈ B * M
+                if n <= 2
+                    @test mul!(Tridiagonal(similar(T)), B, T) ≈ B * M
+                    @test mul!(Tridiagonal(similar(T)), T, B) ≈ M * B
+                end
+                @test mul!(similar(T, size(T)), B, T) ≈ B * M
+                @test mul!(similar(T, size(T)), T, B) ≈ M * B
+                @test mul!(ones(size(T)), B, T, 2, 4) ≈ B * M * 2 .+ 4
+                @test mul!(ones(size(T)), T, B, 2, 4) ≈ M * B * 2 .+ 4
+            end
+        end
+    end
+
+    n = 4
+    arr = SizedArrays.SizedArray{(2,2)}(reshape([1:4;],2,2))
+    for T in (
+            SymTridiagonal(fill(arr,n), fill(arr,n-1)),
+            Tridiagonal(fill(arr,n-1), fill(arr,n), fill(arr,n-1)),
+            )
+        @test T * T ≈ Matrix(T) * Matrix(T)
+        BL = Bidiagonal(fill(arr,n), fill(arr,n-1), :L)
+        BU = Bidiagonal(fill(arr,n), fill(arr,n-1), :U)
+        @test BL * T ≈ Matrix(BL) * Matrix(T)
+        @test BU * T ≈ Matrix(BU) * Matrix(T)
+        @test T * BL ≈ Matrix(T) * Matrix(BL)
+        @test T * BU ≈ Matrix(T) * Matrix(BU)
+        D = Diagonal(fill(arr,n))
+        @test D * T ≈ Matrix(D) * Matrix(T)
+        @test T * D ≈ Matrix(T) * Matrix(D)
+    end
+end
+
 end # module TestTridiagonal