format

SciML · Sep 23, 2023 · daf7212 · daf7212
1 parent 974a36f
commit daf7212
Show file tree

Hide file tree

Showing 15 changed files with 180 additions and 87 deletions.
diff --git a/benchmarks/applelu.jl b/benchmarks/applelu.jl
@@ -18,7 +18,13 @@ function luflop(m, n = m; innerflop = 2)
     end
 end
 
-algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), AppleAccelerateLUFactorization(), MetalLUFactorization()]
+algs = [
+    LUFactorization(),
+    GenericLUFactorization(),
+    RFLUFactorization(),
+    AppleAccelerateLUFactorization(),
+    MetalLUFactorization(),
+]
 res = [Float32[] for i in 1:length(algs)]
 
 ns = 4:8:500
@@ -28,10 +34,14 @@ for i in 1:length(ns)
     rng = MersenneTwister(123)
     global A = rand(rng, Float32, n, n)
     global b = rand(rng, Float32, n)
-    global u0= rand(rng, Float32, n)
-    
+    global u0 = rand(rng, Float32, n)
+
     for j in 1:length(algs)
-        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
+        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
+            copy(b);
+            u0 = copy(u0),
+            alias_A = true,
+            alias_b = true))
         push!(res[j], luflop(n) / bt / 1e9)
     end
 end
@@ -41,11 +51,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
 parameterless_type(x) = __parameterless_type(typeof(x))
 parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
 
-p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
+p = plot(ns,
+    res[1];
+    ylabel = "GFLOPs",
+    xlabel = "N",
+    title = "GFLOPs for NxN LU Factorization",
+    label = string(Symbol(parameterless_type(algs[1]))),
+    legend = :outertopright)
 for i in 2:length(res)
     plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
 end
 p
 
 savefig("metallubench.png")
-savefig("metallubench.pdf")
+savefig("metallubench.pdf")
diff --git a/benchmarks/cudalu.jl b/benchmarks/cudalu.jl
@@ -28,10 +28,14 @@ for i in 1:length(ns)
     rng = MersenneTwister(123)
     global A = rand(rng, Float32, n, n)
     global b = rand(rng, Float32, n)
-    global u0= rand(rng, Float32, n)
-    
+    global u0 = rand(rng, Float32, n)
+
     for j in 1:length(algs)
-        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
+        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
+            copy(b);
+            u0 = copy(u0),
+            alias_A = true,
+            alias_b = true))
         push!(res[j], luflop(n) / bt / 1e9)
     end
 end
@@ -41,11 +45,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
 parameterless_type(x) = __parameterless_type(typeof(x))
 parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
 
-p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
+p = plot(ns,
+    res[1];
+    ylabel = "GFLOPs",
+    xlabel = "N",
+    title = "GFLOPs for NxN LU Factorization",
+    label = string(Symbol(parameterless_type(algs[1]))),
+    legend = :outertopright)
 for i in 2:length(res)
     plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
 end
 p
 
 savefig("cudaoffloadlubench.png")
-savefig("cudaoffloadlubench.pdf")
+savefig("cudaoffloadlubench.pdf")
diff --git a/benchmarks/lu.jl b/benchmarks/lu.jl
@@ -18,7 +18,14 @@ function luflop(m, n = m; innerflop = 2)
     end
 end
 
-algs = [LUFactorization(), GenericLUFactorization(), RFLUFactorization(), MKLLUFactorization(), FastLUFactorization(), SimpleLUFactorization()]
+algs = [
+    LUFactorization(),
+    GenericLUFactorization(),
+    RFLUFactorization(),
+    MKLLUFactorization(),
+    FastLUFactorization(),
+    SimpleLUFactorization(),
+]
 res = [Float64[] for i in 1:length(algs)]
 
 ns = 4:8:500
@@ -28,10 +35,14 @@ for i in 1:length(ns)
     rng = MersenneTwister(123)
     global A = rand(rng, n, n)
     global b = rand(rng, n)
-    global u0= rand(rng, n)
-    
+    global u0 = rand(rng, n)
+
     for j in 1:length(algs)
-        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
+        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
+            copy(b);
+            u0 = copy(u0),
+            alias_A = true,
+            alias_b = true))
         push!(res[j], luflop(n) / bt / 1e9)
     end
 end
@@ -41,11 +52,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
 parameterless_type(x) = __parameterless_type(typeof(x))
 parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
 
-p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
+p = plot(ns,
+    res[1];
+    ylabel = "GFLOPs",
+    xlabel = "N",
+    title = "GFLOPs for NxN LU Factorization",
+    label = string(Symbol(parameterless_type(algs[1]))),
+    legend = :outertopright)
 for i in 2:length(res)
     plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
 end
 p
 
 savefig("lubench.png")
-savefig("lubench.pdf")
+savefig("lubench.pdf")
diff --git a/benchmarks/metallu.jl b/benchmarks/metallu.jl
@@ -28,10 +28,14 @@ for i in 1:length(ns)
     rng = MersenneTwister(123)
     global A = rand(rng, Float32, n, n)
     global b = rand(rng, Float32, n)
-    global u0= rand(rng, Float32, n)
-    
+    global u0 = rand(rng, Float32, n)
+
     for j in 1:length(algs)
-        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A), copy(b); u0 = copy(u0), alias_A=true, alias_b=true))
+        bt = @belapsed solve(prob, $(algs[j])).u setup=(prob = LinearProblem(copy(A),
+            copy(b);
+            u0 = copy(u0),
+            alias_A = true,
+            alias_b = true))
         GC.gc()
         push!(res[j], luflop(n) / bt / 1e9)
     end
@@ -42,11 +46,17 @@ __parameterless_type(T) = Base.typename(T).wrapper
 parameterless_type(x) = __parameterless_type(typeof(x))
 parameterless_type(::Type{T}) where {T} = __parameterless_type(T)
 
-p = plot(ns, res[1]; ylabel = "GFLOPs", xlabel = "N", title = "GFLOPs for NxN LU Factorization", label = string(Symbol(parameterless_type(algs[1]))), legend=:outertopright)
+p = plot(ns,
+    res[1];
+    ylabel = "GFLOPs",
+    xlabel = "N",
+    title = "GFLOPs for NxN LU Factorization",
+    label = string(Symbol(parameterless_type(algs[1]))),
+    legend = :outertopright)
 for i in 2:length(res)
     plot!(p, ns, res[i]; label = string(Symbol(parameterless_type(algs[i]))))
 end
 p
 
 savefig("metal_large_lubench.png")
-savefig("metal_large_lubench.pdf")
+savefig("metal_large_lubench.pdf")
diff --git a/docs/src/solvers/solvers.md b/docs/src/solvers/solvers.md
@@ -14,15 +14,15 @@ but one may need to change this to receive more performance or precision. If
 more precision is necessary, `QRFactorization()` and `SVDFactorization()` are
 the best choices, with SVD being the slowest but most precise.
 
-For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around 
+For efficiency, `RFLUFactorization` is the fastest for dense LU-factorizations until around
 150x150 matrices, though this can be dependent on the exact details of the hardware. After this
 point, `MKLLUFactorization` is usually faster on most hardware. Note that on Mac computers
 that `AppleAccelerateLUFactorization` is generally always the fastest. `LUFactorization` will
-use your base system BLAS which can be fast or slow depending on the hardware configuration. 
+use your base system BLAS which can be fast or slow depending on the hardware configuration.
 `SimpleLUFactorization` will be fast only on very small matrices but can cut down on compile times.
 
 For very large dense factorizations, offloading to the GPU can be preferred. Metal.jl can be used
-on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000 
+on Mac hardware to offload, and has a cutoff point of being faster at around size 20,000 x 20,000
 matrices (and only supports Float32). `CudaOffloadFactorization` can be more efficient at a
 much smaller cutoff, possibly around size 1,000 x 1,000 matrices, though this is highly dependent
 on the chosen GPU hardware. `CudaOffloadFactorization` requires a CUDA-compatible NVIDIA GPU.
@@ -31,9 +31,9 @@ CUDA offload supports Float64 but most consumer GPU hardware will be much faster
 this is only recommended for Float32 matrices.
 
 !!! note
-
-    Performance details for dense LU-factorizations can be highly dependent on the hardware configuration. 
-    For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357). 
+    
+    Performance details for dense LU-factorizations can be highly dependent on the hardware configuration.
+    For details see [this issue](https://github.com/SciML/LinearSolve.jl/issues/357).
     If one is looking to best optimize their system, we suggest running the performance
     tuning benchmark.
 
@@ -65,19 +65,19 @@ The interface is detailed [here](@ref custom).
 ### Lazy SciMLOperators
 
 If the linear operator is given as a lazy non-concrete operator, such as a `FunctionOperator`,
-then using a Krylov method is preferred in order to not concretize the matrix. 
+then using a Krylov method is preferred in order to not concretize the matrix.
 Krylov.jl generally outperforms IterativeSolvers.jl and KrylovKit.jl, and is compatible
 with CPUs and GPUs, and thus is the generally preferred form for Krylov methods. The
 choice of Krylov method should be the one most constrained to the type of operator one
 has, for example if positive definite then `Krylov_CG()`, but if no good properties then
 use `Krylov_GMRES()`.
 
 !!! tip
-
+    
     If your materialized operator is a uniform block diagonal matrix, then you can use
     `SimpleGMRES(; blocksize = <known block size>)` to further improve performance.
     This often shows up in Neural Networks where the Jacobian wrt the Inputs (almost always)
-    is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the 
+    is a Uniform Block Diagonal matrix of Block Size = size of the input divided by the
     batch size.
 
 ## Full List of Methods

diff --git a/ext/LinearSolveBlockDiagonalsExt.jl b/ext/LinearSolveBlockDiagonalsExt.jl
@@ -4,7 +4,7 @@ using LinearSolve, BlockDiagonals
 
 function LinearSolve.init_cacheval(alg::SimpleGMRES{false}, A::BlockDiagonal, b, args...;
     kwargs...)
-    @assert ndims(A) == 2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2."
+    @assert ndims(A)==2 "ndims(A) == $(ndims(A)). `A` must have ndims == 2."
     # We need to perform this check even when `zeroinit == true`, since the type of the
     # cache is dependent on whether we are able to use the specialized dispatch.
     bsizes = blocksizes(A)

diff --git a/ext/LinearSolveKernelAbstractionsExt.jl b/ext/LinearSolveKernelAbstractionsExt.jl
@@ -9,7 +9,7 @@ using GPUArraysCore
 function LinearSolve._fast_sym_givens!(c, s, R, nr::Int, inner_iter::Int, bsize::Int, Hbis)
     backend = get_backend(Hbis)
     kernel! = __fast_sym_givens_kernel!(backend)
-    kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange=bsize)
+    kernel!(c[inner_iter], s[inner_iter], R[nr + inner_iter], Hbis; ndrange = bsize)
     return c, s, R
 end
 

diff --git a/ext/LinearSolveMKLExt.jl b/ext/LinearSolveMKLExt.jl
@@ -2,49 +2,60 @@ module LinearSolveMKLExt
 
 using MKL_jll
 using LinearAlgebra: BlasInt, LU
-using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1, 
-                            @blasfunc, chkargsok
+using LinearAlgebra.LAPACK: require_one_based_indexing,
+    chkfinite, chkstride1,
+    @blasfunc, chkargsok
 using LinearAlgebra
 const usemkl = MKL_jll.is_available()
 
 using LinearSolve
 using LinearSolve: ArrayInterface, MKLLUFactorization, @get_cacheval, LinearCache, SciMLBase
 
-function getrf!(A::AbstractMatrix{<:Float64}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
+function getrf!(A::AbstractMatrix{<:Float64};
+    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+    info = Ref{BlasInt}(),
+    check = false)
     require_one_based_indexing(A)
     check && chkfinite(A)
     chkstride1(A)
     m, n = size(A)
-    lda  = max(1,stride(A, 2))
+    lda = max(1, stride(A, 2))
     if isempty(ipiv)
-        ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
     end
     ccall((@blasfunc(dgetrf_), MKL_jll.libmkl_rt), Cvoid,
-            (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64},
+        (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64},
             Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
-            m, n, A, lda, ipiv, info)
+        m, n, A, lda, ipiv, info)
     chkargsok(info[])
     A, ipiv, info[], info #Error code is stored in LU factorization type
 end
 
-function getrf!(A::AbstractMatrix{<:Float32}; ipiv = similar(A, BlasInt, min(size(A,1),size(A,2))), info = Ref{BlasInt}(), check = false)
+function getrf!(A::AbstractMatrix{<:Float32};
+    ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2))),
+    info = Ref{BlasInt}(),
+    check = false)
     require_one_based_indexing(A)
     check && chkfinite(A)
     chkstride1(A)
     m, n = size(A)
-    lda  = max(1,stride(A, 2))
+    lda = max(1, stride(A, 2))
     if isempty(ipiv)
-        ipiv = similar(A, BlasInt, min(size(A,1),size(A,2)))
+        ipiv = similar(A, BlasInt, min(size(A, 1), size(A, 2)))
     end
     ccall((@blasfunc(sgetrf_), MKL_jll.libmkl_rt), Cvoid,
-            (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
+        (Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32},
             Ref{BlasInt}, Ptr{BlasInt}, Ptr{BlasInt}),
-            m, n, A, lda, ipiv, info)
+        m, n, A, lda, ipiv, info)
     chkargsok(info[])
     A, ipiv, info[], info #Error code is stored in LU factorization type
 end
 
-function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float64}; info = Ref{BlasInt}())
+function getrs!(trans::AbstractChar,
+    A::AbstractMatrix{<:Float64},
+    ipiv::AbstractVector{BlasInt},
+    B::AbstractVecOrMat{<:Float64};
+    info = Ref{BlasInt}())
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
     chkstride1(A, B, ipiv)
@@ -57,14 +68,19 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float64}, ipiv::Abstrac
     end
     nrhs = size(B, 2)
     ccall(("dgetrs_", MKL_jll.libmkl_rt), Cvoid,
-          (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
-           Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
-          trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
+        (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float64}, Ref{BlasInt},
+            Ptr{BlasInt}, Ptr{Float64}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
+        trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
+        1)
     LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
     B
 end
 
-function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::AbstractVector{BlasInt}, B::AbstractVecOrMat{<:Float32}; info = Ref{BlasInt}())
+function getrs!(trans::AbstractChar,
+    A::AbstractMatrix{<:Float32},
+    ipiv::AbstractVector{BlasInt},
+    B::AbstractVecOrMat{<:Float32};
+    info = Ref{BlasInt}())
     require_one_based_indexing(A, ipiv, B)
     LinearAlgebra.LAPACK.chktrans(trans)
     chkstride1(A, B, ipiv)
@@ -77,9 +93,10 @@ function getrs!(trans::AbstractChar, A::AbstractMatrix{<:Float32}, ipiv::Abstrac
     end
     nrhs = size(B, 2)
     ccall(("sgetrs_", MKL_jll.libmkl_rt), Cvoid,
-          (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
-           Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
-          trans, n, size(B,2), A, max(1,stride(A,2)), ipiv, B, max(1,stride(B,2)), info, 1)
+        (Ref{UInt8}, Ref{BlasInt}, Ref{BlasInt}, Ptr{Float32}, Ref{BlasInt},
+            Ptr{BlasInt}, Ptr{Float32}, Ref{BlasInt}, Ptr{BlasInt}, Clong),
+        trans, n, size(B, 2), A, max(1, stride(A, 2)), ipiv, B, max(1, stride(B, 2)), info,
+        1)
     LinearAlgebra.LAPACK.chklapackerror(BlasInt(info[]))
     B
 end
@@ -125,4 +142,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MKLLUFactorization;
     =#
 end
 
-end
+end
diff --git a/ext/LinearSolveMetalExt.jl b/ext/LinearSolveMetalExt.jl
@@ -28,4 +28,4 @@ function SciMLBase.solve!(cache::LinearCache, alg::MetalLUFactorization;
     SciMLBase.build_linear_solution(alg, y, nothing, cache)
 end
 
-end
+end
diff --git a/src/LinearSolve.jl b/src/LinearSolve.jl
@@ -28,8 +28,9 @@ PrecompileTools.@recompile_invalidations begin
     import InteractiveUtils
 
     using LinearAlgebra: BlasInt, LU
-    using LinearAlgebra.LAPACK: require_one_based_indexing, chkfinite, chkstride1,
-                                @blasfunc, chkargsok
+    using LinearAlgebra.LAPACK: require_one_based_indexing,
+        chkfinite, chkstride1,
+        @blasfunc, chkargsok
 
     import GPUArraysCore
     import Preferences