diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 9c7935911..320e0c073 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1,2 +1,3 @@
 style = "sciml"
-format_markdown = true
\ No newline at end of file
+format_markdown = true
+annotate_untyped_fields_with_any = false
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 4955b36c8..75f12a2e9 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -15,7 +15,6 @@ jobs:
           - Core
         version:
           - '1'
-          - '1.6'
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v1
diff --git a/.github/workflows/Downstream.yml b/.github/workflows/Downstream.yml
index 7196cc747..a7fe43d4b 100644
--- a/.github/workflows/Downstream.yml
+++ b/.github/workflows/Downstream.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        julia-version: [1,1.6]
+        julia-version: [1]
         os: [ubuntu-latest]
         package:
           - {user: SciML, repo: ModelingToolkit.jl, group: All}
diff --git a/.gitignore b/.gitignore
index aa4ff57e3..2f8d95920 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,4 @@ Manifest.toml
 docs/src/assets/Project.toml
 
 .vscode
+wip
diff --git a/Project.toml b/Project.toml
index ed0b27f95..4d474db44 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,14 +1,17 @@
 name = "NonlinearSolve"
 uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
 authors = ["SciML"]
-version = "1.10.0"
+version = "2.0.0"
 
 [deps]
+ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
+ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LineSearches = "d3d80556-e9d4-5f37-9878-2ab0fcc64255"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
@@ -22,33 +25,41 @@ StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
 UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [compat]
+ADTypes = "0.2"
 ArrayInterface = "6.0.24, 7"
-DiffEqBase = "6"
+ConcreteStructs = "0.2"
+DiffEqBase = "6.130"
 EnumX = "1"
+Enzyme = "0.11"
 FiniteDiff = "2"
 ForwardDiff = "0.10.3"
 LinearSolve = "2"
+LineSearches = "7"
 PrecompileTools = "1"
 RecursiveArrayTools = "2"
 Reexport = "0.2, 1"
-SciMLBase = "1.92.4"
+SciMLBase = "1.97"
 SimpleNonlinearSolve = "0.1"
-SparseDiffTools = "1, 2"
+SparseDiffTools = "2.6"
 StaticArraysCore = "1.4"
 UnPack = "1.0"
-julia = "1.6"
+Zygote = "0.6"
+julia = "1.9"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["BenchmarkTools", "SafeTestsets", "Pkg", "Test", "ForwardDiff", "StaticArrays", "Symbolics", "LinearSolve", "Random", "LinearAlgebra"]
+test = ["Enzyme", "BenchmarkTools", "SafeTestsets", "Pkg", "Test", "ForwardDiff", "StaticArrays", "Symbolics", "LinearSolve", "Random", "LinearAlgebra", "Zygote", "SparseDiffTools"]
diff --git a/docs/Project.toml b/docs/Project.toml
index f6889de90..df765bb1d 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -14,7 +14,7 @@ Sundials = "c3572dad-4567-51f8-b174-8c6c989267f4"
 BenchmarkTools = "1"
 Documenter = "0.27"
 LinearSolve = "2"
-NonlinearSolve = "1"
+NonlinearSolve = "1, 2"
 NonlinearSolveMINPACK = "0.1"
 SciMLNLSolve = "0.1"
 SimpleNonlinearSolve = "0.1.5"
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index cae730bc3..615f96c03 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -1,39 +1,41 @@
 module NonlinearSolve
-if isdefined(Base, :Experimental) &&
-   isdefined(Base.Experimental, Symbol("@max_methods"))
+
+if isdefined(Base, :Experimental) && isdefined(Base.Experimental, Symbol("@max_methods"))
     @eval Base.Experimental.@max_methods 1
 end
-using Reexport
-using UnPack: @unpack
-using FiniteDiff, ForwardDiff
-using ForwardDiff: Dual
-using LinearAlgebra
-using StaticArraysCore
-using RecursiveArrayTools
-import EnumX
-import ArrayInterface
-import LinearSolve
-using DiffEqBase
-using SparseDiffTools
-
-@reexport using SciMLBase
-using SciMLBase: NLStats
-@reexport using SimpleNonlinearSolve
-
-import SciMLBase: _unwrap_val
-
-abstract type AbstractNonlinearSolveAlgorithm <: SciMLBase.AbstractNonlinearAlgorithm end
-abstract type AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ} <:
-              AbstractNonlinearSolveAlgorithm end
-
-function SciMLBase.__solve(prob::NonlinearProblem,
-    alg::AbstractNonlinearSolveAlgorithm, args...;
-    kwargs...)
+
+using DiffEqBase, LinearAlgebra, LinearSolve, SparseDiffTools
+import ForwardDiff
+
+import ADTypes: AbstractFiniteDifferencesMode
+import ArrayInterface: undefmatrix, matrix_colors, parameterless_type, ismutable
+import ConcreteStructs: @concrete
+import EnumX: @enumx
+import ForwardDiff: Dual
+import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
+import RecursiveArrayTools: ArrayPartition,
+    AbstractVectorOfArray, recursivecopy!, recursivefill!
+import Reexport: @reexport
+import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
+import StaticArraysCore: StaticArray, SVector, SArray, MArray
+import UnPack: @unpack
+
+@reexport using ADTypes, LineSearches, SciMLBase, SimpleNonlinearSolve
+
+const AbstractSparseADType = Union{ADTypes.AbstractSparseFiniteDifferences,
+    ADTypes.AbstractSparseForwardMode, ADTypes.AbstractSparseReverseMode}
+
+abstract type AbstractNonlinearSolveAlgorithm <: AbstractNonlinearAlgorithm end
+abstract type AbstractNewtonAlgorithm{CJ, AD} <: AbstractNonlinearSolveAlgorithm end
+
+function SciMLBase.__solve(prob::NonlinearProblem, alg::AbstractNonlinearSolveAlgorithm,
+    args...; kwargs...)
     cache = init(prob, alg, args...; kwargs...)
-    sol = solve!(cache)
+    return solve!(cache)
 end
 
 include("utils.jl")
+include("linesearch.jl")
 include("raphson.jl")
 include("trustRegion.jl")
 include("levenberg.jl")
@@ -46,7 +48,7 @@ PrecompileTools.@compile_workload begin
     for T in (Float32, Float64)
         prob = NonlinearProblem{false}((u, p) -> u .* u .- p, T(0.1), T(2))
 
-        precompile_algs = if VERSION >= v"1.7"
+        precompile_algs = if VERSION ≥ v"1.7"
             (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
         else
             (NewtonRaphson(),)
@@ -68,4 +70,6 @@ export RadiusUpdateSchemes
 
 export NewtonRaphson, TrustRegion, LevenbergMarquardt
 
+export LineSearch
+
 end # module
diff --git a/src/ad.jl b/src/ad.jl
index 0dad74c56..05fd8bfa9 100644
--- a/src/ad.jl
+++ b/src/ad.jl
@@ -1,44 +1,63 @@
 function scalar_nlsolve_ad(prob, alg, args...; kwargs...)
     f = prob.f
     p = value(prob.p)
-
     u0 = value(prob.u0)
     newprob = NonlinearProblem(f, u0, p; prob.kwargs...)
 
     sol = solve(newprob, alg, args...; kwargs...)
 
     uu = sol.u
-    if p isa Number
-        f_p = ForwardDiff.derivative(Base.Fix1(f, uu), p)
-    else
-        f_p = ForwardDiff.gradient(Base.Fix1(f, uu), p)
-    end
+    f_p = scalar_nlsolve_∂f_∂p(f, uu, p)
+    f_x = scalar_nlsolve_∂f_∂u(f, uu, p)
+
+    z_arr = -inv(f_x) * f_p
 
-    f_x = ForwardDiff.derivative(Base.Fix2(f, p), uu)
     pp = prob.p
-    sumfun = let f_x′ = -f_x
-        ((fp, p),) -> (fp / f_x′) * ForwardDiff.partials(p)
+    sumfun = ((z, p),) -> map(zᵢ -> zᵢ * ForwardDiff.partials(p), z)
+    if uu isa Number
+        partials = sum(sumfun, zip(z_arr, pp))
+    elseif p isa Number
+        partials = sumfun((z_arr, pp))
+    else
+        partials = sum(sumfun, zip(eachcol(z_arr), pp))
     end
-    partials = sum(sumfun, zip(f_p, pp))
+
     return sol, partials
 end
 
-function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, StaticArraysCore.SVector},
-        iip,
-        <:Dual{T, V, P}},
-    alg::AbstractNewtonAlgorithm,
-    args...; kwargs...) where {iip, T, V, P}
+function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector, <:AbstractArray},
+        iip, <:Dual{T, V, P}}, alg::AbstractNewtonAlgorithm, args...;
+    kwargs...) where {iip, T, V, P}
     sol, partials = scalar_nlsolve_ad(prob, alg, args...; kwargs...)
-    return SciMLBase.build_solution(prob, alg, Dual{T, V, P}(sol.u, partials), sol.resid;
-        retcode = sol.retcode)
+    dual_soln = scalar_nlsolve_dual_soln(sol.u, partials, prob.p)
+    return SciMLBase.build_solution(prob, alg, dual_soln, sol.resid; sol.retcode)
 end
-function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, StaticArraysCore.SVector},
-        iip,
-        <:AbstractArray{<:Dual{T, V, P}}},
-    alg::AbstractNewtonAlgorithm,
-    args...;
+
+function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, SVector, <:AbstractArray},
+        iip, <:AbstractArray{<:Dual{T, V, P}}}, alg::AbstractNewtonAlgorithm, args...;
     kwargs...) where {iip, T, V, P}
     sol, partials = scalar_nlsolve_ad(prob, alg, args...; kwargs...)
-    return SciMLBase.build_solution(prob, alg, Dual{T, V, P}(sol.u, partials), sol.resid;
-        retcode = sol.retcode)
+    dual_soln = scalar_nlsolve_dual_soln(sol.u, partials, prob.p)
+    return SciMLBase.build_solution(prob, alg, dual_soln, sol.resid; sol.retcode)
+end
+
+function scalar_nlsolve_∂f_∂p(f, u, p)
+    ff = p isa Number ? ForwardDiff.derivative :
+         (u isa Number ? ForwardDiff.gradient : ForwardDiff.jacobian)
+    return ff(Base.Fix1(f, u), p)
+end
+
+function scalar_nlsolve_∂f_∂u(f, u, p)
+    ff = u isa Number ? ForwardDiff.derivative : ForwardDiff.jacobian
+    return ff(Base.Fix2(f, p), u)
+end
+
+function scalar_nlsolve_dual_soln(u::Number, partials,
+    ::Union{<:AbstractArray{<:Dual{T, V, P}}, Dual{T, V, P}}) where {T, V, P}
+    return Dual{T, V, P}(u, partials)
+end
+
+function scalar_nlsolve_dual_soln(u::AbstractArray, partials,
+    ::Union{<:AbstractArray{<:Dual{T, V, P}}, Dual{T, V, P}}) where {T, V, P}
+    return map(((uᵢ, pᵢ),) -> Dual{T, V, P}(uᵢ, pᵢ), zip(u, partials))
 end
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 8296069e0..83d26fee6 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -1,185 +1,107 @@
-struct JacobianWrapper{fType, pType}
-    f::fType
-    p::pType
+@concrete struct JacobianWrapper{iip}
+    f
+    p
 end
 
-(uf::JacobianWrapper)(u) = uf.f(u, uf.p)
-(uf::JacobianWrapper)(res, u) = uf.f(res, u, uf.p)
+# Previous Implementation did not hold onto `iip`, but this causes problems in packages
+# where we check for the presence of function signatures to check which dispatch to call
+(uf::JacobianWrapper{false})(u) = uf.f(u, uf.p)
+(uf::JacobianWrapper{false})(res, u) = (vec(res) .= vec(uf.f(u, uf.p)))
+(uf::JacobianWrapper{true})(res, u) = uf.f(res, u, uf.p)
 
-struct NonlinearSolveTag end
-
-function sparsity_colorvec(f, x)
-    sparsity = f.sparsity
-    colorvec = DiffEqBase.has_colorvec(f) ? f.colorvec :
-               (isnothing(sparsity) ? (1:length(x)) : matrix_colors(sparsity))
-    sparsity, colorvec
-end
+sparsity_detection_alg(_, _) = NoSparsityDetection()
+function sparsity_detection_alg(f, ad::AbstractSparseADType)
+    if f.sparsity === nothing
+        if f.jac_prototype === nothing
+            return SymbolicsSparsityDetection()
+        else
+            jac_prototype = f.jac_prototype
+        end
+    else
+        jac_prototype = f.sparsity
+    end
 
-function jacobian_finitediff_forward!(J, f, x, jac_config, forwardcache, cache)
-    (FiniteDiff.finite_difference_jacobian!(J, f, x, jac_config, forwardcache);
-    maximum(jac_config.colorvec))
-end
-function jacobian_finitediff!(J, f, x, jac_config, cache)
-    (FiniteDiff.finite_difference_jacobian!(J, f, x, jac_config);
-    2 * maximum(jac_config.colorvec))
+    if SciMLBase.has_colorvec(f)
+        return PrecomputedJacobianColorvec(; jac_prototype, f.colorvec,
+            partition_by_rows = ad isa ADTypes.AbstractSparseReverseMode)
+    else
+        return JacPrototypeSparsityDetection(; jac_prototype)
+    end
 end
 
 # NoOp for Jacobian if it is not a Abstract Array -- For eg, JacVec Operator
-jacobian!(J, cache) = J
-function jacobian!(J::AbstractMatrix{<:Number}, cache)
-    f = cache.f
-    uf = cache.uf
-    x = cache.u
-    fx = cache.fu
-    jac_config = cache.jac_config
-    alg = cache.alg
-
-    if SciMLBase.has_jac(f)
-        f.jac(J, x, cache.p)
-    elseif alg_autodiff(alg)
-        forwarddiff_color_jacobian!(J, uf, x, jac_config)
-        #cache.destats.nf += 1
+jacobian!!(J, _) = J
+# `!!` notation is from BangBang.jl since J might be jacobian in case of oop `f.jac`
+# and we don't want wasteful `copyto!`
+function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
+    @unpack f, uf, u, p, jac_cache, alg, fu2 = cache
+    iip = isinplace(cache)
+    if iip
+        has_jac(f) ? f.jac(J, u, p) :
+        sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, _maybe_mutable(u, alg.ad))
     else
-        isforward = alg_difftype(alg) === Val{:forward}
-        if isforward
-            uf(fx, x)
-            #cache.destats.nf += 1
-            tmp = jacobian_finitediff_forward!(J, uf, x, jac_config, fx,
-                cache)
-        else # not forward difference
-            tmp = jacobian_finitediff!(J, uf, x, jac_config, cache)
-        end
-        #cache.destats.nf += tmp
+        return has_jac(f) ? f.jac(u, p) :
+               sparse_jacobian!(J, alg.ad, jac_cache, uf, _maybe_mutable(u, alg.ad))
     end
-    nothing
+    return J
 end
+# Scalar case
+jacobian!!(::Number, cache) = last(value_derivative(cache.uf, cache.u))
+
+# Build Jacobian Caches
+function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p, ::Val{iip};
+    linsolve_kwargs=(;)) where {iip}
+    uf = JacobianWrapper{iip}(f, p)
 
-function build_jac_and_jac_config(alg, f::F1, uf::F2, du1, u, tmp, du2) where {F1, F2}
     haslinsolve = hasfield(typeof(alg), :linsolve)
 
-    has_analytic_jac = SciMLBase.has_jac(f)
+    has_analytic_jac = has_jac(f)
     linsolve_needs_jac = (concrete_jac(alg) === nothing &&
                           (!haslinsolve || (haslinsolve && (alg.linsolve === nothing ||
-                             LinearSolve.needs_concrete_A(alg.linsolve)))))
+                             needs_concrete_A(alg.linsolve)))))
     alg_wants_jac = (concrete_jac(alg) !== nothing && concrete_jac(alg))
 
+    # NOTE: The deepcopy is needed here since we are using the resid_prototype elsewhere
+    fu = f.resid_prototype === nothing ? (iip ? _mutable_zero(u) : _mutable(f(u, p))) :
+         (iip ? deepcopy(f.resid_prototype) : f.resid_prototype)
     if !has_analytic_jac && (linsolve_needs_jac || alg_wants_jac)
-        sparsity, colorvec = sparsity_colorvec(f, u)
-
-        if alg_autodiff(alg)
-            _chunksize = get_chunksize(alg) === Val(0) ? nothing : get_chunksize(alg) # SparseDiffEq uses different convection...
-
-            T = if standardtag(alg)
-                typeof(ForwardDiff.Tag(NonlinearSolveTag(), eltype(u)))
-            else
-                typeof(ForwardDiff.Tag(uf, eltype(u)))
-            end
-            jac_config = ForwardColorJacCache(uf, u, _chunksize; colorvec, sparsity,
-                tag = T)
-        else
-            if alg_difftype(alg) !== Val{:complex}
-                jac_config = FiniteDiff.JacobianCache(tmp, du1, du2, alg_difftype(alg);
-                    colorvec, sparsity)
-            else
-                jac_config = FiniteDiff.JacobianCache(Complex{eltype(tmp)}.(tmp),
-                    Complex{eltype(du1)}.(du1), nothing, alg_difftype(alg), eltype(u);
-                    colorvec, sparsity)
-            end
-        end
+        sd = sparsity_detection_alg(f, alg.ad)
+        ad = alg.ad
+        jac_cache = iip ? sparse_jacobian_cache(ad, sd, uf, fu, _maybe_mutable(u, ad)) :
+                    sparse_jacobian_cache(ad, sd, uf, _maybe_mutable(u, ad); fx = fu)
     else
-        jac_config = nothing
+        jac_cache = nothing
     end
 
     J = if !linsolve_needs_jac
         # We don't need to construct the Jacobian
-        JacVec(uf, u; autodiff = alg_autodiff(alg) ? AutoForwardDiff() : AutoFiniteDiff())
+        JacVec(uf, u; autodiff = alg.ad)
     else
-        if f.jac_prototype === nothing
-            ArrayInterface.undefmatrix(u)
+        if has_analytic_jac
+            f.jac_prototype === nothing ? undefmatrix(u) : f.jac_prototype
         else
-            f.jac_prototype
+            f.jac_prototype === nothing ? init_jacobian(jac_cache) : f.jac_prototype
         end
     end
 
-    return J, jac_config
-end
-
-# Build Jacobian Caches
-function jacobian_caches(alg::Union{NewtonRaphson, LevenbergMarquardt, TrustRegion}, f, u,
-    p, ::Val{true})
-    uf = JacobianWrapper(f, p)
-
-    du1 = zero(u)
-    du2 = zero(u)
-    tmp = zero(u)
-    J, jac_config = build_jac_and_jac_config(alg, f, uf, du1, u, tmp, du2)
+    du = _mutable_zero(u)
+    linprob = LinearProblem(J, _vec(fu); u0 = _vec(du))
 
-    linprob = LinearProblem(J, _vec(zero(u)); u0 = _vec(zero(u)))
     weight = similar(u)
     recursivefill!(weight, true)
 
     Pl, Pr = wrapprecs(alg.precs(J, nothing, u, p, nothing, nothing, nothing, nothing,
             nothing)..., weight)
-    linsolve = init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr)
+    linsolve = init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr,
+        linsolve_kwargs...)
 
-    uf, linsolve, J, du1, jac_config
-end
-
-function get_chunksize(jac_config::ForwardDiff.JacobianConfig{
-    T,
-    V,
-    N,
-    D,
-}) where {T, V, N, D
-}
-    Val(N)
-end # don't degrade compile time information to runtime information
-
-function jacobian_finitediff(f, x, ::Type{diff_type}, dir, colorvec, sparsity,
-    jac_prototype) where {diff_type}
-    (FiniteDiff.finite_difference_derivative(f, x, diff_type, eltype(x), dir = dir), 2)
-end
-function jacobian_finitediff(f, x::AbstractArray, ::Type{diff_type}, dir, colorvec,
-    sparsity, jac_prototype) where {diff_type}
-    f_in = diff_type === Val{:forward} ? f(x) : similar(x)
-    ret_eltype = eltype(f_in)
-    J = FiniteDiff.finite_difference_jacobian(f, x, diff_type, ret_eltype, f_in,
-        dir = dir, colorvec = colorvec,
-        sparsity = sparsity,
-        jac_prototype = jac_prototype)
-    return J, _nfcount(maximum(colorvec), diff_type)
-end
-function jacobian(cache, f::F) where {F}
-    x = cache.u
-    alg = cache.alg
-    uf = cache.uf
-    local tmp
-
-    if DiffEqBase.has_jac(cache.f)
-        J = f.jac(cache.u, cache.p)
-    elseif alg_autodiff(alg)
-        J, tmp = jacobian_autodiff(uf, x, cache.f, alg)
-    else
-        jac_prototype = cache.f.jac_prototype
-        sparsity, colorvec = sparsity_colorvec(cache.f, x)
-        dir = true
-        J, tmp = jacobian_finitediff(uf, x, alg_difftype(alg), dir, colorvec, sparsity,
-            jac_prototype)
-    end
-    J
+    return uf, linsolve, J, fu, jac_cache, du
 end
 
-jacobian_autodiff(f, x, nonlinfun, alg) = (ForwardDiff.derivative(f, x), 1, alg)
-function jacobian_autodiff(f, x::AbstractArray, nonlinfun, alg)
-    jac_prototype = nonlinfun.jac_prototype
-    sparsity, colorvec = sparsity_colorvec(nonlinfun, x)
-    maxcolor = maximum(colorvec)
-    chunk_size = get_chunksize(alg) === Val(0) ? nothing : get_chunksize(alg)
-    num_of_chunks = chunk_size === nothing ?
-                    Int(ceil(maxcolor /
-                             SparseDiffTools.getsize(ForwardDiff.pickchunksize(maxcolor)))) :
-                    Int(ceil(maxcolor / _unwrap_val(chunk_size)))
-    (forwarddiff_color_jacobian(f, x, colorvec = colorvec, sparsity = sparsity,
-            jac_prototype = jac_prototype, chunksize = chunk_size),
-        num_of_chunks)
+## Special Handling for Scalars
+function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u::Number, p,
+    ::Val{false}; kwargs...)
+    # NOTE: Scalar `u` assumes scalar output from `f`
+    uf = JacobianWrapper{false}(f, p)
+    return uf, nothing, u, nothing, nothing, u
 end
diff --git a/src/levenberg.jl b/src/levenberg.jl
index db8955f4a..17f61475f 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -1,113 +1,82 @@
 """
-```julia
-LevenbergMarquardt(; chunk_size = Val{0}(),
-                    autodiff = Val{true}(),
-                    standardtag = Val{true}(),
-                    concrete_jac = nothing,
-                    diff_type = Val{:forward},
-                    linsolve = nothing, precs = DEFAULT_PRECS,
-                    damping_initial::Real = 1.0,
-                    damping_increase_factor::Real = 2.0,
-                    damping_decrease_factor::Real = 3.0,
-                    finite_diff_step_geodesic::Real = 0.1,
-                    α_geodesic::Real = 0.75,
-                    b_uphill::Real = 1.0,
-                    min_damping_D::AbstractFloat = 1e-8)
-```
+    LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+        precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
+        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+        finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
+        b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
 
 An advanced Levenberg-Marquardt implementation with the improvements suggested in the
 [paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
 algorithm for nonlinear least-squares minimization". Designed for large-scale and
 numerically-difficult nonlinear systems.
 
-
 ### Keyword Arguments
 
-- `chunk_size`: the chunk size used by the internal ForwardDiff.jl automatic differentiation
-  system. This allows for multiple derivative columns to be computed simultaneously,
-  improving performance. Defaults to `0`, which is equivalent to using ForwardDiff.jl's
-  default chunk size mechanism. For more details, see the documentation for
-  [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/).
-- `autodiff`: whether to use forward-mode automatic differentiation for the Jacobian.
-  Note that this argument is ignored if an analytical Jacobian is passed, as that will be
-  used instead. Defaults to `Val{true}`, which means ForwardDiff.jl via
-  SparseDiffTools.jl is used by default. If `Val{false}`, then FiniteDiff.jl is used for
-  finite differencing.
-- `standardtag`: whether to use a standardized tag definition for the purposes of automatic
-  differentiation. Defaults to true, which thus uses the `NonlinearSolveTag`. If `Val{false}`,
-  then ForwardDiff's default function naming tag is used, which results in larger stack
-  traces.
-- `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-  then the Jacobian will not be constructed and instead direct Jacobian-vector products
-  `J*v` are computed using forward-mode automatic differentiation or finite differencing
-  tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-  for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-  the construction of the Jacobian.
-- `diff_type`: the type of finite differencing used if `autodiff = false`. Defaults to
-  `Val{:forward}` for forward finite differences. For more details on the choices, see the
-  [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl) documentation.
-- `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-  linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-  LinearSolve.jl default algorithm choice. For more information on available algorithm
-  choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-- `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-  preconditioners. For more information on specifying preconditioners for LinearSolve
-  algorithms, consult the
-  [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-- `damping_initial`: the starting value for the damping factor. The damping factor is
-  inversely proportional to the step size. The damping factor is adjusted during each
-  iteration. Defaults to `1.0`. For more details, see section 2.1 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `damping_increase_factor`: the factor by which the damping is increased if a step is
-  rejected. Defaults to `2.0`. For more details, see section 2.1 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `damping_decrease_factor`: the factor by which the damping is decreased if a step is
-  accepted. Defaults to `3.0`. For more details, see section 2.1 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
-  the geodesic acceleration. Defaults to `0.1` which means that the step size is
-  approximately 10% of the first-order step. For more details, see section 3 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
-  geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
-  that acceptable steps meet the condition
-  ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
-  acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
-  path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
-  is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
-  effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
-  choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
-  and reject all steps that increase the cost. Although this is a natural and safe choice,
-  it is often not the most efficient. Therefore downhill moves are always accepted, but
-  uphill moves are only conditionally accepted. To decide whether an uphill move will be
-  accepted at each iteration ``i``, we compute
-  ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
-  between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
-  step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
-  specify, uphill moves are accepted if
-  ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
-  iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
-  allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
-  will be accepted. Defaults to `1.0`. For more details, see section 4 of
-  [this paper](https://arxiv.org/abs/1201.5885).
-- `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
-  `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
-  where `J` is the Jacobian. It is suggested by
-  [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
-  `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
-
-
-!!! note
-
-    Currently, the linear solver and chunk size choice only applies to in-place defined
-    `NonlinearProblem`s. That is expected to change in the future.
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+      ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+      `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
+  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
+    then the Jacobian will not be constructed and instead direct Jacobian-vector products
+    `J*v` are computed using forward-mode automatic differentiation or finite differencing
+    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
+    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
+    the construction of the Jacobian.
+  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
+    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
+    LinearSolve.jl default algorithm choice. For more information on available algorithm
+    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
+    preconditioners. For more information on specifying preconditioners for LinearSolve
+    algorithms, consult the
+    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `damping_initial`: the starting value for the damping factor. The damping factor is
+    inversely proportional to the step size. The damping factor is adjusted during each
+    iteration. Defaults to `1.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_increase_factor`: the factor by which the damping is increased if a step is
+    rejected. Defaults to `2.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
+    accepted. Defaults to `3.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
+    the geodesic acceleration. Defaults to `0.1` which means that the step size is
+    approximately 10% of the first-order step. For more details, see section 3 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
+    geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
+    that acceptable steps meet the condition
+    ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
+    acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
+    path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
+    is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
+    effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
+    choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
+    and reject all steps that increase the cost. Although this is a natural and safe choice,
+    it is often not the most efficient. Therefore downhill moves are always accepted, but
+    uphill moves are only conditionally accepted. To decide whether an uphill move will be
+    accepted at each iteration ``i``, we compute
+    ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
+    between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
+    step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
+    specify, uphill moves are accepted if
+    ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
+    iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
+    allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
+    will be accepted. Defaults to `1.0`. For more details, see section 4 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
+    `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
+    where `J` is the Jacobian. It is suggested by
+    [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
+    `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
 """
-struct LevenbergMarquardt{CS, AD, FDT, L, P, ST, CJ, T} <:
-       AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ}
-    linsolve::L
-    precs::P
+@concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
     damping_initial::T
     damping_increase_factor::T
     damping_decrease_factor::T
@@ -117,54 +86,36 @@ struct LevenbergMarquardt{CS, AD, FDT, L, P, ST, CJ, T} <:
     min_damping_D::T
 end
 
-function LevenbergMarquardt(; chunk_size = Val{0}(),
-    autodiff = Val{true}(),
-    standardtag = Val{true}(),
-    concrete_jac = nothing,
-    diff_type = Val{:forward},
-    linsolve = nothing,
-    precs = DEFAULT_PRECS,
-    damping_initial::Real = 1.0,
-    damping_increase_factor::Real = 2.0,
-    damping_decrease_factor::Real = 3.0,
-    finite_diff_step_geodesic::Real = 0.1,
-    α_geodesic::Real = 0.75,
-    b_uphill::Real = 1.0,
-    min_damping_D::AbstractFloat = 1e-8)
-    LevenbergMarquardt{_unwrap_val(chunk_size), _unwrap_val(autodiff), diff_type,
-        typeof(linsolve), typeof(precs), _unwrap_val(standardtag),
-        _unwrap_val(concrete_jac),
-        typeof(min_damping_D)}(linsolve, precs,
-        damping_initial,
-        damping_increase_factor,
-        damping_decrease_factor,
-        finite_diff_step_geodesic,
-        α_geodesic,
-        b_uphill,
-        min_damping_D)
+function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+    precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
+    damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
+    α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
+    adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
+    return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
+        damping_initial, damping_increase_factor, damping_decrease_factor,
+        finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
 end
 
-mutable struct LevenbergMarquardtCache{iip, fType, algType, uType, duType, resType, pType,
-    INType, tolType, probType, ufType, L, jType, JC,
-    DᵀDType, λType, lossType,
-}
-    f::fType
-    alg::algType
+@concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
+    f
+    alg
     u::uType
-    fu::resType
-    p::pType
-    uf::ufType
-    linsolve::L
+    fu1
+    fu2
+    du
+    p
+    uf
+    linsolve
     J::jType
-    du_tmp::duType
-    jac_config::JC
+    jac_cache
     force_stop::Bool
     maxiters::Int
-    internalnorm::INType
-    retcode::SciMLBase.ReturnCode.T
-    abstol::tolType
-    prob::probType
-    DᵀD::DᵀDType
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
+    DᵀD
     JᵀJ::jType
     λ::λType
     λ_factor::λType
@@ -182,75 +133,21 @@ mutable struct LevenbergMarquardtCache{iip, fType, algType, uType, duType, resTy
     δ::uType
     loss_old::lossType
     make_new_J::Bool
-    fu_tmp::resType
+    fu_tmp
     mat_tmp::jType
     stats::NLStats
-
-    function LevenbergMarquardtCache{iip}(f::fType, alg::algType, u::uType, fu::resType,
-        p::pType, uf::ufType, linsolve::L, J::jType,
-        du_tmp::duType, jac_config::JC,
-        force_stop::Bool, maxiters::Int,
-        internalnorm::INType,
-        retcode::SciMLBase.ReturnCode.T, abstol::tolType,
-        prob::probType, DᵀD::DᵀDType, JᵀJ::jType,
-        λ::λType, λ_factor::λType,
-        damping_increase_factor::λType,
-        damping_decrease_factor::λType, h::λType,
-        α_geodesic::λType, b_uphill::λType,
-        min_damping_D::λType, v::uType,
-        a::uType, tmp_vec::uType, v_old::uType,
-        norm_v_old::lossType, δ::uType,
-        loss_old::lossType, make_new_J::Bool,
-        fu_tmp::resType,
-        mat_tmp::jType,
-        stats::NLStats) where {
-        iip, fType, algType,
-        uType, duType, resType,
-        pType, INType, tolType,
-        probType, ufType, L,
-        jType, JC, DᵀDType,
-        λType, lossType,
-    }
-        new{iip, fType, algType, uType, duType, resType,
-            pType, INType, tolType, probType, ufType, L,
-            jType, JC, DᵀDType, λType, lossType}(f, alg, u, fu, p, uf, linsolve, J, du_tmp,
-            jac_config, force_stop, maxiters,
-            internalnorm, retcode, abstol, prob, DᵀD,
-            JᵀJ, λ, λ_factor,
-            damping_increase_factor,
-            damping_decrease_factor, h,
-            α_geodesic, b_uphill, min_damping_D,
-            v, a, tmp_vec, v_old,
-            norm_v_old, δ, loss_old, make_new_J,
-            fu_tmp, mat_tmp, stats)
-    end
 end
 
-function jacobian_caches(alg::LevenbergMarquardt, f, u, p, ::Val{false})
-    JacobianWrapper(f, p), nothing, ArrayInterface.undefmatrix(u), nothing, nothing
-end
+isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
 
 function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
-    args...;
-    alias_u0 = false,
-    maxiters = 1000,
-    abstol = 1e-6,
-    internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
-    if alias_u0
-        u = prob.u0
-    else
-        u = deepcopy(prob.u0)
-    end
-    f = prob.f
-    p = prob.p
-    if iip
-        fu = zero(u)
-        f(fu, u, p)
-    else
-        fu = f(u, p)
-    end
-    uf, linsolve, J, du_tmp, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
+    args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
+    linsolve_kwargs=(;), kwargs...) where {uType, iip}
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
+    fu1 = evaluate_f(prob, u)
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
+        linsolve_kwargs)
 
     λ = convert(eltype(u), alg.damping_initial)
     λ_factor = convert(eltype(u), alg.damping_increase_factor)
@@ -269,7 +166,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarq
         DᵀD = Diagonal(d)
     end
 
-    loss = internalnorm(fu)
+    loss = internalnorm(fu1)
     JᵀJ = zero(J)
     v = zero(u)
     a = zero(u)
@@ -277,26 +174,25 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarq
     v_old = zero(u)
     δ = zero(u)
     make_new_J = true
-    fu_tmp = zero(fu)
+    fu_tmp = zero(fu1)
     mat_tmp = zero(J)
 
-    return LevenbergMarquardtCache{iip}(f, alg, u, fu, p, uf, linsolve, J, du_tmp,
-        jac_config, false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, prob, DᵀD, JᵀJ,
-        λ, λ_factor, damping_increase_factor,
-        damping_decrease_factor, h,
-        α_geodesic, b_uphill, min_damping_D,
-        v, a, tmp_vec, v_old, loss, δ, loss, make_new_J,
-        fu_tmp, mat_tmp, NLStats(1, 0, 0, 0, 0))
+    return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, du, p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
+        JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
+        b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
+        mat_tmp, NLStats(1, 0, 0, 0, 0))
 end
+
 function perform_step!(cache::LevenbergMarquardtCache{true})
-    @unpack fu, f, make_new_J = cache
-    if iszero(fu)
+    @unpack fu1, f, make_new_J = cache
+    if _iszero(fu1)
         cache.force_stop = true
         return nothing
     end
+
     if make_new_J
-        jacobian!(cache.J, cache)
+        jacobian!!(cache.J, cache)
         mul!(cache.JᵀJ, cache.J', cache.J)
         cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
         cache.make_new_J = false
@@ -306,24 +202,24 @@ function perform_step!(cache::LevenbergMarquardtCache{true})
 
     # Usual Levenberg-Marquardt step ("velocity").
     # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
-    mul!(cache.fu_tmp, J', fu)
+    mul!(cache.fu_tmp, J', fu1)
     @. cache.mat_tmp = JᵀJ + λ * DᵀD
-    linres = dolinsolve(alg.precs, linsolve, A = cache.mat_tmp, b = _vec(cache.fu_tmp),
-        linu = _vec(cache.du_tmp), p = p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp, b = _vec(cache.fu_tmp),
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. cache.v = -cache.du_tmp
+    @. cache.v = -cache.du
 
     # Geodesic acceleration (step_size = v + a / 2).
     @unpack v, α_geodesic, h = cache
     f(cache.fu_tmp, u .+ h .* v, p)
 
     # The following lines do: cache.a = -J \ cache.fu_tmp
-    mul!(cache.du_tmp, J, v)
-    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu) / h - cache.du_tmp)
-    linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(cache.fu_tmp),
-        linu = _vec(cache.du_tmp), p = p, reltol = cache.abstol)
+    mul!(cache.du, J, v)
+    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
+    linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(cache.fu_tmp),
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. cache.a = -cache.du_tmp
+    @. cache.a = -cache.du
     cache.stats.nsolve += 2
     cache.stats.nfactors += 2
 
@@ -345,7 +241,7 @@ function perform_step!(cache::LevenbergMarquardtCache{true})
                 cache.force_stop = true
                 return nothing
             end
-            cache.fu .= cache.fu_tmp
+            cache.fu1 .= cache.fu_tmp
             cache.v_old .= v
             cache.norm_v_old = norm_v
             cache.loss_old = loss
@@ -359,13 +255,14 @@ function perform_step!(cache::LevenbergMarquardtCache{true})
 end
 
 function perform_step!(cache::LevenbergMarquardtCache{false})
-    @unpack fu, f, make_new_J = cache
-    if iszero(fu)
+    @unpack fu1, f, make_new_J = cache
+    if _iszero(fu1)
         cache.force_stop = true
         return nothing
     end
+
     if make_new_J
-        cache.J = jacobian(cache, f)
+        cache.J = jacobian!!(cache.J, cache)
         cache.JᵀJ = cache.J' * cache.J
         if cache.JᵀJ isa Number
             cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
@@ -378,11 +275,11 @@ function perform_step!(cache::LevenbergMarquardtCache{false})
     @unpack u, p, λ, JᵀJ, DᵀD, J = cache
 
     # Usual Levenberg-Marquardt step ("velocity").
-    cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu)
+    cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
 
     @unpack v, h, α_geodesic = cache
     # Geodesic acceleration (step_size = v + a / 2).
-    cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu) ./ h .- J * v))
+    cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
     cache.stats.nsolve += 1
     cache.stats.nfactors += 1
 
@@ -404,7 +301,7 @@ function perform_step!(cache::LevenbergMarquardtCache{false})
                 cache.force_stop = true
                 return nothing
             end
-            cache.fu = fu_new
+            cache.fu1 = fu_new
             cache.v_old = v
             cache.norm_v_old = norm_v
             cache.loss_old = loss
@@ -429,6 +326,6 @@ function SciMLBase.solve!(cache::LevenbergMarquardtCache)
         cache.retcode = ReturnCode.Success
     end
 
-    SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-        retcode = cache.retcode, stats = cache.stats)
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
+        cache.retcode, cache.stats)
 end
diff --git a/src/linesearch.jl b/src/linesearch.jl
new file mode 100644
index 000000000..30861e14b
--- /dev/null
+++ b/src/linesearch.jl
@@ -0,0 +1,156 @@
+"""
+    LineSearch(method = Static(), autodiff = AutoFiniteDiff(), alpha = true)
+
+Wrapper over algorithms from
+[LineSeaches.jl](https://github.com/JuliaNLSolvers/LineSearches.jl/). Allows automatic
+construction of the objective functions for the line search algorithms utilizing automatic
+differentiation for fast Vector Jacobian Products.
+
+### Arguments
+
+  - `method`: the line search algorithm to use. Defaults to `Static()`, which means that the
+    step size is fixed to the value of `alpha`.
+  - `autodiff`: the automatic differentiation backend to use for the line search. Defaults to
+    `AutoFiniteDiff()`, which means that finite differencing is used to compute the VJP.
+    `AutoZygote()` will be faster in most cases, but it requires `Zygote.jl` to be manually
+    installed and loaded
+  - `alpha`: the initial step size to use. Defaults to `true` (which is equivalent to `1`).
+"""
+@concrete struct LineSearch
+    method
+    autodiff
+    α
+end
+
+function LineSearch(; method = Static(), autodiff = AutoFiniteDiff(), alpha = true)
+    return LineSearch(method, autodiff, alpha)
+end
+
+@concrete mutable struct LineSearchCache
+    f
+    ϕ
+    dϕ
+    ϕdϕ
+    α
+    ls
+end
+
+function LineSearchCache(ls::LineSearch, f, u::Number, p, _, ::Val{false})
+    eval_f(u, du, α) = eval_f(u - α * du)
+    eval_f(u) = f(u, p)
+
+    ls.method isa Static && return LineSearchCache(eval_f, nothing, nothing, nothing,
+        convert(typeof(u), ls.α), ls)
+
+    g(u, fu) = last(value_derivative(Base.Fix2(f, p), u)) * fu
+
+    function ϕ(u, du)
+        function ϕ_internal(α)
+            u_ = u - α * du
+            _fu = eval_f(u_)
+            return dot(_fu, _fu) / 2
+        end
+        return ϕ_internal
+    end
+
+    function dϕ(u, du)
+        function dϕ_internal(α)
+            u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g(u_, _fu)
+            return dot(g₀, -du)
+        end
+        return dϕ_internal
+    end
+
+    function ϕdϕ(u, du)
+        function ϕdϕ_internal(α)
+            u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g(u_, _fu)
+            return dot(_fu, _fu) / 2, dot(g₀, -du)
+        end
+        return ϕdϕ_internal
+    end
+
+    return LineSearchCache(eval_f, ϕ, dϕ, ϕdϕ, convert(eltype(u), ls.α), ls)
+end
+
+function LineSearchCache(ls::LineSearch, f, u, p, fu1, IIP::Val{iip}) where {iip}
+    fu = iip ? fu1 : nothing
+    u_ = _mutable_zero(u)
+
+    function eval_f(u, du, α)
+        @. u_ = u - α * du
+        return eval_f(u_)
+    end
+    eval_f(u) = evaluate_f(f, u, p, IIP; fu)
+
+    ls.method isa Static && return LineSearchCache(eval_f, nothing, nothing, nothing,
+        convert(eltype(u), ls.α), ls)
+
+    g₀ = _mutable_zero(u)
+
+    autodiff = if iip && (ls.autodiff isa AutoZygote || ls.autodiff isa AutoSparseZygote)
+        @warn "Attempting to use Zygote.jl for linesearch on an in-place problem. Falling back to finite differencing."
+        AutoFiniteDiff()
+    else
+        ls.autodiff
+    end
+
+    function g!(u, fu)
+        op = VecJac((args...) -> f(args..., p), u; autodiff)
+        if iip
+            mul!(g₀, op, fu)
+            return g₀
+        else
+            return op * fu
+        end
+    end
+
+    function ϕ(u, du)
+        function ϕ_internal(α)
+            @. u_ = u - α * du
+            _fu = eval_f(u_)
+            return dot(_fu, _fu) / 2
+        end
+        return ϕ_internal
+    end
+
+    function dϕ(u, du)
+        function dϕ_internal(α)
+            @. u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g!(u_, _fu)
+            return dot(g₀, -du)
+        end
+        return dϕ_internal
+    end
+
+    function ϕdϕ(u, du)
+        function ϕdϕ_internal(α)
+            @. u_ = u - α * du
+            _fu = eval_f(u_)
+            g₀ = g!(u_, _fu)
+            return dot(_fu, _fu) / 2, dot(g₀, -du)
+        end
+        return ϕdϕ_internal
+    end
+
+    return LineSearchCache(eval_f, ϕ, dϕ, ϕdϕ, convert(eltype(u), ls.α), ls)
+end
+
+function perform_linesearch!(cache::LineSearchCache, u, du)
+    cache.ls.method isa Static && return cache.α
+
+    ϕ = cache.ϕ(u, du)
+    dϕ = cache.dϕ(u, du)
+    ϕdϕ = cache.ϕdϕ(u, du)
+
+    ϕ₀, dϕ₀ = ϕdϕ(zero(eltype(u)))
+
+    # This case is sometimes possible for large optimization problems
+    dϕ₀ ≥ 0 && return cache.α
+
+    return first(cache.ls.method(ϕ, cache.dϕ(u, du), cache.ϕdϕ(u, du), cache.α, ϕ₀, dϕ₀))
+end
diff --git a/src/raphson.jl b/src/raphson.jl
index 24e5799fd..8297f92fe 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -1,9 +1,6 @@
 """
-```julia
-NewtonRaphson(; chunk_size = Val{0}(), autodiff = Val{true}(),
-              standardtag = Val{true}(), concrete_jac = nothing,
-              diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS)
-```
+    NewtonRaphson(; concrete_jac = nothing, linsolve = nothing,
+        precs = DEFAULT_PRECS, adkwargs...)
 
 An advanced NewtonRaphson implementation with support for efficient handling of sparse
 matrices via colored automatic differentiation and preconditioned linear solvers. Designed
@@ -11,29 +8,15 @@ for large-scale and numerically-difficult nonlinear systems.
 
 ### Keyword Arguments
 
-  - `chunk_size`: the chunk size used by the internal ForwardDiff.jl automatic differentiation
-    system. This allows for multiple derivative columns to be computed simultaneously,
-    improving performance. Defaults to `0`, which is equivalent to using ForwardDiff.jl's
-    default chunk size mechanism. For more details, see the documentation for
-    [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/).
-  - `autodiff`: whether to use forward-mode automatic differentiation for the Jacobian.
-    Note that this argument is ignored if an analytical Jacobian is passed, as that will be
-    used instead. Defaults to `Val{true}`, which means ForwardDiff.jl via
-    SparseDiffTools.jl is used by default. If `Val{false}`, then FiniteDiff.jl is used for
-    finite differencing.
-  - `standardtag`: whether to use a standardized tag definition for the purposes of automatic
-    differentiation. Defaults to true, which thus uses the `NonlinearSolveTag`. If `Val{false}`,
-    then ForwardDiff's default function naming tag is used, which results in larger stack
-    traces.
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+    `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
   - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
     then the Jacobian will not be constructed and instead direct Jacobian-vector products
     `J*v` are computed using forward-mode automatic differentiation or finite differencing
     tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
     for example for a preconditioner, `concrete_jac = true` can be passed in order to force
     the construction of the Jacobian.
-  - `diff_type`: the type of finite differencing used if `autodiff = false`. Defaults to
-    `Val{:forward}` for forward finite differences. For more details on the choices, see the
-    [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl) documentation.
   - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
     linear solves within the Newton method. Defaults to `nothing`, which means it uses the
     LinearSolve.jl default algorithm choice. For more information on available algorithm
@@ -42,114 +25,79 @@ for large-scale and numerically-difficult nonlinear systems.
     preconditioners. For more information on specifying preconditioners for LinearSolve
     algorithms, consult the
     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-
-!!! note
-
-    Currently, the linear solver and chunk size choice only applies to in-place defined
-    `NonlinearProblem`s. That is expected to change in the future.
+  - `linesearch`: the line search algorithm to use. Defaults to [`LineSearch()`](@ref),
+    which means that no line search is performed. Algorithms from `LineSearches.jl` can be
+    used here directly, and they will be converted to the correct `LineSearch`.
 """
-struct NewtonRaphson{CS, AD, FDT, L, P, ST, CJ} <:
-       AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ}
-    linsolve::L
-    precs::P
+@concrete struct NewtonRaphson{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
+    linesearch
 end
 
-function NewtonRaphson(; chunk_size = Val{0}(), autodiff = Val{true}(),
-    standardtag = Val{true}(), concrete_jac = nothing,
-    diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS)
-    NewtonRaphson{_unwrap_val(chunk_size), _unwrap_val(autodiff), diff_type,
-        typeof(linsolve), typeof(precs), _unwrap_val(standardtag),
-        _unwrap_val(concrete_jac)}(linsolve,
-        precs)
+concrete_jac(::NewtonRaphson{CJ}) where {CJ} = CJ
+
+function NewtonRaphson(; concrete_jac = nothing, linsolve = nothing,
+    linesearch = LineSearch(), precs = DEFAULT_PRECS, adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
+    linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method=linesearch)
+    return NewtonRaphson{_unwrap_val(concrete_jac)}(ad, linsolve, precs, linesearch)
 end
 
-mutable struct NewtonRaphsonCache{iip, fType, algType, uType, duType, resType, pType,
-    INType, tolType,
-    probType, ufType, L, jType, JC}
-    f::fType
-    alg::algType
-    u::uType
-    fu::resType
-    p::pType
-    uf::ufType
-    linsolve::L
-    J::jType
-    du1::duType
-    jac_config::JC
-    force_stop::Bool
+@concrete mutable struct NewtonRaphsonCache{iip}
+    f
+    alg
+    u
+    fu1
+    fu2
+    du
+    p
+    uf
+    linsolve
+    J
+    jac_cache
+    force_stop
     maxiters::Int
-    internalnorm::INType
-    retcode::SciMLBase.ReturnCode.T
-    abstol::tolType
-    prob::probType
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
     stats::NLStats
-
-    function NewtonRaphsonCache{iip}(f::fType, alg::algType, u::uType, fu::resType,
-        p::pType, uf::ufType, linsolve::L, J::jType,
-        du1::duType,
-        jac_config::JC, force_stop::Bool, maxiters::Int,
-        internalnorm::INType,
-        retcode::SciMLBase.ReturnCode.T, abstol::tolType,
-        prob::probType,
-        stats::NLStats) where {
-        iip, fType, algType, uType,
-        duType, resType, pType, INType,
-        tolType,
-        probType, ufType, L, jType, JC}
-        new{iip, fType, algType, uType, duType, resType, pType, INType, tolType,
-            probType, ufType, L, jType, JC}(f, alg, u, fu, p,
-            uf, linsolve, J, du1, jac_config,
-            force_stop, maxiters, internalnorm,
-            retcode, abstol, prob, stats)
-    end
+    lscache
 end
 
-function jacobian_caches(alg::NewtonRaphson, f, u, p, ::Val{false})
-    JacobianWrapper(f, p), nothing, nothing, nothing, nothing
-end
+isinplace(::NewtonRaphsonCache{iip}) where {iip} = iip
 
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson,
-    args...;
-    alias_u0 = false,
-    maxiters = 1000,
-    abstol = 1e-6,
-    internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
-    if alias_u0
-        u = prob.u0
-    else
-        u = deepcopy(prob.u0)
-    end
-    f = prob.f
-    p = prob.p
-    if iip
-        fu = zero(u)
-        f(fu, u, p)
-    else
-        fu = f(u, p)
-    end
-    uf, linsolve, J, du1, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
+function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson, args...;
+    alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
+    linsolve_kwargs=(;), kwargs...) where {uType, iip}
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
+    fu1 = evaluate_f(prob, u)
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
+        linsolve_kwargs)
 
-    return NewtonRaphsonCache{iip}(f, alg, u, fu, p, uf, linsolve, J, du1, jac_config,
-        false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, prob, NLStats(1, 0, 0, 0, 0))
+    return NewtonRaphsonCache{iip}(f, alg, u, fu1, fu2, du, p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob,
+        NLStats(1, 0, 0, 0, 0), LineSearchCache(alg.linesearch, f, u, p, fu1, Val(iip)))
 end
 
 function perform_step!(cache::NewtonRaphsonCache{true})
-    @unpack u, fu, f, p, alg = cache
-    @unpack J, linsolve, du1 = cache
-    jacobian!(J, cache)
+    @unpack u, fu1, f, p, alg, J, linsolve, du = cache
+    jacobian!!(J, cache)
 
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(fu), linu = _vec(du1),
-        p = p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(fu1), linu = _vec(du),
+        p, reltol = cache.abstol)
     cache.linsolve = linres.cache
-    @. u = u - du1
-    f(fu, u, p)
 
-    if cache.internalnorm(cache.fu) < cache.abstol
-        cache.force_stop = true
-    end
+    # Line Search
+    α = perform_linesearch!(cache.lscache, u, du)
+    @. u = u - α * du
+    f(cache.fu1, u, p)
+
+    cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
@@ -158,13 +106,24 @@ function perform_step!(cache::NewtonRaphsonCache{true})
 end
 
 function perform_step!(cache::NewtonRaphsonCache{false})
-    @unpack u, fu, f, p = cache
-    J = jacobian(cache, f)
-    cache.u = u - J \ fu
-    cache.fu = f(cache.u, p)
-    if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-        cache.force_stop = true
+    @unpack u, fu1, f, p, alg, linsolve = cache
+
+    cache.J = jacobian!!(cache.J, cache)
+    # u = u - J \ fu
+    if linsolve === nothing
+        cache.du = fu1 / cache.J
+    else
+        linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1),
+            linu = _vec(cache.du), p, reltol = cache.abstol)
+        cache.linsolve = linres.cache
     end
+
+    # Line Search
+    α = perform_linesearch!(cache.lscache, u, cache.du)
+    cache.u = @. u - α * cache.du  # `u` might not support mutation
+    cache.fu1 = f(cache.u, p)
+
+    cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
     cache.stats.njacs += 1
     cache.stats.nsolve += 1
@@ -184,8 +143,8 @@ function SciMLBase.solve!(cache::NewtonRaphsonCache)
         cache.retcode = ReturnCode.Success
     end
 
-    SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-        retcode = cache.retcode, stats = cache.stats)
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
+        cache.retcode, cache.stats)
 end
 
 function SciMLBase.reinit!(cache::NewtonRaphsonCache{iip}, u0 = cache.u; p = cache.p,
@@ -193,11 +152,11 @@ function SciMLBase.reinit!(cache::NewtonRaphsonCache{iip}, u0 = cache.u; p = cac
     cache.p = p
     if iip
         recursivecopy!(cache.u, u0)
-        cache.f(cache.fu, cache.u, p)
+        cache.f(cache.fu1, cache.u, p)
     else
         # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
         cache.u = u0
-        cache.fu = cache.f(cache.u, p)
+        cache.fu1 = cache.f(cache.u, p)
     end
     cache.abstol = abstol
     cache.maxiters = maxiters
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index 6e867699c..e0892a4da 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -8,13 +8,13 @@ scheme are provided below.
 
 ## Using `RadiusUpdateSchemes`
 
-`RadiusUpdateSchemes` uses the standard EnumX interface (https://github.com/fredrikekre/EnumX.jl), 
+`RadiusUpdateSchemes` uses the standard EnumX interface (https://github.com/fredrikekre/EnumX.jl),
 and hence inherits all properties of being an EnumX, including the type of each constituent enum
 states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
 `TrustRegion(radius_update_scheme = your desired update scheme)`. For example,
 `sol = solve(prob, alg=TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))`.
 """
-EnumX.@enumx RadiusUpdateSchemes begin
+@enumx RadiusUpdateSchemes begin
     """
     `RadiusUpdateSchemes.Simple`
 
@@ -68,19 +68,12 @@ end
 
 """
 ```julia
-TrustRegion(; chunk_size = Val{0}(), autodiff = Val{true}(),
-            standardtag = Val{true}(), concrete_jac = nothing,
-            diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS,
-            radius_update_scheme = RadiusUpdateSchemes.Simple,
-            max_trust_radius::Real = 0 // 1,
-            initial_trust_radius::Real = 0 // 1,
-            step_threshold::Real = 1 // 10,
-            shrink_threshold::Real = 1 // 4,
-            expand_threshold::Real = 3 // 4,
-            shrink_factor::Real = 1 // 4,
-            expand_factor::Real = 2 // 1,
-            max_shrink_times::Int = 32)
-```
+    TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
+        radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple,
+        max_trust_radius::Real = 0 // 1, initial_trust_radius::Real = 0 // 1,
+        step_threshold::Real = 1 // 10, shrink_threshold::Real = 1 // 4,
+        expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
+        expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, adkwargs...)
 
 An advanced TrustRegion implementation with support for efficient handling of sparse
 matrices via colored automatic differentiation and preconditioned linear solvers. Designed
@@ -88,29 +81,15 @@ for large-scale and numerically-difficult nonlinear systems.
 
 ### Keyword Arguments
 
-  - `chunk_size`: the chunk size used by the internal ForwardDiff.jl automatic differentiation
-    system. This allows for multiple derivative columns to be computed simultaneously,
-    improving performance. Defaults to `0`, which is equivalent to using ForwardDiff.jl's
-    default chunk size mechanism. For more details, see the documentation for
-    [ForwardDiff.jl](https://juliadiff.org/ForwardDiff.jl/stable/).
-  - `autodiff`: whether to use forward-mode automatic differentiation for the Jacobian.
-    Note that this argument is ignored if an analytical Jacobian is passed, as that will be
-    used instead. Defaults to `Val{true}`, which means ForwardDiff.jl via
-    SparseDiffTools.jl is used by default. If `Val{false}`, then FiniteDiff.jl is used for
-    finite differencing.
-  - `standardtag`: whether to use a standardized tag definition for the purposes of automatic
-    differentiation. Defaults to true, which thus uses the `NonlinearSolveTag`. If `Val{false}`,
-    then ForwardDiff's default function naming tag is used, which results in larger stack
-    traces.
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+    `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
   - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
     then the Jacobian will not be constructed and instead direct Jacobian-vector products
     `J*v` are computed using forward-mode automatic differentiation or finite differencing
     tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
     for example for a preconditioner, `concrete_jac = true` can be passed in order to force
     the construction of the Jacobian.
-  - `diff_type`: the type of finite differencing used if `autodiff = false`. Defaults to
-    `Val{:forward}` for forward finite differences. For more details on the choices, see the
-    [FiniteDiff.jl](https://github.com/JuliaDiff/FiniteDiff.jl) documentation.
   - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
     linear solves within the Newton method. Defaults to `nothing`, which means it uses the
     LinearSolve.jl default algorithm choice. For more information on available algorithm
@@ -120,7 +99,7 @@ for large-scale and numerically-difficult nonlinear systems.
     algorithms, consult the
     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
   - `radius_update_scheme`: the choice of radius update scheme to be used. Defaults to `RadiusUpdateSchemes.Simple`
-    which follows the conventional approach. Other available schemes are `RadiusUpdateSchemes.Hei`, 
+    which follows the conventional approach. Other available schemes are `RadiusUpdateSchemes.Hei`,
     `RadiusUpdateSchemes.Yuan`, `RadiusUpdateSchemes.Bastin`, `RadiusUpdateSchemes.Fan`. These schemes
     have the trust region radius converging to zero that is seen to improve convergence. For more details, see the
     [Yuan, Yx](https://link.springer.com/article/10.1007/s10107-015-0893-2#Sec4).
@@ -148,18 +127,13 @@ for large-scale and numerically-difficult nonlinear systems.
     `expand_threshold < r` (with `r` defined in `shrink_threshold`). Defaults to `2.0`.
   - `max_shrink_times`: the maximum number of times to shrink the trust region radius in a
     row, `max_shrink_times` is exceeded, the algorithm returns. Defaults to `32`.
-
-!!! note
-
-    Currently, the linear solver and chunk size choice only applies to in-place defined
-    `NonlinearProblem`s. That is expected to change in the future.
 """
-struct TrustRegion{CS, AD, FDT, L, P, ST, CJ, MTR} <:
-       AbstractNewtonAlgorithm{CS, AD, FDT, ST, CJ}
-    linsolve::L
-    precs::P
+@concrete struct TrustRegion{CJ, AD, MTR} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
     radius_update_scheme::RadiusUpdateSchemes.T
-    max_trust_radius::MTR
+    max_trust_radius
     initial_trust_radius::MTR
     step_threshold::MTR
     shrink_threshold::MTR
@@ -169,68 +143,53 @@ struct TrustRegion{CS, AD, FDT, L, P, ST, CJ, MTR} <:
     max_shrink_times::Int
 end
 
-function TrustRegion(; chunk_size = Val{0}(),
-    autodiff = Val{true}(),
-    standardtag = Val{true}(), concrete_jac = nothing,
-    diff_type = Val{:forward}, linsolve = nothing, precs = DEFAULT_PRECS,
+function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
     radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple, #defaults to conventional radius update
-    max_trust_radius::Real = 0 // 1,
-    initial_trust_radius::Real = 0 // 1,
-    step_threshold::Real = 1 // 10,
-    shrink_threshold::Real = 1 // 4,
-    expand_threshold::Real = 3 // 4,
-    shrink_factor::Real = 1 // 4,
-    expand_factor::Real = 2 // 1,
-    max_shrink_times::Int = 32)
-    TrustRegion{_unwrap_val(chunk_size), _unwrap_val(autodiff), diff_type,
-        typeof(linsolve), typeof(precs), _unwrap_val(standardtag),
-        _unwrap_val(concrete_jac), typeof(max_trust_radius),
-    }(linsolve, precs, radius_update_scheme, max_trust_radius,
-        initial_trust_radius,
-        step_threshold,
-        shrink_threshold,
-        expand_threshold,
-        shrink_factor,
-        expand_factor,
-        max_shrink_times)
+    max_trust_radius::Real = 0 // 1, initial_trust_radius::Real = 0 // 1,
+    step_threshold::Real = 1 // 10, shrink_threshold::Real = 1 // 4,
+    expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
+    expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
+    return TrustRegion{_unwrap_val(concrete_jac)}(ad, linsolve, precs, radius_update_scheme,
+        max_trust_radius, initial_trust_radius, step_threshold, shrink_threshold,
+        expand_threshold, shrink_factor, expand_factor, max_shrink_times)
 end
 
-mutable struct TrustRegionCache{iip, fType, algType, uType, resType, pType,
-    INType, tolType, probType, ufType, L, jType, JC, floatType,
-    trustType, suType, su2Type, tmpType}
-    f::fType
-    alg::algType
-    u_prev::uType
-    u::uType
-    fu_prev::resType
-    fu::resType
-    p::pType
-    uf::ufType
-    linsolve::L
-    J::jType
-    jac_config::JC
+@concrete mutable struct TrustRegionCache{iip, trustType, floatType}
+    f
+    alg
+    u_prev
+    u
+    fu_prev
+    fu
+    fu2
+    p
+    uf
+    linsolve
+    J
+    jac_cache
     force_stop::Bool
     maxiters::Int
-    internalnorm::INType
-    retcode::SciMLBase.ReturnCode.T
-    abstol::tolType
-    prob::probType
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
     radius_update_scheme::RadiusUpdateSchemes.T
     trust_r::trustType
     max_trust_r::trustType
-    step_threshold::suType
+    step_threshold
     shrink_threshold::trustType
     expand_threshold::trustType
     shrink_factor::trustType
     expand_factor::trustType
     loss::floatType
     loss_new::floatType
-    H::jType
-    g::resType
+    H
+    g
     shrink_counter::Int
-    step_size::su2Type
-    u_tmp::tmpType
-    fu_new::resType
+    step_size
+    u_tmp
+    fu_new
     make_new_J::Bool
     r::floatType
     p1::floatType
@@ -239,75 +198,19 @@ mutable struct TrustRegionCache{iip, fType, algType, uType, resType, pType,
     p4::floatType
     ϵ::floatType
     stats::NLStats
-
-    function TrustRegionCache{iip}(f::fType, alg::algType, u_prev::uType, u::uType,
-        fu_prev::resType, fu::resType, p::pType,
-        uf::ufType, linsolve::L, J::jType, jac_config::JC,
-        force_stop::Bool, maxiters::Int, internalnorm::INType,
-        retcode::SciMLBase.ReturnCode.T, abstol::tolType,
-        prob::probType,
-        radius_update_scheme::RadiusUpdateSchemes.T,
-        trust_r::trustType,
-        max_trust_r::trustType, step_threshold::suType,
-        shrink_threshold::trustType, expand_threshold::trustType,
-        shrink_factor::trustType, expand_factor::trustType,
-        loss::floatType, loss_new::floatType, H::jType,
-        g::resType, shrink_counter::Int, step_size::su2Type,
-        u_tmp::tmpType, fu_new::resType, make_new_J::Bool,
-        r::floatType, p1::floatType, p2::floatType,
-        p3::floatType, p4::floatType, ϵ::floatType,
-        stats::NLStats) where {iip, fType, algType, uType,
-        resType, pType, INType,
-        tolType, probType, ufType, L,
-        jType, JC, floatType, trustType,
-        suType, su2Type, tmpType}
-        new{iip, fType, algType, uType, resType, pType,
-            INType, tolType, probType, ufType, L, jType, JC, floatType,
-            trustType, suType, su2Type, tmpType}(f, alg, u_prev, u, fu_prev, fu, p, uf,
-            linsolve, J,
-            jac_config, force_stop,
-            maxiters, internalnorm, retcode,
-            abstol, prob, radius_update_scheme,
-            trust_r, max_trust_r,
-            step_threshold, shrink_threshold,
-            expand_threshold, shrink_factor,
-            expand_factor, loss,
-            loss_new, H, g, shrink_counter,
-            step_size, u_tmp, fu_new,
-            make_new_J, r, p1, p2, p3, p4, ϵ, stats)
-    end
-end
-
-function jacobian_caches(alg::TrustRegion, f, u, p, ::Val{false})
-    J = ArrayInterface.undefmatrix(u)
-    JacobianWrapper(f, p), nothing, J, zero(u), nothing
 end
 
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
-    args...;
-    alias_u0 = false,
-    maxiters = 1000,
-    abstol = 1e-8,
-    internalnorm = DEFAULT_NORM,
-    kwargs...) where {uType, iip}
-    if alias_u0
-        u = prob.u0
-    else
-        u = deepcopy(prob.u0)
-    end
+function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion, args...;
+    alias_u0 = false, maxiters = 1000, abstol = 1e-8, internalnorm = DEFAULT_NORM,
+    linsolve_kwargs=(;), kwargs...) where {uType, iip}
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
     u_prev = zero(u)
-    f = prob.f
-    p = prob.p
-    if iip
-        fu = zero(u)
-        f(fu, u, p)
-    else
-        fu = f(u, p)
-    end
-    fu_prev = zero(fu)
+    fu1 = evaluate_f(prob, u)
+    fu_prev = zero(fu1)
 
-    loss = get_loss(fu)
-    uf, linsolve, J, u_tmp, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
+    loss = get_loss(fu1)
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip); linsolve_kwargs)
 
     radius_update_scheme = alg.radius_update_scheme
     max_trust_radius = convert(eltype(u), alg.max_trust_radius)
@@ -319,18 +222,18 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
     expand_factor = convert(eltype(u), alg.expand_factor)
     # Set default trust region radius if not specified
     if iszero(max_trust_radius)
-        max_trust_radius = convert(eltype(u), max(norm(fu), maximum(u) - minimum(u)))
+        max_trust_radius = convert(eltype(u), max(norm(fu1), maximum(u) - minimum(u)))
     end
     if iszero(initial_trust_radius)
         initial_trust_radius = convert(eltype(u), max_trust_radius / 11)
     end
 
     loss_new = loss
-    H = ArrayInterface.undefmatrix(u)
-    g = zero(fu)
+    H = zero(J)
+    g = _mutable_zero(fu1)
     shrink_counter = 0
     step_size = zero(u)
-    fu_new = zero(fu)
+    fu_new = zero(fu1)
     make_new_J = true
     r = loss
 
@@ -358,12 +261,12 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
         p3 = convert(eltype(u), 6.0) # c6
         p4 = convert(eltype(u), 0.0)
         if iip
-            auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
+            auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu1)
         else
             if isa(u, Number)
                 g = ForwardDiff.derivative(x -> f(x, p), u)
             else
-                g = auto_jacvec(x -> f(x, p), u, fu)
+                g = auto_jacvec(x -> f(x, p), u, fu1)
             end
         end
         initial_trust_radius = convert(eltype(u), p1 * norm(g))
@@ -375,7 +278,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
         p2 = convert(eltype(u), 1 / 4) # c5
         p3 = convert(eltype(u), 12) # c6
         p4 = convert(eltype(u), 1.0e18) # M
-        initial_trust_radius = convert(eltype(u), p1 * (norm(fu)^0.99))
+        initial_trust_radius = convert(eltype(u), p1 * (norm(fu1)^0.99))
     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
         step_threshold = convert(eltype(u), 0.05)
         shrink_threshold = convert(eltype(u), 0.05)
@@ -387,29 +290,27 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
         initial_trust_radius = convert(eltype(u), 1.0)
     end
 
-    return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu, p, uf, linsolve, J,
-        jac_config,
-        false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, prob, radius_update_scheme,
-        initial_trust_radius,
-        max_trust_radius, step_threshold, shrink_threshold,
-        expand_threshold, shrink_factor, expand_factor, loss,
-        loss_new, H, g, shrink_counter, step_size, u_tmp, fu_new,
-        make_new_J, r, p1, p2, p3, p4, ϵ, NLStats(1, 0, 0, 0, 0))
+    return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu1, fu2, p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob,
+        radius_update_scheme, initial_trust_radius, max_trust_radius, step_threshold,
+        shrink_threshold, expand_threshold, shrink_factor, expand_factor, loss, loss_new,
+        H, g, shrink_counter, step_size, du, fu_new, make_new_J, r, p1, p2, p3, p4, ϵ,
+        NLStats(1, 0, 0, 0, 0))
 end
 
+isinplace(::TrustRegionCache{iip}) where {iip} = iip
+
 function perform_step!(cache::TrustRegionCache{true})
     @unpack make_new_J, J, fu, f, u, p, u_tmp, alg, linsolve = cache
     if cache.make_new_J
-        jacobian!(J, cache)
+        jacobian!!(J, cache)
         mul!(cache.H, J, J)
         mul!(cache.g, J, fu)
         cache.stats.njacs += 1
     end
 
-    linres = dolinsolve(alg.precs, linsolve, A = cache.H, b = _vec(cache.g),
-        linu = _vec(u_tmp),
-        p = p, reltol = cache.abstol)
+    linres = dolinsolve(alg.precs, linsolve; A = cache.H, b = _vec(cache.g),
+        linu = _vec(u_tmp), p, reltol = cache.abstol)
     cache.linsolve = linres.cache
     cache.u_tmp .= -1 .* u_tmp
     dogleg!(cache)
@@ -428,7 +329,7 @@ function perform_step!(cache::TrustRegionCache{false})
     @unpack make_new_J, fu, f, u, p = cache
 
     if make_new_J
-        J = jacobian(cache, f)
+        J = jacobian!!(cache.J, cache)
         cache.H = J * J
         cache.g = J * fu
         cache.stats.njacs += 1
@@ -449,23 +350,16 @@ function perform_step!(cache::TrustRegionCache{false})
     return nothing
 end
 
-function retrospective_step!(cache::TrustRegionCache{true})
+function retrospective_step!(cache::TrustRegionCache)
     @unpack J, fu_prev, fu, u_prev, u = cache
-    jacobian!(J, cache)
-    mul!(cache.H, J, J)
-    mul!(cache.g, J, fu)
-    cache.stats.njacs += 1
-    @unpack H, g, step_size = cache
-
-    return -(get_loss(fu_prev) - get_loss(fu)) /
-           (step_size' * g + step_size' * H * step_size / 2)
-end
-
-function retrospective_step!(cache::TrustRegionCache{false})
-    @unpack J, fu_prev, fu, u_prev, u, f = cache
-    J = jacobian(cache, f)
-    cache.H = J * J
-    cache.g = J * fu
+    J = jacobian!!(deepcopy(J), cache)
+    if J isa Number
+        cache.H = J * J
+        cache.g = J * fu
+    else
+        mul!(cache.H, J, J)
+        mul!(cache.g, J, fu)
+    end
     cache.stats.njacs += 1
     @unpack H, g, step_size = cache
 
@@ -642,20 +536,20 @@ function take_step!(cache::TrustRegionCache{false})
 end
 
 function jvp!(cache::TrustRegionCache{false})
-    @unpack f, u, fu, p = cache
+    @unpack f, u, fu, uf = cache
     if isa(u, Number)
-        return value_derivative(x -> f(x, p), u)
+        return value_derivative(uf, u)
     end
-    return auto_jacvec(x -> f(x, p), u, fu)
+    return auto_jacvec(uf, u, fu)
 end
 
 function jvp!(cache::TrustRegionCache{true})
-    @unpack g, f, u, fu, p = cache
+    @unpack g, f, u, fu, uf = cache
     if isa(u, Number)
-        return value_derivative(x -> f(x, p), u)
+        return value_derivative(uf, u)
     end
-    auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
-    g
+    auto_jacvec!(g, uf, u, fu)
+    return g
 end
 
 function SciMLBase.solve!(cache::TrustRegionCache)
@@ -671,8 +565,8 @@ function SciMLBase.solve!(cache::TrustRegionCache)
         cache.retcode = ReturnCode.Success
     end
 
-    SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-        retcode = cache.retcode, stats = cache.stats)
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu; cache.retcode,
+        cache.stats)
 end
 
 function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
diff --git a/src/utils.jl b/src/utils.jl
index 9d72e230f..7498d5afa 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -2,25 +2,41 @@
 @inline UNITLESS_ABS2(x) = real(abs2(x))
 @inline DEFAULT_NORM(u::Union{AbstractFloat, Complex}) = @fastmath abs(u)
 @inline function DEFAULT_NORM(u::Array{T}) where {T <: Union{AbstractFloat, Complex}}
-    sqrt(real(sum(abs2, u)) / length(u))
+    return sqrt(real(sum(abs2, u)) / length(u))
 end
-@inline function DEFAULT_NORM(u::StaticArraysCore.StaticArray{
-    T,
-}) where {
-    T <: Union{
-        AbstractFloat,
-        Complex}}
-    sqrt(real(sum(abs2, u)) / length(u))
+@inline function DEFAULT_NORM(u::StaticArray{<:Union{AbstractFloat, Complex}})
+    return sqrt(real(sum(abs2, u)) / length(u))
 end
-@inline function DEFAULT_NORM(u::RecursiveArrayTools.AbstractVectorOfArray)
-    sum(sqrt(real(sum(UNITLESS_ABS2, _u)) / length(_u)) for _u in u.u)
+@inline function DEFAULT_NORM(u::AbstractVectorOfArray)
+    return sum(sqrt(real(sum(UNITLESS_ABS2, _u)) / length(_u)) for _u in u.u)
 end
 @inline DEFAULT_NORM(u::AbstractArray) = sqrt(real(sum(UNITLESS_ABS2, u)) / length(u))
 @inline DEFAULT_NORM(u) = norm(u)
 
-alg_autodiff(alg::AbstractNewtonAlgorithm{CS, AD}) where {CS, AD} = AD
+alg_autodiff(alg::AbstractNewtonAlgorithm{<:AbstractFiniteDifferencesMode}) = false
+alg_autodiff(alg::AbstractNewtonAlgorithm) = true
 alg_autodiff(alg) = false
 
+"""
+    default_adargs_to_adtype(; chunk_size = Val{0}(), autodiff = Val{true}(),
+        standardtag = Val{true}(), diff_type = Val{:forward})
+
+Construct the AD type from the arguments. This is mostly needed for compatibility with older
+code.
+"""
+function default_adargs_to_adtype(; chunk_size = Val{0}(), autodiff = Val{true}(),
+    standardtag = Val{true}(), diff_type = Val{:forward}())
+    ad = _unwrap_val(autodiff)
+    # Old API
+    if ad isa Bool
+        # FIXME: standardtag is not the Tag
+        ad && return AutoForwardDiff(; chunksize = _unwrap_val(chunk_size),
+            tag = _unwrap_val(standardtag))
+        return AutoFiniteDiff(; fdtype = diff_type)
+    end
+    return ad
+end
+
 """
 value_derivative(f, x)
 
@@ -33,57 +49,17 @@ function value_derivative(f::F, x::R) where {F, R}
 end
 
 # Todo: improve this dispatch
-function value_derivative(f::F, x::StaticArraysCore.SVector) where {F}
+function value_derivative(f::F, x::SVector) where {F}
     f(x), ForwardDiff.jacobian(f, x)
 end
 
-value(x) = x
-value(x::Dual) = ForwardDiff.value(x)
-value(x::AbstractArray{<:Dual}) = map(ForwardDiff.value, x)
-
-_vec(v) = vec(v)
-_vec(v::Number) = v
-_vec(v::AbstractVector) = v
-
-function alg_difftype(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    FDT
-end
+@inline value(x) = x
+@inline value(x::Dual) = ForwardDiff.value(x)
+@inline value(x::AbstractArray{<:Dual}) = map(ForwardDiff.value, x)
 
-function concrete_jac(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    CJ
-end
-
-function get_chunksize(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    Val(CS)
-end
-
-function standardtag(alg::AbstractNewtonAlgorithm{
-    CS,
-    AD,
-    FDT,
-    ST,
-    CJ,
-}) where {CS, AD, FDT, ST, CJ}
-    ST
-end
+@inline _vec(v) = vec(v)
+@inline _vec(v::Number) = v
+@inline _vec(v::AbstractVector) = v
 
 DEFAULT_PRECS(W, du, u, p, t, newW, Plprev, Prprev, cachedata) = nothing, nothing
 
@@ -94,10 +70,8 @@ function dolinsolve(precs::P, linsolve; A = nothing, linu = nothing, b = nothing
     b !== nothing && (linsolve.b = b)
     linu !== nothing && (linsolve.u = linu)
 
-    Plprev = linsolve.Pl isa LinearSolve.ComposePreconditioner ? linsolve.Pl.outer :
-             linsolve.Pl
-    Prprev = linsolve.Pr isa LinearSolve.ComposePreconditioner ? linsolve.Pr.outer :
-             linsolve.Pr
+    Plprev = linsolve.Pl isa ComposePreconditioner ? linsolve.Pl.outer : linsolve.Pl
+    Prprev = linsolve.Pr isa ComposePreconditioner ? linsolve.Pr.outer : linsolve.Pr
 
     _Pl, _Pr = precs(linsolve.A, du, u, p, nothing, A !== nothing, Plprev, Prprev,
         cachedata)
@@ -110,29 +84,25 @@ function dolinsolve(precs::P, linsolve; A = nothing, linu = nothing, b = nothing
         linsolve.Pr = Pr
     end
 
-    linres = if reltol === nothing
-        solve!(linsolve)
-    else
-        solve!(linsolve; reltol)
-    end
+    linres = reltol === nothing ? solve!(linsolve) : solve!(linsolve; reltol)
 
     return linres
 end
 
 function wrapprecs(_Pl, _Pr, weight)
     if _Pl !== nothing
-        Pl = LinearSolve.ComposePreconditioner(LinearSolve.InvPreconditioner(Diagonal(_vec(weight))),
-            _Pl)
+        Pl = ComposePreconditioner(InvPreconditioner(Diagonal(_vec(weight))), _Pl)
     else
-        Pl = LinearSolve.InvPreconditioner(Diagonal(_vec(weight)))
+        Pl = InvPreconditioner(Diagonal(_vec(weight)))
     end
 
     if _Pr !== nothing
-        Pr = LinearSolve.ComposePreconditioner(Diagonal(_vec(weight)), _Pr)
+        Pr = ComposePreconditioner(Diagonal(_vec(weight)), _Pr)
     else
         Pr = Diagonal(_vec(weight))
     end
-    Pl, Pr
+
+    return Pl, Pr
 end
 
 function _nfcount(N, ::Type{diff_type}) where {diff_type}
@@ -143,17 +113,55 @@ function _nfcount(N, ::Type{diff_type}) where {diff_type}
     else
         tmp = 2N
     end
-    tmp
+    return tmp
 end
 
-function get_loss(fu)
-    return norm(fu)^2 / 2
-end
+get_loss(fu) = norm(fu)^2 / 2
 
 function rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real} # R-function for adaptive trust region method
-    if (r >= c2)
+    if (r ≥ c2)
         return (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / π
     else
         return (1 - γ1 - β) * (exp(r - c2) + β / (1 - γ1 - β))
     end
 end
+
+concrete_jac(_) = nothing
+concrete_jac(::AbstractNewtonAlgorithm{CJ}) where {CJ} = CJ
+
+# Circumventing https://github.com/SciML/RecursiveArrayTools.jl/issues/277
+_iszero(x) = iszero(x)
+_iszero(x::ArrayPartition) = all(_iszero, x.x)
+
+_mutable_zero(x) = zero(x)
+_mutable_zero(x::SArray) = MArray(x)
+
+_mutable(x) = x
+_mutable(x::SArray) = MArray(x)
+_maybe_mutable(x, ::AbstractFiniteDifferencesMode) = _mutable(x)
+# The shadow allocated for Enzyme needs to be mutable
+_maybe_mutable(x, ::AutoSparseEnzyme) = _mutable(x)
+_maybe_mutable(x, _) = x
+
+# Helper function to get value of `f(u, p)`
+function evaluate_f(prob::NonlinearProblem{uType, iip}, u) where {uType, iip}
+    @unpack f, u0, p = prob
+    if iip
+        fu = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
+        f(fu, u, p)
+    else
+        fu = _mutable(f(u, p))
+    end
+    return fu
+end
+
+evaluate_f(cache, u; fu = nothing) = evaluate_f(cache.f, u, cache.p, Val(cache.iip); fu)
+
+function evaluate_f(f, u, p, ::Val{iip}; fu = nothing) where {iip}
+    if iip
+        f(fu, u, p)
+        return fu
+    else
+        return f(u, p)
+    end
+end
diff --git a/test/23_test_cases.jl b/test/23_test_cases.jl
deleted file mode 100644
index 3cb0eb310..000000000
--- a/test/23_test_cases.jl
+++ /dev/null
@@ -1,510 +0,0 @@
-using NonlinearSolve, NLsolve, LinearAlgebra
-
-# Implementation of the 23 test problems in
-# [test_nonlin](https://people.sc.fsu.edu/~jburkardt/m_src/test_nonlin/test_nonlin.html)
-
-# ------------------------------------- Problem 1 ------------------------------------------
-function p1_f!(out, x, p = nothing)
-    n = length(x)
-    out[1] = 1.0 - x[1]
-    out[2:n] .= 10.0 .* (x[2:n] .- x[1:(n - 1)] .* x[1:(n - 1)])
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = ones(n)
-x_start[1] = -1.2
-p1_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Generalized Rosenbrock function")
-
-# ------------------------------------- Problem 2 ------------------------------------------
-function p2_f!(out, x, p = nothing)
-    out[1] = x[1] + 10.0 * x[2]
-    out[2] = sqrt(5.0) * (x[3] - x[4])
-    out[3] = (x[2] - 2.0 * x[3])^2
-    out[4] = sqrt(10.0) * (x[1] - x[4]) * (x[1] - x[4])
-    nothing
-end
-
-n = 4
-x_sol = zeros(n)
-x_start = [3.0, -1.0, 0.0, 1.0]
-p2_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Powell singular function")
-
-# ------------------------------------- Problem 3 ------------------------------------------
-function p3_f!(out, x, p = nothing)
-    out[1] = 10000.0 * x[1] * x[2] - 1.0
-    out[2] = exp(-x[1]) + exp(-x[2]) - 1.0001
-    nothing
-end
-
-n = 2
-x_sol = [1.098159e-05, 9.106146]
-x_start = [0.0, 1.0]
-p3_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Powell badly scaled function")
-
-# ------------------------------------- Problem 4 ------------------------------------------
-function p4_f!(out, x, p = nothing)
-    temp1 = x[2] - x[1] * x[1]
-    temp2 = x[4] - x[3] * x[3]
-
-    out[1] = -200.0 * x[1] * temp1 - (1.0 - x[1])
-    out[2] = 200.0 * temp1 + 20.2 * (x[2] - 1.0) + 19.8 * (x[4] - 1.0)
-    out[3] = -180.0 * x[3] * temp2 - (1.0 - x[3])
-    out[4] = 180.0 * temp2 + 20.2 * (x[4] - 1.0) + 19.8 * (x[2] - 1.0)
-    nothing
-end
-
-n = 4
-x_sol = ones(n)
-x_start = [-3.0, -1.0, -3.0, -1.0]
-p4_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Wood function")
-
-# ------------------------------------- Problem 5 ------------------------------------------
-function p5_f!(out, x, p = nothing)
-    if 0.0 < x[1]
-        temp = atan(x[2] / x[1]) / (2.0 * pi)
-    elseif x[1] < 0.0
-        temp = atan(x[2] / x[1]) / (2.0 * pi) + 0.5
-    else
-        temp = 0.25 * sign(x[2])
-    end
-
-    out[1] = 10.0 * (x[3] - 10.0 * temp)
-    out[2] = 10.0 * (sqrt(x[1] * x[1] + x[2] * x[2]) - 1.0)
-    out[3] = x[3]
-    nothing
-end
-
-n = 3
-x_sol = [1.0, 0.0, 0.0]
-x_start = [-1.0, 0.0, 0.0]
-p5_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Helical valley function")
-
-# ------------------------------------- Problem 6 ------------------------------------------
-function p6_f!(out, x, p = nothing)
-    n = length(x)
-    for i in 1:29
-        ti = i / 29.0
-        sum1 = 0.0
-        temp = 1.0
-        for j in 2:n
-            sum1 = sum1 + j * temp * x[j]
-            temp = ti * temp
-        end
-
-        sum2 = 0.0
-        temp = 1.0
-        for j in 1:n
-            sum2 = sum2 + temp * x[j]
-            temp = ti * temp
-        end
-        temp = 1.0 / ti
-
-        for k in 1:n
-            out[k] = out[k] + temp * (sum1 - sum2 * sum2 - 1.0) * (k - 2.0 * ti * sum2)
-            temp = ti * temp
-        end
-    end
-
-    out[1] = out[1] + 3.0 * x[1] - 2.0 * x[1] * x[1] + 2.0 * x[1]^3
-    out[2] = out[2] + x[2] - x[2]^2 - 1.0
-    nothing
-end
-
-n = 2
-x_sol = []
-x_start = zeros(n)
-p6_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Watson function")
-
-# ------------------------------------- Problem 7 ------------------------------------------
-function p7_f!(out, x, p = nothing)
-    n = length(x)
-    out .= 0.0
-    for j in 1:n
-        t1 = 1.0
-        t2 = x[j]
-        for i in 1:n
-            out[i] += t2
-            t3 = 2.0 * x[j] * t2 - t1
-            t1 = t2
-            t2 = t3
-        end
-    end
-    out ./= n
-
-    for i in 1:n
-        ip1 = i
-        if ip1 % 2 == 0
-            out[i] = out[i] + 1.0 / (ip1 * ip1 - 1)
-        end
-    end
-    nothing
-end
-
-n = 2
-x_sol = [0.2113248654051871, 0.7886751345948129]
-x_sol .= 2.0 .* x_sol .- 1.0
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = (2 * i - n) / (n + 1)
-end
-p7_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Chebyquad function")
-
-# ------------------------------------- Problem 8 ------------------------------------------
-function p8_f!(out, x, p = nothing)
-    n = length(x)
-    out[1:(n - 1)] .= x[1:(n - 1)] .+ sum(x) .- (n + 1)
-    out[n] = prod(x) - 1.0
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = ones(n) ./ 2
-p8_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Brown almost linear function")
-
-# ------------------------------------- Problem 9 ------------------------------------------
-function p9_f!(out, x, p = nothing)
-    n = length(x)
-    h = 1.0 / (n + 1)
-    for k in 1:n
-        out[k] = 2.0 * x[k] + 0.5 * h^2 * (x[k] + k * h + 1.0)^3
-        if 1 < k
-            out[k] = out[k] - x[k - 1]
-        end
-        if k < n
-            out[k] = out[k] - x[k + 1]
-        end
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n)
-for i in 1:n
-    x_start[i] = (i * (i - n - 1)) / (n + 1)^2
-end
-p9_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Discrete boundary value function")
-
-# ------------------------------------- Problem 10 -----------------------------------------
-function p10_f!(out, x, p = nothing)
-    n = length(x)
-    h = 1.0 / (n + 1)
-    for k in 1:n
-        tk = k / (n + 1)
-        sum1 = 0.0
-        for j in 1:k
-            tj = j * h
-            sum1 = sum1 + tj * (x[j] + tj + 1.0)^3
-        end
-        sum2 = 0.0
-        for j in k:n
-            tj = j * h
-            sum2 = sum2 + (1.0 - tj) * (x[j] + tj + 1.0)^3
-        end
-
-        out[k] = x[k] + h * ((1.0 - tk) * sum1 + tk * sum2) / 2.0
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = (i * (i - n - 1)) / (n + 1)^2
-end
-p10_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Discrete integral equation function")
-
-# ------------------------------------- Problem 11 -----------------------------------------
-function p11_f!(out, x, p = nothing)
-    n = length(x)
-    c_sum = sum(cos.(x))
-    for k in 1:n
-        out[k] = n - c_sum + k * (1.0 - cos(x[k])) - sin(x[k])
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) / n
-p11_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Trigonometric function")
-
-# ------------------------------------- Problem 12 -----------------------------------------
-function p12_f!(out, x, p = nothing)
-    n = length(x)
-    sum1 = 0.0
-    for j in 1:n
-        sum1 += j * (x[j] - 1.0)
-    end
-    for k in 1:n
-        out[k] = x[k] - 1.0 + k * sum1 * (1.0 + 2.0 * sum1 * sum1)
-    end
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = 1.0 - i / n
-end
-p12_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Variably dimensioned function")
-
-# ------------------------------------- Problem 13 -----------------------------------------
-function p13_f!(out, x, p = nothing)
-    n = length(x)
-    for k in 1:n
-        out[k] = (3.0 - 2.0 * x[k]) * x[k] + 1.0
-        if 1 < k
-            out[k] -= x[k - 1]
-        end
-        if k < n
-            out[k] -= 2.0 * x[k + 1]
-        end
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) .* (-1.0)
-p13_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Broyden tridiagonal function")
-
-# ------------------------------------- Problem 14 -----------------------------------------
-function p14_f!(out, x, p = nothing)
-    n = length(x)
-    ml = 5
-    mu = 1
-    for k in 1:n
-        k1 = max(1, k - ml)
-        k2 = min(n, k + mu)
-
-        temp = 0.0
-        for j in k1:k2
-            if j != k
-                temp += x[j] * (1.0 + x[j])
-            end
-        end
-        out[k] = x[k] * (2.0 + 5.0 * x[k] * x[k]) + 1.0 - temp
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) .* (-1.0)
-p14_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Broyden banded function")
-
-# ------------------------------------- Problem 15 -----------------------------------------
-function p15_f!(out, x, p = nothing)
-    out[1] = (x[1] * x[1] + x[2] * x[3]) - 0.0001
-    out[2] = (x[1] * x[2] + x[2] * x[4]) - 1.0
-    out[3] = (x[3] * x[1] + x[4] * x[3]) - 0.0
-    out[4] = (x[3] * x[2] + x[4] * x[4]) - 0.0001
-    nothing
-end
-
-n = 4
-x_sol = [0.01, 50.0, 0.0, 0.01]
-x_start = [1.0, 0.0, 0.0, 1.0]
-p15_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Hammarling 2 by 2 matrix square root problem")
-
-# ------------------------------------- Problem 16 -----------------------------------------
-function p16_f!(out, x, p = nothing)
-    out[1] = (x[1] * x[1] + x[2] * x[4] + x[3] * x[7]) - 0.0001
-    out[2] = (x[1] * x[2] + x[2] * x[5] + x[3] * x[8]) - 1.0
-    out[3] = x[1] * x[3] + x[2] * x[6] + x[3] * x[9]
-    out[4] = x[4] * x[1] + x[5] * x[4] + x[6] * x[7]
-    out[5] = (x[4] * x[2] + x[5] * x[5] + x[6] * x[8]) - 0.0001
-    out[6] = x[4] * x[3] + x[5] * x[6] + x[6] * x[9]
-    out[7] = x[7] * x[1] + x[8] * x[4] + x[9] * x[7]
-    out[8] = x[7] * x[2] + x[8] * x[5] + x[9] * x[8]
-    out[9] = (x[7] * x[3] + x[8] * x[6] + x[9] * x[9]) - 0.0001
-    nothing
-end
-
-n = 9
-x_sol = [0.01, 50.0, 0.0, 0.0, 0.01, 0.0, 0.0, 0.0, 0.01]
-x_start = [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]
-p16_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Hammarling 3 by 3 matrix square root problem")
-
-# ------------------------------------- Problem 17 -----------------------------------------
-function p17_f!(out, x, p = nothing)
-    out[1] = x[1] + x[2] - 3.0
-    out[2] = x[1]^2 + x[2]^2 - 9.0
-    nothing
-end
-
-n = 2
-x_sol = [0.0, 3.0]
-x_start = [1.0, 5.0]
-p17_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Dennis and Schnabel 2 by 2 example")
-
-# ------------------------------------- Problem 18 -----------------------------------------
-function p18_f!(out, x, p = nothing)
-    if x[1] != 0.0
-        out[1] = x[2]^2 * (1.0 - exp(-x[1] * x[1])) / x[1]
-    else
-        out[1] = 0.0
-    end
-    if x[2] != 0.0
-        out[2] = x[1] * (1.0 - exp(-x[2] * x[2])) / x[2]
-    else
-        out[2] = 0.0
-    end
-    nothing
-end
-
-n = 2
-x_sol = zeros(n)
-x_start = [2.0, 2.0]
-p18_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Sample problem 18")
-
-# ------------------------------------- Problem 19 -----------------------------------------
-function p19_f!(out, x, p = nothing)
-    out[1] = x[1] * (x[1]^2 + x[2]^2)
-    out[2] = x[2] * (x[1]^2 + x[2]^2)
-    nothing
-end
-
-n = 2
-x_sol = zeros(n)
-x_start = [3.0, 3.0]
-p19_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Sample problem 19")
-
-# ------------------------------------- Problem 20 -----------------------------------------
-function p20_f!(out, x, p = nothing)
-    out[1] = x[1] * (x[1] - 5.0)^2
-    nothing
-end
-
-n = 1
-x_sol = [5.0] # OR [0.0]...
-x_start = [1.0]
-p20_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Scalar problem f(x) = x(x - 5)^2")
-
-# ------------------------------------- Problem 21 -----------------------------------------
-function p21_f!(out, x, p = nothing)
-    out[1] = x[1] - x[2]^3 + 5.0 * x[2]^2 - 2.0 * x[2] - 13.0
-    out[2] = x[1] + x[2]^3 + x[2]^2 - 14.0 * x[2] - 29.0
-    nothing
-end
-
-n = 2
-x_sol = [5.0, 4.0]
-x_start = [0.5, -2.0]
-p21_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Freudenstein-Roth function")
-
-# ------------------------------------- Problem 22 -----------------------------------------
-function p22_f!(out, x, p = nothing)
-    out[1] = x[1] * x[1] - x[2] + 1.0
-    out[2] = x[1] - cos(0.5 * pi * x[2])
-    nothing
-end
-
-n = 2
-x_sol = [0.0, 1.0]
-x_start = [1.0, 0.0]
-p22_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Boggs function")
-
-# ------------------------------------- Problem 23 -----------------------------------------
-function p23_f!(out, x, p = nothing)
-    c = 0.9
-    out[1:n] = x[1:n]
-    μ = zeros(n)
-    for i in 1:n
-        μ[i] = (2 * i) / (2 * n)
-    end
-    for i in 1:n
-        s = 0.0
-        for j in 1:n
-            s = s + (μ[i] * x[j]) / (μ[i] + μ[j])
-        end
-        term = 1.0 - c * s / (2 * n)
-        out[i] -= 1.0 / term
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n)
-p23_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Chandrasekhar function")
-
-# ----------------------------------- Solve problems ---------------------------------------
-problems = (p1_f!, p2_f!, p3_f!, p4_f!, p5_f!, p6_f!, p7_f!, p8_f!, p9_f!, p10_f!, p11_f!,
-    p12_f!, p13_f!, p14_f!, p15_f!, p16_f!, p17_f!, p18_f!, p19_f!, p20_f!, p21_f!,
-    p22_f!, p23_f!)
-dicts = (p1_dict, p2_dict, p3_dict, p4_dict, p5_dict, p6_dict, p7_dict, p8_dict, p9_dict,
-    p10_dict, p11_dict, p12_dict, p13_dict, p14_dict, p15_dict, p16_dict, p17_dict,
-    p18_dict, p19_dict, p20_dict, p21_dict, p22_dict, p23_dict)
-algs = (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
-names = ("NewtonRaphson", "TrustRegion", "LevenbergMarquardt")
-
-for (problem, dict) in zip(problems, dicts)
-    for (alg, name) in zip(algs, names)
-        local x = dict["start"]
-        local nlprob = NonlinearProblem(problem, x)
-        local out = similar(x)
-        try
-            problem(out,
-                solve(nlprob, alg, abstol = 1e-15, reltol = 1e-15).u, nothing)
-            dict["error_" * name] = ""
-        catch
-            # println("Error in $name")
-            dict["error_" * name] = "(Singular error)"
-        end
-        dict["out_" * name] = out
-    end
-    local x = dict["start"]
-    local nlprob = NonlinearProblem(problem, x)
-    sol = nlsolve(problem, x, xtol = 1e-15, ftol = 1e-15)
-    dict["norm_nlsolve"] = sol.residual_norm
-end
-
-# ----------------------------------- Print results ----------------------------------------
-i_str = i_str = rpad("nr", 3, " ")
-title_str = rpad("Problem", 50, " ")
-n_str = rpad("n", 5, " ")
-norm_str = rpad(names[1], 20, " ") * rpad(names[2], 20, " ") * rpad(names[3], 20, " ") *
-           rpad("nlsolve", 20, " ")
-println("$i_str $title_str $n_str $norm_str")
-
-for (i, dict) in enumerate(dicts)
-    local i_str = rpad(string(i), 3, " ")
-    local title_str = rpad(dict["title"], 50, " ")
-    local n_str = rpad(string(dict["n"]), 5, " ")
-    local norm_str = ""
-    for (alg, name) in zip(algs, names)
-        norm_str *= rpad(string(trunc(norm(dict["out_" * name]); sigdigits = 5)), 20, " ")
-    end
-    norm_str *= rpad(string(round(dict["norm_nlsolve"]; sigdigits = 5)), 20, " ")
-    println("$i_str $title_str $n_str $norm_str")
-end
diff --git a/test/basictests.jl b/test/basictests.jl
index 05a0152fa..54e63e93d 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -1,774 +1,404 @@
-using NonlinearSolve
-using StaticArrays
-using BenchmarkTools
-using LinearSolve
-using Random
-using LinearAlgebra
-using Test
+using BenchmarkTools, LinearSolve, NonlinearSolve, StaticArrays, Random, LinearAlgebra,
+    Test, ForwardDiff, Zygote, Enzyme, SparseDiffTools
 
-# --- NewtonRaphson tests ---
-
-function benchmark_immutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-    sol = solve!(solver)
-end
+_nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
 
-function benchmark_mutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, NewtonRaphson(), abstol = 1e-9))
-end
+quadratic_f(u, p) = u .* u .- p
+quadratic_f!(du, u, p) = (du .= u .* u .- p)
+quadratic_f2(u, p) = @. p[1] * u * u - p[2]
 
-function ff(u, p)
-    u .* u .- 2
+function newton_fails(u, p)
+    return 0.010000000000000002 .+
+           10.000000000000002 ./ (1 .+
+            (0.21640425613334457 .+
+             216.40425613334457 ./ (1 .+
+              (0.21640425613334457 .+
+               216.40425613334457 ./
+               (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
+           0.0011552453009332421u .- p
 end
-const cu0 = @SVector[1.0, 1.0]
-function sf(u, p)
-    u * u - 2
-end
-const csu0 = 1.0
-u0 = [1.0, 1.0]
-
-sol = benchmark_immutable(ff, cu0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_mutable(ff, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_scalar(sf, csu0)
-@test sol.retcode === ReturnCode.Success
-@test abs(sol.u * sol.u - 2) < 1e-9
-
-# @test (@ballocated benchmark_immutable(ff, cu0)) < 200
-# @test (@ballocated benchmark_mutable(ff, cu0)) < 200
-# @test (@ballocated benchmark_scalar(sf, csu0)) < 400
-
-function benchmark_inplace(f, u0, linsolve, precs)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function ffiip(du, u, p)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-precs = [
-    NonlinearSolve.DEFAULT_PRECS,
-    (args...) -> (Diagonal(rand!(similar(u0))), nothing)
-]
 
-for prec in precs, linsolve in (nothing, KrylovJL_GMRES())
-    sol = benchmark_inplace(ffiip, u0, linsolve, prec)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-end
-
-u0 = [1.0, 1.0]
-probN = NonlinearProblem{true}(ffiip, u0)
-solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-@test (@ballocated solve!(solver)) <= 64
-
-# AD Tests
-using ForwardDiff
-
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
+# --- NewtonRaphson tests ---
 
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, NewtonRaphson(), abstol = 1e-9)
-    return sol.u[end]
-end
+@testset "NewtonRaphson" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0; linesearch = LineSearch())
+        prob = NonlinearProblem{false}(f, u0, p)
+        return solve(prob, NewtonRaphson(; linesearch), abstol = 1e-9)
+    end
 
-for p in 1.0:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; linsolve, precs,
+        linesearch = LineSearch())
+        prob = NonlinearProblem{true}(f, u0, p)
+        return solve(prob, NewtonRaphson(; linsolve, precs, linesearch), abstol = 1e-9)
+    end
 
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
+    @testset "LineSearch: $(_nameof(lsmethod)) LineSearch AD: $(_nameof(ad))" for lsmethod in (Static(),
+            StrongWolfe(), BackTracking(), HagerZhang(), MoreThuente()),
+        ad in (AutoFiniteDiff(), AutoZygote())
+
+        linesearch = LineSearch(; method = lsmethod, autodiff = ad)
+        u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
+
+        @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
+            sol = benchmark_nlsolve_oop(quadratic_f, u0; linesearch)
+            @test SciMLBase.successful_retcode(sol)
+            @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+
+            cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0), NewtonRaphson(),
+                abstol = 1e-9)
+            @test (@ballocated solve!($cache)) < 200
+        end
+
+        precs = [NonlinearSolve.DEFAULT_PRECS, :Random]
+
+        @testset "[IIP] u0: $(typeof(u0)) precs: $(_nameof(prec)) linsolve: $(_nameof(linsolve))" for u0 in ([
+                1.0, 1.0],), prec in precs, linsolve in (nothing, KrylovJL_GMRES())
+            ad isa AutoZygote && continue
+            if prec === :Random
+                prec = (args...) -> (Diagonal(randn!(similar(u0))), nothing)
+            end
+            sol = benchmark_nlsolve_iip(quadratic_f!, u0; linsolve, precs = prec,
+                linesearch)
+            @test SciMLBase.successful_retcode(sol)
+            @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+
+            cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0),
+                NewtonRaphson(; linsolve, precs = prec), abstol = 1e-9)
+            @test (@ballocated solve!($cache)) ≤ 64
+        end
+    end
 
-# NewtonRaphson
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, NewtonRaphson(), abstol = 1e-10)
-    return sol.u
-end
+    if VERSION ≥ v"1.9"
+        @testset "[OOP] [Immutable AD]" begin
+            for p in 1.0:0.1:100.0
+                @test begin
+                    res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+                    res_true = sqrt(p)
+                    all(res.u .≈ res_true)
+                end
+                @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                    @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
+            end
+        end
+    end
 
-@test ForwardDiff.derivative(g, 1.0) ≈ 0.5
+    @testset "[OOP] [Scalar AD]" begin
+        for p in 1.0:0.1:100.0
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
+                res_true = sqrt(p)
+                res.u ≈ res_true
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u,
+                p) ≈
+                  1 / (2 * sqrt(p))
+        end
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    if VERSION ≥ v"1.9"
+        t = (p) -> [sqrt(p[2] / p[1])]
+        p = [0.9, 50.0]
+        @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
+        @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u],
+            p) ≈
+              ForwardDiff.jacobian(t, p)
+    end
 
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, NewtonRaphson())
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Iterator interface
-f = (u, p) -> u * u - p
-g = function (p_range)
-    probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
-    cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, cache.u; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u
+    # Iterator interface
+    function nlprob_iterator_interface(f, p_range, ::Val{iip}) where {iip}
+        probN = NonlinearProblem{iip}(f, iip ? [0.5] : 0.5, p_range[begin])
+        cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
+        sols = zeros(length(p_range))
+        for (i, p) in enumerate(p_range)
+            reinit!(cache, iip ? [cache.u[1]] : cache.u; p = p)
+            sol = solve!(cache)
+            sols[i] = iip ? sol.u[1] : sol.u
+        end
+        return sols
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
-g = function (p_range)
-    probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
-    cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, [cache.u[1]]; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u[1]
+    p = range(0.01, 2, length = 200)
+    @test nlprob_iterator_interface(quadratic_f, p, Val(false)) ≈ sqrt.(p)
+    @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
+
+    @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (false, true,
+            AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
+            AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0])
+        probN = NonlinearProblem(quadratic_f, u0, 2.0)
+        @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-# Error Checks
-
-f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
-
-@test solve(probN, NewtonRaphson()).u[end] ≈ sqrt(2.0)
-@test solve(probN, NewtonRaphson(; autodiff = false)).u[end] ≈ sqrt(2.0)
-
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, NewtonRaphson()).u ≈ sol
-    @test solve(probN, NewtonRaphson()).u ≈ sol
-    @test solve(probN, NewtonRaphson(; autodiff = false)).u ≈ sol
 end
 
 # --- TrustRegion tests ---
 
-function benchmark_immutable(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9))
-end
-
-function ff(u, p = nothing)
-    u .* u .- 2
-end
-
-function sf(u, p = nothing)
-    u * u - 2
-end
-
-u0 = [1.0, 1.0]
-radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
-    RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
-
-for radius_update_scheme in radius_update_schemes
-    sol = benchmark_immutable(ff, cu0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-    sol = benchmark_mutable(ff, u0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-    sol = benchmark_scalar(sf, csu0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test abs(sol.u * sol.u - 2) < 1e-9
-end
-
-function benchmark_inplace(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, TrustRegion(; radius_update_scheme), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function ffiip(du, u, p = nothing)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-for radius_update_scheme in radius_update_schemes
-    sol = benchmark_inplace(ffiip, u0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-end
-
-for radius_update_scheme in radius_update_schemes
-    probN = NonlinearProblem{true}(ffiip, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    @test (@ballocated solve!(solver)) < 200
-end
-
-# AD Tests
-using ForwardDiff
-
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-10)
-    return sol.u
-end
+@testset "TrustRegion" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0; radius_update_scheme, kwargs...)
+        prob = NonlinearProblem{false}(f, u0, p)
+        return solve(prob, TrustRegion(; radius_update_scheme); abstol = 1e-9, kwargs...)
+    end
 
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; radius_update_scheme, kwargs...)
+        prob = NonlinearProblem{true}(f, u0, p)
+        return solve(prob, TrustRegion(; radius_update_scheme); abstol = 1e-9, kwargs...)
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
+        RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
+    u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
 
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion())
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+    @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in u0s,
+        radius_update_scheme in radius_update_schemes
 
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+        sol = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0),
+            TrustRegion(; radius_update_scheme); abstol = 1e-9)
+        @test (@ballocated solve!($cache)) < 200
+    end
 
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+    @testset "[IIP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([
+            1.0, 1.0],), radius_update_scheme in radius_update_schemes
+        sol = benchmark_nlsolve_iip(quadratic_f!, u0; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Iterator interface
-f = (u, p) -> u * u - p
-g = function (p_range)
-    probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
-    cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, cache.u; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u
-    end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
-g = function (p_range)
-    probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
-    cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, [cache.u[1]]; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u[1]
+        cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0),
+            TrustRegion(; radius_update_scheme); abstol = 1e-9)
+        @test (@ballocated solve!($cache)) ≤ 64
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-# Error Checks
-f, u0 = (u, p) -> u .* u .- 2, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
-
-@test solve(probN, TrustRegion()).u[end] ≈ sqrt(2.0)
-@test solve(probN, TrustRegion(; autodiff = false)).u[end] ≈ sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, TrustRegion()).u ≈ sol
-    @test solve(probN, TrustRegion()).u ≈ sol
-    @test solve(probN, TrustRegion(; autodiff = false)).u ≈ sol
-end
-
-# Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
-u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-global g, f
-f = (u, p) -> 0.010000000000000002 .+
-              10.000000000000002 ./ (1 .+
-               (0.21640425613334457 .+
-                216.40425613334457 ./ (1 .+
-                 (0.21640425613334457 .+
-                  216.40425613334457 ./
-                  (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-              0.0011552453009332421u .- p
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-# Test kwars in `TrustRegion`
-max_trust_radius = [10.0, 100.0, 1000.0]
-initial_trust_radius = [10.0, 1.0, 0.1]
-step_threshold = [0.0, 0.01, 0.25]
-shrink_threshold = [0.25, 0.3, 0.5]
-expand_threshold = [0.5, 0.8, 0.9]
-shrink_factor = [0.1, 0.3, 0.5]
-expand_factor = [1.5, 2.0, 3.0]
-max_shrink_times = [10, 20, 30]
-
-list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
-    shrink_threshold, expand_threshold, shrink_factor,
-    expand_factor, max_shrink_times)
-for options in list_of_options
-    local probN, sol, alg
-    alg = TrustRegion(max_trust_radius = options[1],
-        initial_trust_radius = options[2],
-        step_threshold = options[3],
-        shrink_threshold = options[4],
-        expand_threshold = options[5],
-        shrink_factor = options[6],
-        expand_factor = options[7],
-        max_shrink_times = options[8])
-
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, alg, abstol = 1e-10)
-    @test all(abs.(f(u, p)) .< 1e-10)
-end
 
-# Testing consistency of iip vs oop iterations
+    if VERSION ≥ v"1.9"
+        @testset "[OOP] [Immutable AD] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
+            for p in 1.0:0.1:100.0
+                @test begin
+                    res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p;
+                        radius_update_scheme)
+                    res_true = sqrt(p)
+                    all(res.u .≈ res_true)
+                end
+                @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                    @SVector[1.0, 1.0], p; radius_update_scheme).u[end], p) ≈ 1 / (2 * sqrt(p))
+            end
+        end
+    end
 
-maxiterations = [2, 3, 4, 5]
-u0 = [1.0, 1.0]
-function iip_oop(f, fip, u0, radius_update_scheme, maxiters)
-    prob_iip = NonlinearProblem{true}(fip, u0)
-    solver = init(prob_iip, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9, maxiters = maxiters)
-    sol_iip = solve!(solver)
+    @testset "[OOP] [Scalar AD] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
+        for p in 1.0:0.1:100.0
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, oftype(p, 1.0), p;
+                    radius_update_scheme)
+                res_true = sqrt(p)
+                res.u ≈ res_true
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                    oftype(p, 1.0),
+                    p; radius_update_scheme).u, p) ≈ 1 / (2 * sqrt(p))
+        end
+    end
 
-    prob_oop = NonlinearProblem{false}(f, u0)
-    solver = init(prob_oop, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9, maxiters = maxiters)
-    sol_oop = solve!(solver)
+    if VERSION ≥ v"1.9"
+        t = (p) -> [sqrt(p[2] / p[1])]
+        p = [0.9, 50.0]
+        @testset "[OOP] [Jacobian] radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in radius_update_schemes
+            @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p; radius_update_scheme).u ≈
+                  sqrt(p[2] / p[1])
+            @test ForwardDiff.jacobian(p -> [
+                    benchmark_nlsolve_oop(quadratic_f2, 0.5, p;
+                        radius_update_scheme).u,
+                ], p) ≈ ForwardDiff.jacobian(t, p)
+        end
+    end
 
-    return sol_iip.u[end], sol_oop.u[end]
-end
+    # Iterator interface
+    function nlprob_iterator_interface(f, p_range, ::Val{iip}) where {iip}
+        probN = NonlinearProblem{iip}(f, iip ? [0.5] : 0.5, p_range[begin])
+        cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
+        sols = zeros(length(p_range))
+        for (i, p) in enumerate(p_range)
+            reinit!(cache, iip ? [cache.u[1]] : cache.u; p = p)
+            sol = solve!(cache)
+            sols[i] = iip ? sol.u[1] : sol.u
+        end
+        return sols
+    end
+    p = range(0.01, 2, length = 200)
+    @test nlprob_iterator_interface(quadratic_f, p, Val(false)) ≈ sqrt.(p)
+    @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
 
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Simple, maxiters)
-    @test iip == oop
-end
+    @testset "ADType: $(autodiff) u0: $(_nameof(u0)) radius_update_scheme: $(radius_update_scheme)" for autodiff in (false,
+            true, AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
+            AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0]),
+        radius_update_scheme in radius_update_schemes
 
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Hei, maxiters)
-    @test iip == oop
-end
+        probN = NonlinearProblem(quadratic_f, u0, 2.0)
+        @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
+    end
 
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Yuan, maxiters)
-    @test iip == oop
-end
+    # Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
+    @testset "Newton Raphson Fails: radius_update_scheme: $(radius_update_scheme)" for radius_update_scheme in [
+        RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
+        u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+        p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        sol = benchmark_nlsolve_oop(newton_fails, u0, p; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(newton_fails(sol.u, p)) .< 1e-9)
+    end
 
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Fan, maxiters)
-    @test iip == oop
-end
+    # Test kwargs in `TrustRegion`
+    @testset "Keyword Arguments" begin
+        max_trust_radius = [10.0, 100.0, 1000.0]
+        initial_trust_radius = [10.0, 1.0, 0.1]
+        step_threshold = [0.0, 0.01, 0.25]
+        shrink_threshold = [0.25, 0.3, 0.5]
+        expand_threshold = [0.5, 0.8, 0.9]
+        shrink_factor = [0.1, 0.3, 0.5]
+        expand_factor = [1.5, 2.0, 3.0]
+        max_shrink_times = [10, 20, 30]
+
+        list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
+            shrink_threshold, expand_threshold, shrink_factor,
+            expand_factor, max_shrink_times)
+        for options in list_of_options
+            local probN, sol, alg
+            alg = TrustRegion(max_trust_radius = options[1],
+                initial_trust_radius = options[2], step_threshold = options[3],
+                shrink_threshold = options[4], expand_threshold = options[5],
+                shrink_factor = options[6], expand_factor = options[7],
+                max_shrink_times = options[8])
+
+            probN = NonlinearProblem{false}(quadratic_f, [1.0, 1.0], 2.0)
+            sol = solve(probN, alg, abstol = 1e-10)
+            @test all(abs.(quadratic_f(sol.u, 2.0)) .< 1e-10)
+        end
+    end
 
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Bastin, maxiters)
-    @test iip == oop
+    # Testing consistency of iip vs oop iterations
+    @testset "OOP / IIP Consistency" begin
+        maxiterations = [2, 3, 4, 5]
+        u0 = [1.0, 1.0]
+        @testset "radius_update_scheme: $(radius_update_scheme) maxiters: $(maxiters)" for radius_update_scheme in radius_update_schemes,
+            maxiters in maxiterations
+
+            sol_iip = benchmark_nlsolve_iip(quadratic_f!, u0; radius_update_scheme,
+                maxiters)
+            sol_oop = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme,
+                maxiters)
+            @test sol_iip.u ≈ sol_iip.u
+        end
+    end
 end
 
 # --- LevenbergMarquardt tests ---
 
-function benchmark_immutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, LevenbergMarquardt(), abstol = 1e-9))
-end
-
-function ff(u, p)
-    u .* u .- 2
-end
-
-function sf(u, p)
-    u * u - 2
-end
-u0 = [1.0, 1.0]
-
-sol = benchmark_immutable(ff, cu0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_mutable(ff, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_scalar(sf, csu0)
-@test sol.retcode === ReturnCode.Success
-@test abs(sol.u * sol.u - 2) < 1e-9
-
-function benchmark_inplace(f, u0)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function ffiip(du, u, p)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-sol = benchmark_inplace(ffiip, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-
-u0 = [1.0, 1.0]
-probN = NonlinearProblem{true}(ffiip, u0)
-solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-@test (@ballocated solve!(solver)) < 120
-
-# AD Tests
-using ForwardDiff
-
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-9)
-    return sol.u[end]
-end
+@testset "LevenbergMarquardt" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0)
+        prob = NonlinearProblem{false}(f, u0, p)
+        return solve(prob, LevenbergMarquardt(), abstol = 1e-9)
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    function benchmark_nlsolve_iip(f, u0, p = 2.0)
+        prob = NonlinearProblem{true}(f, u0, p)
+        return solve(prob, LevenbergMarquardt(), abstol = 1e-9)
+    end
 
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
+    u0s = VERSION ≥ v"1.9" ? ([1.0, 1.0], @SVector[1.0, 1.0], 1.0) : ([1.0, 1.0], 1.0)
+    @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
+        sol = benchmark_nlsolve_oop(quadratic_f, u0)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-    return sol.u
-end
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0), LevenbergMarquardt(),
+            abstol = 1e-9)
+        @test (@ballocated solve!($cache)) < 200
+    end
 
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+    @testset "[IIP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0],)
+        sol = benchmark_nlsolve_iip(quadratic_f!, u0)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+        cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0), LevenbergMarquardt(),
+            abstol = 1e-9)
+        @test (@ballocated solve!($cache)) ≤ 64
+    end
 
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, LevenbergMarquardt())
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+    if VERSION ≥ v"1.9"
+        @testset "[OOP] [Immutable AD]" begin
+            for p in 1.0:0.1:100.0
+                @test begin
+                    res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+                    res_true = sqrt(p)
+                    all(res.u .≈ res_true)
+                end
+                @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+                    @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
+            end
+        end
+    end
 
-# Error Checks
-f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
+    @testset "[OOP] [Scalar AD]" begin
+        for p in 1.0:0.1:100.0
+            @test begin
+                res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
+                res_true = sqrt(p)
+                res.u ≈ res_true
+            end
+            @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u,
+                p) ≈
+                  1 / (2 * sqrt(p))
+        end
+    end
 
-@test solve(probN, LevenbergMarquardt()).u[end] ≈ sqrt(2.0)
-@test solve(probN, LevenbergMarquardt(; autodiff = false)).u[end] ≈ sqrt(2.0)
+    if VERSION ≥ v"1.9"
+        t = (p) -> [sqrt(p[2] / p[1])]
+        p = [0.9, 50.0]
+        @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
+        @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u],
+            p) ≈
+              ForwardDiff.jacobian(t, p)
+    end
 
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
+    @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (false, true,
+            AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(),
+            AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0])
+        probN = NonlinearProblem(quadratic_f, u0, 2.0)
+        @test all(solve(probN, LevenbergMarquardt(; autodiff)).u .≈ sqrt(2.0))
+    end
 
-    @test solve(probN, LevenbergMarquardt()).u ≈ sol
-    @test solve(probN, LevenbergMarquardt()).u ≈ sol
-    @test solve(probN, LevenbergMarquardt(; autodiff = false)).u ≈ sol
-end
+    # Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
+    @testset "Newton Raphson Fails" begin
+        u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+        p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+        sol = benchmark_nlsolve_oop(newton_fails, u0, p)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(newton_fails(sol.u, p)) .< 1e-9)
+    end
 
-# Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
-u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-global g, f
-f = (u, p) -> 0.010000000000000002 .+
-              10.000000000000002 ./ (1 .+
-               (0.21640425613334457 .+
-                216.40425613334457 ./ (1 .+
-                 (0.21640425613334457 .+
-                  216.40425613334457 ./
-                  (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-              0.0011552453009332421u .- p
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-# # Test kwars in `LevenbergMarquardt`
-damping_initial = [0.5, 2.0, 5.0]
-damping_increase_factor = [1.5, 3.0, 10.0]
-damping_decrease_factor = [2, 5, 10]
-finite_diff_step_geodesic = [0.02, 0.2, 0.3]
-α_geodesic = [0.6, 0.8, 0.9]
-b_uphill = [0, 1, 2]
-min_damping_D = [1e-12, 1e-9, 1e-4]
-
-list_of_options = zip(damping_initial, damping_increase_factor, damping_decrease_factor,
-    finite_diff_step_geodesic, α_geodesic, b_uphill,
-    min_damping_D)
-for options in list_of_options
-    local probN, sol, alg
-    alg = LevenbergMarquardt(damping_initial = options[1],
-        damping_increase_factor = options[2],
-        damping_decrease_factor = options[3],
-        finite_diff_step_geodesic = options[4],
-        α_geodesic = options[5],
-        b_uphill = options[6],
-        min_damping_D = options[7])
-
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, alg, abstol = 1e-10)
-    @test all(abs.(f(u, p)) .< 1e-10)
+    # Test kwargs in `LevenbergMarquardt`
+    @testset "Keyword Arguments" begin
+        damping_initial = [0.5, 2.0, 5.0]
+        damping_increase_factor = [1.5, 3.0, 10.0]
+        damping_decrease_factor = Float64[2, 5, 10]
+        finite_diff_step_geodesic = [0.02, 0.2, 0.3]
+        α_geodesic = [0.6, 0.8, 0.9]
+        b_uphill = Float64[0, 1, 2]
+        min_damping_D = [1e-12, 1e-9, 1e-4]
+
+        list_of_options = zip(damping_initial, damping_increase_factor,
+            damping_decrease_factor, finite_diff_step_geodesic, α_geodesic, b_uphill,
+            min_damping_D)
+        for options in list_of_options
+            local probN, sol, alg
+            alg = LevenbergMarquardt(damping_initial = options[1],
+                damping_increase_factor = options[2],
+                damping_decrease_factor = options[3],
+                finite_diff_step_geodesic = options[4], α_geodesic = options[5],
+                b_uphill = options[6], min_damping_D = options[7])
+
+            probN = NonlinearProblem{false}(quadratic_f, [1.0, 1.0], 2.0)
+            sol = solve(probN, alg, abstol = 1e-10)
+            @test all(abs.(quadratic_f(sol.u, 2.0)) .< 1e-10)
+        end
+    end
 end
diff --git a/test/convergencetests.jl b/test/convergencetests.jl
deleted file mode 100644
index 751948522..000000000
--- a/test/convergencetests.jl
+++ /dev/null
@@ -1,40 +0,0 @@
-using NonlinearSolve
-using StaticArrays
-using BenchmarkTools
-using Test
-
-using SciMLNLSolve
-
-###-----Trust Region tests-----###
-
-# some simple functions #
-function f_oop(u, p)
-    u .* u .- p
-end
-
-function f_iip(du, u, p)
-    du .= u .* u .- p
-end
-
-function f_scalar(u, p)
-    u * u - p
-end
-
-u0 = [1.0, 1.0]
-csu0 = 1.0
-p = [2.0, 2.0]
-radius_update_scheme = RadiusUpdateSchemes.Simple
-tol = 1e-9
-
-function convergence_test_oop(f, u0, p, radius_update_scheme)
-    prob = NonlinearProblem{false}(f, oftype(p, u0), p)
-    cache = init(prob,
-        TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(cache)
-    return cache.internalnorm(cache.u_prev - cache.u), cache.iter, sol.retcode
-end
-
-residual, iterations, return_code = convergence_test_oop(f_oop, u0, p, radius_update_scheme)
-@test return_code === ReturnCode.Success
-@test residual ≈ tol
diff --git a/test/sparse.jl b/test/sparse.jl
index 1f4d07161..256ca7530 100644
--- a/test/sparse.jl
+++ b/test/sparse.jl
@@ -2,8 +2,10 @@ using NonlinearSolve, LinearAlgebra, SparseArrays, Symbolics
 
 const N = 32
 const xyd_brusselator = range(0, stop = 1, length = N)
+
 brusselator_f(x, y) = (((x - 0.3)^2 + (y - 0.6)^2) <= 0.1^2) * 5.0
 limit(a, N) = a == N + 1 ? 1 : a == 0 ? N : a
+
 function brusselator_2d_loop(du, u, p)
     A, B, alpha, dx = p
     alpha = alpha / dx^2
@@ -21,6 +23,7 @@ function brusselator_2d_loop(du, u, p)
                       A * u[i, j, 1] - u[i, j, 1]^2 * u[i, j, 2]
     end
 end
+
 p = (3.4, 1.0, 10.0, step(xyd_brusselator))
 
 function init_brusselator_2d(xyd)
@@ -32,8 +35,9 @@ function init_brusselator_2d(xyd)
         u[I, 1] = 22 * (y * (1 - y))^(3 / 2)
         u[I, 2] = 27 * (x * (1 - x))^(3 / 2)
     end
-    u
+    return u
 end
+
 u0 = init_brusselator_2d(xyd_brusselator)
 prob_brusselator_2d = NonlinearProblem(brusselator_2d_loop, u0, p)
 sol = solve(prob_brusselator_2d, NewtonRaphson())
@@ -47,12 +51,14 @@ fill!(jac_prototype, 0)
 
 ff = NonlinearFunction(brusselator_2d_loop; jac_prototype)
 prob_brusselator_2d = NonlinearProblem(ff, u0, p)
+
+# for autodiff in [false, ]
 sol = solve(prob_brusselator_2d, NewtonRaphson())
 @test norm(sol.resid) < 1e-8
 @test !all(iszero, jac_prototype)
 
-sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = false))
+sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseFiniteDiff()))
 @test norm(sol.resid) < 1e-6
 
-cache = init(prob_brusselator_2d, NewtonRaphson())
-@test maximum(cache.jac_config.colorvec) == 12
+cache = init(prob_brusselator_2d, NewtonRaphson(; autodiff = AutoSparseForwardDiff()));
+@test maximum(cache.jac_cache.coloring.colorvec) == 12