diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 3993fff35..838aba50f 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -13,7 +13,7 @@ concurrency:
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
@@ -24,8 +24,11 @@ jobs:
           - Wrappers
           - Miscellaneous
         version:
-          - '1'
-          - '~1.10.0-0'
+          - '1.10'
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v1
diff --git a/.github/workflows/Documentation.yml b/.github/workflows/Documentation.yml
index 6a08fca1a..73a1826ca 100644
--- a/.github/workflows/Documentation.yml
+++ b/.github/workflows/Documentation.yml
@@ -20,6 +20,7 @@ jobs:
         run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy
         env:
+          JULIA_DEBUG: "Documenter"
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # For authentication with GitHub Actions token
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} # For authentication with SSH deploy key
         run: julia --project=docs/ --code-coverage=user docs/make.jl
diff --git a/Project.toml b/Project.toml
index 865fc43ff..75f45bce7 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,14 +1,13 @@
 name = "NonlinearSolve"
 uuid = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
 authors = ["SciML"]
-version = "3.4.0"
+version = "3.5.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
-EnumX = "4e289a0a-7415-4d19-859d-a7e5c4648b56"
 FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
 FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
@@ -19,16 +18,16 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 MaybeInplace = "bb5d69b7-63fc-4a16-80bd-7e42200c7bdb"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
+Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
-SciMLOperators = "c0aeaf25-5076-4817-a8d5-81caf7dfa961"
 SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
-StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
-UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
+StaticArraysCore = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
+TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [weakdeps]
 BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
@@ -55,14 +54,13 @@ NonlinearSolveSymbolicsExt = "Symbolics"
 NonlinearSolveZygoteExt = "Zygote"
 
 [compat]
-ADTypes = "0.2.5"
+ADTypes = "0.2.6"
 Aqua = "0.8"
 ArrayInterface = "7.7"
 BandedMatrices = "1.4"
 BenchmarkTools = "1.4"
-ConcreteStructs = "0.2"
-DiffEqBase = "6.144"
-EnumX = "1"
+ConcreteStructs = "0.2.3"
+DiffEqBase = "6.146.0"
 Enzyme = "0.11.11"
 FastBroadcast = "0.2.8"
 FastClosures = "0.3"
@@ -73,7 +71,7 @@ ForwardDiff = "0.10.36"
 LazyArrays = "1.8.2"
 LeastSquaresOptim = "0.8.5"
 LineSearches = "7.2"
-LinearAlgebra = "<0.0.1, 1"
+LinearAlgebra = "1.10"
 LinearSolve = "2.21"
 MINPACK = "1.2"
 MaybeInplace = "0.1.1"
@@ -81,28 +79,29 @@ NLsolve = "4.5"
 NaNMath = "1"
 NonlinearProblemLibrary = "0.1.2"
 OrdinaryDiffEq = "6.63"
-Pkg = "1"
+Pkg = "1.10"
 PrecompileTools = "1.2"
+Preferences = "1.4"
 Printf = "1.10"
 Random = "1.91"
-RecursiveArrayTools = "3.2"
+RecursiveArrayTools = "3.4"
 Reexport = "1.2"
 SIAMFANLEquations = "1.0.1"
 SafeTestsets = "0.1"
-SciMLBase = "2.11"
-SciMLOperators = "0.3.7"
-SimpleNonlinearSolve = "1.0.2"
+SciMLBase = "2.19.0"
+SimpleNonlinearSolve = "1.2"
 SparseArrays = "1.10"
 SparseDiffTools = "2.14"
 SpeedMapping = "0.3"
 StableRNGs = "1"
 StaticArrays = "1.7"
+StaticArraysCore = "1.4"
 Sundials = "4.23.1"
 Symbolics = "5.13"
-Test = "1"
-UnPack = "1.0"
+Test = "1.10"
+TimerOutputs = "0.5.23"
 Zygote = "0.6.67"
-julia = "1.9"
+julia = "1.10"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
diff --git a/docs/LocalPreferences.toml b/docs/LocalPreferences.toml
new file mode 100644
index 000000000..feb3e965a
--- /dev/null
+++ b/docs/LocalPreferences.toml
@@ -0,0 +1,2 @@
+[NonlinearSolve]
+enable_timer_outputs = true
diff --git a/docs/Project.toml b/docs/Project.toml
index 9ba131dc9..1a82e485c 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -4,7 +4,9 @@ ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
 IncompleteLU = "40713840-3770-5561-ab4c-a76e7d0d7895"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 ModelingToolkit = "961ee093-0014-501f-94e3-6117800e7a78"
 NonlinearSolve = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
@@ -25,6 +27,7 @@ ArrayInterface = "6, 7"
 BenchmarkTools = "1"
 DiffEqBase = "6.136"
 Documenter = "1"
+DocumenterCitations = "1"
 IncompleteLU = "0.2"
 LinearSolve = "2"
 ModelingToolkit = "8"
diff --git a/docs/make.jl b/docs/make.jl
index f494f711c..0826acd60 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,5 +1,6 @@
-using Documenter,
-    NonlinearSolve, SimpleNonlinearSolve, Sundials, SteadyStateDiffEq, SciMLBase, DiffEqBase
+using Documenter, DocumenterCitations
+using NonlinearSolve,
+    SimpleNonlinearSolve, Sundials, SteadyStateDiffEq, SciMLBase, DiffEqBase
 
 cp(joinpath(@__DIR__, "Manifest.toml"), joinpath(@__DIR__, "src/assets/Manifest.toml"),
     force = true)
@@ -8,14 +9,16 @@ cp(joinpath(@__DIR__, "Project.toml"), joinpath(@__DIR__, "src/assets/Project.to
 
 include("pages.jl")
 
+bib = CitationBibliography(joinpath(@__DIR__, "src", "refs.bib"))
+
 makedocs(; sitename = "NonlinearSolve.jl",
     authors = "Chris Rackauckas",
     modules = [NonlinearSolve, SimpleNonlinearSolve, SteadyStateDiffEq, Sundials,
         DiffEqBase, SciMLBase],
     clean = true, doctest = false, linkcheck = true,
     linkcheck_ignore = ["https://twitter.com/ChrisRackauckas/status/1544743542094020615"],
-    checkdocs = :export,
-    format = Documenter.HTML(assets = ["assets/favicon.ico"],
+    checkdocs = :exports, warnonly = [:missing_docs], plugins = [bib],
+    format = Documenter.HTML(assets = ["assets/favicon.ico", "assets/citations.css"],
         canonical = "https://docs.sciml.ai/NonlinearSolve/stable/"),
     pages)
 
diff --git a/docs/pages.jl b/docs/pages.jl
index 9c148bcb4..9b15d694a 100644
--- a/docs/pages.jl
+++ b/docs/pages.jl
@@ -1,6 +1,7 @@
 # Put in a separate page so it can be used by SciMLDocs.jl
 
-pages = ["index.md",
+pages = [
+    "index.md",
     "Getting Started with Nonlinear Rootfinding in Julia" => "tutorials/getting_started.md",
     "Tutorials" => Any["tutorials/code_optimization.md",
         "tutorials/large_systems.md",
@@ -8,30 +9,38 @@ pages = ["index.md",
         "tutorials/small_compile.md",
         "tutorials/iterator_interface.md",
         "tutorials/optimizing_parameterized_ode.md"],
-    "Basics" => Any["basics/NonlinearProblem.md",
-        "basics/NonlinearFunctions.md",
+    "Basics" => Any["basics/nonlinear_problem.md",
+        "basics/nonlinear_functions.md",
         "basics/solve.md",
-        "basics/NonlinearSolution.md",
-        "basics/TerminationCondition.md",
-        "basics/Logging.md",
-        "basics/SparsityDetection.md",
-        "basics/FAQ.md"],
-    "Solver Summaries and Recommendations" => Any["solvers/NonlinearSystemSolvers.md",
-        "solvers/BracketingSolvers.md",
-        "solvers/SteadyStateSolvers.md",
-        "solvers/NonlinearLeastSquaresSolvers.md",
-        "solvers/FixedPointSolvers.md",
-        "solvers/LineSearch.md"],
-    "Detailed Solver APIs" => Any["api/nonlinearsolve.md",
-        "api/simplenonlinearsolve.md",
+        "basics/nonlinear_solution.md",
+        "basics/termination_condition.md",
+        "basics/diagnostics_api.md",
+        "basics/sparsity_detection.md",
+        "basics/faq.md"],
+    "Solver Summaries and Recommendations" => Any["solvers/nonlinear_system_solvers.md",
+        "solvers/bracketing_solvers.md",
+        "solvers/steady_state_solvers.md",
+        "solvers/nonlinear_least_squares_solvers.md",
+        "solvers/fixed_point_solvers.md"],
+    "Native Functionalities" => Any["native/solvers.md",
+        "native/simplenonlinearsolve.md",
+        "native/steadystatediffeq.md",
+        "native/descent.md",
+        "native/globalization.md",
+        "native/diagnostics.md"],
+    "Wrapped Solver APIs" => Any["api/fastlevenbergmarquardt.md",
+        "api/fixedpointacceleration.md",
+        "api/leastsquaresoptim.md",
         "api/minpack.md",
         "api/nlsolve.md",
-        "api/sundials.md",
-        "api/steadystatediffeq.md",
-        "api/leastsquaresoptim.md",
-        "api/fastlevenbergmarquardt.md",
+        "api/siamfanlequations.md",
         "api/speedmapping.md",
-        "api/fixedpointacceleration.md",
-        "api/siamfanlequations.md"],
+        "api/sundials.md"],
+    "Development Documentation" => ["devdocs/internal_interfaces.md",
+        "devdocs/linear_solve.md",
+        "devdocs/jacobian.md",
+        "devdocs/operators.md",
+        "devdocs/algorithm_helpers.md"],
     "Release Notes" => "release_notes.md",
+    "References" => "references.md",
 ]
diff --git a/docs/src/api/nonlinearsolve.md b/docs/src/api/nonlinearsolve.md
deleted file mode 100644
index cefda9ad7..000000000
--- a/docs/src/api/nonlinearsolve.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# NonlinearSolve.jl Native Solvers
-
-These are the native solvers of NonlinearSolve.jl.
-
-## Nonlinear Solvers
-
-```@docs
-NewtonRaphson
-PseudoTransient
-DFSane
-Broyden
-Klement
-LimitedMemoryBroyden
-```
-
-## Nonlinear Least Squares Solvers
-
-```@docs
-GaussNewton
-```
-
-## Both Nonlinear & Nonlinear Least Squares Solvers
-
-These solvers can be used for both nonlinear and nonlinear least squares problems.
-
-```@docs
-TrustRegion
-LevenbergMarquardt
-```
-
-## Polyalgorithms
-
-```@docs
-NonlinearSolvePolyAlgorithm
-FastShortcutNonlinearPolyalg
-FastShortcutNLLSPolyalg
-RobustMultiNewton
-```
-
-## Radius Update Schemes for Trust Region (RadiusUpdateSchemes)
-
-```@docs
-RadiusUpdateSchemes
-```
-
-### Available Radius Update Schemes
-
-```@docs
-RadiusUpdateSchemes.Simple
-RadiusUpdateSchemes.Hei
-RadiusUpdateSchemes.Yuan
-RadiusUpdateSchemes.Bastin
-RadiusUpdateSchemes.Fan
-RadiusUpdateSchemes.NLsolve
-RadiusUpdateSchemes.NocedalWright
-```
diff --git a/docs/src/api/simplenonlinearsolve.md b/docs/src/api/simplenonlinearsolve.md
deleted file mode 100644
index f10fb78d6..000000000
--- a/docs/src/api/simplenonlinearsolve.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# SimpleNonlinearSolve.jl
-
-These methods can be used independently of the rest of NonlinearSolve.jl
-
-## Solver API
-
-### Interval Methods
-
-These methods are suited for interval (scalar) root-finding problems,
-i.e. `IntervalNonlinearProblem`.
-
-```@docs
-ITP
-Alefeld
-Bisection
-Falsi
-Ridder
-Brent
-```
-
-### General Methods
-
-These methods are suited for any general nonlinear root-finding problem, i.e.
-`NonlinearProblem`.
-
-```@docs
-SimpleNewtonRaphson
-SimpleBroyden
-SimpleHalley
-SimpleKlement
-SimpleTrustRegion
-SimpleDFSane
-SimpleLimitedMemoryBroyden
-```
diff --git a/docs/src/assets/citations.css b/docs/src/assets/citations.css
new file mode 100644
index 000000000..20e89810b
--- /dev/null
+++ b/docs/src/assets/citations.css
@@ -0,0 +1,23 @@
+.citation dl {
+  display: grid;
+  grid-template-columns: max-content auto;
+}
+.citation dt {
+  grid-column-start: 1;
+}
+.citation dd {
+  grid-column-start: 2;
+  margin-bottom: 0.75em;
+}
+.citation ul {
+  padding: 0 0 2.25em 0;
+  margin: 0;
+  list-style: none;
+}
+.citation ul li {
+  text-indent: -2.25em;
+  margin: 0.33em 0.5em 0.5em 2.25em;
+}
+.citation ol li {
+  padding-left: 0.75em;
+}
diff --git a/docs/src/basics/NonlinearSolution.md b/docs/src/basics/NonlinearSolution.md
deleted file mode 100644
index a8762a015..000000000
--- a/docs/src/basics/NonlinearSolution.md
+++ /dev/null
@@ -1,18 +0,0 @@
-# [Nonlinear Solutions](@id solution)
-
-```@docs
-SciMLBase.NonlinearSolution
-```
-
-## Return Code
-
-  - `ReturnCode.Success` - The nonlinear solve succeeded.
-  - `ReturnCode.ConvergenceFailure` - The nonlinear solve failed to converge due to stalling
-    or some limit of the solver was exceeded. For example, too many shrinks for trust
-    region methods, number of resets for Broyden, etc.
-  - `ReturnCode.Unstable` - This corresponds to
-    `NonlinearSafeTerminationReturnCode.ProtectiveTermination` and is caused if the step-size
-    of the solver was too large or the objective value became non-finite.
-  - `ReturnCode.MaxIters` - The maximum number of iterations was reached.
-  - `ReturnCode.Failure` - The nonlinear solve failed for some reason. This is used
-    sparingly and mostly for wrapped solvers for which we don't have a better error code.
diff --git a/docs/src/basics/Logging.md b/docs/src/basics/diagnostics_api.md
similarity index 57%
rename from docs/src/basics/Logging.md
rename to docs/src/basics/diagnostics_api.md
index edd191d76..993432a00 100644
--- a/docs/src/basics/Logging.md
+++ b/docs/src/basics/diagnostics_api.md
@@ -1,4 +1,9 @@
-# Logging the Solve Process
+# [Diagnostics API](@id diagnostics_api)
+
+Detailed API Documentation is provided at
+[Diagnostics API Reference](@ref diagnostics_api_reference).
+
+## Logging the Solve Process
 
 All NonlinearSolve.jl native solvers allow storing and displaying the trace of the nonlinear
 solve process. This is controlled by 3 keyword arguments to `solve`:
@@ -11,9 +16,17 @@ solve process. This is controlled by 3 keyword arguments to `solve`:
  3. `store_trace`: Must be `Val(true)` or `Val(false)`. This controls whether the trace is
     stored in the solution object. (Defaults to `Val(false)`)
 
+## Detailed Internal Timings
+
+All the native NonlinearSolve.jl algorithms come with in-built
+[TimerOutputs.jl](https://github.com/KristofferC/TimerOutputs.jl) support. However, this
+is disabled by default and can be enabled via [`NonlinearSolve.enable_timer_outputs`](@ref).
+
+Note that you will have to restart Julia to disable the timer outputs once enabled.
+
 ## Example Usage
 
-```@example tracing
+```@example diagnostics_example
 using ModelingToolkit, NonlinearSolve
 
 @variables x y z
@@ -37,28 +50,38 @@ solve(prob)
 This produced the output, but it is hard to diagnose what is going on. We can turn on
 the trace to see what is happening:
 
-```@example tracing
+```@example diagnostics_example
 solve(prob; show_trace = Val(true), trace_level = TraceAll(10))
 nothing; # hide
 ```
 
 You can also store the trace in the solution object:
 
-```@example tracing
+```@example diagnostics_example
 sol = solve(prob; trace_level = TraceAll(), store_trace = Val(true));
 
 sol.trace
 ```
 
+Now, let's try to investigate the time it took for individual internal steps. We will have
+to use the `init` and `solve!` API for this. The `TimerOutput` will be present in
+`cache.timer`. However, note that for poly-algorithms this is currently not implemented.
+
+```@example diagnostics_example
+cache = init(prob, NewtonRaphson(); show_trace = Val(true));
+solve!(cache)
+cache.timer
+```
+
+Let's try for some other solver:
+
+```@example diagnostics_example
+cache = init(prob, DFSane(); show_trace = Val(true), trace_level = TraceMinimal(50));
+solve!(cache)
+cache.timer
+```
+
 !!! note
     
     For `iteration == 0` only the `norm(fu, Inf)` is guaranteed to be meaningful. The other
     values being meaningful are solver dependent.
-
-## API
-
-```@docs
-TraceMinimal
-TraceWithJacobianConditionNumber
-TraceAll
-```
diff --git a/docs/src/basics/FAQ.md b/docs/src/basics/faq.md
similarity index 58%
rename from docs/src/basics/FAQ.md
rename to docs/src/basics/faq.md
index 62add8e83..b3eff3d75 100644
--- a/docs/src/basics/FAQ.md
+++ b/docs/src/basics/faq.md
@@ -36,7 +36,7 @@ speedup.
 
 For more information on performance of SciML, see the [SciMLBenchmarks](https://docs.sciml.ai/SciMLBenchmarksOutput/stable/).
 
-## The solver tried to set a Dual Number in my Vector of Floats.How do I fix that?
+## The solver tried to set a Dual Number in my Vector of Floats. How do I fix that?
 
 This is a common problem that occurs if the code was not written to be generic based on the
 input types. For example, consider this example taken from
@@ -76,8 +76,9 @@ sol = solve(prob_oop, LevenbergMarquardt(; autodiff = AutoFiniteDiff()); maxiter
 ```
 
 This worked but, Finite Differencing is not the recommended approach in any scenario.
-Instead, rewrite the function to use
-[PreallocationTools.jl](https://github.com/SciML/PreallocationTools.jl) or write it as
+
+ 2. Rewrite the function to use
+    [PreallocationTools.jl](https://github.com/SciML/PreallocationTools.jl) or write it as
 
 ```@example dual_error_faq
 function fff_correct(var, p)
@@ -90,3 +91,64 @@ end
 prob_oop = NonlinearLeastSquaresProblem{false}(fff_correct, v_init)
 sol = solve(prob_oop, LevenbergMarquardt(); maxiters = 10000, abstol = 1e-8)
 ```
+
+## I thought NonlinearSolve.jl was type-stable and fast. But it isn't, why?
+
+It is hard to say why your code is not fast. Take a look at the
+[Diagnostics API](@ref diagnostics_api) to pin-point the problem. One common issue is that
+there is type instability.
+
+If you are using the defaults for the autodiff and your problem is not a scalar or using
+static arrays, ForwardDiff will create type unstable code. See this simple example:
+
+```@example type_unstable
+using NonlinearSolve, InteractiveUtils
+
+f(u, p) = @. u^2 - p
+
+prob = NonlinearProblem{false}(f, 1.0, 2.0)
+
+@code_warntype solve(prob, NewtonRaphson())
+nothing # hide
+```
+
+Notice that this was type-stable, since it is a scalar problem. Now what happens for static
+arrays
+
+```@example type_unstable
+using StaticArrays
+
+prob = NonlinearProblem{false}(f, @SVector([1.0, 2.0]), 2.0)
+
+@code_warntype solve(prob, NewtonRaphson())
+nothing # hide
+```
+
+Again Type-Stable! Now let's try using a regular array:
+
+```@example type_unstable
+prob = NonlinearProblem(f, [1.0, 2.0], 2.0)
+
+@code_warntype solve(prob, NewtonRaphson())
+nothing # hide
+```
+
+Oh no! This is type unstable. This is because ForwardDiff.jl will chunk the jacobian
+computation and the type of this chunksize can't be statically inferred. To fix this, we
+directly specify the chunksize:
+
+```@example type_unstable
+@code_warntype solve(prob,
+    NewtonRaphson(;
+        autodiff = AutoForwardDiff(; chunksize = NonlinearSolve.pickchunksize(prob.u0))))
+nothing # hide
+```
+
+And boom! Type stable again. We always recommend picking the chunksize via
+[`NonlinearSolve.pickchunksize`](@ref), however, if you manually specify the chunksize, it
+must be `≤ length of input`. However, a very large chunksize can lead to excessive
+compilation times and slowdown.
+
+```@docs
+NonlinearSolve.pickchunksize
+```
diff --git a/docs/src/basics/NonlinearFunctions.md b/docs/src/basics/nonlinear_functions.md
similarity index 88%
rename from docs/src/basics/NonlinearFunctions.md
rename to docs/src/basics/nonlinear_functions.md
index f3e142ac5..151010ba2 100644
--- a/docs/src/basics/NonlinearFunctions.md
+++ b/docs/src/basics/nonlinear_functions.md
@@ -1,4 +1,4 @@
-# [NonlinearFunctions and Jacobian Types](@id nonlinearfunctions)
+# [Nonlinear Functions and Jacobian Types](@id nonlinearfunctions)
 
 The SciML ecosystem provides an extensive interface for declaring extra functions
 associated with the differential equation's data. In traditional libraries, there is usually
diff --git a/docs/src/basics/NonlinearProblem.md b/docs/src/basics/nonlinear_problem.md
similarity index 91%
rename from docs/src/basics/NonlinearProblem.md
rename to docs/src/basics/nonlinear_problem.md
index 23acf78b5..4da69cde8 100644
--- a/docs/src/basics/NonlinearProblem.md
+++ b/docs/src/basics/nonlinear_problem.md
@@ -7,8 +7,8 @@ NonlinearSolve.jl tackles four related types of nonlinear systems:
  1. Interval rootfinding problems. I.e., find the ``t \in [t_0, t_f]`` such that
     ``f(t) = 0``.
  2. Systems of nonlinear equations, i.e., find the ``u`` such that ``f(u) = 0``.
- 3. Steady state problems, i.e., find the ``u`` such that ``u' = f(u,t)`` has reached steady state,
-    i.e., ``0 = f(u, ∞)``.
+ 3. Steady state problems, i.e., find the ``u`` such that ``u' = f(u,t)`` has reached steady
+    state, i.e., ``0 = f(u, ∞)``.
  4. The nonlinear least squares problem, which is an under/over-constrained nonlinear system
     which might not be satisfiable, i.e. there may be no `u` such that `f(u) = 0`, and thus
     we find the `u` which minimizes `||f(u)||` in the least squares sense.
@@ -35,7 +35,7 @@ that `f(u) = 0`, the `NonlinearProblem` does not have a preferred solution, whil
 `SteadyStateProblem` the preferred solution is the `u(∞)` that would arise from solving the
 ODE `u' = f(u,t)`.
 
-!!! warn
+!!! warning
     
     Most solvers for `SteadyStateProblem` do not guarantee the preferred solution and
     instead will solve for some `u` in the set of solutions. The documentation of the
@@ -44,8 +44,8 @@ ODE `u' = f(u,t)`.
 ## Problem Construction Details
 
 ```@docs
-SciMLBase.IntervalNonlinearProblem
-SciMLBase.NonlinearProblem
-SciMLBase.SteadyStateProblem
-SciMLBase.NonlinearLeastSquaresProblem
+IntervalNonlinearProblem
+NonlinearProblem
+SteadyStateProblem
+NonlinearLeastSquaresProblem
 ```
diff --git a/docs/src/basics/nonlinear_solution.md b/docs/src/basics/nonlinear_solution.md
new file mode 100644
index 000000000..ce1abcc4c
--- /dev/null
+++ b/docs/src/basics/nonlinear_solution.md
@@ -0,0 +1,26 @@
+# [Nonlinear Solutions](@id solution)
+
+```@docs
+SciMLBase.AbstractNonlinearSolution
+SciMLBase.NonlinearSolution
+```
+
+## Statistics
+
+```@docs
+SciMLBase.NLStats
+NonlinearSolve.ImmutableNLStats
+```
+
+## Return Code
+
+```@docs
+ReturnCode.Success
+ReturnCode.ConvergenceFailure
+ReturnCode.Unstable
+ReturnCode.MaxIters
+ReturnCode.Failure
+ReturnCode.InternalLineSearchFailed
+ReturnCode.Stalled
+ReturnCode.ShrinkThresholdExceeded
+```
diff --git a/docs/src/basics/solve.md b/docs/src/basics/solve.md
index cf78e1212..8ceeaa5de 100644
--- a/docs/src/basics/solve.md
+++ b/docs/src/basics/solve.md
@@ -8,14 +8,19 @@ solve(prob::SciMLBase.NonlinearProblem, args...; kwargs...)
 
   - `alias_u0::Bool`: Whether to alias the initial condition or use a copy.
     Defaults to `false`.
-  - `internal_norm::Function`: The norm used by the solver. Default depends on algorithm
+  - `internalnorm::Function`: The norm used by the solver. Default depends on algorithm
     choice.
 
 ## Iteration Controls
 
   - `maxiters::Int`: The maximum number of iterations to perform. Defaults to `1000`.
-  - `abstol::Number`: The absolute tolerance. Defaults to `real(oneunit(T)) * (eps(real(one(T))))^(4 // 5)`.
-  - `reltol::Number`: The relative tolerance. Defaults to `real(oneunit(T)) * (eps(real(one(T))))^(4 // 5)`.
+  - `maxtime`: The maximum time for solving the nonlinear system of equations. Defaults to
+    `nothing` which means no time limit. Note that setting a time limit does have a small
+    overhead.
+  - `abstol::Number`: The absolute tolerance. Defaults to
+    `real(oneunit(T)) * (eps(real(one(T))))^(4 // 5)`.
+  - `reltol::Number`: The relative tolerance. Defaults to
+    `real(oneunit(T)) * (eps(real(one(T))))^(4 // 5)`.
   - `termination_condition`: Termination Condition from DiffEqBase. Defaults to
     `AbsSafeBestTerminationMode()` for `NonlinearSolve.jl` and `AbsTerminateMode()` for
     `SimpleNonlinearSolve.jl`.
diff --git a/docs/src/basics/SparsityDetection.md b/docs/src/basics/sparsity_detection.md
similarity index 100%
rename from docs/src/basics/SparsityDetection.md
rename to docs/src/basics/sparsity_detection.md
diff --git a/docs/src/basics/TerminationCondition.md b/docs/src/basics/termination_condition.md
similarity index 76%
rename from docs/src/basics/TerminationCondition.md
rename to docs/src/basics/termination_condition.md
index 5351198ca..a87f157aa 100644
--- a/docs/src/basics/TerminationCondition.md
+++ b/docs/src/basics/termination_condition.md
@@ -8,7 +8,7 @@ Provides a API to specify termination conditions for [`NonlinearProblem`](@ref)
 The termination condition is constructed as:
 
 ```julia
-cache = init(du, u, AbsNormTerminationMode(); abstol = 1e-9, reltol = 1e-9)
+cache = init(du, u, AbsSafeBestTerminationMode(); abstol = 1e-9, reltol = 1e-9)
 ```
 
 If `abstol` and `reltol` are not supplied, then we choose a default based on the element
@@ -23,10 +23,6 @@ To test for termination simply call the `cache`:
 terminated = cache(du, u, uprev)
 ```
 
-!!! note
-    
-    The default for NonlinearSolve.jl is `AbsSafeBestTerminationMode`!
-
 ### Absolute Tolerance
 
 ```@docs
@@ -50,10 +46,20 @@ RelSafeBestTerminationMode
 ```@docs
 NormTerminationMode
 SteadyStateDiffEqTerminationMode
+```
+
+The following was named to match an older version of SimpleNonlinearSolve. It is currently
+not used as a default anywhere.
+
+```@docs
 SimpleNonlinearSolveTerminationMode
 ```
 
-### Return Codes
+### Return Codes (Deprecated)
+
+These are deprecated and will be removed in a future release. Use the
+`use_deprecated_retcodes = Val(false)` option to `SciMLBase.init` to use the new return
+`ReturnCode` versions.
 
 ```@docs
 DiffEqBase.NonlinearSafeTerminationReturnCode
diff --git a/docs/src/devdocs/algorithm_helpers.md b/docs/src/devdocs/algorithm_helpers.md
new file mode 100644
index 000000000..7b0f91a9f
--- /dev/null
+++ b/docs/src/devdocs/algorithm_helpers.md
@@ -0,0 +1,68 @@
+# Internal Algorithm Helpers
+
+## Pseudo Transient Method
+
+```@docs
+NonlinearSolve.SwitchedEvolutionRelaxation
+NonlinearSolve.SwitchedEvolutionRelaxationCache
+```
+
+## Approximate Jacobian Methods
+
+### Initialization
+
+```@docs
+NonlinearSolve.IdentityInitialization
+NonlinearSolve.TrueJacobianInitialization
+NonlinearSolve.BroydenLowRankInitialization
+```
+
+### Jacobian Structure
+
+```@docs
+NonlinearSolve.FullStructure
+NonlinearSolve.DiagonalStructure
+```
+
+### Jacobian Caches
+
+```@docs
+NonlinearSolve.InitializedApproximateJacobianCache
+```
+
+### Reset Methods
+
+```@docs
+NonlinearSolve.NoChangeInStateReset
+NonlinearSolve.IllConditionedJacobianReset
+```
+
+### Update Rules
+
+```@docs
+NonlinearSolve.GoodBroydenUpdateRule
+NonlinearSolve.BadBroydenUpdateRule
+NonlinearSolve.KlementUpdateRule
+```
+
+## Levenberg Marquardt Method
+
+```@docs
+NonlinearSolve.LevenbergMarquardtTrustRegion
+```
+
+## Trust Region Method
+
+```@docs
+NonlinearSolve.GenericTrustRegionScheme
+```
+
+## Miscellaneous
+
+```@docs
+SimpleNonlinearSolve.__nextfloat_tdir
+SimpleNonlinearSolve.__prevfloat_tdir
+SimpleNonlinearSolve.__max_tdir
+NonlinearSolve.callback_into_cache!
+NonlinearSolve.concrete_jac
+```
diff --git a/docs/src/devdocs/internal_interfaces.md b/docs/src/devdocs/internal_interfaces.md
new file mode 100644
index 000000000..843054cc8
--- /dev/null
+++ b/docs/src/devdocs/internal_interfaces.md
@@ -0,0 +1,53 @@
+# Internal Abstract Types
+
+## Solvers
+
+```@docs
+NonlinearSolve.AbstractNonlinearSolveAlgorithm
+NonlinearSolve.AbstractNonlinearSolveExtensionAlgorithm
+NonlinearSolve.AbstractNonlinearSolveCache
+```
+
+## Descent Algorithms
+
+```@docs
+NonlinearSolve.AbstractDescentAlgorithm
+NonlinearSolve.AbstractDescentCache
+```
+
+## Approximate Jacobian
+
+```@docs
+NonlinearSolve.AbstractApproximateJacobianStructure
+NonlinearSolve.AbstractJacobianInitialization
+NonlinearSolve.AbstractApproximateJacobianUpdateRule
+NonlinearSolve.AbstractApproximateJacobianUpdateRuleCache
+NonlinearSolve.AbstractResetCondition
+```
+
+## Damping Algorithms
+
+```@docs
+NonlinearSolve.AbstractDampingFunction
+NonlinearSolve.AbstractDampingFunctionCache
+```
+
+## Line Search
+
+```@docs
+NonlinearSolve.AbstractNonlinearSolveLineSearchAlgorithm
+NonlinearSolve.AbstractNonlinearSolveLineSearchCache
+```
+
+## Trust Region
+
+```@docs
+NonlinearSolve.AbstractTrustRegionMethod
+NonlinearSolve.AbstractTrustRegionMethodCache
+```
+
+## Tracing
+
+```@docs
+NonlinearSolve.AbstractNonlinearSolveTraceLevel
+```
diff --git a/docs/src/devdocs/jacobian.md b/docs/src/devdocs/jacobian.md
new file mode 100644
index 000000000..2a7dbd00d
--- /dev/null
+++ b/docs/src/devdocs/jacobian.md
@@ -0,0 +1,13 @@
+# Jacobian Wrappers
+
+```@docs
+NonlinearSolve.AbstractNonlinearSolveJacobianCache
+NonlinearSolve.JacobianCache
+```
+
+## SimpleNonlinearSolve functions
+
+```@docs
+SimpleNonlinearSolve.jacobian_cache
+SimpleNonlinearSolve.value_and_jacobian
+```
diff --git a/docs/src/devdocs/linear_solve.md b/docs/src/devdocs/linear_solve.md
new file mode 100644
index 000000000..88fa87440
--- /dev/null
+++ b/docs/src/devdocs/linear_solve.md
@@ -0,0 +1,6 @@
+# Linear Solve
+
+```@docs
+NonlinearSolve.AbstractLinearSolverCache
+NonlinearSolve.LinearSolverCache
+```
diff --git a/docs/src/devdocs/operators.md b/docs/src/devdocs/operators.md
new file mode 100644
index 000000000..b96a63f8c
--- /dev/null
+++ b/docs/src/devdocs/operators.md
@@ -0,0 +1,28 @@
+# Custom SciML Operators
+
+## Abstract Operators
+
+```@docs
+NonlinearSolve.AbstractNonlinearSolveOperator
+```
+
+## Jacobian Operators
+
+```@docs
+NonlinearSolve.JacobianOperator
+NonlinearSolve.VecJacOperator
+NonlinearSolve.JacVecOperator
+```
+
+### Stateful Jacobian Operators
+
+```@docs
+NonlinearSolve.StatefulJacobianOperator
+NonlinearSolve.StatefulJacobianNormalFormOperator
+```
+
+## Low-Rank Jacobian Operators
+
+```@docs
+NonlinearSolve.BroydenLowRankJacobian
+```
diff --git a/docs/src/native/descent.md b/docs/src/native/descent.md
new file mode 100644
index 000000000..162f8d636
--- /dev/null
+++ b/docs/src/native/descent.md
@@ -0,0 +1,27 @@
+# Descent Subroutines
+
+The following subroutines are available for computing the descent direction.
+
+```@index
+Pages = ["descent.md"]
+```
+
+## Core Subroutines
+
+```@docs
+NewtonDescent
+SteepestDescent
+DampedNewtonDescent
+```
+
+## Special Trust Region Descent Subroutines
+
+```@docs
+Dogleg
+```
+
+## Special Levenberg Marquardt Descent Subroutines
+
+```@docs
+GeodesicAcceleration
+```
diff --git a/docs/src/native/diagnostics.md b/docs/src/native/diagnostics.md
new file mode 100644
index 000000000..35f11552f
--- /dev/null
+++ b/docs/src/native/diagnostics.md
@@ -0,0 +1,22 @@
+# [Diagnostics API](@id diagnostics_api_reference)
+
+## Timer Outputs
+
+These functions are not exported since the names have a potential for conflict.
+
+```@docs
+NonlinearSolve.enable_timer_outputs
+NonlinearSolve.disable_timer_outputs
+NonlinearSolve.@static_timeit
+```
+
+## Tracing API
+
+```@docs
+TraceAll
+TraceWithJacobianConditionNumber
+TraceMinimal
+```
+
+For details about the arguments refer to the documentation of
+[`NonlinearSolve.AbstractNonlinearSolveTraceLevel`](@ref).
diff --git a/docs/src/native/globalization.md b/docs/src/native/globalization.md
new file mode 100644
index 000000000..d7ff7d684
--- /dev/null
+++ b/docs/src/native/globalization.md
@@ -0,0 +1,34 @@
+# Globalization Subroutines
+
+The following globalization subroutines are available.
+
+```@index
+Pages = ["globalization.md"]
+```
+
+## [Line Search Algorithms](@id line-search)
+
+```@docs
+LiFukushimaLineSearch
+LineSearchesJL
+RobustNonMonotoneLineSearch
+NoLineSearch
+```
+
+## Radius Update Schemes for Trust Region
+
+```@docs
+RadiusUpdateSchemes
+```
+
+### Available Radius Update Schemes
+
+```@docs
+RadiusUpdateSchemes.Simple
+RadiusUpdateSchemes.Hei
+RadiusUpdateSchemes.Yuan
+RadiusUpdateSchemes.Bastin
+RadiusUpdateSchemes.Fan
+RadiusUpdateSchemes.NLsolve
+RadiusUpdateSchemes.NocedalWright
+```
diff --git a/docs/src/native/simplenonlinearsolve.md b/docs/src/native/simplenonlinearsolve.md
new file mode 100644
index 000000000..0ff386898
--- /dev/null
+++ b/docs/src/native/simplenonlinearsolve.md
@@ -0,0 +1,59 @@
+# SimpleNonlinearSolve.jl
+
+These methods can be used independently of the rest of NonlinearSolve.jl
+
+```@index
+Pages = ["simplenonlinearsolve.md"]
+```
+
+## Interval Methods
+
+These methods are suited for interval (scalar) root-finding problems,
+i.e. `IntervalNonlinearProblem`.
+
+```@docs
+ITP
+Alefeld
+Bisection
+Falsi
+Ridder
+Brent
+```
+
+## General Methods
+
+These methods are suited for any general nonlinear root-finding problem, i.e.
+`NonlinearProblem`.
+
+| Solver                               | In-place | Out of Place | Non-Allocating (Scalars) | Non-Allocating (`SArray`) |
+|:------------------------------------ |:-------- |:------------ |:------------------------ |:------------------------- |
+| [`SimpleNewtonRaphson`](@ref)        | ✔️       | ✔️           | ✔️                       | ✔️                        |
+| [`SimpleBroyden`](@ref)              | ✔️       | ✔️           | ✔️                       | ✔️                        |
+| [`SimpleHalley`](@ref)               | ❌        | ✔️           | ✔️                       | ❌                         |
+| [`SimpleKlement`](@ref)              | ✔️       | ✔️           | ✔️                       | ✔️                        |
+| [`SimpleTrustRegion`](@ref)          | ✔️       | ✔️           | ✔️                       | ✔️                        |
+| [`SimpleDFSane`](@ref)               | ✔️       | ✔️           | ✔️[^1]                   | ✔️                        |
+| [`SimpleLimitedMemoryBroyden`](@ref) | ✔️       | ✔️           | ✔️                       | ✔️[^2]                    |
+
+The algorithms which are non-allocating can be used directly inside GPU Kernels[^3].
+See [PSOGPU.jl](https://github.com/SciML/PSOGPU.jl) for more details.
+
+```@docs
+SimpleNewtonRaphson
+SimpleBroyden
+SimpleHalley
+SimpleKlement
+SimpleTrustRegion
+SimpleDFSane
+SimpleLimitedMemoryBroyden
+```
+
+`SimpleGaussNewton` is aliased to [`SimpleNewtonRaphson`](@ref) for solving Nonlinear Least
+Squares problems.
+
+[^1]: Needs [`StaticArrays.jl`](https://github.com/JuliaArrays/StaticArrays.jl) to be
+    installed and loaded for the non-allocating version.
+[^2]: This method is non-allocating if the termination condition is set to either `nothing`
+    (default) or [`AbsNormTerminationMode`](@ref).
+[^3]: Only the defaults are guaranteed to work inside kernels. We try to provide warnings
+    if the used version is not non-allocating.
diff --git a/docs/src/native/solvers.md b/docs/src/native/solvers.md
new file mode 100644
index 000000000..d2c0fc6e5
--- /dev/null
+++ b/docs/src/native/solvers.md
@@ -0,0 +1,89 @@
+# NonlinearSolve.jl Solvers
+
+These are the native solvers of NonlinearSolve.jl.
+
+```@index
+Pages = ["solvers.md"]
+```
+
+## General Keyword Arguments
+
+Several Algorithms share the same specification for common keyword arguments. Those are
+documented in this section to avoid repetition. Certain algorithms might have additional
+considerations for these keyword arguments, which are documented in the algorithm's
+documentation.
+
+  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) solvers used
+    for the linear solves within the Newton method. Defaults to `nothing`, which means it
+    uses the LinearSolve.jl default algorithm choice. For more information on available
+    algorithm choices, see the
+    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
+    preconditioners. For more information on specifying preconditioners for LinearSolve
+    algorithms, consult the
+    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `linesearch`: the line search algorithm to use. Defaults to [`NoLineSearch()`](@ref),
+    which means that no line search is performed.  Algorithms from
+    [`LineSearches.jl`](https://github.com/JuliaNLSolvers/LineSearches.jl/) must be
+    wrapped in [`LineSearchesJL`](@ref) before being supplied. For a detailed documentation
+    refer to [Line Search Algorithms](@ref line-search).
+  - `autodiff`/`jacobian_ad`: etermines the backend used for the Jacobian. Note that this
+    argument is ignored if an analytical Jacobian is passed, as that will be used instead.
+    Defaults to `nothing` which means that a default is selected according to the problem
+    specification! Valid choices are types from ADTypes.jl.
+  - `forward_ad`/`vjp_autodiff`: similar to `autodiff`, but is used to compute Jacobian
+    Vector Products. Ignored if the NonlinearFunction contains the `jvp` function.
+  - `reverse_ad`/`vjp_autodiff`: similar to `autodiff`, but is used to compute Vector
+    Jacobian Products. Ignored if the NonlinearFunction contains the `vjp` function.
+  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is
+    used, then the Jacobian will not be constructed and instead direct Jacobian-Vector
+    products `J*v` are computed using forward-mode automatic differentiation or finite
+    differencing tricks (without ever constructing the Jacobian). However, if the Jacobian
+    is still needed, for example for a preconditioner, `concrete_jac = true` can be passed
+    in order to force the construction of the Jacobian.
+
+## Nonlinear Solvers
+
+```@docs
+NewtonRaphson
+DFSane
+Broyden
+Klement
+LimitedMemoryBroyden
+```
+
+## Nonlinear Least Squares Solvers
+
+```@docs
+GaussNewton
+```
+
+## Both Nonlinear & Nonlinear Least Squares Solvers
+
+These solvers can be used for both nonlinear and nonlinear least squares problems.
+
+```@docs
+TrustRegion
+LevenbergMarquardt
+PseudoTransient
+```
+
+## Polyalgorithms
+
+```@docs
+NonlinearSolvePolyAlgorithm
+FastShortcutNonlinearPolyalg
+FastShortcutNLLSPolyalg
+RobustMultiNewton
+```
+
+## Advanced Solvers
+
+All of the previously mentioned solvers are wrappers around the following solvers. These
+are meant for advanced users and allow building custom solvers.
+
+```@docs
+ApproximateJacobianSolveAlgorithm
+GeneralizedFirstOrderAlgorithm
+GeneralizedDFSane
+```
diff --git a/docs/src/api/steadystatediffeq.md b/docs/src/native/steadystatediffeq.md
similarity index 91%
rename from docs/src/api/steadystatediffeq.md
rename to docs/src/native/steadystatediffeq.md
index 3bfe61c1a..471fc3f01 100644
--- a/docs/src/api/steadystatediffeq.md
+++ b/docs/src/native/steadystatediffeq.md
@@ -13,6 +13,10 @@ using SteadyStateDiffEq
 
 These methods can be used independently of the rest of NonlinearSolve.jl
 
+```@index
+Pages = ["steadystatediffeq.md"]
+```
+
 ## Solver API
 
 ```@docs
diff --git a/docs/src/references.md b/docs/src/references.md
new file mode 100644
index 000000000..78f29bc41
--- /dev/null
+++ b/docs/src/references.md
@@ -0,0 +1,4 @@
+# References
+
+```@bibliography
+```
diff --git a/docs/src/refs.bib b/docs/src/refs.bib
new file mode 100644
index 000000000..7d8f48f15
--- /dev/null
+++ b/docs/src/refs.bib
@@ -0,0 +1,148 @@
+@article{bastin2010retrospective,
+  title     = {A retrospective trust-region method for unconstrained optimization},
+  author    = {Bastin, Fabian and Malmedy, Vincent and Mouffe, M{\'e}lodie and Toint, Philippe L and Tomanos, Dimitri},
+  journal   = {Mathematical programming},
+  volume    = {123},
+  pages     = {395--418},
+  year      = {2010},
+  publisher = {Springer}
+}
+
+@article{broyden1965class,
+  title   = {A class of methods for solving nonlinear simultaneous equations},
+  author  = {Broyden, Charles G},
+  journal = {Mathematics of computation},
+  volume  = {19},
+  number  = {92},
+  pages   = {577--593},
+  year    = {1965}
+}
+
+@article{coffey2003pseudotransient,
+  title     = {Pseudotransient continuation and differential-algebraic equations},
+  author    = {Coffey, Todd S and Kelley, Carl Tim and Keyes, David E},
+  journal   = {SIAM Journal on Scientific Computing},
+  volume    = {25},
+  number    = {2},
+  pages     = {553--569},
+  year      = {2003},
+  publisher = {SIAM}
+}
+
+@article{fan2006convergence,
+  title     = {Convergence rate of the trust region method for nonlinear equations under local error bound condition},
+  author    = {Fan, Jinyan},
+  journal   = {Computational Optimization and Applications},
+  volume    = {34},
+  number    = {2},
+  pages     = {215--227},
+  year      = {2006},
+  publisher = {Springer}
+}
+
+@article{fan2016retrospective,
+  title     = {A retrospective trust region algorithm with trust region converging to zero},
+  author    = {Fan, Jinyan and Pan, Jianyu and Song, Hongyan},
+  journal   = {Journal of Computational Mathematics},
+  volume    = {34},
+  number    = {4},
+  pages     = {421--436},
+  year      = {2016},
+  publisher = {JSTOR}
+}
+
+@article{hei2003self,
+  title     = {A self-adaptive trust region algorithm},
+  author    = {Hei, Long},
+  journal   = {Journal of Computational Mathematics},
+  pages     = {229--236},
+  year      = {2003},
+  publisher = {JSTOR}
+}
+
+@article{kelley1998convergence,
+  title     = {Convergence analysis of pseudo-transient continuation},
+  author    = {Kelley, Carl Timothy and Keyes, David E},
+  journal   = {SIAM Journal on Numerical Analysis},
+  volume    = {35},
+  number    = {2},
+  pages     = {508--523},
+  year      = {1998},
+  publisher = {SIAM}
+}
+
+@article{klement2014using,
+  title     = {On using quasi-newton algorithms of the Broyden class for model-to-test correlation},
+  author    = {Klement, Jan},
+  journal   = {Journal of Aerospace Technology and Management},
+  volume    = {6},
+  pages     = {407--414},
+  year      = {2014},
+  publisher = {SciELO Brasil}
+}
+
+@article{la2006spectral,
+  title   = {Spectral residual method without gradient information for solving large-scale nonlinear systems of equations},
+  author  = {La Cruz, William and Mart{\'\i}nez, Jos{\'e} and Raydan, Marcos},
+  journal = {Mathematics of computation},
+  volume  = {75},
+  number  = {255},
+  pages   = {1429--1448},
+  year    = {2006}
+}
+
+@article{lepage2021alternating,
+  title   = {Alternating cyclic extrapolation methods for optimization algorithms},
+  author  = {Lepage-Saucier, Nicolas},
+  journal = {arXiv preprint arXiv:2104.04974},
+  year    = {2021}
+}
+
+@article{li2000derivative,
+  title     = {A derivative-free line search and global convergence of Broyden-like method for nonlinear equations},
+  author    = {Li, Dong-Hui and Fukushima, Masao},
+  journal   = {Optimization methods and software},
+  volume    = {13},
+  number    = {3},
+  pages     = {181--201},
+  year      = {2000},
+  publisher = {Taylor \& Francis}
+}
+
+@article{transtrum2012improvements,
+  title   = {Improvements to the Levenberg-Marquardt algorithm for nonlinear least-squares minimization},
+  author  = {Transtrum, Mark K and Sethna, James P},
+  journal = {arXiv preprint arXiv:1201.5885},
+  year    = {2012}
+}
+
+@article{yuan2015recent,
+  title     = {Recent advances in trust region algorithms},
+  author    = {Yuan, Ya-xiang},
+  journal   = {Mathematical Programming},
+  volume    = {151},
+  pages     = {249--281},
+  year      = {2015},
+  publisher = {Springer}
+}
+
+@article{yuan2015recent,
+  title     = {Recent advances in trust region algorithms},
+  author    = {Yuan, Ya-xiang},
+  journal   = {Mathematical Programming},
+  volume    = {151},
+  pages     = {249--281},
+  year      = {2015},
+  publisher = {Springer}
+}
+
+@article{ziani2008autoadaptative,
+  title     = {An autoadaptative limited memory Broyden’s method to solve systems of nonlinear equations},
+  author    = {Ziani, Mohammed and Guyomarc’h, Fr{\'e}d{\'e}ric},
+  journal   = {Applied mathematics and computation},
+  volume    = {205},
+  number    = {1},
+  pages     = {202--211},
+  year      = {2008},
+  publisher = {Elsevier}
+}
\ No newline at end of file
diff --git a/docs/src/solvers/BracketingSolvers.md b/docs/src/solvers/BracketingSolvers.md
deleted file mode 100644
index af322af74..000000000
--- a/docs/src/solvers/BracketingSolvers.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# [Interval Rootfinding Methods (Bracketing Solvers)](@id bracketing)
-
-`solve(prob::IntervalNonlinearProblem, alg; kwargs...)`
-
-Solves for ``f(t) = 0`` in the problem defined by `prob` using the algorithm `alg`. If no
-algorithm is given, a default algorithm will be chosen.
-
-## Recommended Methods
-
-`ITP()` is the recommended method for the scalar interval root-finding problems. It is
-particularly well-suited for cases where the function is smooth and well-behaved; and
-achieved superlinear convergence while retaining the optimal worst-case performance of the
-Bisection method. For more details, consult the detailed solver API docs.
-
-`Ridder` is a hybrid method that uses the value of function at the midpoint of the interval
-to perform an exponential interpolation to the root. This gives a fast convergence with a
-guaranteed convergence of at most twice the number of iterations as the bisection method.
-
-`Brent` is a combination of the bisection method, the secant method and inverse quadratic
-interpolation. At every iteration, Brent's method decides which method out of these three is
-likely to do best, and proceeds by doing a step according to that method. This gives a
-robust and fast method, which therefore enjoys considerable popularity.
-
-## Full List of Methods
-
-### SimpleNonlinearSolve.jl
-
-These methods are automatically included as part of NonlinearSolve.jl. Though, one can use
-SimpleNonlinearSolve.jl directly to decrease the dependencies and improve load time.
-
-  - `ITP`: A non-allocating ITP (Interpolate, Truncate & Project) method
-  - `Falsi`: A non-allocating regula falsi method
-  - `Bisection`: A common bisection method
-  - `Ridder`: A non-allocating Ridder method
-  - `Brent`: A non-allocating Brent method
diff --git a/docs/src/solvers/LineSearch.md b/docs/src/solvers/LineSearch.md
deleted file mode 100644
index 5d09301e2..000000000
--- a/docs/src/solvers/LineSearch.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# [Line Search](@id linesearch)
-
-A convenience wrapper over `LineSearches.jl` and some native Line Search methods, powered
-internally with fast automatic differentiation.
-
-```@docs
-LineSearch
-```
-
-## Native Line Search Methods
-
-```@docs
-LiFukushimaLineSearch
-```
diff --git a/docs/src/solvers/NonlinearSystemSolvers.md b/docs/src/solvers/NonlinearSystemSolvers.md
deleted file mode 100644
index c15948814..000000000
--- a/docs/src/solvers/NonlinearSystemSolvers.md
+++ /dev/null
@@ -1,152 +0,0 @@
-# [Nonlinear System Solvers](@id nonlinearsystemsolvers)
-
-`solve(prob::NonlinearProblem, alg; kwargs)`
-
-Solves for ``f(u)=0`` in the problem defined by `prob` using the algorithm
-`alg`. If no algorithm is given, a default algorithm will be chosen.
-
-## Recommended Methods
-
-The default method `FastShortcutNonlinearPolyalg` is a good choice for most problems. It is
-a polyalgorithm that attempts to use a fast algorithm (Klement, Broyden) and if that fails
-it falls back to a more robust algorithm (`NewtonRaphson`) before falling back the most
-robust variant of `TrustRegion`. For basic problems this will be very fast, for harder
-problems it will make sure to work.
-
-If one is looking for more robustness then `RobustMultiNewton` is a good choice. It attempts
-a set of the most robust methods in succession and only fails if all of the methods fail to
-converge. Additionally, `DynamicSS` can be a good choice for high stability.
-
-As a balance, `NewtonRaphson` is a good choice for most problems that aren't too difficult
-yet need high performance, and  `TrustRegion` is a bit less performant but more stable. If
-the problem is well-conditioned, `Klement` or `Broyden` may be faster, but highly dependent
-on the eigenvalues of the Jacobian being sufficiently small.
-
-`NewtonRaphson` and `TrustRegion` are designed for for large systems. They can make use of
-sparsity patterns for sparse automatic differentiation and sparse linear solving of very
-large systems. Meanwhile, `SimpleNewtonRaphson` and `SimpleTrustRegion` are implementations
-which are specialized for small equations. They are non-allocating on static arrays and thus
-really well-optimized for small systems, thus usually outperforming the other methods when
-such types are used for `u0`.
-
-## Full List of Methods
-
-!!! note
-    
-    For the full details on the capabilities and constructors of the different solvers,
-    see the Detailed Solver APIs section!
-
-### NonlinearSolve.jl
-
-These are the core solvers, which excel at large-scale problems that need advanced
-linear solver, automatic differentiation, abstract array types, GPU,
-sparse/structured matrix support, etc. These methods support the largest set of types and
-features, but have a bit of overhead on very small problems.
-
-  - `NewtonRaphson()`:A Newton-Raphson method with swappable nonlinear solvers and autodiff
-    methods for high performance on large and sparse systems.
-  - `TrustRegion()`: A Newton Trust Region dogleg method with swappable nonlinear solvers and
-    autodiff methods for high performance on large and sparse systems.
-  - `LevenbergMarquardt()`: An advanced Levenberg-Marquardt implementation with the
-    improvements suggested in the [paper](https://arxiv.org/abs/1201.5885) "Improvements to
-    the Levenberg-Marquardt algorithm for nonlinear least-squares minimization". Designed for
-    large-scale and numerically-difficult nonlinear systems.
-  - `PseudoTransient()`: A pseudo-transient method which mixes the stability of Euler-type
-    stepping with the convergence speed of a Newton method. Good for highly unstable
-    systems.
-  - `RobustMultiNewton()`: A polyalgorithm that mixes highly robust methods (line searches and
-    trust regions) in order to be as robust as possible for difficult problems. If this method
-    fails to converge, then one can be pretty certain that most (all?) other choices would
-    likely fail.
-  - `FastShortcutNonlinearPolyalg()`: The default method. A polyalgorithm that mixes fast methods
-    with fallbacks to robust methods to allow for solving easy problems quickly without sacrificing
-    robustness on the hard problems.
-  - `Broyden()`: Generalization of Broyden's Quasi-Newton Method with Line Search and
-    Automatic Jacobian Resetting. This is a fast method but unstable when the condition number of
-    the Jacobian matrix is sufficiently large.
-  - `Klement()`: Generalization of Klement's Quasi-Newton Method with Line Search and
-    Automatic Jacobian Resetting. This is a fast method but unstable when the condition number of
-    the Jacobian matrix is sufficiently large.
-  - `LimitedMemoryBroyden()`: An advanced version of `LBroyden` which uses a limited memory
-    Broyden method. This is a fast method but unstable when the condition number of
-    the Jacobian matrix is sufficiently large. It is recommended to use `Broyden` or
-    `Klement` instead unless the memory usage is a concern.
-
-### SimpleNonlinearSolve.jl
-
-These methods are included with NonlinearSolve.jl by default, though SimpleNonlinearSolve.jl
-can be used directly to reduce dependencies and improve load times. SimpleNonlinearSolve.jl's
-methods excel at small problems and problems defined with static arrays.
-
-  - `SimpleNewtonRaphson()`: A simplified implementation of the Newton-Raphson method.
-  - `SimpleBroyden()`: The classic Broyden's quasi-Newton method.
-  - `SimpleLimitedMemoryBroyden()`: A low-memory Broyden implementation, similar to L-BFGS. This method is
-    common in machine learning contexts but is known to be unstable in comparison to many
-    other choices.
-  - `SimpleKlement()`: A quasi-Newton method due to Klement. It's supposed to be more efficient
-    than Broyden's method, and it seems to be in the cases that have been tried, but more
-    benchmarking is required.
-  - `SimpleTrustRegion()`: A dogleg trust-region Newton method. Improved globalizing stability
-    for more robust fitting over basic Newton methods, though potentially with a cost.
-  - `SimpleDFSane()`: A low-overhead implementation of the df-sane method for solving
-    large-scale nonlinear systems of equations.
-  - `SimpleHalley()`: A low-overhead implementation of the Halley method. This is a higher order
-    method and thus can converge faster to low tolerances than a Newton method. Requires higher
-    order derivatives, so best used when automatic differentiation is available.
-
-!!! note
-    
-    When used with certain types for the states `u` such as a `Number` or `StaticArray`,
-    these solvers are very efficient and non-allocating. These implementations are thus
-    well-suited for small systems of equations.
-
-### SteadyStateDiffEq.jl
-
-SteadyStateDiffEq.jl uses ODE solvers to iteratively approach the steady state. It is a
-very stable method for solving nonlinear systems, though often more
-computationally expensive than direct methods.
-
-  - `DynamicSS()`: Uses an ODE solver to find the steady state. Automatically terminates when
-    close to the steady state.
-  - `SSRootfind()`: Uses a NonlinearSolve compatible solver to find the steady state.
-
-### NLsolve.jl
-
-This is a wrapper package for importing solvers from NLsolve.jl into the SciML interface.
-
-  - `NLsolveJL()`: A wrapper for [NLsolve.jl](https://github.com/JuliaNLSolvers/NLsolve.jl)
-
-Submethod choices for this algorithm include:
-
-  - `:anderson`: Anderson-accelerated fixed-point iteration
-  - `:newton`: Classical Newton method with an optional line search
-  - `:trust_region`: Trust region Newton method (the default choice)
-
-### MINPACK.jl
-
-MINPACK.jl methods are good for medium-sized nonlinear solves. It does not scale due to
-the lack of sparse Jacobian support, though the methods are very robust and stable.
-
-  - `CMINPACK()`: A wrapper for using the classic MINPACK method through [MINPACK.jl](https://github.com/sglyon/MINPACK.jl)
-
-Submethod choices for this algorithm include:
-
-  - `:hybr`: Modified version of Powell's algorithm.
-  - `:lm`: Levenberg-Marquardt.
-  - `:lmdif`: Advanced Levenberg-Marquardt
-  - `:hybrd`: Advanced modified version of Powell's algorithm
-
-### Sundials.jl
-
-Sundials.jl are a classic set of C/Fortran methods which are known for good scaling of the
-Newton-Krylov form. However, KINSOL is known to be less stable than some other
-implementations, as it has no line search or globalizer (trust region).
-
-  - `KINSOL()`: The KINSOL method of the SUNDIALS C library
-
-### SIAMFANLEquations.jl
-
-SIAMFANLEquations.jl is a wrapper for the methods in the SIAMFANLEquations.jl library.
-
-  - `SIAMFANLEquationsJL()`: A wrapper for using the methods in
-    [SIAMFANLEquations.jl](https://github.com/ctkelley/SIAMFANLEquations.jl)
diff --git a/docs/src/solvers/SteadyStateSolvers.md b/docs/src/solvers/SteadyStateSolvers.md
deleted file mode 100644
index 91776a7d0..000000000
--- a/docs/src/solvers/SteadyStateSolvers.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# [Steady State Solvers](@id ss_solvers)
-
-`solve(prob::SteadyStateProblem, alg; kwargs)`
-
-Solves for the steady states in the problem defined by `prob` using the algorithm
-`alg`. If no algorithm is given, a default algorithm will be chosen.
-
-## Recommended Methods
-
-Conversion to a NonlinearProblem is generally the fastest method. However, this will not
-guarantee the preferred root (the stable equilibrium), and thus if the preferred root is
-required, then it's recommended that one uses `DynamicSS`. For `DynamicSS`, often an
-adaptive stiff solver, like a Rosenbrock or BDF method (`Rodas5` or `QNDF`), is a good way
-to allow for very large time steps as the steady state approaches.
-
-!!! note
-    
-    The SteadyStateDiffEq.jl methods on a `SteadyStateProblem` respect the time definition
-    in the nonlinear definition, i.e., `u' = f(u, t)` uses the correct values for `t` as the
-    solution evolves. A conversion of a `SteadyStateProblem` to a `NonlinearProblem`
-    replaces this with the nonlinear system `u' = f(u, ∞)`, and thus the direct
-    `SteadyStateProblem` approach can give different answers (i.e., the correct unique
-    fixed point) on ODEs with non-autonomous dynamics.
-
-!!! note
-    
-    If you have an unstable equilibrium and you want to solve for the unstable equilibrium,
-    then `DynamicSS` might converge to the equilibrium based on the initial condition.
-    However, Nonlinear Solvers don't suffer from this issue, and thus it's recommended to
-    use a nonlinear solver if you want to solve for the unstable equilibrium.
-
-## Full List of Methods
-
-### Conversion to NonlinearProblem
-
-Any `SteadyStateProblem` can be trivially converted to a `NonlinearProblem` via
-`NonlinearProblem(prob::SteadyStateProblem)`. Using this approach, any of the solvers from
-the [Nonlinear System Solvers page](@ref nonlinearsystemsolvers) can be used. As a
-convenience, users can use:
-
-  - `SSRootfind`: A wrapper around `NonlinearSolve.jl` compliant solvers which converts
-    the `SteadyStateProblem` to a `NonlinearProblem` and solves it.
-
-### SteadyStateDiffEq.jl
-
-SteadyStateDiffEq.jl uses ODE solvers to iteratively approach the steady state. It is a
-very stable method for solving nonlinear systems,
-though often computationally more expensive than direct methods.
-
-  - `DynamicSS` : Uses an ODE solver to find the steady state. Automatically terminates
-    when close to the steady state. `DynamicSS(alg; tspan=Inf)` requires that an ODE
-    algorithm is given as the first argument. The absolute and relative tolerances specify
-    the termination conditions on the derivative's closeness to zero. This internally
-    uses the `TerminateSteadyState` callback from the Callback Library. The simulated time,
-    for which the ODE is solved, can be limited by `tspan`.  If `tspan` is a number, it is
-    equivalent to passing `(zero(tspan), tspan)`.
-
-Example usage:
-
-```julia
-using NonlinearSolve, SteadyStateDiffEq, OrdinaryDiffEq
-sol = solve(prob, DynamicSS(Tsit5()))
-
-using Sundials
-sol = solve(prob, DynamicSS(CVODE_BDF()), dt = 1.0)
-```
-
-!!! note
-    
-    If you use `CVODE_BDF` you may need to give a starting `dt` via `dt=....`.
diff --git a/docs/src/solvers/bracketing_solvers.md b/docs/src/solvers/bracketing_solvers.md
new file mode 100644
index 000000000..e51f7805a
--- /dev/null
+++ b/docs/src/solvers/bracketing_solvers.md
@@ -0,0 +1,38 @@
+# [Interval Root-Finding Methods (Bracketing Solvers)](@id bracketing)
+
+```julia
+solve(prob::IntervalNonlinearProblem, alg; kwargs...)
+```
+
+Solves for ``f(t) = 0`` in the problem defined by `prob` using the algorithm `alg`. If no
+algorithm is given, a default algorithm will be chosen.
+
+## Recommended Methods
+
+[`ITP`](@ref) is the recommended method for the scalar interval root-finding problems. It is
+particularly well-suited for cases where the function is smooth and well-behaved; and
+achieved superlinear convergence while retaining the optimal worst-case performance of the
+Bisection method. For more details, consult the detailed solver API docs.
+
+[`Ridder`](@ref) is a hybrid method that uses the value of function at the midpoint of the
+interval to perform an exponential interpolation to the root. This gives a fast convergence
+with a guaranteed convergence of at most twice the number of iterations as the bisection
+method.
+
+[`Brent`](@ref) is a combination of the bisection method, the secant method and inverse
+quadratic interpolation. At every iteration, Brent's method decides which method out of
+these three is likely to do best, and proceeds by doing a step according to that method.
+This gives a robust and fast method, which therefore enjoys considerable popularity.
+
+## Full List of Methods
+
+### SimpleNonlinearSolve.jl
+
+These methods are automatically included as part of NonlinearSolve.jl. Though, one can use
+SimpleNonlinearSolve.jl directly to decrease the dependencies and improve load time.
+
+  - [`ITP`](@ref): A non-allocating ITP (Interpolate, Truncate & Project) method
+  - [`Falsi`](@ref): A non-allocating regula falsi method
+  - [`Bisection`](@ref): A common bisection method
+  - [`Ridder`](@ref): A non-allocating Ridder method
+  - [`Brent`](@ref): A non-allocating Brent method
diff --git a/docs/src/solvers/FixedPointSolvers.md b/docs/src/solvers/fixed_point_solvers.md
similarity index 68%
rename from docs/src/solvers/FixedPointSolvers.md
rename to docs/src/solvers/fixed_point_solvers.md
index 0d5a6f826..220a8a186 100644
--- a/docs/src/solvers/FixedPointSolvers.md
+++ b/docs/src/solvers/fixed_point_solvers.md
@@ -27,27 +27,29 @@ Using [native NonlinearSolve.jl methods](@ref nonlinearsystemsolvers) is the rec
 approach. For systems where constructing Jacobian Matrices are expensive, we recommend
 using a Krylov Method with one of those solvers.
 
-## Full List of Methods
+## [Full List of Methods](@id fixed_point_methods_full_list)
 
 We are only listing the methods that natively solve fixed point problems.
 
 ### SpeedMapping.jl
 
-  - `SpeedMappingJL()`: accelerates the convergence of a mapping to a fixed point by the
-    Alternating cyclic extrapolation algorithm (ACX).
+  - [`SpeedMappingJL()`](@ref): accelerates the convergence of a mapping to a fixed point by
+    the Alternating cyclic extrapolation algorithm (ACX).
 
 ### FixedPointAcceleration.jl
 
-  - `FixedPointAccelerationJL()`: accelerates the convergence of a mapping to a fixed point
-    by the Anderson acceleration algorithm and a few other methods.
+  - [`FixedPointAccelerationJL()`](@ref): accelerates the convergence of a mapping to a
+    fixed point by the Anderson acceleration algorithm and a few other methods.
 
 ### NLsolve.jl
 
 In our tests, we have found the anderson method implemented here to NOT be the most
 robust.
 
-  - `NLsolveJL(; method = :anderson)`: Anderson acceleration for fixed point problems.
+  - [`NLsolveJL(; method = :anderson)`](@ref): Anderson acceleration for fixed point
+    problems.
 
 ### SIAMFANLEquations.jl
 
-  - `SIAMFANLEquationsJL(; method = :anderson)`: Anderson acceleration for fixed point problems.
+  - [`SIAMFANLEquationsJL(; method = :anderson)`](@ref): Anderson acceleration for fixed
+    point problems.
diff --git a/docs/src/solvers/NonlinearLeastSquaresSolvers.md b/docs/src/solvers/nonlinear_least_squares_solvers.md
similarity index 59%
rename from docs/src/solvers/NonlinearLeastSquaresSolvers.md
rename to docs/src/solvers/nonlinear_least_squares_solvers.md
index 720cdb7f8..c037dd8f5 100644
--- a/docs/src/solvers/NonlinearLeastSquaresSolvers.md
+++ b/docs/src/solvers/nonlinear_least_squares_solvers.md
@@ -1,30 +1,30 @@
 # Nonlinear Least Squares Solvers
 
-`solve(prob::NonlinearLeastSquaresProblem, alg; kwargs...)`
+```julia
+solve(prob::NonlinearLeastSquaresProblem, alg; kwargs...)
+```
 
 Solves the nonlinear least squares problem defined by `prob` using the algorithm
 `alg`. If no algorithm is given, a default algorithm will be chosen.
 
 ## Recommended Methods
 
-The default method `FastShortcutNLLSPolyalg` is a good choice for most problems. It is a
-polyalgorithm that attempts to use a fast algorithm (`GaussNewton`) and if that fails it
-falls back to a more robust algorithm (`LevenbergMarquardt`).
+The default method [`FastShortcutNLLSPolyalg`](@ref) is a good choice for most problems. It
+is a polyalgorithm that attempts to use a fast algorithm ([`GaussNewton`](@ref)) and if that
+fails it falls back to a more robust algorithms ([`LevenbergMarquardt`](@ref),
+[`TrustRegion`](@ref)).
 
 ## Full List of Methods
 
 ### NonlinearSolve.jl
 
-  - `LevenbergMarquardt()`: An advanced Levenberg-Marquardt implementation with the
-    improvements suggested in the [paper](https://arxiv.org/abs/1201.5885) "Improvements to
-    the Levenberg-Marquardt algorithm for nonlinear least-squares minimization". Designed
-    for large-scale and numerically-difficult nonlinear systems.
-  - `GaussNewton()`: An advanced GaussNewton implementation with support for efficient
-    handling of sparse matrices via colored automatic differentiation and preconditioned
-    linear solvers. Designed for large-scale and numerically-difficult nonlinear least
-    squares problems.
-  - `TrustRegion()`: A Newton Trust Region dogleg method with swappable nonlinear solvers and
+  - [`LevenbergMarquardt()`](@ref): An advanced Levenberg-Marquardt implementation with the
+    improvements suggested in the [transtrum2012improvements](@citet). Designed for
+    large-scale and numerically-difficult nonlinear systems.
+  - [`GaussNewton()`](@ref): A Gauss-Newton method with swappable nonlinear solvers and
     autodiff methods for high performance on large and sparse systems.
+  - [`TrustRegion()`](@ref): A Newton Trust Region dogleg method with swappable nonlinear
+    solvers and autodiff methods for high performance on large and sparse systems.
 
 ### SimpleNonlinearSolve.jl
 
@@ -34,22 +34,23 @@ SimpleNonlinearSolve.jl's methods excel at small problems and problems defined w
 arrays.
 
   - `SimpleGaussNewton()`: Simple Gauss Newton implementation using QR factorizations for
-    numerical stability.
+    numerical stability (aliased to [`SimpleNewtonRaphson`](@ref)).
 
-### FastLevenbergMarquardt.jl
+### [FastLevenbergMarquardt.jl](@id fastlm_wrapper_summary)
 
 A wrapper over
 [FastLevenbergMarquardt.jl](https://github.com/kamesy/FastLevenbergMarquardt.jl). Note that
 it is called `FastLevenbergMarquardt` since the original package is called "Fast", though
-benchmarks demonstrate `LevenbergMarquardt()` usually outperforms.
+benchmarks demonstrate [`LevenbergMarquardt()`](@ref) usually outperforms.
 
-  - `FastLevenbergMarquardtJL(linsolve = :cholesky)`, can also choose `linsolve = :qr`.
+  - [`FastLevenbergMarquardtJL(linsolve = :cholesky)`](@ref), can also choose
+    `linsolve = :qr`.
 
-### LeastSquaresOptim.jl
+### [LeastSquaresOptim.jl](@id lso_wrapper_summary)
 
 A wrapper over
 [LeastSquaresOptim.jl](https://github.com/matthieugomez/LeastSquaresOptim.jl). Has a core
-algorithm `LeastSquaresOptimJL(alg; linsolve)` where the choices for `alg` are:
+algorithm [`LeastSquaresOptimJL(alg; linsolve)`](@ref) where the choices for `alg` are:
 
   - `:lm` a Levenberg-Marquardt implementation
   - `:dogleg` a trust-region dogleg Gauss-Newton
@@ -68,7 +69,8 @@ demonstrate that these methods are not robust or stable. In addition, they are s
 than the standard methods and do not scale due to lack of sparse Jacobian support.
 Thus they are only recommended for benchmarking and testing code conversions.
 
-  - `CMINPACK()`: A wrapper for using the classic MINPACK method through [MINPACK.jl](https://github.com/sglyon/MINPACK.jl)
+  - [`CMINPACK()`](@ref): A wrapper for using the classic MINPACK method through
+    [MINPACK.jl](https://github.com/sglyon/MINPACK.jl)
 
 Submethod choices for this algorithm include:
 
diff --git a/docs/src/solvers/nonlinear_system_solvers.md b/docs/src/solvers/nonlinear_system_solvers.md
new file mode 100644
index 000000000..c0f4164c9
--- /dev/null
+++ b/docs/src/solvers/nonlinear_system_solvers.md
@@ -0,0 +1,170 @@
+# [Nonlinear System Solvers](@id nonlinearsystemsolvers)
+
+```julia
+solve(prob::NonlinearProblem, alg; kwargs...)
+```
+
+Solves for ``f(u) = 0`` in the problem defined by `prob` using the algorithm `alg`. If no
+algorithm is given, a default algorithm will be chosen.
+
+## Recommended Methods
+
+The default method [`FastShortcutNonlinearPolyalg`](@ref) is a good choice for most
+problems. It is a polyalgorithm that attempts to use a fast algorithm ([`Klement`](@ref),
+[`Broyden`](@ref)) and if that fails it falls back to a more robust algorithm
+([`NewtonRaphson`](@ref)) before falling back the most robust variant of
+[`TrustRegion`](@ref). For basic problems this will be very fast, for harder problems it
+will make sure to work.
+
+If one is looking for more robustness then [`RobustMultiNewton`](@ref) is a good choice. It
+attempts a set of the most robust methods in succession and only fails if all of the methods
+fail to converge. Additionally, [`DynamicSS`](@ref) can be a good choice for high stability
+if the root corresponds to a stable equilibrium.
+
+As a balance, [`NewtonRaphson`](@ref) is a good choice for most problems that aren't too
+difficult yet need high performance, and  [`TrustRegion`](@ref) is a bit less performant but
+more stable. If the problem is well-conditioned, [`Klement`](@ref) or [`Broyden`](@ref) may
+be faster, but highly dependent on the eigenvalues of the Jacobian being sufficiently small.
+
+[`NewtonRaphson`](@ref) and [`TrustRegion`](@ref) are designed for for large systems. They
+can make use of sparsity patterns for sparse automatic differentiation and sparse linear
+solving of very large systems. Meanwhile, [`SimpleNewtonRaphson`](@ref) and
+[`SimpleTrustRegion`](@ref) are implementations which are specialized for small equations.
+They are non-allocating on static arrays and thus really well-optimized for small systems,
+thus usually outperforming the other methods when such types are used for `u0`.
+Additionally, these solvers can be used inside GPU kernels. See
+[PSOGPU.jl](https://github.com/SciML/PSOGPU.jl) for an example of this.
+
+## Full List of Methods
+
+!!! note
+    
+    For the full details on the capabilities and constructors of the different solvers,
+    see the Detailed Solver APIs section!
+
+### NonlinearSolve.jl
+
+These are the core solvers, which excel at large-scale problems that need advanced
+linear solver, automatic differentiation, abstract array types, GPU,
+sparse/structured matrix support, etc. These methods support the largest set of types and
+features, but have a bit of overhead on very small problems.
+
+  - [`NewtonRaphson()`](@ref): A Newton-Raphson method with swappable nonlinear solvers and
+    autodiff methods for high performance on large and sparse systems.
+  - [`TrustRegion()`](@ref): A Newton Trust Region dogleg method with swappable nonlinear
+    solvers and autodiff methods for high performance on large and sparse systems.
+  - [`LevenbergMarquardt()`](@ref): An advanced Levenberg-Marquardt implementation with the
+    improvements suggested in the [transtrum2012improvements](@citet). Designed for
+    large-scale and numerically-difficult nonlinear systems.
+  - [`PseudoTransient()`](@ref): A pseudo-transient method which mixes the stability of
+    Euler-type stepping with the convergence speed of a Newton method. Good for highly
+    unstable systems.
+  - [`RobustMultiNewton()`](@ref): A polyalgorithm that mixes highly robust methods (line
+    searches and trust regions) in order to be as robust as possible for difficult problems.
+    If this method fails to converge, then one can be pretty certain that most (all?) other
+    choices would likely fail.
+  - [`FastShortcutNonlinearPolyalg()`](@ref): The default method. A polyalgorithm that mixes
+    fast methods with fallbacks to robust methods to allow for solving easy problems quickly
+    without sacrificing robustness on the hard problems.
+  - [`Broyden()`](@ref): Generalization of Broyden's Quasi-Newton Method with Line Search
+    and Automatic Jacobian Resetting. This is a fast method but unstable when the condition
+    number of the Jacobian matrix is sufficiently large.
+  - [`Klement()`](@ref): Generalization of Klement's Quasi-Newton Method with Line Search
+    and Automatic Jacobian Resetting. This is a fast method but unstable when the condition
+    number of the Jacobian matrix is sufficiently large.
+  - [`LimitedMemoryBroyden()`](@ref): An advanced version of
+    [`SimpleLimitedMemoryBroyden`](@ref) which uses a limited memory Broyden method. This is
+    a fast method but unstable when the condition number of the Jacobian matrix is
+    sufficiently large. It is recommended to use [`Broyden`](@ref) or [`Klement`](@ref)
+    instead unless the memory usage is a concern.
+
+### SimpleNonlinearSolve.jl
+
+These methods are included with NonlinearSolve.jl by default, though SimpleNonlinearSolve.jl
+can be used directly to reduce dependencies and improve load times.
+SimpleNonlinearSolve.jl's methods excel at small problems and problems defined with static
+arrays.
+
+  - [`SimpleNewtonRaphson()`](@ref): A simplified implementation of the Newton-Raphson
+    method.
+  - [`SimpleBroyden()`](@ref): The classic Broyden's quasi-Newton method.
+  - [`SimpleLimitedMemoryBroyden()`](@ref): A low-memory Broyden implementation, similar to
+    L-BFGS. This method is common in machine learning contexts but is known to be unstable
+    in comparison to many other choices.
+  - [`SimpleKlement()`](@ref): A quasi-Newton method due to Klement. It's supposed to be
+    more efficient than Broyden's method, and it seems to be in the cases that have been
+    tried, but more benchmarking is required.
+  - [`SimpleTrustRegion()`](@ref): A dogleg trust-region Newton method. Improved globalizing
+    stability for more robust fitting over basic Newton methods, though potentially with a
+    cost.
+  - [`SimpleDFSane()`](@ref): A low-overhead implementation of the df-sane method for
+    solving large-scale nonlinear systems of equations.
+  - [`SimpleHalley()`](@ref): A low-overhead implementation of the Halley method. This is a
+    higher order method and thus can converge faster to low tolerances than a Newton method.
+    Requires higher order derivatives, so best used when automatic differentiation is
+    available.
+
+!!! note
+    
+    When used with certain types for the states `u` such as a `Number` or `StaticArray`,
+    these solvers are very efficient and non-allocating. These implementations are thus
+    well-suited for small systems of equations.
+
+### SteadyStateDiffEq.jl
+
+SteadyStateDiffEq.jl uses ODE solvers to iteratively approach the steady state. It is a
+very stable method for solving nonlinear systems with stable equilibrium points, though
+often more computationally expensive than direct methods.
+
+  - [`DynamicSS()`](@ref): Uses an ODE solver to find the steady state. Automatically
+    terminates when close to the steady state.
+  - [`SSRootfind()`](@ref): Uses a NonlinearSolve compatible solver to find the steady
+    state.
+
+### NLsolve.jl
+
+This is a wrapper package for importing solvers from NLsolve.jl into the SciML interface.
+
+  - [`NLsolveJL()`](@ref): A wrapper for
+    [NLsolve.jl](https://github.com/JuliaNLSolvers/NLsolve.jl)
+
+Submethod choices for this algorithm include:
+
+  - `:anderson`: Anderson-accelerated fixed-point iteration
+  - `:newton`: Classical Newton method with an optional line search
+  - `:trust_region`: Trust region Newton method (the default choice)
+
+### MINPACK.jl
+
+MINPACK.jl is a wrapper package for bringing the Fortran solvers from MINPACK. However, our
+benchmarks reveal that these methods are rarely competitive with our native solvers. Thus,
+our recommendation is to use these only for benchmarking and debugging purposes.
+
+  - [`CMINPACK()`](@ref): A wrapper for using the classic MINPACK method through
+    [MINPACK.jl](https://github.com/sglyon/MINPACK.jl)
+
+Submethod choices for this algorithm include:
+
+  - `:hybr`: Modified version of Powell's algorithm.
+  - `:lm`: Levenberg-Marquardt.
+  - `:lmdif`: Advanced Levenberg-Marquardt
+  - `:hybrd`: Advanced modified version of Powell's algorithm
+
+### Sundials.jl
+
+Sundials.jl are a classic set of C/Fortran methods which are known for good scaling of the
+Newton-Krylov form. However, KINSOL is known to be less stable than some other
+implementations.
+
+  - [`KINSOL()`](@ref): The KINSOL method of the SUNDIALS C library
+
+### SIAMFANLEquations.jl
+
+SIAMFANLEquations.jl is a wrapper for the methods in the SIAMFANLEquations.jl library.
+
+  - [`SIAMFANLEquationsJL()`](@ref): A wrapper for using the methods in
+    [SIAMFANLEquations.jl](https://github.com/ctkelley/SIAMFANLEquations.jl)
+
+Other solvers listed in [Fixed Point Solvers](@ref fixed_point_methods_full_list),
+[FastLevenbergMarquardt.jl](@ref fastlm_wrapper_summary) and
+[LeastSquaresOptim.jl](@ref lso_wrapper_summary) can also solve nonlinear systems.
diff --git a/docs/src/solvers/steady_state_solvers.md b/docs/src/solvers/steady_state_solvers.md
new file mode 100644
index 000000000..91530c448
--- /dev/null
+++ b/docs/src/solvers/steady_state_solvers.md
@@ -0,0 +1,68 @@
+# [Steady State Solvers](@id ss_solvers)
+
+```julia
+solve(prob::SteadyStateProblem, alg; kwargs)
+```
+
+Solves for the steady states in the problem defined by `prob` using the algorithm `alg`. If
+no algorithm is given, a default algorithm will be chosen.
+
+## Recommended Methods
+
+Conversion to a NonlinearProblem is generally the fastest method. However, this will not
+guarantee the preferred root (the stable equilibrium), and thus if the preferred root is
+required, then it's recommended that one uses [`DynamicSS`](@ref). For [`DynamicSS`](@ref),
+often an adaptive stiff solver, like a Rosenbrock or BDF method (`Rodas5` or `QNDF`), is a
+good way to allow for very large time steps as the steady state approaches.
+
+The SteadyStateDiffEq.jl methods on a [`SteadyStateProblem`](@ref) respect the time
+definition in the nonlinear definition, i.e., `u' = f(u, t)` uses the correct values for
+`t` as the solution evolves. A conversion of a [`SteadyStateProblem`](@ref) to a
+[`NonlinearProblem`](@ref) replaces this with the nonlinear system `u' = f(u, ∞)`, and thus
+the direct [`SteadyStateProblem`](@ref) approach can give different answers (i.e., the
+correct unique fixed point) on ODEs with non-autonomous dynamics.
+
+If you have an unstable equilibrium and you want to solve for the unstable equilibrium,
+then [`DynamicSS`](@ref) will not converge to that equilibrium for any initial condition.
+However, Nonlinear Solvers don't suffer from this issue, and thus it's recommended to
+use a nonlinear solver if you want to solve for the unstable equilibrium.
+
+## Full List of Methods
+
+### Conversion to NonlinearProblem
+
+Any [`SteadyStateProblem`](@ref) can be trivially converted to a [`NonlinearProblem`](@ref)
+via `NonlinearProblem(prob::SteadyStateProblem)`. Using this approach, any of the solvers
+from the [Nonlinear System Solvers page](@ref nonlinearsystemsolvers) can be used. As a
+convenience, users can use:
+
+  - [`SSRootfind`](@ref): A wrapper around `NonlinearSolve.jl` compliant solvers which
+    converts the [`SteadyStateProblem`](@ref) to a [`NonlinearProblem`](@ref) and solves it.
+
+### SteadyStateDiffEq.jl
+
+SteadyStateDiffEq.jl uses ODE solvers to iteratively approach the steady state. It is a
+very stable method for solving nonlinear systems,
+though often computationally more expensive than direct methods.
+
+  - [`DynamicSS`](@ref) : Uses an ODE solver to find the steady state. Automatically
+    terminates when close to the steady state. `DynamicSS(alg; tspan = Inf)` requires that
+    an ODE algorithm is given as the first argument. The absolute and relative tolerances
+    specify the termination conditions on the derivative's closeness to zero. This
+    internally uses the `TerminateSteadyState` callback from the Callback Library. The
+    simulated time, for which the ODE is solved, can be limited by `tspan`.  If `tspan` is a
+    number, it is equivalent to passing `(zero(tspan), tspan)`.
+
+Example usage:
+
+```julia
+using NonlinearSolve, SteadyStateDiffEq, OrdinaryDiffEq
+sol = solve(prob, DynamicSS(Tsit5()))
+
+using Sundials
+sol = solve(prob, DynamicSS(CVODE_BDF()), dt = 1.0)
+```
+
+!!! note
+    
+    If you use `CVODE_BDF` you may need to give a starting `dt` via `dt=....`.
diff --git a/docs/src/tutorials/code_optimization.md b/docs/src/tutorials/code_optimization.md
index 1bfc1c302..fa0f61657 100644
--- a/docs/src/tutorials/code_optimization.md
+++ b/docs/src/tutorials/code_optimization.md
@@ -115,7 +115,7 @@ to normal array expressions, for example:
 ```@example small_opt
 using StaticArrays
 A = SA[2.0, 3.0, 5.0]
-typeof(A) # SVector{3, Float64} (alias for SArray{Tuple{3}, Float64, 1, 3})
+typeof(A)
 ```
 
 Notice that the `3` after `SVector` gives the size of the `SVector`. It cannot be changed.
diff --git a/docs/src/tutorials/getting_started.md b/docs/src/tutorials/getting_started.md
index 0078aaa16..26bf9faa9 100644
--- a/docs/src/tutorials/getting_started.md
+++ b/docs/src/tutorials/getting_started.md
@@ -194,8 +194,8 @@ solve(prob, GaussNewton(), reltol = 1e-12, abstol = 1e-12)
 
 ## Going Beyond the Basics: How to use the Documentation
 
-Congrats, you now know how to use the basics of NonlinearSolve.jl! However, there is so much more to
-see. Next check out:
+Congrats, you now know how to use the basics of NonlinearSolve.jl! However, there is so much
+more to see. Next check out:
 
   - [Some code optimization tricks to know about with NonlinearSolve.jl](@ref code_optimization)
   - [An iterator interface which lets you step through the solving process step by step](@ref iterator)
diff --git a/docs/src/tutorials/iterator_interface.md b/docs/src/tutorials/iterator_interface.md
index c0fb914f4..1b6aee101 100644
--- a/docs/src/tutorials/iterator_interface.md
+++ b/docs/src/tutorials/iterator_interface.md
@@ -1,16 +1,35 @@
 # [Nonlinear Solver Iterator Interface](@id iterator)
 
-!!! warn
-    
-    This iterator interface will be expanded with a `step!` function soon!
+There is an iterator form of the nonlinear solver which somewhat mirrors the DiffEq
+integrator interface:
 
-There is an iterator form of the nonlinear solver which mirrors the DiffEq integrator interface:
-
-```@example
+```@example iterator_interface
 using NonlinearSolve
+
 f(u, p) = u .* u .- 2.0
 u0 = 1.5
 probB = NonlinearProblem(f, u0)
-cache = init(probB, NewtonRaphson()) # Can iterate the solver object
-solver = solve!(cache)
+
+nlcache = init(probB, NewtonRaphson())
 ```
+
+`init` takes the same keyword arguments as [`solve`](@ref solver_options), but it returns a
+cache object that satisfies `typeof(nlcache) <: AbstractNonlinearSolveCache` and can be used
+to iterate the solver.
+
+The iterator interface supports:
+
+```@docs
+step!(nlcache::NonlinearSolve.AbstractNonlinearSolveCache, args...; kwargs...)
+```
+
+We can perform 10 steps of the Newton-Raphson solver with the following:
+
+```@example iterator_interface
+for i in 1:10
+    step!(nlcache)
+end
+```
+
+We currently don't implement a `Base.iterate` interface but that will be added in the
+future.
diff --git a/docs/src/tutorials/large_systems.md b/docs/src/tutorials/large_systems.md
index 1eab0d88b..38242c19f 100644
--- a/docs/src/tutorials/large_systems.md
+++ b/docs/src/tutorials/large_systems.md
@@ -137,11 +137,14 @@ Symbolic Sparsity Detection. See the manual entry on
 using BenchmarkTools # for @btime
 
 @btime solve(prob_brusselator_2d, NewtonRaphson());
-@btime solve(prob_brusselator_2d, NewtonRaphson(; autodiff = AutoSparseForwardDiff()));
 @btime solve(prob_brusselator_2d,
-    NewtonRaphson(; autodiff = AutoSparseForwardDiff(), linsolve = KLUFactorization()));
+    NewtonRaphson(; autodiff = AutoSparseForwardDiff(; chunksize = 32)));
 @btime solve(prob_brusselator_2d,
-    NewtonRaphson(; autodiff = AutoSparseForwardDiff(), linsolve = KrylovJL_GMRES()));
+    NewtonRaphson(; autodiff = AutoSparseForwardDiff(; chunksize = 32),
+        linsolve = KLUFactorization()));
+@btime solve(prob_brusselator_2d,
+    NewtonRaphson(; autodiff = AutoSparseForwardDiff(; chunksize = 32),
+        linsolve = KrylovJL_GMRES()));
 nothing # hide
 ```
 
@@ -175,7 +178,7 @@ ff = NonlinearFunction(brusselator_2d_loop; sparsity = jac_sparsity)
 Build the `NonlinearProblem`:
 
 ```@example ill_conditioned_nlprob
-prob_brusselator_2d_sparse = NonlinearProblem(ff, u0, p)
+prob_brusselator_2d_sparse = NonlinearProblem(ff, u0, p; abstol = 1e-10, reltol = 1e-10)
 ```
 
 Now let's see how the version with sparsity compares to the version without:
diff --git a/docs/src/tutorials/optimizing_parameterized_ode.md b/docs/src/tutorials/optimizing_parameterized_ode.md
index 6a0740939..d3b409eca 100644
--- a/docs/src/tutorials/optimizing_parameterized_ode.md
+++ b/docs/src/tutorials/optimizing_parameterized_ode.md
@@ -34,7 +34,6 @@ sol = solve(prob, Tsit5(); saveat = tsteps)
 # Plot the solution
 using Plots
 plot(sol; linewidth = 3)
-savefig("LV_ode.png")
 ```
 
 Let us now formulate the parameter estimation as a Nonlinear Least Squares Problem.
@@ -54,7 +53,7 @@ Now, we can use any NLLS solver to solve this problem.
 
 ```@example parameterized_ode
 res = solve(nlls_prob, LevenbergMarquardt(); maxiters = 1000, show_trace = Val(true),
-    trace_level = TraceAll())
+    trace_level = TraceWithJacobianConditionNumber(25))
 nothing # hide
 ```
 
@@ -66,7 +65,7 @@ We can also use Trust Region methods.
 
 ```@example parameterized_ode
 res = solve(nlls_prob, TrustRegion(); maxiters = 1000, show_trace = Val(true),
-    trace_level = TraceAll())
+    trace_level = TraceWithJacobianConditionNumber(25))
 nothing # hide
 ```
 
diff --git a/ext/NonlinearSolveFastLevenbergMarquardtExt.jl b/ext/NonlinearSolveFastLevenbergMarquardtExt.jl
index fcda6e34d..2cfb98020 100644
--- a/ext/NonlinearSolveFastLevenbergMarquardtExt.jl
+++ b/ext/NonlinearSolveFastLevenbergMarquardtExt.jl
@@ -2,8 +2,9 @@ module NonlinearSolveFastLevenbergMarquardtExt
 
 using ArrayInterface, NonlinearSolve, SciMLBase
 import ConcreteStructs: @concrete
+import FastClosures: @closure
 import FastLevenbergMarquardt as FastLM
-import FiniteDiff, ForwardDiff
+import StaticArraysCore: SArray
 
 @inline function _fast_lm_solver(::FastLevenbergMarquardtJL{linsolve}, x) where {linsolve}
     if linsolve === :cholesky
@@ -14,59 +15,54 @@ import FiniteDiff, ForwardDiff
         throw(ArgumentError("Unknown FastLevenbergMarquardt Linear Solver: $linsolve"))
     end
 end
+@inline _fast_lm_solver(::FastLevenbergMarquardtJL{linsolve}, ::SArray) where {linsolve} = linsolve
 
-# TODO: Implement reinit
-@concrete struct FastLevenbergMarquardtJLCache
-    f!
-    J!
-    prob
-    alg
-    lmworkspace
-    solver
-    kwargs
-end
-
-function SciMLBase.__init(prob::NonlinearLeastSquaresProblem,
+function SciMLBase.__solve(prob::Union{NonlinearLeastSquaresProblem, NonlinearProblem},
         alg::FastLevenbergMarquardtJL, args...; alias_u0 = false, abstol = nothing,
-        reltol = nothing, maxiters = 1000, kwargs...)
-    # FIXME: Support scalar u0
-    prob.u0 isa Number &&
-        throw(ArgumentError("FastLevenbergMarquardtJL does not support scalar `u0`"))
-    iip = SciMLBase.isinplace(prob)
-    u = NonlinearSolve.__maybe_unaliased(prob.u0, alias_u0)
-    fu = NonlinearSolve.evaluate_f(prob, u)
-
-    f! = NonlinearSolve.__make_inplace{iip}(prob.f, nothing)
+        reltol = nothing, maxiters = 1000, termination_condition = nothing, kwargs...)
+    NonlinearSolve.__test_termination_condition(termination_condition,
+        :FastLevenbergMarquardt)
 
+    fn, u, resid = NonlinearSolve.__construct_extension_f(prob; alias_u0,
+        can_handle_oop = Val(prob.u0 isa SArray))
+    f = if prob.u0 isa SArray
+        @closure (u, p) -> fn(u)
+    else
+        @closure (du, u, p) -> fn(du, u)
+    end
     abstol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, eltype(u))
     reltol = NonlinearSolve.DEFAULT_TOLERANCE(reltol, eltype(u))
 
-    if prob.f.jac === nothing
-        alg = NonlinearSolve.get_concrete_algorithm(alg, prob)
-        J! = NonlinearSolve.__construct_jac(prob, alg, u;
-            can_handle_arbitrary_dims = Val(true))
+    _jac_fn = NonlinearSolve.__construct_extension_jac(prob, alg, u, resid; alg.autodiff,
+        can_handle_oop = Val(prob.u0 isa SArray))
+    jac_fn = if prob.u0 isa SArray
+        @closure (u, p) -> _jac_fn(u)
     else
-        J! = NonlinearSolve.__make_inplace{iip}(prob.f.jac, nothing)
+        @closure (J, u, p) -> _jac_fn(J, u)
     end
 
-    J = similar(u, length(fu), length(u))
+    solver_kwargs = (; xtol = reltol, ftol = reltol, gtol = abstol, maxit = maxiters,
+        alg.factor, alg.factoraccept, alg.factorreject, alg.minscale, alg.maxscale,
+        alg.factorupdate, alg.minfactor, alg.maxfactor)
 
-    solver = _fast_lm_solver(alg, u)
-    LM = FastLM.LMWorkspace(u, fu, J)
+    if prob.u0 isa SArray
+        res, fx, info, iter, nfev, njev = FastLM.lmsolve(f, jac_fn, prob.u0;
+            solver_kwargs...)
+        LM, solver = nothing, nothing
+    else
+        J = prob.f.jac_prototype === nothing ? similar(u, length(resid), length(u)) :
+            zero(prob.f.jac_prototype)
+        solver = _fast_lm_solver(alg, u)
+        LM = FastLM.LMWorkspace(u, resid, J)
 
-    return FastLevenbergMarquardtJLCache(f!, J!, prob, alg, LM, solver,
-        (; xtol = reltol, ftol = reltol, gtol = abstol, maxit = maxiters, alg.factor,
-            alg.factoraccept, alg.factorreject, alg.minscale, alg.maxscale,
-            alg.factorupdate, alg.minfactor, alg.maxfactor))
-end
+        res, fx, info, iter, nfev, njev, LM, solver = FastLM.lmsolve!(f, jac_fn, LM;
+            solver, solver_kwargs...)
+    end
 
-function SciMLBase.solve!(cache::FastLevenbergMarquardtJLCache)
-    res, fx, info, iter, nfev, njev, LM, solver = FastLM.lmsolve!(cache.f!, cache.J!,
-        cache.lmworkspace, cache.prob.p; cache.solver, cache.kwargs...)
     stats = SciMLBase.NLStats(nfev, njev, -1, -1, iter)
     retcode = info == -1 ? ReturnCode.MaxIters : ReturnCode.Success
-    return SciMLBase.build_solution(cache.prob, cache.alg, res, fx;
-        retcode, original = (res, fx, info, iter, nfev, njev, LM, solver), stats)
+    return SciMLBase.build_solution(prob, alg, res, fx; retcode,
+        original = (res, fx, info, iter, nfev, njev, LM, solver), stats)
 end
 
 end
diff --git a/ext/NonlinearSolveFixedPointAccelerationExt.jl b/ext/NonlinearSolveFixedPointAccelerationExt.jl
index 2c7ed376e..0c8ff8371 100644
--- a/ext/NonlinearSolveFixedPointAccelerationExt.jl
+++ b/ext/NonlinearSolveFixedPointAccelerationExt.jl
@@ -1,23 +1,21 @@
 module NonlinearSolveFixedPointAccelerationExt
 
-using NonlinearSolve, FixedPointAcceleration, DiffEqBase, SciMLBase
+using NonlinearSolve, FixedPointAcceleration, SciMLBase
 
 function SciMLBase.__solve(prob::NonlinearProblem, alg::FixedPointAccelerationJL, args...;
         abstol = nothing, maxiters = 1000, alias_u0::Bool = false,
         show_trace::Val{PrintReports} = Val(false), termination_condition = nothing,
         kwargs...) where {PrintReports}
-    @assert (termination_condition ===
-             nothing)||(termination_condition isa AbsNormTerminationMode) "FixedPointAccelerationJL does not support termination conditions!"
-
-    f, u0 = NonlinearSolve.__construct_f(prob; alias_u0, make_fixed_point = Val(true),
-        force_oop = Val(true))
+    NonlinearSolve.__test_termination_condition(termination_condition,
+        :FixedPointAccelerationJL)
 
+    f, u0, resid = NonlinearSolve.__construct_extension_f(prob; alias_u0,
+        make_fixed_point = Val(true), force_oop = Val(true))
     tol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, eltype(u0))
 
-    sol = fixed_point(f, u0; Algorithm = alg.algorithm,
-        ConvergenceMetricThreshold = tol, MaxIter = maxiters, MaxM = alg.m,
-        ExtrapolationPeriod = alg.extrapolation_period, Dampening = alg.dampening,
-        PrintReports, ReplaceInvalids = alg.replace_invalids,
+    sol = fixed_point(f, u0; Algorithm = alg.algorithm, MaxIter = maxiters, MaxM = alg.m,
+        ConvergenceMetricThreshold = tol, ExtrapolationPeriod = alg.extrapolation_period,
+        Dampening = alg.dampening, PrintReports, ReplaceInvalids = alg.replace_invalids,
         ConditionNumberThreshold = alg.condition_number_threshold, quiet_errors = true)
 
     if sol.FixedPoint_ === missing
@@ -31,10 +29,10 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::FixedPointAccelerationJL
         resid = NonlinearSolve.evaluate_f(prob, res)
         converged = maximum(abs, resid) ≤ tol
     end
-    return SciMLBase.build_solution(prob, alg, res, resid;
+
+    return SciMLBase.build_solution(prob, alg, res, resid; original = sol,
         retcode = converged ? ReturnCode.Success : ReturnCode.Failure,
-        stats = SciMLBase.NLStats(sol.Iterations_, 0, 0, 0, sol.Iterations_),
-        original = sol)
+        stats = SciMLBase.NLStats(sol.Iterations_, 0, 0, 0, sol.Iterations_))
 end
 
 end
diff --git a/ext/NonlinearSolveLeastSquaresOptimExt.jl b/ext/NonlinearSolveLeastSquaresOptimExt.jl
index e50469cec..6ce6eabd4 100644
--- a/ext/NonlinearSolveLeastSquaresOptimExt.jl
+++ b/ext/NonlinearSolveLeastSquaresOptimExt.jl
@@ -4,20 +4,18 @@ using NonlinearSolve, SciMLBase
 import ConcreteStructs: @concrete
 import LeastSquaresOptim as LSO
 
-@inline function _lso_solver(::LeastSquaresOptimJL{alg, linsolve}) where {alg, linsolve}
-    ls = linsolve === :qr ? LSO.QR() :
-         (linsolve === :cholesky ? LSO.Cholesky() :
-          (linsolve === :lsmr ? LSO.LSMR() : nothing))
+@inline function _lso_solver(::LeastSquaresOptimJL{alg, ls}) where {alg, ls}
+    linsolve = ls === :qr ? LSO.QR() :
+               (ls === :cholesky ? LSO.Cholesky() : (ls === :lsmr ? LSO.LSMR() : nothing))
     if alg === :lm
-        return LSO.LevenbergMarquardt(ls)
+        return LSO.LevenbergMarquardt(linsolve)
     elseif alg === :dogleg
-        return LSO.Dogleg(ls)
+        return LSO.Dogleg(linsolve)
     else
         throw(ArgumentError("Unknown LeastSquaresOptim Algorithm: $alg"))
     end
 end
 
-# TODO: Implement reinit
 @concrete struct LeastSquaresOptimJLCache
     prob
     alg
@@ -25,24 +23,30 @@ end
     kwargs
 end
 
-function SciMLBase.__init(prob::NonlinearLeastSquaresProblem, alg::LeastSquaresOptimJL,
-        args...; alias_u0 = false, abstol = nothing, show_trace::Val{ShT} = Val(false),
-        trace_level = TraceMinimal(), store_trace::Val{StT} = Val(false), maxiters = 1000,
-        reltol = nothing, kwargs...) where {ShT, StT}
-    iip = SciMLBase.isinplace(prob)
-    u = NonlinearSolve.__maybe_unaliased(prob.u0, alias_u0)
+function SciMLBase.reinit!(cache::LeastSquaresOptimJLCache, args...; kwargs...)
+    error("Reinitialization not supported for LeastSquaresOptimJL.")
+end
+
+function SciMLBase.__init(prob::Union{NonlinearLeastSquaresProblem, NonlinearProblem},
+        alg::LeastSquaresOptimJL, args...; alias_u0 = false, abstol = nothing,
+        show_trace::Val{ShT} = Val(false), trace_level = TraceMinimal(), reltol = nothing,
+        store_trace::Val{StT} = Val(false), maxiters = 1000,
+        termination_condition = nothing, kwargs...) where {ShT, StT}
+    NonlinearSolve.__test_termination_condition(termination_condition, :LeastSquaresOptim)
 
+    f!, u, resid = NonlinearSolve.__construct_extension_f(prob; alias_u0)
     abstol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, eltype(u))
     reltol = NonlinearSolve.DEFAULT_TOLERANCE(reltol, eltype(u))
 
-    f! = NonlinearSolve.__make_inplace{iip}(prob.f, prob.p)
-    g! = NonlinearSolve.__make_inplace{iip}(prob.f.jac, prob.p)
-
-    resid_prototype = prob.f.resid_prototype === nothing ?
-                      (!iip ? prob.f(u, prob.p) : zeros(u)) : prob.f.resid_prototype
+    if prob.f.jac === nothing && alg.autodiff isa Symbol
+        lsoprob = LSO.LeastSquaresProblem(; x = u, f!, y = resid, alg.autodiff,
+            J = prob.f.jac_prototype, output_length = length(resid))
+    else
+        g! = NonlinearSolve.__construct_extension_jac(prob, alg, u, resid; alg.autodiff)
+        lsoprob = LSO.LeastSquaresProblem(; x = u, f!, y = resid, g!,
+            J = prob.f.jac_prototype, output_length = length(resid))
+    end
 
-    lsoprob = LSO.LeastSquaresProblem(; x = u, f!, y = resid_prototype, g!,
-        J = prob.f.jac_prototype, alg.autodiff, output_length = length(resid_prototype))
     allocated_prob = LSO.LeastSquaresProblemAllocated(lsoprob, _lso_solver(alg))
 
     return LeastSquaresOptimJLCache(prob, alg, allocated_prob,
diff --git a/ext/NonlinearSolveMINPACKExt.jl b/ext/NonlinearSolveMINPACKExt.jl
index 0d3b8fc42..a15e8d968 100644
--- a/ext/NonlinearSolveMINPACKExt.jl
+++ b/ext/NonlinearSolveMINPACKExt.jl
@@ -1,21 +1,17 @@
 module NonlinearSolveMINPACKExt
 
-using NonlinearSolve, DiffEqBase, SciMLBase
-using MINPACK
+using MINPACK, NonlinearSolve, SciMLBase
 import FastClosures: @closure
 
-function SciMLBase.__solve(prob::Union{NonlinearProblem{uType, iip},
-            NonlinearLeastSquaresProblem{uType, iip}}, alg::CMINPACK, args...;
-        abstol = nothing, maxiters = 1000, alias_u0::Bool = false,
-        show_trace::Val{ShT} = Val(false), store_trace::Val{StT} = Val(false),
-        termination_condition = nothing, kwargs...) where {uType, iip, ShT, StT}
-    @assert (termination_condition ===
-             nothing)||(termination_condition isa AbsNormTerminationMode) "CMINPACK does not support termination conditions!"
+function SciMLBase.__solve(prob::Union{NonlinearLeastSquaresProblem,
+            NonlinearProblem}, alg::CMINPACK, args...; abstol = nothing, maxiters = 1000,
+        alias_u0::Bool = false, show_trace::Val{ShT} = Val(false),
+        store_trace::Val{StT} = Val(false), termination_condition = nothing,
+        kwargs...) where {ShT, StT}
+    NonlinearSolve.__test_termination_condition(termination_condition, :CMINPACK)
 
-    f!_, u0 = NonlinearSolve.__construct_f(prob; alias_u0)
-    f! = @closure (du, u) -> (f!_(du, u); Cint(0))
-
-    resid = NonlinearSolve.evaluate_f(prob, prob.u0)
+    _f!, u0, resid = NonlinearSolve.__construct_extension_f(prob; alias_u0)
+    f! = @closure (du, u) -> (_f!(du, u); Cint(0))
     m = length(resid)
 
     method = ifelse(alg.method === :auto,
@@ -25,13 +21,12 @@ function SciMLBase.__solve(prob::Union{NonlinearProblem{uType, iip},
     tracing = alg.tracing || StT
     tol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, eltype(u0))
 
-    jac!_ = NonlinearSolve.__construct_jac(prob, alg, u0)
-
-    if jac!_ === nothing
+    if alg.autodiff === missing && prob.f.jac === nothing
         original = MINPACK.fsolve(f!, u0, m; tol, show_trace, tracing, method,
             iterations = maxiters)
     else
-        jac! = @closure((J, u)->(jac!_(J, u); Cint(0)))
+        _jac! = NonlinearSolve.__construct_extension_jac(prob, alg, u0, resid; alg.autodiff)
+        jac! = @closure (J, u) -> (_jac!(J, u); Cint(0))
         original = MINPACK.fsolve(f!, jac!, u0, m; tol, show_trace, tracing, method,
             iterations = maxiters)
     end
diff --git a/ext/NonlinearSolveNLsolveExt.jl b/ext/NonlinearSolveNLsolveExt.jl
index 7d1eff02d..64886c021 100644
--- a/ext/NonlinearSolveNLsolveExt.jl
+++ b/ext/NonlinearSolveNLsolveExt.jl
@@ -1,43 +1,37 @@
 module NonlinearSolveNLsolveExt
 
-using NonlinearSolve, NLsolve, DiffEqBase, SciMLBase
+using NonlinearSolve, NLsolve, SciMLBase
 
 function SciMLBase.__solve(prob::NonlinearProblem, alg::NLsolveJL, args...;
         abstol = nothing, maxiters = 1000, alias_u0::Bool = false,
-        termination_condition = nothing, kwargs...)
-    @assert (termination_condition ===
-             nothing)||(termination_condition isa AbsNormTerminationMode) "NLsolveJL does not support termination conditions!"
+        termination_condition = nothing, store_trace::Val{StT} = Val(false),
+        show_trace::Val{ShT} = Val(false), trace_level = TraceMinimal(),
+        kwargs...) where {StT, ShT}
+    NonlinearSolve.__test_termination_condition(termination_condition, :NLsolveJL)
 
-    f!, u0 = NonlinearSolve.__construct_f(prob; alias_u0)
+    f!, u0, resid = NonlinearSolve.__construct_extension_f(prob; alias_u0)
 
-    # unwrapping alg params
-    (; method, autodiff, store_trace, extended_trace, linesearch, linsolve, factor,
-    autoscale, m, beta, show_trace) = alg
-
-    if prob.u0 isa Number
-        resid = [NonlinearSolve.evaluate_f(prob, first(u0))]
+    if prob.f.jac === nothing && alg.autodiff isa Symbol
+        df = OnceDifferentiable(f!, u0, resid; alg.autodiff)
     else
-        resid = NonlinearSolve.evaluate_f(prob, prob.u0)
-    end
-
-    jac! = NonlinearSolve.__construct_jac(prob, alg, u0)
-
-    if jac! === nothing
-        df = OnceDifferentiable(f!, vec(u0), vec(resid); autodiff)
-    else
-        if prob.f.jac_prototype !== nothing
-            J = zero(prob.f.jac_prototype)
-            df = OnceDifferentiable(f!, jac!, vec(u0), vec(resid), J)
+        jac! = NonlinearSolve.__construct_extension_jac(prob, alg, u0, resid; alg.autodiff)
+        if prob.f.jac_prototype === nothing
+            J = similar(u0, promote_type(eltype(u0), eltype(resid)), length(u0),
+                length(resid))
         else
-            df = OnceDifferentiable(f!, jac!, vec(u0), vec(resid))
+            J = zero(prob.f.jac_prototype)
         end
+        df = OnceDifferentiable(f!, jac!, vec(u0), vec(resid), J)
     end
 
     abstol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, eltype(u0))
+    show_trace = ShT || alg.show_trace
+    store_trace = StT || alg.store_trace
+    extended_trace = !(trace_level isa TraceMinimal) || alg.extended_trace
 
-    original = nlsolve(df, vec(u0); ftol = abstol, iterations = maxiters, method,
-        store_trace, extended_trace, linesearch, linsolve, factor, autoscale, m, beta,
-        show_trace)
+    original = nlsolve(df, vec(u0); ftol = abstol, iterations = maxiters, alg.method,
+        store_trace, extended_trace, alg.linesearch, alg.linsolve, alg.factor,
+        alg.autoscale, alg.m, alg.beta, show_trace)
 
     f!(vec(resid), original.zero)
     u = prob.u0 isa Number ? original.zero[1] : reshape(original.zero, size(prob.u0))
diff --git a/ext/NonlinearSolveSIAMFANLEquationsExt.jl b/ext/NonlinearSolveSIAMFANLEquationsExt.jl
index 27da9dd81..c313477df 100644
--- a/ext/NonlinearSolveSIAMFANLEquationsExt.jl
+++ b/ext/NonlinearSolveSIAMFANLEquationsExt.jl
@@ -1,7 +1,7 @@
 module NonlinearSolveSIAMFANLEquationsExt
 
-using NonlinearSolve, SciMLBase
-using SIAMFANLEquations
+using NonlinearSolve, SIAMFANLEquations, SciMLBase
+import FastClosures: @closure
 
 @inline function __siam_fanl_equations_retcode_mapping(sol)
     if sol.errcode == 0
@@ -33,19 +33,15 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SIAMFANLEquationsJL, arg
         abstol = nothing, reltol = nothing, alias_u0::Bool = false, maxiters = 1000,
         termination_condition = nothing, show_trace::Val{ShT} = Val(false),
         kwargs...) where {ShT}
-    @assert (termination_condition ===
-             nothing)||(termination_condition isa AbsNormTerminationMode) "SIAMFANLEquationsJL does not support termination conditions!"
+    NonlinearSolve.__test_termination_condition(termination_condition, :SIAMFANLEquationsJL)
 
     (; method, delta, linsolve, m, beta) = alg
-
     T = eltype(prob.u0)
     atol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, T)
     rtol = NonlinearSolve.DEFAULT_TOLERANCE(reltol, T)
 
     if prob.u0 isa Number
-        f = method == :anderson ? (du, u) -> (du = prob.f(u, prob.p)) :
-            ((u) -> prob.f(u, prob.p))
-
+        f = @closure u -> prob.f(u, prob.p)
         if method == :newton
             sol = nsolsc(f, prob.u0; maxit = maxiters, atol, rtol, printerr = ShT)
         elseif method == :pseudotransient
@@ -54,82 +50,64 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SIAMFANLEquationsJL, arg
         elseif method == :secant
             sol = secant(f, prob.u0; maxit = maxiters, atol, rtol, printerr = ShT)
         elseif method == :anderson
-            f, u = NonlinearSolve.__construct_f(prob; alias_u0,
-                make_fixed_point = Val(true), can_handle_arbitrary_dims = Val(true))
-            sol = aasol(f, [prob.u0], m, __zeros_like(u, 1, 2 * m + 4); maxit = maxiters,
-                atol, rtol, beta = beta)
-        end
-
-        retcode = __siam_fanl_equations_retcode_mapping(sol)
-        stats = __siam_fanl_equations_stats_mapping(method, sol)
-        resid = NonlinearSolve.evaluate_f(prob, sol.solution[1])
-        return SciMLBase.build_solution(prob, alg, sol.solution, resid; retcode,
-            stats, original = sol)
-    end
-
-    f!, u = NonlinearSolve.__construct_f(prob; alias_u0,
-        can_handle_arbitrary_dims = Val(true))
-
-    # Allocate ahead for function
-    N = length(u)
-    FS = __zeros_like(u, N)
-
-    # Jacobian free Newton Krylov
-    if linsolve !== nothing
-        # Allocate ahead for Krylov basis
-        JVS = linsolve == :gmres ? __zeros_like(u, N, 3) : __zeros_like(u, N)
-        # `linsolve` as a Symbol to keep unified interface with other EXTs,
-        # SIAMFANLEquations directly use String to choose between different linear solvers
-        linsolve_alg = String(linsolve)
-
-        if method == :newton
-            sol = nsoli(f!, u, FS, JVS; lsolver = linsolve_alg, maxit = maxiters, atol,
-                rtol, printerr = ShT)
-        elseif method == :pseudotransient
-            sol = ptcsoli(f!, u, FS, JVS; lsolver = linsolve_alg, maxit = maxiters, atol,
-                rtol, printerr = ShT)
-        end
-
-        retcode = __siam_fanl_equations_retcode_mapping(sol)
-        stats = __siam_fanl_equations_stats_mapping(method, sol)
-        resid = NonlinearSolve.evaluate_f(prob, sol.solution)
-        return SciMLBase.build_solution(prob, alg, sol.solution, resid; retcode,
-            stats, original = sol)
-    end
-
-    # Allocate ahead for Jacobian
-    FPS = __zeros_like(u, N, N)
-
-    if prob.f.jac === nothing
-        # Use the built-in Jacobian machinery
-        if method == :newton
-            sol = nsol(f!, u, FS, FPS; sham = 1, atol, rtol, maxit = maxiters,
-                printerr = ShT)
-        elseif method == :pseudotransient
-            sol = ptcsol(f!, u, FS, FPS; atol, rtol, maxit = maxiters,
-                delta0 = delta, printerr = ShT)
-        elseif method == :anderson
-            f!, u = NonlinearSolve.__construct_f(prob; alias_u0,
-                can_handle_arbitrary_dims = Val(true), make_fixed_point = Val(true))
-            sol = aasol(f!, u, m, zeros(T, N, 2 * m + 4), atol = atol, rtol = rtol,
-                maxit = maxiters, beta = beta)
+            f_aa, u, _ = NonlinearSolve.__construct_extension_f(prob; alias_u0,
+                make_fixed_point = Val(true))
+            sol = aasol(f_aa, u, m, __zeros_like(u, 1, 2 * m + 4); maxit = maxiters,
+                atol, rtol, beta)
         end
     else
-        AJ!(J, u, x) = prob.f.jac(J, x, prob.p)
-        if method == :newton
-            sol = nsol(f!, u, FS, FPS, AJ!; sham = 1, atol, rtol, maxit = maxiters,
-                printerr = ShT)
-        elseif method == :pseudotransient
-            sol = ptcsol(f!, u, FS, FPS, AJ!; atol, rtol, maxit = maxiters,
-                delta0 = delta, printerr = ShT)
+        f, u, resid = NonlinearSolve.__construct_extension_f(prob; alias_u0,
+            make_fixed_point = Val(method == :anderson))
+        N = length(u)
+        FS = __zeros_like(u, N)
+
+        # Jacobian Free Newton Krylov
+        if linsolve !== nothing
+            # Allocate ahead for Krylov basis
+            JVS = linsolve == :gmres ? __zeros_like(u, N, 3) : __zeros_like(u, N)
+            linsolve_alg = String(linsolve)
+            if method == :newton
+                sol = nsoli(f, u, FS, JVS; lsolver = linsolve_alg, maxit = maxiters, atol,
+                    rtol, printerr = ShT)
+            elseif method == :pseudotransient
+                sol = ptcsoli(f, u, FS, JVS; lsolver = linsolve_alg, maxit = maxiters,
+                    atol, rtol, printerr = ShT)
+            end
+        else
+            if prob.f.jac === nothing && alg.autodiff === missing
+                FPS = __zeros_like(u, N, N)
+                if method == :newton
+                    sol = nsol(f, u, FS, FPS; sham = 1, atol, rtol, maxit = maxiters,
+                        printerr = ShT)
+                elseif method == :pseudotransient
+                    sol = ptcsol(f, u, FS, FPS; atol, rtol, maxit = maxiters,
+                        delta0 = delta, printerr = ShT)
+                elseif method == :anderson
+                    sol = aasol(f, u, m, zeros(T, N, 2 * m + 4); atol, rtol,
+                        maxit = maxiters, beta)
+                end
+            else
+                FPS = prob.f.jac_prototype !== nothing ? zero(prob.f.jac_prototype) :
+                      __zeros_like(u, N, N)
+                jac = NonlinearSolve.__construct_extension_jac(prob, alg, u, resid;
+                    alg.autodiff)
+                AJ! = @closure (J, u, x) -> jac(J, x)
+                if method == :newton
+                    sol = nsol(f, u, FS, FPS, AJ!; sham = 1, atol, rtol, maxit = maxiters,
+                        printerr = ShT)
+                elseif method == :pseudotransient
+                    sol = ptcsol(f, u, FS, FPS, AJ!; atol, rtol, maxit = maxiters,
+                        delta0 = delta, printerr = ShT)
+                end
+            end
         end
     end
 
     retcode = __siam_fanl_equations_retcode_mapping(sol)
     stats = __siam_fanl_equations_stats_mapping(method, sol)
-    resid = NonlinearSolve.evaluate_f(prob, sol.solution)
-    return SciMLBase.build_solution(prob, alg, sol.solution, resid; retcode, stats,
-        original = sol)
+    res = prob.u0 isa Number && method === :anderson ? sol.solution[1] : sol.solution
+    resid = NonlinearSolve.evaluate_f(prob, res)
+    return SciMLBase.build_solution(prob, alg, res, resid; retcode, stats, original = sol)
 end
 
 end
diff --git a/ext/NonlinearSolveSpeedMappingExt.jl b/ext/NonlinearSolveSpeedMappingExt.jl
index 9f15ab97b..23f1cba98 100644
--- a/ext/NonlinearSolveSpeedMappingExt.jl
+++ b/ext/NonlinearSolveSpeedMappingExt.jl
@@ -1,27 +1,27 @@
 module NonlinearSolveSpeedMappingExt
 
-using NonlinearSolve, SpeedMapping, DiffEqBase, SciMLBase
+using NonlinearSolve, SciMLBase, SpeedMapping
 
 function SciMLBase.__solve(prob::NonlinearProblem, alg::SpeedMappingJL, args...;
-        abstol = nothing, maxiters = 1000, alias_u0::Bool = false,
+        abstol = nothing, maxiters = 1000, alias_u0::Bool = false, maxtime = nothing,
         store_trace::Val{store_info} = Val(false), termination_condition = nothing,
         kwargs...) where {store_info}
-    @assert (termination_condition ===
-             nothing)||(termination_condition isa AbsNormTerminationMode) "SpeedMappingJL does not support termination conditions!"
+    NonlinearSolve.__test_termination_condition(termination_condition, :SpeedMappingJL)
 
-    m!, u0 = NonlinearSolve.__construct_f(prob; alias_u0, make_fixed_point = Val(true),
-        can_handle_arbitrary_dims = Val(true))
+    m!, u, resid = NonlinearSolve.__construct_extension_f(prob; alias_u0,
+        make_fixed_point = Val(true))
+    tol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, eltype(u))
 
-    tol = NonlinearSolve.DEFAULT_TOLERANCE(abstol, eltype(u0))
+    time_limit = ifelse(maxtime === nothing, alg.time_limit, maxtime)
 
-    sol = speedmapping(u0; m!, tol, Lp = Inf, maps_limit = maxiters, alg.orders,
-        alg.check_obj, store_info, alg.σ_min, alg.stabilize)
+    sol = speedmapping(u; m!, tol, Lp = Inf, maps_limit = maxiters, alg.orders,
+        alg.check_obj, store_info, alg.σ_min, alg.stabilize, time_limit)
     res = prob.u0 isa Number ? first(sol.minimizer) : sol.minimizer
     resid = NonlinearSolve.evaluate_f(prob, res)
 
-    return SciMLBase.build_solution(prob, alg, res, resid;
+    return SciMLBase.build_solution(prob, alg, res, resid; original = sol,
         retcode = sol.converged ? ReturnCode.Success : ReturnCode.Failure,
-        stats = SciMLBase.NLStats(sol.maps, 0, 0, 0, sol.maps), original = sol)
+        stats = SciMLBase.NLStats(sol.maps, 0, 0, 0, sol.maps))
 end
 
 end
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index 9b8786380..2f3d0cf13 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -8,33 +8,27 @@ import Reexport: @reexport
 import PrecompileTools: @recompile_invalidations, @compile_workload, @setup_workload
 
 @recompile_invalidations begin
-    using ADTypes, DiffEqBase, LazyArrays, LineSearches, LinearAlgebra, LinearSolve, Printf,
-        SciMLBase, SimpleNonlinearSolve, SparseArrays, SparseDiffTools, StaticArrays
-
-    import ADTypes: AbstractFiniteDifferencesMode
-    import ArrayInterface: undefmatrix, restructure, can_setindex,
-        matrix_colors, parameterless_type, ismutable, issingular, fast_scalar_indexing
-    import ConcreteStructs: @concrete
-    import EnumX: @enumx
-    import FastBroadcast: @..
-    import FastClosures: @closure
+    using ADTypes, ConcreteStructs, DiffEqBase, FastBroadcast, FastClosures, LazyArrays,
+        LineSearches, LinearAlgebra, LinearSolve, MaybeInplace, Preferences, Printf,
+        SciMLBase, SimpleNonlinearSolve, SparseArrays, SparseDiffTools
+
+    import ArrayInterface: undefmatrix, can_setindex, restructure, fast_scalar_indexing
+    import DiffEqBase: AbstractNonlinearTerminationMode,
+        AbstractSafeNonlinearTerminationMode, AbstractSafeBestNonlinearTerminationMode,
+        NonlinearSafeTerminationReturnCode, get_termination_mode
     import FiniteDiff
     import ForwardDiff
     import ForwardDiff: Dual
     import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
-    import MaybeInplace: setindex_trait, @bb, CanSetindex, CannotSetindex
-    import RecursiveArrayTools: ArrayPartition,
-        AbstractVectorOfArray, recursivecopy!, recursivefill!
-    import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
-    import SciMLOperators: FunctionOperator
-    import StaticArrays: StaticArray, SVector, SArray, MArray, Size, SMatrix, MMatrix
-    import UnPack: @unpack
+    import RecursiveArrayTools: recursivecopy!, recursivefill!
+
+    import SciMLBase: AbstractNonlinearAlgorithm, JacobianWrapper, AbstractNonlinearProblem,
+        AbstractSciMLOperator, NLStats, _unwrap_val, has_jac, isinplace
+    import SparseDiffTools: AbstractSparsityDetection
+    import StaticArraysCore: StaticArray, SVector, SArray, MArray, Size, SMatrix, MMatrix
 end
 
 @reexport using ADTypes, LineSearches, SciMLBase, SimpleNonlinearSolve
-import DiffEqBase: AbstractNonlinearTerminationMode,
-    AbstractSafeNonlinearTerminationMode, AbstractSafeBestNonlinearTerminationMode,
-    NonlinearSafeTerminationReturnCode, get_termination_mode
 
 const AbstractSparseADType = Union{ADTypes.AbstractSparseFiniteDifferences,
     ADTypes.AbstractSparseForwardMode, ADTypes.AbstractSparseReverseMode}
@@ -42,149 +36,47 @@ const AbstractSparseADType = Union{ADTypes.AbstractSparseFiniteDifferences,
 # Type-Inference Friendly Check for Extension Loading
 is_extension_loaded(::Val) = false
 
-abstract type AbstractNonlinearSolveLineSearchAlgorithm end
-
-abstract type AbstractNonlinearSolveAlgorithm <: AbstractNonlinearAlgorithm end
-abstract type AbstractNewtonAlgorithm{CJ, AD} <: AbstractNonlinearSolveAlgorithm end
-
-abstract type AbstractNonlinearSolveCache{iip} end
-
-isinplace(::AbstractNonlinearSolveCache{iip}) where {iip} = iip
-
-function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache{iip}, u0 = get_u(cache);
-        p = cache.p, abstol = cache.abstol, reltol = cache.reltol,
-        maxiters = cache.maxiters, alias_u0 = false, termination_condition = missing,
-        kwargs...) where {iip}
-    cache.p = p
-    if iip
-        recursivecopy!(get_u(cache), u0)
-        cache.f(get_fu(cache), get_u(cache), p)
-    else
-        cache.u = __maybe_unaliased(u0, alias_u0)
-        set_fu!(cache, cache.f(cache.u, p))
-    end
-
-    reset!(cache.trace)
-
-    # Some algorithms store multiple termination caches
-    if hasfield(typeof(cache), :tc_cache)
-        # TODO: We need an efficient way to reset this upstream
-        tc = termination_condition === missing ? get_termination_mode(cache.tc_cache) :
-             termination_condition
-        abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, get_fu(cache),
-            get_u(cache), tc)
-        cache.tc_cache = tc_cache
-    end
-
-    if hasfield(typeof(cache), :ls_cache)
-        # TODO: A more efficient way to do this
-        cache.ls_cache = init_linesearch_cache(cache.alg.linesearch, cache.f,
-            get_u(cache), p, get_fu(cache), Val(iip))
-    end
-
-    hasfield(typeof(cache), :uf) && cache.uf !== nothing && (cache.uf.p = p)
-
-    cache.abstol = abstol
-    cache.reltol = reltol
-    cache.maxiters = maxiters
-    cache.stats.nf = 1
-    cache.stats.nsteps = 1
-    cache.force_stop = false
-    cache.retcode = ReturnCode.Default
-
-    __reinit_internal!(cache; u0, p, abstol, reltol, maxiters, alias_u0,
-        termination_condition, kwargs...)
-
-    return cache
-end
-
-__reinit_internal!(::AbstractNonlinearSolveCache; kwargs...) = nothing
-
-function Base.show(io::IO, alg::AbstractNonlinearSolveAlgorithm)
-    str = "$(nameof(typeof(alg)))("
-    modifiers = String[]
-    if __getproperty(alg, Val(:ad)) !== nothing
-        push!(modifiers, "ad = $(nameof(typeof(alg.ad)))()")
-    end
-    if __getproperty(alg, Val(:linsolve)) !== nothing
-        push!(modifiers, "linsolve = $(nameof(typeof(alg.linsolve)))()")
-    end
-    if __getproperty(alg, Val(:linesearch)) !== nothing
-        ls = alg.linesearch
-        if ls isa LineSearch
-            ls.method !== nothing &&
-                push!(modifiers, "linesearch = $(nameof(typeof(ls.method)))()")
-        else
-            push!(modifiers, "linesearch = $(nameof(typeof(alg.linesearch)))()")
-        end
-    end
-    append!(modifiers, __alg_print_modifiers(alg))
-    if __getproperty(alg, Val(:radius_update_scheme)) !== nothing
-        push!(modifiers, "radius_update_scheme = $(alg.radius_update_scheme)")
-    end
-    str = str * join(modifiers, ", ")
-    print(io, "$(str))")
-    return nothing
-end
-
-__alg_print_modifiers(_) = String[]
-
-function SciMLBase.__solve(prob::Union{NonlinearProblem, NonlinearLeastSquaresProblem},
-        alg::AbstractNonlinearSolveAlgorithm, args...; kwargs...)
-    cache = init(prob, alg, args...; kwargs...)
-    return solve!(cache)
-end
-
-function not_terminated(cache::AbstractNonlinearSolveCache)
-    return !cache.force_stop && cache.stats.nsteps < cache.maxiters
-end
-
-get_fu(cache::AbstractNonlinearSolveCache) = cache.fu
-set_fu!(cache::AbstractNonlinearSolveCache, fu) = (cache.fu = fu)
-get_u(cache::AbstractNonlinearSolveCache) = cache.u
-SciMLBase.set_u!(cache::AbstractNonlinearSolveCache, u) = (cache.u = u)
-
-function SciMLBase.solve!(cache::AbstractNonlinearSolveCache)
-    while not_terminated(cache)
-        perform_step!(cache)
-        cache.stats.nsteps += 1
-    end
-
-    # The solver might have set a different `retcode`
-    if cache.retcode == ReturnCode.Default
-        if cache.stats.nsteps == cache.maxiters
-            cache.retcode = ReturnCode.MaxIters
-        else
-            cache.retcode = ReturnCode.Success
-        end
-    end
-
-    trace = __getproperty(cache, Val{:trace}())
-    if trace !== nothing
-        update_trace!(trace, cache.stats.nsteps, get_u(cache), get_fu(cache), nothing,
-            nothing, nothing; last = Val(true))
-    end
-
-    return SciMLBase.build_solution(cache.prob, cache.alg, get_u(cache), get_fu(cache);
-        cache.retcode, cache.stats, trace)
-end
+const True = Val(true)
+const False = Val(false)
+
+include("abstract_types.jl")
+include("timer_outputs.jl")
+include("internal/helpers.jl")
+
+include("descent/newton.jl")
+include("descent/steepest.jl")
+include("descent/dogleg.jl")
+include("descent/damped_newton.jl")
+include("descent/geodesic_acceleration.jl")
+
+include("internal/operators.jl")
+include("internal/jacobian.jl")
+include("internal/forward_diff.jl")
+include("internal/linear_solve.jl")
+include("internal/termination.jl")
+include("internal/tracing.jl")
+include("internal/approximate_initialization.jl")
+
+include("globalization/line_search.jl")
+include("globalization/trust_region.jl")
+
+include("core/generic.jl")
+include("core/approximate_jacobian.jl")
+include("core/generalized_first_order.jl")
+include("core/spectral_methods.jl")
+
+include("algorithms/raphson.jl")
+include("algorithms/pseudo_transient.jl")
+include("algorithms/broyden.jl")
+include("algorithms/klement.jl")
+include("algorithms/lbroyden.jl")
+include("algorithms/dfsane.jl")
+include("algorithms/gauss_newton.jl")
+include("algorithms/levenberg_marquardt.jl")
+include("algorithms/trust_region.jl")
+include("algorithms/extension_algs.jl")
 
 include("utils.jl")
-include("function_wrappers.jl")
-include("trace.jl")
-include("extension_algs.jl")
-include("linesearch.jl")
-include("raphson.jl")
-include("trustRegion.jl")
-include("levenberg.jl")
-include("gaussnewton.jl")
-include("dfsane.jl")
-include("pseudotransient.jl")
-include("broyden.jl")
-include("klement.jl")
-include("lbroyden.jl")
-include("jacobian.jl")
-include("ad.jl")
 include("default.jl")
 
 @setup_workload begin
@@ -220,31 +112,43 @@ include("default.jl")
         push!(probs_nlls, NonlinearLeastSquaresProblem(fn, u0, 2.0f0))
     end
 
-    nlls_algs = (LevenbergMarquardt(), GaussNewton(),
+    nlls_algs = (LevenbergMarquardt(), GaussNewton(), TrustRegion(),
         LevenbergMarquardt(; linsolve = LUFactorization()),
-        GaussNewton(; linsolve = LUFactorization()))
+        GaussNewton(; linsolve = LUFactorization()),
+        TrustRegion(; linsolve = LUFactorization()), nothing)
 
     @compile_workload begin
         for prob in probs_nls, alg in nls_algs
-            solve(prob, alg, abstol = 1e-2)
+            solve(prob, alg; abstol = 1e-2)
         end
         for prob in probs_nlls, alg in nlls_algs
-            solve(prob, alg, abstol = 1e-2)
+            solve(prob, alg; abstol = 1e-2)
         end
     end
 end
 
-export RadiusUpdateSchemes
-
-export NewtonRaphson, TrustRegion, LevenbergMarquardt, DFSane, GaussNewton, PseudoTransient,
-    Broyden, Klement, LimitedMemoryBroyden
-export LeastSquaresOptimJL,
-    FastLevenbergMarquardtJL, CMINPACK, NLsolveJL, FixedPointAccelerationJL, SpeedMappingJL,
-    SIAMFANLEquationsJL
+# Core Algorithms
+export NewtonRaphson, PseudoTransient, Klement, Broyden, LimitedMemoryBroyden, DFSane
+export GaussNewton, LevenbergMarquardt, TrustRegion
 export NonlinearSolvePolyAlgorithm,
     RobustMultiNewton, FastShortcutNonlinearPolyalg, FastShortcutNLLSPolyalg
 
-export LineSearch, LiFukushimaLineSearch
+# Extension Algorithms
+export LeastSquaresOptimJL, FastLevenbergMarquardtJL, CMINPACK, NLsolveJL,
+    FixedPointAccelerationJL, SpeedMappingJL, SIAMFANLEquationsJL
+
+# Advanced Algorithms -- Without Bells and Whistles
+export GeneralizedFirstOrderAlgorithm, ApproximateJacobianSolveAlgorithm, GeneralizedDFSane
+
+# Descent Algorithms
+export NewtonDescent, SteepestDescent, Dogleg, DampedNewtonDescent,
+    GeodesicAcceleration
+
+# Globalization
+## Line Search Algorithms
+export LineSearchesJL, NoLineSearch, RobustNonMonotoneLineSearch, LiFukushimaLineSearch
+## Trust Region Algorithms
+export RadiusUpdateSchemes
 
 # Export the termination conditions from DiffEqBase
 export SteadyStateDiffEqTerminationMode, SimpleNonlinearSolveTerminationMode,
diff --git a/src/abstract_types.jl b/src/abstract_types.jl
new file mode 100644
index 000000000..f0324ed41
--- /dev/null
+++ b/src/abstract_types.jl
@@ -0,0 +1,474 @@
+function __internal_init end
+function __internal_solve! end
+
+"""
+    AbstractDescentAlgorithm
+
+Given the Jacobian `J` and the residual `fu`, this type of algorithm computes the descent
+direction `δu`.
+
+For non-square Jacobian problems, if we need to solve a linear solve problem, we use a least
+squares solver by default, unless the provided `linsolve` can't handle non-square matrices,
+in which case we use the normal form equations ``JᵀJ δu = Jᵀ fu``. Note that this
+factorization is often the faster choice, but it is not as numerically stable as the least
+squares solver.
+
+### `__internal_init` specification
+
+```julia
+__internal_init(prob::NonlinearProblem{uType, iip}, alg::AbstractDescentAlgorithm, J, fu, u;
+    pre_inverted::Val{INV} = Val(false), linsolve_kwargs = (;), abstol = nothing,
+    reltol = nothing, alias_J::Bool = true, shared::Val{N} = Val(1),
+    kwargs...) where {INV, N, uType, iip} --> AbstractDescentCache
+
+__internal_init(prob::NonlinearLeastSquaresProblem{uType, iip},
+    alg::AbstractDescentAlgorithm, J, fu, u; pre_inverted::Val{INV} = Val(false),
+    linsolve_kwargs = (;), abstol = nothing, reltol = nothing, alias_J::Bool = true,
+    shared::Val{N} = Val(1), kwargs...) where {INV, N, uType, iip} --> AbstractDescentCache
+```
+
+  - `pre_inverted`: whether or not the Jacobian has been pre_inverted. Defaults to `False`.
+    Note that for most algorithms except `NewtonDescent` setting it to `Val(true)` is
+    generally a bad idea.
+  - `linsolve_kwargs`: keyword arguments to pass to the linear solver. Defaults to `(;)`.
+  - `abstol`: absolute tolerance for the linear solver. Defaults to `nothing`.
+  - `reltol`: relative tolerance for the linear solver. Defaults to `nothing`.
+  - `alias_J`: whether or not to alias the Jacobian. Defaults to `true`.
+  - `shared`: Store multiple descent directions in the cache. Allows efficient and correct
+    reuse of factorizations if needed,
+
+Some of the algorithms also allow additional keyword arguments. See the documentation for
+the specific algorithm for more information.
+
+### Interface Functions
+
+  - `supports_trust_region(alg)`: whether or not the algorithm supports trust region
+    methods. Defaults to `false`.
+  - `supports_line_search(alg)`: whether or not the algorithm supports line search
+    methods. Defaults to `false`.
+
+See also [`NewtonDescent`](@ref), [`Dogleg`](@ref), [`SteepestDescent`](@ref),
+[`DampedNewtonDescent`](@ref).
+"""
+abstract type AbstractDescentAlgorithm end
+
+supports_trust_region(::AbstractDescentAlgorithm) = false
+supports_line_search(::AbstractDescentAlgorithm) = false
+
+get_linear_solver(alg::AbstractDescentAlgorithm) = __getproperty(alg, Val(:linsolve))
+
+"""
+    AbstractDescentCache
+
+Abstract Type for all Descent Caches.
+
+### `__internal_solve!` specification
+
+```julia
+δu, success, intermediates = __internal_solve!(cache::AbstractDescentCache, J, fu, u,
+    idx::Val; skip_solve::Bool = false, kwargs...)
+```
+
+  - `J`: Jacobian or Inverse Jacobian (if `pre_inverted = Val(true)`).
+  - `fu`: residual.
+  - `u`: current state.
+  - `idx`: index of the descent problem to solve and return. Defaults to `Val(1)`.
+  - `skip_solve`: Skip the direction computation and return the previous direction.
+    Defaults to `false`. This is useful for Trust Region Methods where the previous
+    direction was rejected and we want to try with a modified trust region.
+  - `kwargs`: keyword arguments to pass to the linear solver if there is one.
+
+#### Returned values
+
+  - `δu`: the descent direction.
+  - `success`: Certain Descent Algorithms can reject a descent direction for example
+    `GeodesicAcceleration`.
+  - `intermediates`: A named tuple containing intermediates computed during the solve.
+    For example, `GeodesicAcceleration` returns `NamedTuple{(:v, :a)}` containing the
+    "velocity" and "acceleration" terms.
+
+### Interface Functions
+
+  - `get_du(cache)`: get the descent direction.
+  - `get_du(cache, ::Val{N})`: get the `N`th descent direction.
+  - `set_du!(cache, δu)`: set the descent direction.
+  - `set_du!(cache, δu, ::Val{N})`: set the `N`th descent direction.
+  - `last_step_accepted(cache)`: whether or not the last step was accepted. Checks if the
+    cache has a `last_step_accepted` field and returns it if it does, else returns `true`.
+"""
+abstract type AbstractDescentCache end
+
+SciMLBase.get_du(cache::AbstractDescentCache) = cache.δu
+SciMLBase.get_du(cache::AbstractDescentCache, ::Val{1}) = get_du(cache)
+SciMLBase.get_du(cache::AbstractDescentCache, ::Val{N}) where {N} = cache.δus[N - 1]
+set_du!(cache::AbstractDescentCache, δu) = (cache.δu = δu)
+set_du!(cache::AbstractDescentCache, δu, ::Val{1}) = set_du!(cache, δu)
+set_du!(cache::AbstractDescentCache, δu, ::Val{N}) where {N} = (cache.δus[N - 1] = δu)
+
+function last_step_accepted(cache::AbstractDescentCache)
+    hasfield(typeof(cache), :last_step_accepted) && return cache.last_step_accepted
+    return true
+end
+
+"""
+    AbstractNonlinearSolveLineSearchAlgorithm
+
+Abstract Type for all Line Search Algorithms used in NonlinearSolve.jl.
+
+### `__internal_init` specification
+
+```julia
+__internal_init(prob::AbstractNonlinearProblem,
+    alg::AbstractNonlinearSolveLineSearchAlgorithm, f::F, fu, u, p, args...;
+    internalnorm::IN = DEFAULT_NORM,
+    kwargs...) where {F, IN} --> AbstractNonlinearSolveLineSearchCache
+```
+"""
+abstract type AbstractNonlinearSolveLineSearchAlgorithm end
+
+"""
+    AbstractNonlinearSolveLineSearchCache
+
+Abstract Type for all Line Search Caches used in NonlinearSolve.jl.
+
+### `__internal_solve!` specification
+
+```julia
+__internal_solve!(cache::AbstractNonlinearSolveLineSearchCache, u, du; kwargs...)
+```
+
+Returns 2 values:
+
+  - `unsuccessful`: If `true` it means that the Line Search Failed.
+  - `alpha`: The step size.
+"""
+abstract type AbstractNonlinearSolveLineSearchCache end
+
+function reinit_cache!(cache::AbstractNonlinearSolveLineSearchCache, args...; p = cache.p,
+        kwargs...)
+    cache.nf[] = 0
+    cache.p = p
+end
+
+"""
+    AbstractNonlinearSolveAlgorithm{name} <: AbstractNonlinearAlgorithm
+
+Abstract Type for all NonlinearSolve.jl Algorithms. `name` can be used to define custom
+dispatches by wrapped solvers.
+
+### Interface Functions
+
+  - `concrete_jac(alg)`: whether or not the algorithm uses a concrete Jacobian. Defaults
+    to `nothing`.
+  - `get_name(alg)`: get the name of the algorithm.
+"""
+abstract type AbstractNonlinearSolveAlgorithm{name} <: AbstractNonlinearAlgorithm end
+
+"""
+    concrete_jac(alg::AbstractNonlinearSolveAlgorithm)
+
+Whether the algorithm uses a concrete Jacobian. Defaults to `nothing` if it is unknown or
+not applicable. Else a boolean value is returned.
+"""
+concrete_jac(::AbstractNonlinearSolveAlgorithm) = nothing
+
+function Base.show(io::IO, alg::AbstractNonlinearSolveAlgorithm{name}) where {name}
+    __show_algorithm(io, alg, name, 0)
+end
+
+get_name(::AbstractNonlinearSolveAlgorithm{name}) where {name} = name
+
+"""
+    AbstractNonlinearSolveExtensionAlgorithm <: AbstractNonlinearSolveAlgorithm{:Extension}
+
+Abstract Type for all NonlinearSolve.jl Extension Algorithms, i.e. wrappers over 3rd party
+solvers.
+"""
+abstract type AbstractNonlinearSolveExtensionAlgorithm <:
+              AbstractNonlinearSolveAlgorithm{:Extension} end
+
+"""
+    AbstractNonlinearSolveCache{iip, timeit}
+
+Abstract Type for all NonlinearSolve.jl Caches.
+
+### Interface Functions
+
+  - `get_fu(cache)`: get the residual.
+  - `get_u(cache)`: get the current state.
+  - `set_fu!(cache, fu)`: set the residual.
+  - `set_u!(cache, u)`: set the current state.
+  - `reinit!(cache, u0; kwargs...)`: reinitialize the cache with the initial state `u0` and
+    any additional keyword arguments.
+  - `step!(cache; kwargs...)`: See [`SciMLBase.step!`](@ref) for more details.
+  - `not_terminated(cache)`: whether or not the solver has terminated.
+  - `isinplace(cache)`: whether or not the solver is inplace.
+"""
+abstract type AbstractNonlinearSolveCache{iip, timeit} end
+
+SciMLBase.isinplace(::AbstractNonlinearSolveCache{iip}) where {iip} = iip
+
+get_fu(cache::AbstractNonlinearSolveCache) = cache.fu
+get_u(cache::AbstractNonlinearSolveCache) = cache.u
+set_fu!(cache::AbstractNonlinearSolveCache, fu) = (cache.fu = fu)
+SciMLBase.set_u!(cache::AbstractNonlinearSolveCache, u) = (cache.u = u)
+
+function SciMLBase.reinit!(cache::AbstractNonlinearSolveCache, u0; kwargs...)
+    return reinit_cache!(cache; u0, kwargs...)
+end
+
+"""
+    AbstractLinearSolverCache <: Function
+
+Abstract Type for all Linear Solvers used in NonlinearSolve.jl.
+"""
+abstract type AbstractLinearSolverCache <: Function end
+
+"""
+    AbstractDampingFunction
+
+Abstract Type for Damping Functions in DampedNewton.
+
+### `__internal_init` specification
+
+```julia
+__internal_init(prob::AbstractNonlinearProblem, f::AbstractDampingFunction, initial_damping,
+    J, fu, u, args...; internal_norm = DEFAULT_NORM,
+    kwargs...) --> AbstractDampingFunctionCache
+```
+
+Returns a [`AbstractDampingFunctionCache`](@ref).
+"""
+abstract type AbstractDampingFunction end
+
+"""
+    AbstractDampingFunctionCache
+
+Abstract Type for the Caches created by AbstractDampingFunctions
+
+### Interface Functions
+
+  - `requires_normal_form_jacobian(f)`: whether or not the Jacobian is needed in normal
+    form. No default.
+  - `requires_normal_form_rhs(f)`: whether or not the residual is needed in normal form.
+    No default.
+  - `returns_norm_form_damping(f)`: whether or not the damping function returns the
+    damping factor in normal form. Defaults to `requires_normal_form_jacobian(f) || requires_normal_form_rhs(f)`.
+  - `(cache::AbstractDampingFunctionCache)(::Nothing)`: returns the damping factor. The type
+    of the damping factor returned from `solve!` is guaranteed to be the same as this.
+
+### `__internal_solve!` specification
+
+```julia
+__internal_solve!(cache::AbstractDampingFunctionCache, J, fu, args...; kwargs...)
+```
+
+Returns the damping factor.
+"""
+abstract type AbstractDampingFunctionCache end
+
+function requires_normal_form_jacobian end
+function requires_normal_form_rhs end
+function returns_norm_form_damping(f::F) where {F}
+    return requires_normal_form_jacobian(f) || requires_normal_form_rhs(f)
+end
+
+"""
+    AbstractNonlinearSolveOperator <: SciMLBase.AbstractSciMLOperator
+
+NonlinearSolve.jl houses a few custom operators. These will eventually be moved out but till
+then this serves as the abstract type for them.
+"""
+abstract type AbstractNonlinearSolveOperator{T} <: SciMLBase.AbstractSciMLOperator{T} end
+
+# Approximate Jacobian Algorithms
+"""
+    AbstractApproximateJacobianStructure
+
+Abstract Type for all Approximate Jacobian Structures used in NonlinearSolve.jl.
+
+### Interface Functions
+
+  - `stores_full_jacobian(alg)`: whether or not the algorithm stores the full Jacobian.
+    Defaults to `false`.
+  - `get_full_jacobian(cache, alg, J)`: get the full Jacobian. Defaults to throwing an
+    error if `stores_full_jacobian(alg)` is `false`.
+"""
+abstract type AbstractApproximateJacobianStructure end
+
+stores_full_jacobian(::AbstractApproximateJacobianStructure) = false
+function get_full_jacobian(cache, alg::AbstractApproximateJacobianStructure, J)
+    stores_full_jacobian(alg) && return J
+    error("This algorithm does not store the full Jacobian. Define `get_full_jacobian` for \
+           this algorithm.")
+end
+
+"""
+    AbstractJacobianInitialization
+
+Abstract Type for all Jacobian Initialization Algorithms used in NonlinearSolve.jl.
+
+### Interface Functions
+
+  - `jacobian_initialized_preinverted(alg)`: whether or not the Jacobian is initialized
+    preinverted. Defaults to `false`.
+
+### `__internal_init` specification
+
+```julia
+__internal_init(prob::AbstractNonlinearProblem, alg::AbstractJacobianInitialization,
+    solver, f::F, fu, u, p; linsolve = missing, internalnorm::IN = DEFAULT_NORM,
+    kwargs...)
+```
+
+Returns a [`NonlinearSolve.InitializedApproximateJacobianCache`](@ref).
+
+All subtypes need to define
+`(cache::InitializedApproximateJacobianCache)(alg::NewSubType, fu, u)` which reinitializes
+the Jacobian in `cache.J`.
+"""
+abstract type AbstractJacobianInitialization end
+
+function Base.show(io::IO, alg::AbstractJacobianInitialization)
+    modifiers = String[]
+    hasfield(typeof(alg), :structure) &&
+        push!(modifiers, "structure = $(nameof(typeof(alg.structure)))()")
+    print(io, "$(nameof(typeof(alg)))($(join(modifiers, ", ")))")
+    return nothing
+end
+
+jacobian_initialized_preinverted(::AbstractJacobianInitialization) = false
+
+"""
+    AbstractApproximateJacobianUpdateRule{INV}
+
+Abstract Type for all Approximate Jacobian Update Rules used in NonlinearSolve.jl.
+
+### Interface Functions
+
+  - `store_inverse_jacobian(alg)`: Return `INV`
+
+### `__internal_init` specification
+
+```julia
+__internal_init(prob::AbstractNonlinearProblem,
+    alg::AbstractApproximateJacobianUpdateRule, J, fu, u, du, args...;
+    internalnorm::F = DEFAULT_NORM,
+    kwargs...) where {F} --> AbstractApproximateJacobianUpdateRuleCache{INV}
+```
+"""
+abstract type AbstractApproximateJacobianUpdateRule{INV} end
+
+store_inverse_jacobian(::AbstractApproximateJacobianUpdateRule{INV}) where {INV} = INV
+
+"""
+    AbstractApproximateJacobianUpdateRuleCache{INV}
+
+Abstract Type for all Approximate Jacobian Update Rule Caches used in NonlinearSolve.jl.
+
+### Interface Functions
+
+  - `store_inverse_jacobian(alg)`: Return `INV`
+
+### `__internal_solve!` specification
+
+```julia
+__internal_solve!(cache::AbstractApproximateJacobianUpdateRuleCache, J, fu, u, du;
+    kwargs...) --> J / J⁻¹
+```
+"""
+abstract type AbstractApproximateJacobianUpdateRuleCache{INV} end
+
+store_inverse_jacobian(::AbstractApproximateJacobianUpdateRuleCache{INV}) where {INV} = INV
+
+"""
+    AbstractResetCondition
+
+Condition for resetting the Jacobian in Quasi-Newton's methods.
+
+### `__internal_init` specification
+
+```julia
+__internal_init(alg::AbstractResetCondition, J, fu, u, du, args...;
+    kwargs...) --> ResetCache
+```
+
+### `__internal_solve!` specification
+
+```julia
+__internal_solve!(cache::ResetCache, J, fu, u, du) --> Bool
+```
+"""
+abstract type AbstractResetCondition end
+
+"""
+    AbstractTrustRegionMethod
+
+Abstract Type for all Trust Region Methods used in NonlinearSolve.jl.
+
+### `__internal_init` specification
+
+```julia
+__internal_init(prob::AbstractNonlinearProblem, alg::AbstractTrustRegionMethod,
+    f::F, fu, u, p, args...; internalnorm::IF = DEFAULT_NORM,
+    kwargs...) where {F, IF} --> AbstractTrustRegionMethodCache
+```
+"""
+abstract type AbstractTrustRegionMethod end
+
+"""
+    AbstractTrustRegionMethodCache
+
+Abstract Type for all Trust Region Method Caches used in NonlinearSolve.jl.
+
+### Interface Functions
+
+  - `last_step_accepted(cache)`: whether or not the last step was accepted. Defaults to
+    `cache.last_step_accepted`. Should if overloaded if the field is not present.
+
+### `__internal_solve!` specification
+
+```julia
+__internal_solve!(cache::AbstractTrustRegionMethodCache, J, fu, u, δu, descent_stats)
+```
+
+Returns `last_step_accepted`, updated `u_cache` and `fu_cache`. If the last step was
+accepted then these values should be copied into the toplevel cache.
+"""
+abstract type AbstractTrustRegionMethodCache end
+
+last_step_accepted(cache::AbstractTrustRegionMethodCache) = cache.last_step_accepted
+
+"""
+    AbstractNonlinearSolveJacobianCache{iip} <: Function
+
+Abstract Type for all Jacobian Caches used in NonlinearSolve.jl.
+"""
+abstract type AbstractNonlinearSolveJacobianCache{iip} <: Function end
+
+SciMLBase.isinplace(::AbstractNonlinearSolveJacobianCache{iip}) where {iip} = iip
+
+"""
+    AbstractNonlinearSolveTraceLevel
+
+### Common Arguments
+
+  - `freq`: Sets both `print_frequency` and `store_frequency` to `freq`.
+
+### Common Keyword Arguments
+
+  - `print_frequency`: Print the trace every `print_frequency` iterations if
+    `show_trace == Val(true)`.
+  - `store_frequency`: Store the trace every `store_frequency` iterations if
+    `store_trace == Val(true)`.
+"""
+abstract type AbstractNonlinearSolveTraceLevel end
+
+# Default Printing
+for aType in (AbstractTrustRegionMethod, AbstractNonlinearSolveLineSearchAlgorithm,
+    AbstractResetCondition, AbstractApproximateJacobianUpdateRule, AbstractDampingFunction,
+    AbstractNonlinearSolveExtensionAlgorithm)
+    @eval function Base.show(io::IO, alg::$(aType))
+        print(io, "$(nameof(typeof(alg)))()")
+    end
+end
diff --git a/src/ad.jl b/src/ad.jl
deleted file mode 100644
index b1ca26378..000000000
--- a/src/ad.jl
+++ /dev/null
@@ -1,138 +0,0 @@
-function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, <:AbstractArray},
-            iip, <:Union{<:Dual{T, V, P}, <:AbstractArray{<:Dual{T, V, P}}}},
-        alg::Union{Nothing, AbstractNonlinearAlgorithm}, args...;
-        kwargs...) where {T, V, P, iip}
-    sol, partials = __nlsolve_ad(prob, alg, args...; kwargs...)
-    dual_soln = __nlsolve_dual_soln(sol.u, partials, prob.p)
-    return SciMLBase.build_solution(prob, alg, dual_soln, sol.resid; sol.retcode, sol.stats,
-        sol.original)
-end
-
-@concrete mutable struct NonlinearSolveForwardDiffCache
-    cache
-    prob
-    alg
-    p
-    values_p
-    partials_p
-end
-
-@inline function __has_duals(::Union{<:Dual{T, V, P},
-        <:AbstractArray{<:Dual{T, V, P}}}) where {T, V, P}
-    return true
-end
-@inline __has_duals(::Any) = false
-
-function SciMLBase.reinit!(cache::NonlinearSolveForwardDiffCache; p = cache.p,
-        u0 = get_u(cache.cache), kwargs...)
-    inner_cache = SciMLBase.reinit!(cache.cache; p = value(p), u0 = value(u0), kwargs...)
-    cache.cache = inner_cache
-    cache.p = p
-    cache.values_p = value(p)
-    cache.partials_p = ForwardDiff.partials(p)
-    return cache
-end
-
-function SciMLBase.init(prob::NonlinearProblem{<:Union{Number, <:AbstractArray},
-            iip, <:Union{<:Dual{T, V, P}, <:AbstractArray{<:Dual{T, V, P}}}},
-        alg::Union{Nothing, AbstractNonlinearAlgorithm}, args...;
-        kwargs...) where {T, V, P, iip}
-    p = value(prob.p)
-    newprob = NonlinearProblem(prob.f, value(prob.u0), p; prob.kwargs...)
-    cache = init(newprob, alg, args...; kwargs...)
-    return NonlinearSolveForwardDiffCache(cache, newprob, alg, prob.p, p,
-        ForwardDiff.partials(prob.p))
-end
-
-function SciMLBase.solve!(cache::NonlinearSolveForwardDiffCache)
-    sol = solve!(cache.cache)
-    prob = cache.prob
-
-    uu = sol.u
-    f_p = __nlsolve_∂f_∂p(prob, prob.f, uu, cache.values_p)
-    f_x = __nlsolve_∂f_∂u(prob, prob.f, uu, cache.values_p)
-
-    z_arr = -f_x \ f_p
-
-    sumfun = ((z, p),) -> map(zᵢ -> zᵢ * ForwardDiff.partials(p), z)
-    if cache.p isa Number
-        partials = sumfun((z_arr, cache.p))
-    else
-        partials = sum(sumfun, zip(eachcol(z_arr), cache.p))
-    end
-
-    dual_soln = __nlsolve_dual_soln(sol.u, partials, cache.p)
-    return SciMLBase.build_solution(prob, cache.alg, dual_soln, sol.resid; sol.retcode,
-        sol.stats, sol.original)
-end
-
-function __nlsolve_ad(prob::NonlinearProblem{uType, iip}, alg, args...;
-        kwargs...) where {uType, iip}
-    p = value(prob.p)
-    newprob = NonlinearProblem(prob.f, value(prob.u0), p; prob.kwargs...)
-
-    sol = solve(newprob, alg, args...; kwargs...)
-
-    uu = sol.u
-    f_p = __nlsolve_∂f_∂p(prob, prob.f, uu, p)
-    f_x = __nlsolve_∂f_∂u(prob, prob.f, uu, p)
-
-    z_arr = -f_x \ f_p
-
-    pp = prob.p
-    sumfun = ((z, p),) -> map(zᵢ -> zᵢ * ForwardDiff.partials(p), z)
-    if uu isa Number
-        partials = sum(sumfun, zip(z_arr, pp))
-    elseif p isa Number
-        partials = sumfun((z_arr, pp))
-    else
-        partials = sum(sumfun, zip(eachcol(z_arr), pp))
-    end
-
-    return sol, partials
-end
-
-@inline function __nlsolve_∂f_∂p(prob, f::F, u, p) where {F}
-    if isinplace(prob)
-        __f = p -> begin
-            du = similar(u, promote_type(eltype(u), eltype(p)))
-            f(du, u, p)
-            return du
-        end
-    else
-        __f = Base.Fix1(f, u)
-    end
-    if p isa Number
-        return __reshape(ForwardDiff.derivative(__f, p), :, 1)
-    elseif u isa Number
-        return __reshape(ForwardDiff.gradient(__f, p), 1, :)
-    else
-        return ForwardDiff.jacobian(__f, p)
-    end
-end
-
-@inline function __nlsolve_∂f_∂u(prob, f::F, u, p) where {F}
-    if isinplace(prob)
-        du = similar(u)
-        __f = (du, u) -> f(du, u, p)
-        ForwardDiff.jacobian(__f, du, u)
-    else
-        __f = Base.Fix2(f, p)
-        if u isa Number
-            return ForwardDiff.derivative(__f, u)
-        else
-            return ForwardDiff.jacobian(__f, u)
-        end
-    end
-end
-
-@inline function __nlsolve_dual_soln(u::Number, partials,
-        ::Union{<:AbstractArray{<:Dual{T, V, P}}, Dual{T, V, P}}) where {T, V, P}
-    return Dual{T, V, P}(u, partials)
-end
-
-@inline function __nlsolve_dual_soln(u::AbstractArray, partials,
-        ::Union{<:AbstractArray{<:Dual{T, V, P}}, Dual{T, V, P}}) where {T, V, P}
-    _partials = _restructure(u, partials)
-    return map(((uᵢ, pᵢ),) -> Dual{T, V, P}(uᵢ, pᵢ), zip(u, _partials))
-end
diff --git a/src/algorithms/broyden.jl b/src/algorithms/broyden.jl
new file mode 100644
index 000000000..1d063c6c0
--- /dev/null
+++ b/src/algorithms/broyden.jl
@@ -0,0 +1,225 @@
+"""
+    Broyden(; max_resets::Int = 100, linesearch = NoLineSearch(), reset_tolerance = nothing,
+        init_jacobian::Val = Val(:identity), autodiff = nothing, alpha = nothing)
+
+An implementation of `Broyden`'s Method [broyden1965class](@cite) with resetting and line
+search.
+
+### Keyword Arguments
+
+  - `max_resets`: the maximum number of resets to perform. Defaults to `100`.
+
+  - `reset_tolerance`: the tolerance for the reset check. Defaults to
+    `sqrt(eps(real(eltype(u))))`.
+  - `alpha`: If `init_jacobian` is set to `Val(:identity)`, then the initial Jacobian
+    inverse is set to be `(αI)⁻¹`. Defaults to `nothing` which implies
+    `α = max(norm(u), 1) / (2 * norm(fu))`.
+  - `init_jacobian`: the method to use for initializing the jacobian. Defaults to
+    `Val(:identity)`. Choices include:
+
+      + `Val(:identity)`: Identity Matrix.
+      + `Val(:true_jacobian)`: True Jacobian. This is a good choice for differentiable
+        problems.
+  - `update_rule`: Update Rule for the Jacobian. Choices are:
+
+      + `Val(:good_broyden)`: Good Broyden's Update Rule
+      + `Val(:bad_broyden)`: Bad Broyden's Update Rule
+      + `Val(:diagonal)`: Only update the diagonal of the Jacobian. This algorithm may be
+        useful for specific problems, but whether it will work may depend strongly on the
+        problem
+"""
+function Broyden(; max_resets = 100, linesearch = NoLineSearch(), reset_tolerance = nothing,
+        init_jacobian::Val{IJ} = Val(:identity), autodiff = nothing, alpha = nothing,
+        update_rule::Val{UR} = Val(:good_broyden)) where {IJ, UR}
+    if IJ === :identity
+        if UR === :diagonal
+            initialization = IdentityInitialization(alpha, DiagonalStructure())
+        else
+            initialization = IdentityInitialization(alpha, FullStructure())
+        end
+    elseif IJ === :true_jacobian
+        initialization = TrueJacobianInitialization(FullStructure(), autodiff)
+    else
+        throw(ArgumentError("`init_jacobian` must be one of `:identity` or \
+                             `:true_jacobian`"))
+    end
+
+    update_rule = if UR === :good_broyden
+        GoodBroydenUpdateRule()
+    elseif UR === :bad_broyden
+        BadBroydenUpdateRule()
+    elseif UR === :diagonal
+        GoodBroydenUpdateRule()
+    else
+        throw(ArgumentError("`update_rule` must be one of `:good_broyden`, `:bad_broyden`, \
+                             or `:diagonal`"))
+    end
+
+    return ApproximateJacobianSolveAlgorithm{IJ === :true_jacobian, :Broyden}(; linesearch,
+        descent = NewtonDescent(), update_rule, max_resets, initialization,
+        reinit_rule = NoChangeInStateReset(; reset_tolerance))
+end
+
+# Checks for no significant change for `nsteps`
+"""
+    NoChangeInStateReset(; nsteps::Int = 3, reset_tolerance = nothing,
+        check_du::Bool = true, check_dfu::Bool = true)
+
+Recommends a reset if the state or the function value has not changed significantly in
+`nsteps` steps. This is used in [`Broyden`](@ref).
+
+### Keyword Arguments
+
+  - `nsteps`: the number of steps to check for no change. Defaults to `3`.
+  - `reset_tolerance`: the tolerance for the reset check. Defaults to
+    `sqrt(eps(real(eltype(u))))`.
+  - `check_du`: whether to check the state. Defaults to `true`.
+  - `check_dfu`: whether to check the function value. Defaults to `true`.
+"""
+@kwdef @concrete struct NoChangeInStateReset <: AbstractResetCondition
+    nsteps::Int = 3
+    reset_tolerance = nothing
+    check_du::Bool = true
+    check_dfu::Bool = true
+end
+
+@concrete mutable struct NoChangeInStateResetCache
+    dfu
+    reset_tolerance
+    check_du
+    check_dfu
+    nsteps::Int
+    steps_since_change_du::Int
+    steps_since_change_dfu::Int
+end
+
+function reinit_cache!(cache::NoChangeInStateResetCache, args...; kwargs...)
+    cache.steps_since_change_du = 0
+    cache.steps_since_change_dfu = 0
+end
+
+function __internal_init(alg::NoChangeInStateReset, J, fu, u, du, args...; kwargs...)
+    if alg.check_dfu
+        @bb dfu = copy(fu)
+    else
+        dfu = fu
+    end
+    T = real(eltype(u))
+    tol = alg.reset_tolerance === nothing ? eps(T)^(3 // 4) : T(alg.reset_tolerance)
+    return NoChangeInStateResetCache(dfu, tol, alg.check_du, alg.check_dfu, alg.nsteps, 0,
+        0)
+end
+
+function __internal_solve!(cache::NoChangeInStateResetCache, J, fu, u, du)
+    reset_tolerance = cache.reset_tolerance
+    if cache.check_du
+        if any(@closure(x->abs(x) ≤ reset_tolerance), du)
+            cache.steps_since_change_du += 1
+            if cache.steps_since_change_du ≥ cache.nsteps
+                cache.steps_since_change_du = 0
+                cache.steps_since_change_dfu = 0
+                return true
+            end
+        else
+            cache.steps_since_change_du = 0
+            cache.steps_since_change_dfu = 0
+        end
+    end
+    if cache.check_dfu
+        @bb @. cache.dfu = fu - cache.dfu
+        if any(@closure(x->abs(x) ≤ reset_tolerance), cache.dfu)
+            cache.steps_since_change_dfu += 1
+            if cache.steps_since_change_dfu ≥ cache.nsteps
+                cache.steps_since_change_dfu = 0
+                cache.steps_since_change_du = 0
+                @bb copyto!(cache.dfu, fu)
+                return true
+            end
+        else
+            cache.steps_since_change_dfu = 0
+            cache.steps_since_change_du = 0
+        end
+        @bb copyto!(cache.dfu, fu)
+    end
+    return false
+end
+
+# Broyden Update Rules
+"""
+    BadBroydenUpdateRule()
+
+Broyden Update Rule corresponding to "bad broyden's method" [broyden1965class](@cite).
+"""
+@concrete struct BadBroydenUpdateRule <: AbstractApproximateJacobianUpdateRule{true} end
+
+"""
+    GoodBroydenUpdateRule()
+
+Broyden Update Rule corresponding to "good broyden's method" [broyden1965class](@cite).
+"""
+@concrete struct GoodBroydenUpdateRule <: AbstractApproximateJacobianUpdateRule{true} end
+
+@concrete mutable struct BroydenUpdateRuleCache{mode} <:
+                         AbstractApproximateJacobianUpdateRuleCache{true}
+    J⁻¹dfu
+    dfu
+    u_cache
+    du_cache
+    internalnorm
+end
+
+function __internal_init(prob::AbstractNonlinearProblem,
+        alg::Union{GoodBroydenUpdateRule, BadBroydenUpdateRule}, J, fu, u, du, args...;
+        internalnorm::F = DEFAULT_NORM, kwargs...) where {F}
+    @bb J⁻¹dfu = similar(u)
+    @bb dfu = copy(fu)
+    if alg isa GoodBroydenUpdateRule || J isa Diagonal
+        @bb u_cache = similar(u)
+    else
+        u_cache = nothing
+    end
+    if J isa Diagonal
+        du_cache = nothing
+    else
+        @bb du_cache = similar(du)
+    end
+    mode = alg isa GoodBroydenUpdateRule ? :good : :bad
+    return BroydenUpdateRuleCache{mode}(J⁻¹dfu, dfu, u_cache, du_cache, internalnorm)
+end
+
+function __internal_solve!(cache::BroydenUpdateRuleCache{mode}, J⁻¹, fu, u, du) where {mode}
+    T = eltype(u)
+    @bb @. cache.dfu = fu - cache.dfu
+    @bb cache.J⁻¹dfu = J⁻¹ × vec(cache.dfu)
+    if mode === :good
+        @bb cache.u_cache = transpose(J⁻¹) × vec(du)
+        denom = dot(du, cache.J⁻¹dfu)
+        rmul = transpose(_vec(cache.u_cache))
+    else
+        denom = cache.internalnorm(cache.dfu)^2
+        rmul = transpose(_vec(cache.dfu))
+    end
+    @bb @. cache.du_cache = (du - cache.J⁻¹dfu) / ifelse(iszero(denom), T(1e-5), denom)
+    @bb J⁻¹ += vec(cache.du_cache) × rmul
+    @bb copyto!(cache.dfu, fu)
+    return J⁻¹
+end
+
+function __internal_solve!(cache::BroydenUpdateRuleCache{mode}, J⁻¹::Diagonal, fu, u,
+        du) where {mode}
+    T = eltype(u)
+    @bb @. cache.dfu = fu - cache.dfu
+    J⁻¹_diag = _restructure(cache.dfu, diag(J⁻¹))
+    if mode === :good
+        @bb @. cache.J⁻¹dfu = J⁻¹_diag * cache.dfu * du
+        denom = sum(cache.J⁻¹dfu)
+        @bb @. J⁻¹_diag += (du - J⁻¹_diag * cache.dfu) * du * J⁻¹_diag /
+                           ifelse(iszero(denom), T(1e-5), denom)
+    else
+        denom = cache.internalnorm(cache.dfu)^2
+        @bb @. J⁻¹_diag += (du - J⁻¹_diag * cache.dfu) * cache.dfu /
+                           ifelse(iszero(denom), T(1e-5), denom)
+    end
+    @bb copyto!(cache.dfu, fu)
+    return Diagonal(J⁻¹_diag)
+end
diff --git a/src/algorithms/dfsane.jl b/src/algorithms/dfsane.jl
new file mode 100644
index 000000000..17bdcac55
--- /dev/null
+++ b/src/algorithms/dfsane.jl
@@ -0,0 +1,25 @@
+"""
+    DFSane(; σ_min = 1 // 10^10, σ_max = 1e10, σ_1 = 1, M::Int = 10, γ = 1 // 10^4,
+        τ_min = 1 // 10, τ_max = 1 // 2, n_exp::Int = 2, max_inner_iterations::Int = 100,
+        η_strategy = (fn_1, n, x_n, f_n) -> fn_1 / n^2)
+
+A low-overhead and allocation-free implementation of the df-sane method for solving
+large-scale nonlinear systems of equations. For in depth information about all the
+parameters and the algorithm, see [la2006spectral](@citet).
+
+### Keyword Arguments
+
+  - `σ_min`: the minimum value of the spectral coefficient `σₙ` which is related to the step
+    size in the algorithm. Defaults to `1e-10`.
+  - `σ_max`: the maximum value of the spectral coefficient `σₙ` which is related to the step
+    size in the algorithm. Defaults to `1e10`.
+
+For other keyword arguments, see [`RobustNonMonotoneLineSearch`](@ref).
+"""
+function DFSane(; σ_min = 1 // 10^10, σ_max = 1e10, σ_1 = 1, M::Int = 10, γ = 1 // 10^4,
+        τ_min = 1 // 10, τ_max = 1 // 2, n_exp::Int = 2, max_inner_iterations::Int = 100,
+        η_strategy::ETA = (fn_1, n, x_n, f_n) -> fn_1 / n^2) where {ETA}
+    linesearch = RobustNonMonotoneLineSearch(; gamma = γ, sigma_1 = σ_1, M, tau_min = τ_min,
+        tau_max = τ_max, n_exp, η_strategy, maxiters = max_inner_iterations)
+    return GeneralizedDFSane{:DFSane}(linesearch, σ_min, σ_max, nothing)
+end
diff --git a/src/extension_algs.jl b/src/algorithms/extension_algs.jl
similarity index 81%
rename from src/extension_algs.jl
rename to src/algorithms/extension_algs.jl
index 8d7397ea0..57b24eab6 100644
--- a/src/extension_algs.jl
+++ b/src/algorithms/extension_algs.jl
@@ -1,4 +1,4 @@
-# This file only include the algorithm struct to be exported by LinearSolve.jl. The main
+# This file only include the algorithm struct to be exported by NonlinearSolve.jl. The main
 # functionality is implemented as package extensions
 """
     LeastSquaresOptimJL(alg = :lm; linsolve = nothing, autodiff::Symbol = :central)
@@ -6,9 +6,12 @@
 Wrapper over [LeastSquaresOptim.jl](https://github.com/matthieugomez/LeastSquaresOptim.jl)
 for solving `NonlinearLeastSquaresProblem`.
 
-## Arguments:
+### Arguments
 
   - `alg`: Algorithm to use. Can be `:lm` or `:dogleg`.
+
+### Keyword Arguments
+
   - `linsolve`: Linear solver to use. Can be `:qr`, `:cholesky` or `:lsmr`. If `nothing`,
     then `LeastSquaresOptim.jl` will choose the best linear solver based on the Jacobian
     structure.
@@ -19,14 +22,14 @@ for solving `NonlinearLeastSquaresProblem`.
 
     This algorithm is only available if `LeastSquaresOptim.jl` is installed.
 """
-struct LeastSquaresOptimJL{alg, linsolve} <: AbstractNonlinearSolveAlgorithm
-    autodiff::Symbol
+struct LeastSquaresOptimJL{alg, linsolve} <: AbstractNonlinearSolveExtensionAlgorithm
+    autodiff
 end
 
-function LeastSquaresOptimJL(alg = :lm; linsolve = nothing, autodiff::Symbol = :central)
+function LeastSquaresOptimJL(alg = :lm; linsolve = nothing, autodiff = :central)
     @assert alg in (:lm, :dogleg)
     @assert linsolve === nothing || linsolve in (:qr, :cholesky, :lsmr)
-    @assert autodiff in (:central, :forward)
+    autodiff isa Symbol && @assert autodiff in (:central, :forward)
 
     if Base.get_extension(@__MODULE__, :NonlinearSolveLeastSquaresOptimExt) === nothing
         error("LeastSquaresOptimJL requires LeastSquaresOptim.jl to be loaded")
@@ -36,30 +39,37 @@ function LeastSquaresOptimJL(alg = :lm; linsolve = nothing, autodiff::Symbol = :
 end
 
 """
-    FastLevenbergMarquardtJL(linsolve = :cholesky; autodiff = nothing)
+    FastLevenbergMarquardtJL(linsolve::Symbol = :cholesky; factor = 1e-6,
+        factoraccept = 13.0, factorreject = 3.0, factorupdate = :marquardt,
+        minscale = 1e-12, maxscale = 1e16, minfactor = 1e-28, maxfactor = 1e32,
+        autodiff = nothing)
 
 Wrapper over [FastLevenbergMarquardt.jl](https://github.com/kamesy/FastLevenbergMarquardt.jl)
-for solving `NonlinearLeastSquaresProblem`.
+for solving `NonlinearLeastSquaresProblem`. For details about the other keyword arguments
+see the documentation for `FastLevenbergMarquardt.jl`.
 
 !!! warning
 
     This is not really the fastest solver. It is called that since the original package
     is called "Fast". `LevenbergMarquardt()` is almost always a better choice.
 
-## Arguments:
+### Arguments
 
   - `linsolve`: Linear solver to use. Can be `:qr` or `:cholesky`.
+
+### Keyword Arguments
+
   - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
     ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
     `nothing` which means that a default is selected according to the problem specification!
-    Valid choices are `nothing`, `AutoForwardDiff` or `AutoFiniteDiff`.
 
 !!! note
 
     This algorithm is only available if `FastLevenbergMarquardt.jl` is installed.
 """
-@concrete struct FastLevenbergMarquardtJL{linsolve} <: AbstractNonlinearSolveAlgorithm
-    ad
+@concrete struct FastLevenbergMarquardtJL{linsolve} <:
+                 AbstractNonlinearSolveExtensionAlgorithm
+    autodiff
     factor
     factoraccept
     factorreject
@@ -70,20 +80,12 @@ for solving `NonlinearLeastSquaresProblem`.
     maxfactor
 end
 
-function set_ad(alg::FastLevenbergMarquardtJL{linsolve}, ad) where {linsolve}
-    return FastLevenbergMarquardtJL{linsolve}(ad, alg.factor, alg.factoraccept,
-        alg.factorreject, alg.factorupdate, alg.minscale, alg.maxscale, alg.minfactor,
-        alg.maxfactor)
-end
-
 function FastLevenbergMarquardtJL(linsolve::Symbol = :cholesky; factor = 1e-6,
         factoraccept = 13.0, factorreject = 3.0, factorupdate = :marquardt,
         minscale = 1e-12, maxscale = 1e16, minfactor = 1e-28, maxfactor = 1e32,
         autodiff = nothing)
     @assert linsolve in (:qr, :cholesky)
     @assert factorupdate in (:marquardt, :nielson)
-    @assert autodiff === nothing || autodiff isa AutoFiniteDiff ||
-            autodiff isa AutoForwardDiff
 
     if Base.get_extension(@__MODULE__, :NonlinearSolveFastLevenbergMarquardtExt) === nothing
         error("FastLevenbergMarquardtJL requires FastLevenbergMarquardt.jl to be loaded")
@@ -94,13 +96,16 @@ function FastLevenbergMarquardtJL(linsolve::Symbol = :cholesky; factor = 1e-6,
 end
 
 """
-    CMINPACK(; method::Symbol = :auto)
+    CMINPACK(; method::Symbol = :auto, autodiff = missing)
 
 ### Keyword Arguments
 
   - `method`: the choice of method for the solver.
+  - `autodiff`: Defaults to `missing`, which means we will default to letting `MINPACK`
+    construct the jacobian if `f.jac` is not provided. In other cases, we use it to generate
+    a jacobian similar to other NonlinearSolve solvers.
 
-### Method Choices
+### Submethod Choice
 
 The keyword argument `method` can take on different value depending on which method of
 `fsolve` you are calling. The standard choices of `method` are:
@@ -125,20 +130,26 @@ then the following methods are allowed:
     [`hybrj`](https://github.com/devernay/cminpack/blob/d1f5f5a273862ca1bbcf58394e4ac060d9e22c76/hybrj.c)
     for more information
   - `:lm`: Advanced Levenberg-Marquardt with user supplied Jacobian. Additional arguments
-    are available via `;kwargs...`. See MINPACK routine
+    are available via `; kwargs...`. See MINPACK routine
     [`lmder`](https://github.com/devernay/cminpack/blob/d1f5f5a273862ca1bbcf58394e4ac060d9e22c76/lmder.c)
     for more information
 
 The default choice of `:auto` selects `:hybr` for NonlinearProblem and `:lm` for
 NonlinearLeastSquaresProblem.
+
+!!! note
+
+    This algorithm is only available if `MINPACK.jl` is installed.
 """
-struct CMINPACK <: AbstractNonlinearSolveAlgorithm
+@concrete struct CMINPACK <: AbstractNonlinearSolveExtensionAlgorithm
     show_trace::Bool
     tracing::Bool
     method::Symbol
+    autodiff
 end
 
-function CMINPACK(; show_trace = missing, tracing = missing, method::Symbol = :auto)
+function CMINPACK(; show_trace = missing, tracing = missing, method::Symbol = :auto,
+        autodiff = missing)
     if Base.get_extension(@__MODULE__, :NonlinearSolveMINPACKExt) === nothing
         error("CMINPACK requires MINPACK.jl to be loaded")
     end
@@ -161,7 +172,7 @@ function CMINPACK(; show_trace = missing, tracing = missing, method::Symbol = :a
         tracing = false
     end
 
-    return CMINPACK(show_trace, tracing, method)
+    return CMINPACK(show_trace, tracing, method, autodiff)
 end
 
 """
@@ -173,7 +184,8 @@ end
 
   - `method`: the choice of method for solving the nonlinear system.
   - `autodiff`: the choice of method for generating the Jacobian. Defaults to `:central` or
-    central differencing via FiniteDiff.jl. The other choices are `:forward`
+    central differencing via FiniteDiff.jl. The other choices are `:forward` or `ADTypes`
+    similar to other solvers in NonlinearSolve.
   - `linesearch`: the line search method to be used within the solver method. The choices
     are line search types from
     [LineSearches.jl](https://github.com/JuliaNLSolvers/LineSearches.jl).
@@ -185,8 +197,9 @@ end
   - `m`: the amount of history in the Anderson method. Naive "Picard"-style iteration can be
     achieved by setting m=0, but that isn't advisable for contractions whose Lipschitz
     constants are close to 1. If convergence fails, though, you may consider lowering it.
-  - `beta`: It is also known as DIIS or Pulay mixing, this method is based on the acceleration
-    of the fixed-point iteration xₙ₊₁ = xₙ + beta*f(xₙ), where by default beta = 1.
+  - `beta`: It is also known as DIIS or Pulay mixing, this method is based on the
+    acceleration of the fixed-point iteration xₙ₊₁ = xₙ + beta*f(xₙ), where by default
+    beta = 1.
 
 ### Submethod Choice
 
@@ -195,13 +208,18 @@ Choices for methods in `NLsolveJL`:
   - `:anderson`: Anderson-accelerated fixed-point iteration
   - `:broyden`: Broyden's quasi-Newton method
   - `:newton`: Classical Newton method with an optional line search
-  - `:trust_region`: Trust region Newton method (the default choice) For more information on
-    these arguments, consult the
-    [NLsolve.jl documentation](https://github.com/JuliaNLSolvers/NLsolve.jl).
+  - `:trust_region`: Trust region Newton method (the default choice)
+
+For more information on these arguments, consult the
+[NLsolve.jl documentation](https://github.com/JuliaNLSolvers/NLsolve.jl).
+
+!!! note
+
+    This algorithm is only available if `NLsolve.jl` is installed.
 """
-@concrete struct NLsolveJL <: AbstractNonlinearSolveAlgorithm
+@concrete struct NLsolveJL <: AbstractNonlinearSolveExtensionAlgorithm
     method::Symbol
-    autodiff::Symbol
+    autodiff
     store_trace::Bool
     extended_trace::Bool
     linesearch
@@ -249,6 +267,10 @@ function NLsolveJL(; method = :trust_region, autodiff = :central, store_trace =
         extended_trace = false
     end
 
+    if autodiff isa Symbol && autodiff !== :central && autodiff !== :forward
+        error("`autodiff` must be `:central` or `:forward`.")
+    end
+
     return NLsolveJL(method, autodiff, store_trace, extended_trace, linesearch, linsolve,
         factor, autoscale, m, beta, show_trace)
 end
@@ -260,25 +282,25 @@ end
 Wrapper over [SpeedMapping.jl](https://nicolasl-s.github.io/SpeedMapping.jl) for solving
 Fixed Point Problems. We allow using this algorithm to solve root finding problems as well.
 
-## Arguments:
+### Keyword Arguments
 
-  - `σ_min`: Setting to `1` may avoid stalling (see paper).
+  - `σ_min`: Setting to `1` may avoid stalling (see [lepage2021alternating](@cite)).
   - `stabilize`: performs a stabilization mapping before extrapolating. Setting to `true`
     may improve the performance for applications like accelerating the EM or MM algorithms
-    (see paper).
+    (see [lepage2021alternating](@cite)).
   - `check_obj`: In case of NaN or Inf values, the algorithm restarts at the best past
     iterate.
   - `orders`: determines ACX's alternating order. Must be between `1` and `3` (where `1`
     means no extrapolation). The two recommended orders are `[3, 2]` and `[3, 3, 2]`, the
-    latter being potentially better for highly non-linear applications (see paper).
+    latter being potentially better for highly non-linear applications (see
+    [lepage2021alternating](@cite)).
   - `time_limit`: time limit for the algorithm.
 
-## References:
+!!! note
 
-  - N. Lepage-Saucier, Alternating cyclic extrapolation methods for optimization algorithms,
-    arXiv:2104.04974 (2021). https://arxiv.org/abs/2104.04974.
+    This algorithm is only available if `SpeedMapping.jl` is installed.
 """
-@concrete struct SpeedMappingJL <: AbstractNonlinearSolveAlgorithm
+@concrete struct SpeedMappingJL <: AbstractNonlinearSolveExtensionAlgorithm
     σ_min
     stabilize::Bool
     check_obj::Bool
@@ -287,11 +309,19 @@ Fixed Point Problems. We allow using this algorithm to solve root finding proble
 end
 
 function SpeedMappingJL(; σ_min = 0.0, stabilize::Bool = false, check_obj::Bool = false,
-        orders::Vector{Int} = [3, 3, 2], time_limit::Real = 1000)
+        orders::Vector{Int} = [3, 3, 2], time_limit = missing)
     if Base.get_extension(@__MODULE__, :NonlinearSolveSpeedMappingExt) === nothing
         error("SpeedMappingJL requires SpeedMapping.jl to be loaded")
     end
 
+    if time_limit !== missing
+        Base.depwarn("`time_limit` keyword argument to `SpeedMappingJL` has been \
+                      deprecated and will be removed in v4. Pass `maxtime = <value>` to \
+                      `SciMLBase.solve`.", :SpeedMappingJL)
+    else
+        time_limit = 1000
+    end
+
     return SpeedMappingJL(σ_min, stabilize, check_obj, orders, time_limit)
 end
 
@@ -304,7 +334,7 @@ Wrapper over [FixedPointAcceleration.jl](https://s-baumann.github.io/FixedPointA
 for solving Fixed Point Problems. We allow using this algorithm to solve root finding
 problems as well.
 
-## Arguments:
+### Keyword Arguments
 
   - `algorithm`: The algorithm to use. Can be `:Anderson`, `:MPE`, `:RRE`, `:VEA`, `:SEA`,
     `:Simple`, `:Aitken` or `:Newton`.
@@ -317,8 +347,12 @@ problems as well.
     `:SEA` and `:VEA`. For `:SEA` and `:VEA`, this must be a multiple of `2`.
   - `replace_invalids`: The method to use for replacing invalid iterates. Can be
     `:ReplaceInvalids`, `:ReplaceVector` or `:NoAction`.
+
+!!! note
+
+    This algorithm is only available if `FixedPointAcceleration.jl` is installed.
 """
-@concrete struct FixedPointAccelerationJL <: AbstractNonlinearSolveAlgorithm
+@concrete struct FixedPointAccelerationJL <: AbstractNonlinearSolveExtensionAlgorithm
     algorithm::Symbol
     extrapolation_period::Int
     replace_invalids::Symbol
@@ -370,7 +404,8 @@ function FixedPointAccelerationJL(; algorithm = :Anderson, m = missing,
 end
 
 """
-    SIAMFANLEquationsJL(; method = :newton, delta = 1e-3, linsolve = nothing)
+    SIAMFANLEquationsJL(; method = :newton, delta = 1e-3, linsolve = nothing,
+        autodiff = missing)
 
 ### Keyword Arguments
 
@@ -380,6 +415,9 @@ end
   - `m`: Depth for Anderson acceleration, default as 0 for Picard iteration.
   - `beta`: Anderson mixing parameter, change f(x) to (1-beta)x+beta*f(x),
     equivalent to accelerating damped Picard iteration.
+  - `autodiff`: Defaults to `missing`, which means we will default to letting
+    `SIAMFANLEquations` construct the jacobian if `f.jac` is not provided. In other cases,
+    we use it to generate a jacobian similar to other NonlinearSolve solvers.
 
 ### Submethod Choice
 
@@ -387,20 +425,25 @@ end
   - `:pseudotransient`: Pseudo transient method.
   - `:secant`: Secant method for scalar equations.
   - `:anderson`: Anderson acceleration for fixed point iterations.
+
+!!! note
+
+    This algorithm is only available if `SIAMFANLEquations.jl` is installed.
 """
 @concrete struct SIAMFANLEquationsJL{L <: Union{Symbol, Nothing}} <:
-                 AbstractNonlinearSolveAlgorithm
+                 AbstractNonlinearSolveExtensionAlgorithm
     method::Symbol
     delta
     linsolve::L
     m::Int
     beta
+    autodiff
 end
 
 function SIAMFANLEquationsJL(; method = :newton, delta = 1e-3, linsolve = nothing, m = 0,
-        beta = 1.0)
+        beta = 1.0, autodiff = missing)
     if Base.get_extension(@__MODULE__, :NonlinearSolveSIAMFANLEquationsExt) === nothing
         error("SIAMFANLEquationsJL requires SIAMFANLEquations.jl to be loaded")
     end
-    return SIAMFANLEquationsJL(method, delta, linsolve, m, beta)
+    return SIAMFANLEquationsJL(method, delta, linsolve, m, beta, autodiff)
 end
diff --git a/src/algorithms/gauss_newton.jl b/src/algorithms/gauss_newton.jl
new file mode 100644
index 000000000..1e6384788
--- /dev/null
+++ b/src/algorithms/gauss_newton.jl
@@ -0,0 +1,14 @@
+"""
+    GaussNewton(; concrete_jac = nothing, linsolve = nothing, linesearch = NoLineSearch(),
+        precs = DEFAULT_PRECS, adkwargs...)
+
+An advanced GaussNewton implementation with support for efficient handling of sparse
+matrices via colored automatic differentiation and preconditioned linear solvers. Designed
+for large-scale and numerically-difficult nonlinear least squares problems.
+"""
+function GaussNewton(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
+        linesearch = NoLineSearch(), vjp_autodiff = nothing, autodiff = nothing)
+    descent = NewtonDescent(; linsolve, precs)
+    return GeneralizedFirstOrderAlgorithm(; concrete_jac, name = :GaussNewton,
+        descent, jacobian_ad = autodiff, reverse_ad = vjp_autodiff)
+end
diff --git a/src/algorithms/klement.jl b/src/algorithms/klement.jl
new file mode 100644
index 000000000..b67ab4f58
--- /dev/null
+++ b/src/algorithms/klement.jl
@@ -0,0 +1,146 @@
+"""
+    Klement(; max_resets = 100, linsolve = NoLineSearch(), linesearch = nothing,
+        precs = DEFAULT_PRECS, alpha = nothing, init_jacobian::Val = Val(:identity),
+        autodiff = nothing)
+
+An implementation of `Klement` [klement2014using](@citep) with line search, preconditioning
+and customizable linear solves. It is recommended to use [`Broyden`](@ref) for most problems
+over this.
+
+### Keyword Arguments
+
+  - `max_resets`: the maximum number of resets to perform. Defaults to `100`.
+
+  - `alpha`: If `init_jacobian` is set to `Val(:identity)`, then the initial Jacobian
+    inverse is set to be `αI`. Defaults to `1`. Can be set to `nothing` which implies
+    `α = max(norm(u), 1) / (2 * norm(fu))`.
+  - `init_jacobian`: the method to use for initializing the jacobian. Defaults to
+    `Val(:identity)`. Choices include:
+
+      + `Val(:identity)`: Identity Matrix.
+      + `Val(:true_jacobian)`: True Jacobian. Our tests suggest that this is not very
+        stable. Instead using `Broyden` with `Val(:true_jacobian)` gives faster and more
+        reliable convergence.
+      + `Val(:true_jacobian_diagonal)`: Diagonal of True Jacobian. This is a good choice for
+        differentiable problems.
+"""
+function Klement(; max_resets::Int = 100, linsolve = nothing, alpha = nothing,
+        linesearch = NoLineSearch(), precs = DEFAULT_PRECS, autodiff = nothing,
+        init_jacobian::Val{IJ} = Val(:identity)) where {IJ}
+    if !(linesearch isa AbstractNonlinearSolveLineSearchAlgorithm)
+        Base.depwarn("Passing in a `LineSearches.jl` algorithm directly is deprecated. \
+                      Please use `LineSearchesJL` instead.", :Klement)
+        linesearch = LineSearchesJL(; method = linesearch)
+    end
+
+    if IJ === :identity
+        initialization = IdentityInitialization(alpha, DiagonalStructure())
+    elseif IJ === :true_jacobian
+        initialization = TrueJacobianInitialization(FullStructure(), autodiff)
+    elseif IJ === :true_jacobian_diagonal
+        initialization = TrueJacobianInitialization(DiagonalStructure(), autodiff)
+    else
+        throw(ArgumentError("`init_jacobian` must be one of `:identity`, `:true_jacobian`, \
+                             or `:true_jacobian_diagonal`"))
+    end
+
+    CJ = IJ === :true_jacobian || IJ === :true_jacobian_diagonal
+
+    return ApproximateJacobianSolveAlgorithm{CJ, :Klement}(; linesearch,
+        descent = NewtonDescent(; linsolve, precs), update_rule = KlementUpdateRule(),
+        reinit_rule = IllConditionedJacobianReset(), max_resets, initialization)
+end
+
+# Essentially checks ill conditioned Jacobian
+"""
+    IllConditionedJacobianReset()
+
+Recommend resetting the Jacobian if the current jacobian is ill-conditioned. This is used
+in [`Klement`](@ref).
+"""
+struct IllConditionedJacobianReset <: AbstractResetCondition end
+
+@concrete struct IllConditionedJacobianResetCache
+    condition_number_threshold
+end
+
+function __internal_init(alg::IllConditionedJacobianReset, J, fu, u, du, args...; kwargs...)
+    condition_number_threshold = if J isa AbstractMatrix
+        inv(eps(real(eltype(J)))^(1 // 2))
+    else
+        nothing
+    end
+    return IllConditionedJacobianResetCache(condition_number_threshold)
+end
+
+function __internal_solve!(cache::IllConditionedJacobianResetCache, J, fu, u, du)
+    J isa Number && return iszero(J)
+    J isa Diagonal && return any(iszero, diag(J))
+    J isa AbstractMatrix && return cond(J) ≥ cache.condition_number_threshold
+    J isa AbstractVector && return any(iszero, J)
+    return false
+end
+
+# Update Rule
+"""
+    KlementUpdateRule()
+
+Update rule for [`Klement`](@ref).
+"""
+@concrete struct KlementUpdateRule <: AbstractApproximateJacobianUpdateRule{false} end
+
+@concrete mutable struct KlementUpdateRuleCache <:
+                         AbstractApproximateJacobianUpdateRuleCache{false}
+    Jdu
+    J_cache
+    J_cache_2
+    Jdu_cache
+    fu_cache
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::KlementUpdateRule, J, fu, u,
+        du, args...; kwargs...)
+    @bb Jdu = similar(fu)
+    if J isa Diagonal || J isa Number
+        J_cache, J_cache_2, Jdu_cache = nothing, nothing, nothing
+    else
+        @bb J_cache = similar(J)
+        @bb J_cache_2 = similar(J)
+        @bb Jdu_cache = similar(Jdu)
+    end
+    @bb fu_cache = copy(fu)
+    return KlementUpdateRuleCache(Jdu, J_cache, J_cache_2, Jdu_cache, fu_cache)
+end
+
+function __internal_solve!(cache::KlementUpdateRuleCache, J::Number, fu, u, du)
+    Jdu = J^2 * du^2
+    J = J + ((fu - cache.fu_cache - J * du) / ifelse(iszero(Jdu), 1e-5, Jdu)) * du * J^2
+    cache.fu_cache = fu
+    return J
+end
+
+function __internal_solve!(cache::KlementUpdateRuleCache, J_::Diagonal, fu, u, du)
+    T = eltype(u)
+    J = _restructure(u, diag(J_))
+    @bb @. cache.Jdu = (J^2) * (du^2)
+    @bb @. J += ((fu - cache.fu_cache - J * du) /
+                 ifelse(iszero(cache.Jdu), T(1e-5), cache.Jdu)) * du * (J^2)
+    @bb copyto!(cache.fu_cache, fu)
+    return Diagonal(vec(J))
+end
+
+function __internal_solve!(cache::KlementUpdateRuleCache, J::AbstractMatrix, fu, u, du)
+    T = eltype(u)
+    @bb @. cache.J_cache = J'^2
+    @bb @. cache.Jdu = du^2
+    @bb cache.Jdu_cache = cache.J_cache × vec(cache.Jdu)
+    @bb cache.Jdu = J × vec(du)
+    @bb @. cache.fu_cache = (fu - cache.fu_cache - cache.Jdu) /
+                            ifelse(iszero(cache.Jdu_cache), T(1e-5), cache.Jdu_cache)
+    @bb cache.J_cache = vec(cache.fu_cache) × transpose(_vec(du))
+    @bb @. cache.J_cache *= J
+    @bb cache.J_cache_2 = cache.J_cache × J
+    @bb J .+= cache.J_cache_2
+    @bb copyto!(cache.fu_cache, fu)
+    return J
+end
diff --git a/src/algorithms/lbroyden.jl b/src/algorithms/lbroyden.jl
new file mode 100644
index 000000000..ab2b26c50
--- /dev/null
+++ b/src/algorithms/lbroyden.jl
@@ -0,0 +1,168 @@
+"""
+    LimitedMemoryBroyden(; max_resets::Int = 3, linesearch = NoLineSearch(),
+        threshold::Val = Val(10), reset_tolerance = nothing, alpha = nothing)
+
+An implementation of `LimitedMemoryBroyden` [ziani2008autoadaptative](@cite) with resetting
+and line search.
+
+### Keyword Arguments
+
+  - `max_resets`: the maximum number of resets to perform. Defaults to `3`.
+  - `reset_tolerance`: the tolerance for the reset check. Defaults to
+    `sqrt(eps(real(eltype(u))))`.
+  - `threshold`: the number of vectors to store in the low rank approximation. Defaults
+    to `Val(10)`.
+  - `alpha`: The initial Jacobian inverse is set to be `(αI)⁻¹`. Defaults to `nothing`
+    which implies `α = max(norm(u), 1) / (2 * norm(fu))`.
+"""
+function LimitedMemoryBroyden(; max_resets::Int = 3, linesearch = NoLineSearch(),
+        threshold::Union{Val, Int} = Val(10), reset_tolerance = nothing, alpha = nothing)
+    threshold isa Int && (threshold = Val(threshold))
+    return ApproximateJacobianSolveAlgorithm{false, :LimitedMemoryBroyden}(; linesearch,
+        descent = NewtonDescent(), update_rule = GoodBroydenUpdateRule(), max_resets,
+        initialization = BroydenLowRankInitialization{_unwrap_val(threshold)}(alpha,
+            threshold), reinit_rule = NoChangeInStateReset(; reset_tolerance))
+end
+
+"""
+    BroydenLowRankInitialization{T}(alpha, threshold::Val{T})
+
+An initialization for `LimitedMemoryBroyden` that uses a low rank approximation of the
+Jacobian. The low rank updates to the Jacobian matrix corresponds to what SciPy calls
+["simple"](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.broyden2.html#scipy-optimize-broyden2).
+"""
+@concrete struct BroydenLowRankInitialization{T} <: AbstractJacobianInitialization
+    alpha
+    threshold::Val{T}
+end
+
+jacobian_initialized_preinverted(::BroydenLowRankInitialization) = true
+
+function __internal_init(prob::AbstractNonlinearProblem,
+        alg::BroydenLowRankInitialization{T}, solver, f::F, fu, u, p; maxiters = 1000,
+        internalnorm::IN = DEFAULT_NORM, kwargs...) where {T, F, IN}
+    if u isa Number # Use the standard broyden
+        return __internal_init(prob, IdentityInitialization(true, FullStructure()), solver,
+            f, fu, u,
+            p; maxiters, kwargs...)
+    end
+    # Pay to cost of slightly more allocations to prevent type-instability for StaticArrays
+    α = inv(__initial_alpha(alg.alpha, u, fu, internalnorm))
+    if u isa StaticArray
+        J = BroydenLowRankJacobian(fu, u; alg.threshold, alpha = α)
+    else
+        threshold = min(_unwrap_val(alg.threshold), maxiters)
+        J = BroydenLowRankJacobian(fu, u; threshold, alpha = α)
+    end
+    return InitializedApproximateJacobianCache(J, FullStructure(), alg, nothing, true,
+        internalnorm)
+end
+
+function (cache::InitializedApproximateJacobianCache)(alg::BroydenLowRankInitialization, fu,
+        u)
+    α = __initial_alpha(alg.alpha, u, fu, cache.internalnorm)
+    cache.J.idx = 0
+    cache.J.alpha = inv(α)
+    return
+end
+
+"""
+    BroydenLowRankJacobian{T}(U, Vᵀ, idx, cache, alpha)
+
+Low Rank Approximation of the Jacobian Matrix. Currently only used for
+[`LimitedMemoryBroyden`](@ref). This computes the Jacobian as ``U \\times V^T``.
+"""
+@concrete mutable struct BroydenLowRankJacobian{T} <: AbstractNonlinearSolveOperator{T}
+    U
+    Vᵀ
+    idx::Int
+    cache
+    alpha
+end
+
+__safe_inv!!(workspace, op::BroydenLowRankJacobian) = op  # Already Inverted form
+
+@inline function __get_components(op::BroydenLowRankJacobian)
+    op.idx ≥ size(op.U, 2) && return op.cache, op.U, transpose(op.Vᵀ)
+    _cache = op.cache === nothing ? op.cache : view(op.cache, 1:(op.idx))
+    return (_cache, view(op.U, :, 1:(op.idx)), transpose(view(op.Vᵀ, :, 1:(op.idx))))
+end
+
+Base.size(op::BroydenLowRankJacobian) = size(op.U, 1), size(op.Vᵀ, 1)
+function Base.size(op::BroydenLowRankJacobian, d::Integer)
+    return ifelse(d == 1, size(op.U, 1), size(op.Vᵀ, 1))
+end
+
+for op in (:adjoint, :transpose)
+    # FIXME: adjoint might be a problem here. Fix if a complex number issue shows up
+    @eval function Base.$(op)(operator::BroydenLowRankJacobian{T}) where {T}
+        return BroydenLowRankJacobian{T}(operator.Vᵀ, operator.U,
+            operator.idx, operator.cache, operator.alpha)
+    end
+end
+
+# Storing the transpose to ensure contiguous memory on splicing
+function BroydenLowRankJacobian(fu::StaticArray{S2, T2}, u::StaticArray{S1, T1};
+        alpha = true, threshold::Val{Th} = Val(10)) where {S1, S2, T1, T2, Th}
+    T = promote_type(T1, T2)
+    fuSize, uSize = Size(fu), Size(u)
+    U = MArray{Tuple{prod(fuSize), Th}, T}(undef)
+    Vᵀ = MArray{Tuple{prod(uSize), Th}, T}(undef)
+    return BroydenLowRankJacobian{T}(U, Vᵀ, 0, nothing, T(alpha))
+end
+
+function BroydenLowRankJacobian(fu, u; threshold::Int = 10, alpha = true)
+    T = promote_type(eltype(u), eltype(fu))
+    U = similar(fu, T, length(fu), threshold)
+    Vᵀ = similar(u, T, length(u), threshold)
+    cache = similar(u, T, threshold)
+    return BroydenLowRankJacobian{T}(U, Vᵀ, 0, cache, T(alpha))
+end
+
+function Base.:*(J::BroydenLowRankJacobian, x::AbstractVector)
+    J.idx == 0 && return -x
+    cache, U, Vᵀ = __get_components(J)
+    return U * (Vᵀ * x) .- J.alpha .* x
+end
+
+function LinearAlgebra.mul!(y::AbstractVector, J::BroydenLowRankJacobian, x::AbstractVector)
+    if J.idx == 0
+        @. y = -J.alpha * x
+        return y
+    end
+    cache, U, Vᵀ = __get_components(J)
+    @bb cache = Vᵀ × x
+    mul!(y, U, cache)
+    @bb @. y -= J.alpha * x
+    return y
+end
+
+function Base.:*(x::AbstractVector, J::BroydenLowRankJacobian)
+    J.idx == 0 && return -x
+    cache, U, Vᵀ = __get_components(J)
+    return Vᵀ' * (U' * x) .- J.alpha .* x
+end
+
+function LinearAlgebra.mul!(y::AbstractVector, x::AbstractVector, J::BroydenLowRankJacobian)
+    if J.idx == 0
+        @. y = -J.alpha * x
+        return y
+    end
+    cache, U, Vᵀ = __get_components(J)
+    @bb cache = transpose(U) × x
+    mul!(y, transpose(Vᵀ), cache)
+    @bb @. y -= J.alpha * x
+    return y
+end
+
+function LinearAlgebra.mul!(J::BroydenLowRankJacobian, u,
+        vᵀ::LinearAlgebra.AdjOrTransAbsVec, α::Bool, β::Bool)
+    @assert α & β
+    idx_update = mod1(J.idx + 1, size(J.U, 2))
+    copyto!(@view(J.U[:, idx_update]), _vec(u))
+    copyto!(@view(J.Vᵀ[:, idx_update]), _vec(vᵀ))
+    J.idx += 1
+    return J
+end
+
+restructure(::BroydenLowRankJacobian, J::BroydenLowRankJacobian) = J
diff --git a/src/algorithms/levenberg_marquardt.jl b/src/algorithms/levenberg_marquardt.jl
new file mode 100644
index 000000000..72dd63957
--- /dev/null
+++ b/src/algorithms/levenberg_marquardt.jl
@@ -0,0 +1,181 @@
+"""
+    LevenbergMarquardt(; linsolve = nothing,
+        precs = DEFAULT_PRECS, damping_initial::Real = 1.0, α_geodesic::Real = 0.75,
+        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+        finite_diff_step_geodesic = 0.1, b_uphill::Real = 1.0, autodiff = nothing,
+        min_damping_D::Real = 1e-8, disable_geodesic = Val(false))
+
+An advanced Levenberg-Marquardt implementation with the improvements suggested in
+[transtrum2012improvements](@citet). Designed for large-scale and numerically-difficult
+nonlinear systems.
+
+### Keyword Arguments
+
+  - `damping_initial`: the starting value for the damping factor. The damping factor is
+    inversely proportional to the step size. The damping factor is adjusted during each
+    iteration. Defaults to `1.0`. See Section 2.1 of [transtrum2012improvements](@citet).
+  - `damping_increase_factor`: the factor by which the damping is increased if a step is
+    rejected. Defaults to `2.0`. See Section 2.1 of [transtrum2012improvements](@citet).
+  - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
+    accepted. Defaults to `3.0`. See Section 2.1 of [transtrum2012improvements](@citet).
+  - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
+    `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
+    where `J` is the Jacobian. It is suggested by [transtrum2012improvements](@citet) to use
+    a minimum value of the elements in `DᵀD` to prevent the damping from being too small.
+    Defaults to `1e-8`.
+  - `disable_geodesic`: Disables Geodesic Acceleration if set to `Val(true)`. It provides
+    a way to trade-off robustness for speed, though in most situations Geodesic Acceleration
+    should not be disabled.
+
+For the remaining arguments, see [`GeodesicAcceleration`](@ref) and
+[`NonlinearSolve.LevenbergMarquardtTrustRegion`](@ref) documentations.
+"""
+function LevenbergMarquardt(; concrete_jac = missing, linsolve = nothing,
+        precs = DEFAULT_PRECS, damping_initial::Real = 1.0, α_geodesic::Real = 0.75,
+        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+        finite_diff_step_geodesic = 0.1, b_uphill::Real = 1.0, autodiff = nothing,
+        min_damping_D::Real = 1e-8, disable_geodesic = False)
+    if concrete_jac !== missing
+        Base.depwarn("The `concrete_jac` keyword argument is deprecated and will be \
+                      removed in v0.4. This kwarg doesn't make sense (and is currently \
+                      ignored) for LM since it needs to materialize the Jacobian to \
+                      compute the Damping Term", :LevenbergMarquardt)
+    end
+
+    descent = DampedNewtonDescent(; linsolve, precs, initial_damping = damping_initial,
+        damping_fn = LevenbergMarquardtDampingFunction(damping_increase_factor,
+            damping_decrease_factor, min_damping_D))
+    if disable_geodesic === False
+        descent = GeodesicAcceleration(descent, finite_diff_step_geodesic, α_geodesic)
+    end
+    trustregion = LevenbergMarquardtTrustRegion(b_uphill)
+    return GeneralizedFirstOrderAlgorithm(; concrete_jac = true, name = :LevenbergMarquardt,
+        trustregion, descent, jacobian_ad = autodiff)
+end
+
+@concrete struct LevenbergMarquardtDampingFunction <: AbstractDampingFunction
+    increase_factor
+    decrease_factor
+    min_damping
+end
+
+@concrete mutable struct LevenbergMarquardtDampingCache <: AbstractDampingFunctionCache
+    increase_factor
+    decrease_factor
+    min_damping
+    λ_factor
+    λ
+    DᵀD
+    J_diag_cache
+    J_damped
+    damping_f
+end
+
+function reinit_cache!(cache::LevenbergMarquardtDampingCache, args...; kwargs...)
+    cache.λ = cache.damping_f.initial_damping
+    cache.λ_factor = cache.damping_f.increase_factor
+    if !(cache.DᵀD isa Number)
+        if can_setindex(cache.DᵀD.diag)
+            cache.DᵀD.diag .= cache.min_damping
+        else
+            cache.DᵀD = Diagonal(ones(typeof(cache.DᵀD.diag)) * cache.min_damping)
+        end
+    end
+    cache.J_damped = cache.λ .* cache.DᵀD
+end
+
+function requires_normal_form_jacobian(::Union{LevenbergMarquardtDampingFunction,
+        LevenbergMarquardtDampingCache})
+    return false
+end
+function requires_normal_form_rhs(::Union{LevenbergMarquardtDampingFunction,
+        LevenbergMarquardtDampingCache})
+    return false
+end
+function returns_norm_form_damping(::Union{LevenbergMarquardtDampingFunction,
+        LevenbergMarquardtDampingCache})
+    return true
+end
+
+function __internal_init(prob::AbstractNonlinearProblem,
+        f::LevenbergMarquardtDampingFunction, initial_damping, J, fu, u, ::Val{NF};
+        internalnorm::F = DEFAULT_NORM, kwargs...) where {F, NF}
+    T = promote_type(eltype(u), eltype(fu))
+    DᵀD = __init_diagonal(u, T(f.min_damping))
+    if NF
+        J_diag_cache = nothing
+    else
+        @bb J_diag_cache = similar(u)
+    end
+    J_damped = T(initial_damping) .* DᵀD
+    return LevenbergMarquardtDampingCache(T(f.increase_factor), T(f.decrease_factor),
+        T(f.min_damping), T(f.increase_factor), T(initial_damping), DᵀD, J_diag_cache,
+        J_damped, f)
+end
+
+(damping::LevenbergMarquardtDampingCache)(::Nothing) = damping.J_damped
+
+function __internal_solve!(damping::LevenbergMarquardtDampingCache, J, fu, ::Val{false};
+        kwargs...)
+    if __can_setindex(damping.J_diag_cache)
+        sum!(abs2, _vec(damping.J_diag_cache), J')
+    elseif damping.J_diag_cache isa Number
+        damping.J_diag_cache = abs2(J)
+    else
+        damping.J_diag_cache = dropdims(sum(abs2, J'; dims = 1); dims = 1)
+    end
+    damping.DᵀD = __update_LM_diagonal!!(damping.DᵀD, _vec(damping.J_diag_cache))
+    @bb @. damping.J_damped = damping.λ * damping.DᵀD
+    return damping.J_damped
+end
+
+function __internal_solve!(damping::LevenbergMarquardtDampingCache, JᵀJ, fu, ::Val{true};
+        kwargs...)
+    damping.DᵀD = __update_LM_diagonal!!(damping.DᵀD, JᵀJ)
+    @bb @. damping.J_damped = damping.λ * damping.DᵀD
+    return damping.J_damped
+end
+
+function callback_into_cache!(topcache, cache::LevenbergMarquardtDampingCache, args...)
+    if last_step_accepted(topcache.trustregion_cache) &&
+       last_step_accepted(topcache.descent_cache)
+        cache.λ_factor = 1 / cache.decrease_factor
+    end
+    cache.λ *= cache.λ_factor
+    cache.λ_factor = cache.increase_factor
+end
+
+@inline __update_LM_diagonal!!(y::Number, x::Number) = max(y, x)
+@inline function __update_LM_diagonal!!(y::Diagonal, x::AbstractVector)
+    if __can_setindex(y.diag)
+        @. y.diag = max(y.diag, x)
+        return y
+    else
+        return Diagonal(max.(y.diag, x))
+    end
+end
+@inline function __update_LM_diagonal!!(y::Diagonal, x::AbstractMatrix)
+    if __can_setindex(y.diag)
+        if fast_scalar_indexing(y.diag)
+            @inbounds for i in axes(x, 1)
+                y.diag[i] = max(y.diag[i], x[i, i])
+            end
+            return y
+        else
+            idxs = diagind(x)
+            @.. broadcast=false y.diag=max(y.diag, @view(x[idxs]))
+            return y
+        end
+    else
+        idxs = diagind(x)
+        return Diagonal(@.. broadcast=false max(y.diag, @view(x[idxs])))
+    end
+end
+
+@inline __init_diagonal(u::Number, v) = oftype(u, v)
+@inline __init_diagonal(u::SArray, v) = Diagonal(ones(typeof(vec(u))) * v)
+@inline function __init_diagonal(u, v)
+    d = similar(vec(u))
+    d .= v
+    return Diagonal(d)
+end
diff --git a/src/algorithms/pseudo_transient.jl b/src/algorithms/pseudo_transient.jl
new file mode 100644
index 000000000..957cfc904
--- /dev/null
+++ b/src/algorithms/pseudo_transient.jl
@@ -0,0 +1,71 @@
+"""
+    PseudoTransient(; concrete_jac = nothing, linsolve = nothing,
+        linesearch::AbstractNonlinearSolveLineSearchAlgorithm = NoLineSearch(),
+        precs = DEFAULT_PRECS, autodiff = nothing)
+
+An implementation of PseudoTransient Method [coffey2003pseudotransient](@cite) that is used
+to solve steady state problems in an accelerated manner. It uses an adaptive time-stepping
+to integrate an initial value of nonlinear problem until sufficient accuracy in the desired
+steady-state is achieved to switch over to Newton's method and gain a rapid convergence.
+This implementation specifically uses "switched evolution relaxation"
+[kelley1998convergence](@cite) SER method.
+
+### Keyword Arguments
+
+  - `alpha_initial` : the initial pseudo time step. It defaults to `1e-3`. If it is small,
+    you are going to need more iterations to converge but it can be more stable.
+"""
+function PseudoTransient(; concrete_jac = nothing, linsolve = nothing,
+        linesearch::AbstractNonlinearSolveLineSearchAlgorithm = NoLineSearch(),
+        precs = DEFAULT_PRECS, autodiff = nothing, alpha_initial = 1e-3)
+    descent = DampedNewtonDescent(; linsolve, precs, initial_damping = alpha_initial,
+        damping_fn = SwitchedEvolutionRelaxation())
+    return GeneralizedFirstOrderAlgorithm(; concrete_jac,
+        name = :PseudoTransient, linesearch, descent, jacobian_ad = autodiff)
+end
+
+"""
+    SwitchedEvolutionRelaxation()
+
+Method for updating the damping parameter in the [`PseudoTransient`](@ref) method based on
+"switched evolution relaxation" [kelley1998convergence](@cite) SER method.
+"""
+struct SwitchedEvolutionRelaxation <: AbstractDampingFunction end
+
+"""
+    SwitchedEvolutionRelaxationCache <: AbstractDampingFunctionCache
+
+Cache for the [`SwitchedEvolutionRelaxation`](@ref) method.
+"""
+@concrete mutable struct SwitchedEvolutionRelaxationCache <: AbstractDampingFunctionCache
+    res_norm
+    α⁻¹
+    internalnorm
+end
+
+function requires_normal_form_jacobian(cache::Union{SwitchedEvolutionRelaxation,
+        SwitchedEvolutionRelaxationCache})
+    return false
+end
+function requires_normal_form_rhs(cache::Union{SwitchedEvolutionRelaxation,
+        SwitchedEvolutionRelaxationCache})
+    return false
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, f::SwitchedEvolutionRelaxation,
+        initial_damping, J, fu, u, args...; internalnorm::F = DEFAULT_NORM,
+        kwargs...) where {F}
+    T = promote_type(eltype(u), eltype(fu))
+    return SwitchedEvolutionRelaxationCache(internalnorm(fu), T(1 / initial_damping),
+        internalnorm)
+end
+
+(damping::SwitchedEvolutionRelaxationCache)(::Nothing) = damping.α⁻¹
+
+function __internal_solve!(damping::SwitchedEvolutionRelaxationCache, J, fu, args...;
+        kwargs...)
+    res_norm = damping.internalnorm(fu)
+    damping.α⁻¹ *= res_norm / damping.res_norm
+    damping.res_norm = res_norm
+    return damping.α⁻¹
+end
diff --git a/src/algorithms/raphson.jl b/src/algorithms/raphson.jl
new file mode 100644
index 000000000..bc005f2b6
--- /dev/null
+++ b/src/algorithms/raphson.jl
@@ -0,0 +1,14 @@
+"""
+    NewtonRaphson(; concrete_jac = nothing, linsolve = nothing, linesearch = NoLineSearch(),
+        precs = DEFAULT_PRECS, autodiff = nothing)
+
+An advanced NewtonRaphson implementation with support for efficient handling of sparse
+matrices via colored automatic differentiation and preconditioned linear solvers. Designed
+for large-scale and numerically-difficult nonlinear systems.
+"""
+function NewtonRaphson(; concrete_jac = nothing, linsolve = nothing,
+        linesearch = NoLineSearch(), precs = DEFAULT_PRECS, autodiff = nothing)
+    descent = NewtonDescent(; linsolve, precs)
+    return GeneralizedFirstOrderAlgorithm(; concrete_jac, name = :NewtonRaphson,
+        linesearch, descent, jacobian_ad = autodiff)
+end
diff --git a/src/algorithms/trust_region.jl b/src/algorithms/trust_region.jl
new file mode 100644
index 000000000..89c4d8f5d
--- /dev/null
+++ b/src/algorithms/trust_region.jl
@@ -0,0 +1,36 @@
+"""
+    TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
+        radius_update_scheme = RadiusUpdateSchemes.Simple, max_trust_radius::Real = 0 // 1,
+        initial_trust_radius::Real = 0 // 1, step_threshold::Real = 1 // 10000,
+        shrink_threshold::Real = 1 // 4, expand_threshold::Real = 3 // 4,
+        shrink_factor::Real = 1 // 4, expand_factor::Real = 2 // 1,
+        max_shrink_times::Int = 32, vjp_autodiff = nothing, autodiff = nothing)
+
+An advanced TrustRegion implementation with support for efficient handling of sparse
+matrices via colored automatic differentiation and preconditioned linear solvers. Designed
+for large-scale and numerically-difficult nonlinear systems.
+
+### Keyword Arguments
+
+  - `radius_update_scheme`: the scheme used to update the trust region radius. Defaults to
+    `RadiusUpdateSchemes.Simple`. See [`RadiusUpdateSchemes`](@ref) for more details. For a
+    review on trust region radius update schemes, see [yuan2015recent](@citet).
+
+For the remaining arguments, see [`NonlinearSolve.GenericTrustRegionScheme`](@ref)
+documentation.
+"""
+function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
+        radius_update_scheme = RadiusUpdateSchemes.Simple, max_trust_radius::Real = 0 // 1,
+        initial_trust_radius::Real = 0 // 1, step_threshold::Real = 1 // 10000,
+        shrink_threshold::Real = 1 // 4, expand_threshold::Real = 3 // 4,
+        shrink_factor::Real = 1 // 4, expand_factor::Real = 2 // 1,
+        max_shrink_times::Int = 32, vjp_autodiff = nothing, autodiff = nothing)
+    descent = Dogleg(; linsolve, precs)
+    forward_ad = autodiff isa ADTypes.AbstractForwardMode ? autodiff : nothing
+    trustregion = GenericTrustRegionScheme(; method = radius_update_scheme, step_threshold,
+        shrink_threshold, expand_threshold, shrink_factor, expand_factor,
+        reverse_ad = vjp_autodiff, forward_ad)
+    return GeneralizedFirstOrderAlgorithm(; concrete_jac, name = :TrustRegion,
+        trustregion, descent, jacobian_ad = autodiff, reverse_ad = vjp_autodiff,
+        max_shrink_times)
+end
diff --git a/src/broyden.jl b/src/broyden.jl
deleted file mode 100644
index 7c90d6f92..000000000
--- a/src/broyden.jl
+++ /dev/null
@@ -1,249 +0,0 @@
-# Sadly `Broyden` is taken up by SimpleNonlinearSolve.jl
-"""
-    Broyden(; max_resets = 100, linesearch = nothing, reset_tolerance = nothing,
-        init_jacobian::Val = Val(:identity), autodiff = nothing, alpha = nothing)
-
-An implementation of `Broyden` with resetting and line search.
-
-## Arguments
-
-  - `max_resets`: the maximum number of resets to perform. Defaults to `100`.
-
-  - `reset_tolerance`: the tolerance for the reset check. Defaults to
-    `sqrt(eps(real(eltype(u))))`.
-  - `linesearch`: the line search algorithm to use. Defaults to [`LineSearch()`](@ref),
-    which means that no line search is performed. Algorithms from `LineSearches.jl` can be
-    used here directly, and they will be converted to the correct `LineSearch`. It is
-    recommended to use [`LiFukushimaLineSearch`](@ref) -- a derivative free linesearch
-    specifically designed for Broyden's method.
-  - `alpha`: If `init_jacobian` is set to `Val(:identity)`, then the initial Jacobian
-    inverse is set to be `(αI)⁻¹`. Defaults to `nothing` which implies
-    `α = max(norm(u), 1) / (2 * norm(fu))`.
-  - `init_jacobian`: the method to use for initializing the jacobian. Defaults to
-    `Val(:identity)`. Choices include:
-
-      + `Val(:identity)`: Identity Matrix.
-      + `Val(:true_jacobian)`: True Jacobian. This is a good choice for differentiable
-        problems.
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing` which means that a default is selected according to the problem specification!
-    Valid choices are types from ADTypes.jl. (Used if `init_jacobian = Val(:true_jacobian)`)
-  - `update_rule`: Update Rule for the Jacobian. Choices are:
-
-      + `Val(:good_broyden)`: Good Broyden's Update Rule
-      + `Val(:bad_broyden)`: Bad Broyden's Update Rule
-      + `Val(:diagonal)`: Only update the diagonal of the Jacobian. This algorithm may be
-        useful for specific problems, but whether it will work may depend strongly on the
-        problem.
-"""
-@concrete struct Broyden{IJ, UR, CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    max_resets::Int
-    reset_tolerance
-    linesearch
-    alpha
-end
-
-function __alg_print_modifiers(alg::Broyden{IJ, UR}) where {IJ, UR}
-    modifiers = String[]
-    IJ !== :identity && push!(modifiers, "init_jacobian = Val(:$(IJ))")
-    UR !== :good_broyden && push!(modifiers, "update_rule = Val(:$(UR))")
-    alg.alpha !== nothing && push!(modifiers, "alpha = $(alg.alpha)")
-    return modifiers
-end
-
-function set_ad(alg::Broyden{IJ, UR, CJ}, ad) where {IJ, UR, CJ}
-    return Broyden{IJ, UR, CJ}(ad, alg.max_resets, alg.reset_tolerance,
-        alg.linesearch, alg.alpha)
-end
-
-function Broyden(; max_resets = 100, linesearch = nothing, reset_tolerance = nothing,
-        init_jacobian::Val = Val(:identity), autodiff = nothing, alpha = nothing,
-        update_rule = Val(:good_broyden))
-    UR = _unwrap_val(update_rule)
-    @assert UR ∈ (:good_broyden, :bad_broyden, :diagonal)
-    IJ = _unwrap_val(init_jacobian)
-    @assert IJ ∈ (:identity, :true_jacobian)
-    linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method = linesearch)
-    CJ = IJ === :true_jacobian
-    return Broyden{IJ, UR, CJ}(autodiff, max_resets, reset_tolerance, linesearch,
-        alpha)
-end
-
-@concrete mutable struct BroydenCache{iip, IJ, UR} <:
-                         AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    du
-    fu
-    fu_cache
-    dfu
-    p
-    uf
-    J⁻¹
-    J⁻¹_cache
-    J⁻¹dfu
-    inv_alpha
-    alpha_initial
-    force_stop::Bool
-    resets::Int
-    max_resets::Int
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    reset_tolerance
-    reset_check
-    jac_cache
-    prob
-    stats::NLStats
-    ls_cache
-    tc_cache
-    trace
-end
-
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::Broyden{IJ, UR},
-        args...; alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
-        kwargs...) where {uType, iip, F, IJ, UR}
-    @unpack f, u0, p = prob
-    u = __maybe_unaliased(u0, alias_u0)
-    fu = evaluate_f(prob, u)
-    @bb du = copy(u)
-
-    inv_alpha = __initial_inv_alpha(alg_.alpha, u, fu, internalnorm)
-
-    if IJ === :true_jacobian
-        alg = get_concrete_algorithm(alg_, prob)
-        uf, _, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
-            lininit = Val(false))
-        if UR === :diagonal
-            J⁻¹_cache = J
-            J⁻¹ = __diag(J)
-        else
-            J⁻¹_cache = nothing
-            J⁻¹ = J
-        end
-    elseif IJ === :identity
-        alg = alg_
-        @bb du = similar(u)
-        uf, fu_cache, jac_cache, J⁻¹_cache = nothing, nothing, nothing, nothing
-        if UR === :diagonal
-            J⁻¹ = one.(fu)
-            @bb J⁻¹ .*= inv_alpha
-        else
-            J⁻¹ = __init_identity_jacobian(u, fu, inv_alpha)
-        end
-    end
-
-    reset_tolerance = alg.reset_tolerance === nothing ? sqrt(eps(real(eltype(u)))) :
-                      alg.reset_tolerance
-    reset_check = x -> abs(x) ≤ reset_tolerance
-
-    @bb u_cache = copy(u)
-    @bb dfu = copy(fu)
-    @bb J⁻¹dfu = similar(u)
-
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J⁻¹), du;
-        uses_jac_inverse = Val(true), kwargs...)
-
-    return BroydenCache{iip, IJ, UR}(f, alg, u, u_cache, du, fu, fu_cache, dfu, p,
-        uf, J⁻¹, J⁻¹_cache, J⁻¹dfu, inv_alpha, alg.alpha, false, 0, alg.max_resets,
-        maxiters, internalnorm, ReturnCode.Default, abstol, reltol, reset_tolerance,
-        reset_check, jac_cache, prob, NLStats(1, 0, 0, 0, 0),
-        init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
-end
-
-function perform_step!(cache::BroydenCache{iip, IJ, UR}) where {iip, IJ, UR}
-    T = eltype(cache.u)
-
-    if IJ === :true_jacobian && cache.stats.nsteps == 0
-        if UR === :diagonal
-            cache.J⁻¹_cache = __safe_inv(jacobian!!(cache.J⁻¹_cache, cache))
-            cache.J⁻¹ = __get_diagonal!!(cache.J⁻¹, cache.J⁻¹_cache)
-        else
-            cache.J⁻¹ = __safe_inv(jacobian!!(cache.J⁻¹, cache))
-        end
-    end
-
-    if UR === :diagonal
-        @bb @. cache.du = cache.J⁻¹ * cache.fu
-    else
-        @bb cache.du = cache.J⁻¹ × vec(cache.fu)
-    end
-    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    @bb axpy!(-α, cache.du, cache.u)
-
-    evaluate_f(cache, cache.u, cache.p)
-
-    update_trace!(cache, α)
-    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-
-    cache.force_stop && return nothing
-
-    # Update the inverse jacobian
-    @bb @. cache.dfu = cache.fu - cache.dfu
-
-    if all(cache.reset_check, cache.du) || all(cache.reset_check, cache.dfu)
-        if cache.resets ≥ cache.max_resets
-            cache.retcode = ReturnCode.ConvergenceFailure
-            cache.force_stop = true
-            return nothing
-        end
-        if IJ === :true_jacobian
-            if UR === :diagonal
-                cache.J⁻¹_cache = __safe_inv(jacobian!!(cache.J⁻¹_cache, cache))
-                cache.J⁻¹ = __get_diagonal!!(cache.J⁻¹, cache.J⁻¹_cache)
-            else
-                cache.J⁻¹ = __safe_inv(jacobian!!(cache.J⁻¹, cache))
-            end
-        else
-            cache.inv_alpha = __initial_inv_alpha(cache.inv_alpha, cache.alpha_initial,
-                cache.u, cache.fu, cache.internalnorm)
-            cache.J⁻¹ = __reinit_identity_jacobian!!(cache.J⁻¹, cache.inv_alpha)
-        end
-        cache.resets += 1
-    else
-        @bb cache.du .*= -1
-        if UR === :good_broyden
-            @bb cache.J⁻¹dfu = cache.J⁻¹ × vec(cache.dfu)
-            @bb cache.u_cache = transpose(cache.J⁻¹) × vec(cache.du)
-            denom = dot(cache.du, cache.J⁻¹dfu)
-            @bb @. cache.du = (cache.du - cache.J⁻¹dfu) /
-                              ifelse(iszero(denom), T(1e-5), denom)
-            @bb cache.J⁻¹ += vec(cache.du) × transpose(_vec(cache.u_cache))
-        elseif UR === :bad_broyden
-            @bb cache.J⁻¹dfu = cache.J⁻¹ × vec(cache.dfu)
-            dfu_norm = cache.internalnorm(cache.dfu)^2
-            @bb @. cache.du = (cache.du - cache.J⁻¹dfu) /
-                              ifelse(iszero(dfu_norm), T(1e-5), dfu_norm)
-            @bb cache.J⁻¹ += vec(cache.du) × transpose(_vec(cache.dfu))
-        elseif UR === :diagonal
-            @bb @. cache.J⁻¹dfu = cache.du * cache.J⁻¹ * cache.dfu
-            denom = sum(cache.J⁻¹dfu)
-            @bb @. cache.J⁻¹ += (cache.du - cache.J⁻¹ * cache.dfu) * cache.du * cache.J⁻¹ /
-                                ifelse(iszero(denom), T(1e-5), denom)
-        else
-            error("update_rule = Val(:$(UR)) is not implemented for Broyden.")
-        end
-    end
-
-    @bb copyto!(cache.dfu, cache.fu)
-    @bb copyto!(cache.u_cache, cache.u)
-
-    return nothing
-end
-
-function __reinit_internal!(cache::BroydenCache; kwargs...)
-    cache.inv_alpha = __initial_inv_alpha(cache.inv_alpha, cache.alpha_initial, cache.u,
-        cache.fu, cache.internalnorm)
-    cache.J⁻¹ = __reinit_identity_jacobian!!(cache.J⁻¹, cache.inv_alpha)
-    cache.resets = 0
-    return nothing
-end
diff --git a/src/core/approximate_jacobian.jl b/src/core/approximate_jacobian.jl
new file mode 100644
index 000000000..54cf5b34a
--- /dev/null
+++ b/src/core/approximate_jacobian.jl
@@ -0,0 +1,354 @@
+"""
+    ApproximateJacobianSolveAlgorithm{concrete_jac, name}(; linesearch = missing,
+        trustregion = missing, descent, update_rule, reinit_rule, initialization,
+        max_resets::Int = typemax(Int), max_shrink_times::Int = typemax(Int))
+    ApproximateJacobianSolveAlgorithm(; concrete_jac = nothing,
+        name::Symbol = :unknown, kwargs...)
+
+Nonlinear Solve Algorithms using an Iterative Approximation of the Jacobian. Most common
+examples include [`Broyden`](@ref)'s Method.
+
+### Keyword Arguments
+
+  - `trustregion`: Globalization using a Trust Region Method. This needs to follow the
+    [`NonlinearSolve.AbstractTrustRegionMethod`](@ref) interface.
+  - `descent`: The descent method to use to compute the step. This needs to follow the
+    [`NonlinearSolve.AbstractDescentAlgorithm`](@ref) interface.
+  - `max_shrink_times`: The maximum number of times the trust region radius can be shrunk
+    before the algorithm terminates.
+  - `update_rule`: The update rule to use to update the Jacobian. This needs to follow the
+    [`NonlinearSolve.AbstractApproximateJacobianUpdateRule`](@ref) interface.
+  - `reinit_rule`: The reinitialization rule to use to reinitialize the Jacobian. This
+    needs to follow the [`NonlinearSolve.AbstractResetCondition`](@ref) interface.
+  - `initialization`: The initialization method to use to initialize the Jacobian. This
+    needs to follow the [`NonlinearSolve.AbstractJacobianInitialization`](@ref) interface.
+"""
+@concrete struct ApproximateJacobianSolveAlgorithm{concrete_jac, name} <:
+                 AbstractNonlinearSolveAlgorithm{name}
+    linesearch
+    trustregion
+    descent
+    update_rule
+    reinit_rule
+    max_resets::Int
+    max_shrink_times::Int
+    initialization
+end
+
+function __show_algorithm(io::IO, alg::ApproximateJacobianSolveAlgorithm, name, indent)
+    modifiers = String[]
+    __is_present(alg.linesearch) && push!(modifiers, "linesearch = $(alg.linesearch)")
+    __is_present(alg.trustregion) && push!(modifiers, "trustregion = $(alg.trustregion)")
+    push!(modifiers, "descent = $(alg.descent)")
+    push!(modifiers, "update_rule = $(alg.update_rule)")
+    push!(modifiers, "reinit_rule = $(alg.reinit_rule)")
+    push!(modifiers, "max_resets = $(alg.max_resets)")
+    push!(modifiers, "initialization = $(alg.initialization)")
+    store_inverse_jacobian(alg.update_rule) && push!(modifiers, "inverse_jacobian = true")
+    spacing = " "^indent * "    "
+    spacing_last = " "^indent
+    print(io, "$(name)(\n$(spacing)$(join(modifiers, ",\n$(spacing)"))\n$(spacing_last))")
+end
+
+function ApproximateJacobianSolveAlgorithm(; concrete_jac = nothing,
+        name::Symbol = :unknown, kwargs...)
+    return ApproximateJacobianSolveAlgorithm{concrete_jac, name}(; kwargs...)
+end
+
+function ApproximateJacobianSolveAlgorithm{concrete_jac, name}(; linesearch = missing,
+        trustregion = missing, descent, update_rule, reinit_rule, initialization,
+        max_resets::Int = typemax(Int),
+        max_shrink_times::Int = typemax(Int)) where {concrete_jac, name}
+    if linesearch !== missing && !(linesearch isa AbstractNonlinearSolveLineSearchAlgorithm)
+        Base.depwarn("Passing in a `LineSearches.jl` algorithm directly is deprecated. \
+                      Please use `LineSearchesJL` instead.",
+            :GeneralizedFirstOrderAlgorithm)
+        linesearch = LineSearchesJL(; method = linesearch)
+    end
+    return ApproximateJacobianSolveAlgorithm{concrete_jac, name}(linesearch, trustregion,
+        descent, update_rule, reinit_rule, max_resets, max_shrink_times, initialization)
+end
+
+@inline concrete_jac(::ApproximateJacobianSolveAlgorithm{CJ}) where {CJ} = CJ
+
+@concrete mutable struct ApproximateJacobianSolveCache{INV, GB, iip, timeit} <:
+                         AbstractNonlinearSolveCache{iip, timeit}
+    # Basic Requirements
+    fu
+    u
+    u_cache
+    p
+    du  # Aliased to `get_du(descent_cache)`
+    J   # Aliased to `initialization_cache.J` if !INV
+    alg
+    prob
+
+    # Internal Caches
+    initialization_cache
+    descent_cache
+    linesearch_cache
+    trustregion_cache
+    update_rule_cache
+    reinit_rule_cache
+
+    inv_workspace
+
+    # Counters
+    nf::Int
+    nsteps::Int
+    nresets::Int
+    max_resets::Int
+    maxiters::Int
+    maxtime
+    max_shrink_times::Int
+    steps_since_last_reset::Int
+
+    # Timer
+    timer
+    total_time::Float64   # Simple Counter which works even if TimerOutput is disabled
+
+    # Termination & Tracking
+    termination_cache
+    trace
+    retcode::ReturnCode.T
+    force_stop::Bool
+    force_reinit::Bool
+end
+
+store_inverse_jacobian(::ApproximateJacobianSolveCache{INV}) where {INV} = INV
+
+function __reinit_internal!(cache::ApproximateJacobianSolveCache{INV, GB, iip}, args...;
+        p = cache.p, u0 = cache.u, alias_u0::Bool = false, maxiters = 1000,
+        maxtime = nothing, kwargs...) where {INV, GB, iip}
+    if iip
+        recursivecopy!(cache.u, u0)
+        cache.prob.f(cache.fu, cache.u, p)
+    else
+        cache.u = __maybe_unaliased(u0, alias_u0)
+        set_fu!(cache, cache.prob.f(cache.u, p))
+    end
+    cache.p = p
+
+    cache.nf = 1
+    cache.nsteps = 0
+    cache.nresets = 0
+    cache.steps_since_last_reset = 0
+    cache.maxiters = maxiters
+    cache.maxtime = maxtime
+    cache.total_time = 0.0
+    cache.force_stop = false
+    cache.force_reinit = false
+    cache.retcode = ReturnCode.Default
+
+    reset!(cache.trace)
+    reinit!(cache.termination_cache, get_fu(cache), get_u(cache); kwargs...)
+    reset_timer!(cache.timer)
+end
+
+@internal_caches ApproximateJacobianSolveCache :initialization_cache :descent_cache :linesearch_cache :trustregion_cache :update_rule_cache :reinit_rule_cache
+
+function SciMLBase.__init(prob::AbstractNonlinearProblem{uType, iip},
+        alg::ApproximateJacobianSolveAlgorithm, args...; alias_u0 = false,
+        maxtime = nothing, maxiters = 1000, abstol = nothing, reltol = nothing,
+        linsolve_kwargs = (;), termination_condition = nothing,
+        internalnorm::F = DEFAULT_NORM, kwargs...) where {uType, iip, F}
+    timer = get_timer_output()
+    @static_timeit timer "cache construction" begin
+        (; f, u0, p) = prob
+        u = __maybe_unaliased(u0, alias_u0)
+        fu = evaluate_f(prob, u)
+        @bb u_cache = copy(u)
+
+        INV = store_inverse_jacobian(alg.update_rule)
+
+        linsolve = get_linear_solver(alg.descent)
+        initialization_cache = __internal_init(prob, alg.initialization, alg, f, fu, u, p;
+            linsolve,
+            maxiters, internalnorm)
+
+        abstol, reltol, termination_cache = init_termination_cache(abstol, reltol, fu, u,
+            termination_condition)
+        linsolve_kwargs = merge((; abstol, reltol), linsolve_kwargs)
+
+        J = initialization_cache(nothing)
+        inv_workspace, J = INV ? __safe_inv_workspace(J) : (nothing, J)
+        descent_cache = __internal_init(prob, alg.descent, J, fu, u; abstol, reltol,
+            internalnorm, linsolve_kwargs, pre_inverted = Val(INV), timer)
+        du = get_du(descent_cache)
+
+        reinit_rule_cache = __internal_init(alg.reinit_rule, J, fu, u, du)
+
+        if alg.trustregion !== missing && alg.linesearch !== missing
+            error("TrustRegion and LineSearch methods are algorithmically incompatible.")
+        end
+
+        GB = :None
+        linesearch_cache = nothing
+        trustregion_cache = nothing
+
+        if alg.trustregion !== missing
+            supports_trust_region(alg.descent) || error("Trust Region not supported by \
+                                                        $(alg.descent).")
+            trustregion_cache = __internal_init(prob, alg.trustregion, f, fu, u, p;
+                internalnorm, kwargs...)
+            GB = :TrustRegion
+        end
+
+        if alg.linesearch !== missing
+            supports_line_search(alg.descent) || error("Line Search not supported by \
+                                                        $(alg.descent).")
+            linesearch_cache = __internal_init(prob, alg.linesearch, f, fu, u, p;
+                internalnorm,
+                kwargs...)
+            GB = :LineSearch
+        end
+
+        update_rule_cache = __internal_init(prob, alg.update_rule, J, fu, u, du;
+            internalnorm)
+
+        trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du;
+            uses_jacobian_inverse = Val(INV), kwargs...)
+
+        return ApproximateJacobianSolveCache{INV, GB, iip, maxtime !== nothing}(fu, u,
+            u_cache, p, du, J, alg, prob, initialization_cache, descent_cache,
+            linesearch_cache, trustregion_cache, update_rule_cache, reinit_rule_cache,
+            inv_workspace, 0, 0, 0, alg.max_resets, maxiters, maxtime, alg.max_shrink_times,
+            0, timer, 0.0, termination_cache, trace, ReturnCode.Default, false, false)
+    end
+end
+
+function __step!(cache::ApproximateJacobianSolveCache{INV, GB, iip};
+        recompute_jacobian::Union{Nothing, Bool} = nothing) where {INV, GB, iip}
+    new_jacobian = true
+    @static_timeit cache.timer "jacobian init/reinit" begin
+        if get_nsteps(cache) == 0  # First Step is special ignore kwargs
+            J_init = __internal_solve!(cache.initialization_cache,
+                cache.fu,
+                cache.u,
+                Val(false))
+            if INV
+                if jacobian_initialized_preinverted(cache.initialization_cache.alg)
+                    cache.J = J_init
+                else
+                    cache.J = __safe_inv!!(cache.inv_workspace, J_init)
+                end
+            else
+                if jacobian_initialized_preinverted(cache.initialization_cache.alg)
+                    cache.J = __safe_inv!!(cache.inv_workspace, J_init)
+                else
+                    cache.J = J_init
+                end
+            end
+            J = cache.J
+            cache.steps_since_last_reset += 1
+        else
+            countable_reinit = false
+            if cache.force_reinit
+                reinit, countable_reinit = true, true
+                cache.force_reinit = false
+            elseif recompute_jacobian === nothing
+                # Standard Step
+                reinit = __internal_solve!(cache.reinit_rule_cache, cache.J, cache.fu,
+                    cache.u, cache.du)
+                reinit && (countable_reinit = true)
+            elseif recompute_jacobian
+                reinit = true  # Force ReInitialization: Don't count towards resetting
+            else
+                new_jacobian = false # Jacobian won't be updated in this step
+                reinit = false # Override Checks: Unsafe operation
+            end
+
+            if countable_reinit
+                cache.nresets += 1
+                if cache.nresets ≥ cache.max_resets
+                    cache.retcode = ReturnCode.ConvergenceFailure
+                    cache.force_stop = true
+                    return
+                end
+            end
+
+            if reinit
+                J_init = __internal_solve!(cache.initialization_cache, cache.fu, cache.u,
+                    Val(true))
+                cache.J = INV ? __safe_inv!!(cache.inv_workspace, J_init) : J_init
+                J = cache.J
+                cache.steps_since_last_reset = 0
+            else
+                J = cache.J
+                cache.steps_since_last_reset += 1
+            end
+        end
+    end
+
+    @static_timeit cache.timer "descent" begin
+        if cache.trustregion_cache !== nothing &&
+           hasfield(typeof(cache.trustregion_cache), :trust_region)
+            δu, descent_success, descent_intermediates = __internal_solve!(cache.descent_cache,
+                J, cache.fu, cache.u; new_jacobian,
+                trust_region = cache.trustregion_cache.trust_region)
+        else
+            δu, descent_success, descent_intermediates = __internal_solve!(cache.descent_cache,
+                J, cache.fu, cache.u; new_jacobian)
+        end
+    end
+
+    if descent_success
+        if GB === :LineSearch
+            @static_timeit cache.timer "linesearch" begin
+                needs_reset, α = __internal_solve!(cache.linesearch_cache, cache.u, δu)
+            end
+            if needs_reset && cache.steps_since_last_reset > 5 # Reset after a burn-in period
+                cache.force_reinit = true
+            else
+                @static_timeit cache.timer "step" begin
+                    @bb axpy!(α, δu, cache.u)
+                    evaluate_f!(cache, cache.u, cache.p)
+                end
+            end
+        elseif GB === :TrustRegion
+            @static_timeit cache.timer "trustregion" begin
+                tr_accepted, u_new, fu_new = __internal_solve!(cache.trustregion_cache, J,
+                    cache.fu, cache.u, δu, descent_intermediates)
+                if tr_accepted
+                    @bb copyto!(cache.u, u_new)
+                    @bb copyto!(cache.fu, fu_new)
+                end
+                if hasfield(typeof(cache.trustregion_cache), :shrink_counter) &&
+                   cache.trustregion_cache.shrink_counter > cache.max_shrink_times
+                    cache.retcode = ReturnCode.ShrinkThresholdExceeded
+                    cache.force_stop = true
+                end
+            end
+            α = true
+        elseif GB === :None
+            @static_timeit cache.timer "step" begin
+                @bb axpy!(1, δu, cache.u)
+                evaluate_f!(cache, cache.u, cache.p)
+            end
+            α = true
+        else
+            error("Unknown Globalization Strategy: $(GB). Allowed values are (:LineSearch, \
+                :TrustRegion, :None)")
+        end
+        check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
+    else
+        α = false
+        cache.force_reinit = true
+    end
+
+    update_trace!(cache, α)
+    @bb copyto!(cache.u_cache, cache.u)
+
+    if (cache.force_stop || cache.force_reinit ||
+        (recompute_jacobian !== nothing && !recompute_jacobian))
+        callback_into_cache!(cache)
+        return nothing
+    end
+
+    @static_timeit cache.timer "jacobian update" begin
+        cache.J = __internal_solve!(cache.update_rule_cache, cache.J, cache.fu, cache.u, δu)
+        callback_into_cache!(cache)
+    end
+
+    return nothing
+end
diff --git a/src/core/generalized_first_order.jl b/src/core/generalized_first_order.jl
new file mode 100644
index 000000000..1ac5ae018
--- /dev/null
+++ b/src/core/generalized_first_order.jl
@@ -0,0 +1,282 @@
+"""
+    GeneralizedFirstOrderAlgorithm{concrete_jac, name}(; descent, linesearch = missing,
+        trustregion = missing, jacobian_ad = nothing, forward_ad = nothing,
+        reverse_ad = nothing, max_shrink_times::Int = typemax(Int))
+    GeneralizedFirstOrderAlgorithm(; concrete_jac = nothing, name::Symbol = :unknown,
+        kwargs...)
+
+This is a Generalization of First-Order (uses Jacobian) Nonlinear Solve Algorithms. The most
+common example of this is Newton-Raphson Method.
+
+First Order here refers to the order of differentiation, and should not be confused with the
+order of convergence.
+
+`trustregion` and `linesearch` cannot be specified together.
+
+### Keyword Arguments
+
+  - `trustregion`: Globalization using a Trust Region Method. This needs to follow the
+    [`NonlinearSolve.AbstractTrustRegionMethod`](@ref) interface.
+  - `descent`: The descent method to use to compute the step. This needs to follow the
+    [`NonlinearSolve.AbstractDescentAlgorithm`](@ref) interface.
+  - `max_shrink_times`: The maximum number of times the trust region radius can be shrunk
+    before the algorithm terminates.
+"""
+@concrete struct GeneralizedFirstOrderAlgorithm{concrete_jac, name} <:
+                 AbstractNonlinearSolveAlgorithm{name}
+    linesearch
+    trustregion
+    descent
+    max_shrink_times::Int
+    jacobian_ad
+    forward_ad
+    reverse_ad
+end
+
+function __show_algorithm(io::IO, alg::GeneralizedFirstOrderAlgorithm, name, indent)
+    modifiers = String[]
+    __is_present(alg.linesearch) && push!(modifiers, "linesearch = $(alg.linesearch)")
+    __is_present(alg.trustregion) && push!(modifiers, "trustregion = $(alg.trustregion)")
+    push!(modifiers, "descent = $(alg.descent)")
+    __is_present(alg.jacobian_ad) && push!(modifiers, "jacobian_ad = $(alg.jacobian_ad)")
+    __is_present(alg.forward_ad) && push!(modifiers, "forward_ad = $(alg.forward_ad)")
+    __is_present(alg.reverse_ad) && push!(modifiers, "reverse_ad = $(alg.reverse_ad)")
+    spacing = " "^indent * "   "
+    spacing_last = " "^indent
+    print(io, "$(name)(\n$(spacing)$(join(modifiers, ",\n$(spacing)"))\n$(spacing_last))")
+end
+
+function GeneralizedFirstOrderAlgorithm(; concrete_jac = nothing,
+        name::Symbol = :unknown, kwargs...)
+    return GeneralizedFirstOrderAlgorithm{concrete_jac, name}(; kwargs...)
+end
+
+function GeneralizedFirstOrderAlgorithm{concrete_jac, name}(; descent,
+        linesearch = missing, trustregion = missing, jacobian_ad = nothing,
+        forward_ad = nothing, reverse_ad = nothing,
+        max_shrink_times::Int = typemax(Int)) where {concrete_jac, name}
+    forward_ad = ifelse(forward_ad !== nothing, forward_ad,
+        ifelse(jacobian_ad isa ADTypes.AbstractForwardMode, jacobian_ad, nothing))
+    reverse_ad = ifelse(reverse_ad !== nothing, reverse_ad,
+        ifelse(jacobian_ad isa ADTypes.AbstractReverseMode, jacobian_ad, nothing))
+
+    if linesearch !== missing && !(linesearch isa AbstractNonlinearSolveLineSearchAlgorithm)
+        Base.depwarn("Passing in a `LineSearches.jl` algorithm directly is deprecated. \
+                      Please use `LineSearchesJL` instead.",
+            :GeneralizedFirstOrderAlgorithm)
+        linesearch = LineSearchesJL(; method = linesearch)
+    end
+
+    return GeneralizedFirstOrderAlgorithm{concrete_jac, name}(linesearch,
+        trustregion, descent, max_shrink_times, jacobian_ad, forward_ad, reverse_ad)
+end
+
+concrete_jac(::GeneralizedFirstOrderAlgorithm{CJ}) where {CJ} = CJ
+
+@concrete mutable struct GeneralizedFirstOrderAlgorithmCache{iip, GB, timeit} <:
+                         AbstractNonlinearSolveCache{iip, timeit}
+    # Basic Requirements
+    fu
+    u
+    u_cache
+    p
+    du  # Aliased to `get_du(descent_cache)`
+    J   # Aliased to `jac_cache.J`
+    alg
+    prob
+
+    # Internal Caches
+    jac_cache
+    descent_cache
+    linesearch_cache
+    trustregion_cache
+
+    # Counters
+    nf::Int
+    nsteps::Int
+    maxiters::Int
+    maxtime
+    max_shrink_times::Int
+
+    # Timer
+    timer
+    total_time::Float64   # Simple Counter which works even if TimerOutput is disabled
+
+    # State Affect
+    make_new_jacobian::Bool
+
+    # Termination & Tracking
+    termination_cache
+    trace
+    retcode::ReturnCode.T
+    force_stop::Bool
+end
+
+function __reinit_internal!(cache::GeneralizedFirstOrderAlgorithmCache{iip}, args...;
+        p = cache.p, u0 = cache.u, alias_u0::Bool = false, maxiters = 1000,
+        maxtime = nothing, kwargs...) where {iip}
+    if iip
+        recursivecopy!(cache.u, u0)
+        cache.prob.f(cache.fu, cache.u, p)
+    else
+        cache.u = __maybe_unaliased(u0, alias_u0)
+        set_fu!(cache, cache.prob.f(cache.u, p))
+    end
+    cache.p = p
+
+    cache.nf = 1
+    cache.nsteps = 0
+    cache.maxiters = maxiters
+    cache.maxtime = maxtime
+    cache.total_time = 0.0
+    cache.force_stop = false
+    cache.retcode = ReturnCode.Default
+    cache.make_new_jacobian = true
+
+    reset!(cache.trace)
+    reinit!(cache.termination_cache, get_fu(cache), get_u(cache); kwargs...)
+    reset_timer!(cache.timer)
+end
+
+@internal_caches GeneralizedFirstOrderAlgorithmCache :jac_cache :descent_cache :linesearch_cache :trustregion_cache
+
+function SciMLBase.__init(prob::AbstractNonlinearProblem{uType, iip},
+        alg::GeneralizedFirstOrderAlgorithm, args...; alias_u0 = false, maxiters = 1000,
+        abstol = nothing, reltol = nothing, maxtime = nothing,
+        termination_condition = nothing, internalnorm = DEFAULT_NORM, linsolve_kwargs = (;),
+        kwargs...) where {uType, iip}
+    timer = get_timer_output()
+    @static_timeit timer "cache construction" begin
+        (; f, u0, p) = prob
+        u = __maybe_unaliased(u0, alias_u0)
+        fu = evaluate_f(prob, u)
+        @bb u_cache = copy(u)
+
+        linsolve = get_linear_solver(alg.descent)
+
+        abstol, reltol, termination_cache = init_termination_cache(abstol, reltol, fu, u,
+            termination_condition)
+        linsolve_kwargs = merge((; abstol, reltol), linsolve_kwargs)
+
+        jac_cache = JacobianCache(prob, alg, f, fu, u, p; autodiff = alg.jacobian_ad,
+            linsolve, jvp_autodiff = alg.forward_ad, vjp_autodiff = alg.reverse_ad)
+        J = jac_cache(nothing)
+        descent_cache = __internal_init(prob, alg.descent, J, fu, u; abstol, reltol,
+            internalnorm, linsolve_kwargs, timer)
+        du = get_du(descent_cache)
+
+        if alg.trustregion !== missing && alg.linesearch !== missing
+            error("TrustRegion and LineSearch methods are algorithmically incompatible.")
+        end
+
+        GB = :None
+        linesearch_cache = nothing
+        trustregion_cache = nothing
+
+        if alg.trustregion !== missing
+            supports_trust_region(alg.descent) || error("Trust Region not supported by \
+                                                        $(alg.descent).")
+            trustregion_cache = __internal_init(prob, alg.trustregion, f, fu, u, p;
+                internalnorm,
+                kwargs...)
+            GB = :TrustRegion
+        end
+
+        if alg.linesearch !== missing
+            supports_line_search(alg.descent) || error("Line Search not supported by \
+                                                        $(alg.descent).")
+            linesearch_cache = __internal_init(prob, alg.linesearch, f, fu, u, p;
+                internalnorm,
+                kwargs...)
+            GB = :LineSearch
+        end
+
+        trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
+
+        return GeneralizedFirstOrderAlgorithmCache{iip, GB, maxtime !== nothing}(fu, u,
+            u_cache, p, du, J, alg, prob, jac_cache, descent_cache, linesearch_cache,
+            trustregion_cache, 0, 0, maxiters, maxtime, alg.max_shrink_times, timer, 0.0,
+            true, termination_cache, trace, ReturnCode.Default, false)
+    end
+end
+
+function __step!(cache::GeneralizedFirstOrderAlgorithmCache{iip, GB};
+        recompute_jacobian::Union{Nothing, Bool} = nothing, kwargs...) where {iip, GB}
+    @static_timeit cache.timer "jacobian" begin
+        if (recompute_jacobian === nothing || recompute_jacobian) && cache.make_new_jacobian
+            J = cache.jac_cache(cache.u)
+            new_jacobian = true
+        else
+            J = cache.jac_cache(nothing)
+            new_jacobian = false
+        end
+    end
+
+    @static_timeit cache.timer "descent" begin
+        if cache.trustregion_cache !== nothing &&
+           hasfield(typeof(cache.trustregion_cache), :trust_region)
+            δu, descent_success, descent_intermediates = __internal_solve!(cache.descent_cache,
+                J, cache.fu, cache.u; new_jacobian,
+                trust_region = cache.trustregion_cache.trust_region)
+        else
+            δu, descent_success, descent_intermediates = __internal_solve!(cache.descent_cache,
+                J, cache.fu, cache.u; new_jacobian)
+        end
+    end
+
+    if descent_success
+        cache.make_new_jacobian = true
+        if GB === :LineSearch
+            @static_timeit cache.timer "linesearch" begin
+                linesearch_failed, α = __internal_solve!(cache.linesearch_cache,
+                    cache.u, δu)
+            end
+            if linesearch_failed
+                cache.retcode = ReturnCode.InternalLineSearchFailed
+                cache.force_stop = true
+            end
+            @static_timeit cache.timer "step" begin
+                @bb axpy!(α, δu, cache.u)
+                evaluate_f!(cache, cache.u, cache.p)
+            end
+        elseif GB === :TrustRegion
+            @static_timeit cache.timer "trustregion" begin
+                tr_accepted, u_new, fu_new = __internal_solve!(cache.trustregion_cache, J,
+                    cache.fu, cache.u, δu, descent_intermediates)
+                if tr_accepted
+                    @bb copyto!(cache.u, u_new)
+                    @bb copyto!(cache.fu, fu_new)
+                    α = true
+                else
+                    α = false
+                    cache.make_new_jacobian = false
+                end
+                if hasfield(typeof(cache.trustregion_cache), :shrink_counter) &&
+                   cache.trustregion_cache.shrink_counter > cache.max_shrink_times
+                    cache.retcode = ReturnCode.ShrinkThresholdExceeded
+                    cache.force_stop = true
+                end
+            end
+        elseif GB === :None
+            @static_timeit cache.timer "step" begin
+                @bb axpy!(1, δu, cache.u)
+                evaluate_f!(cache, cache.u, cache.p)
+            end
+            α = true
+        else
+            error("Unknown Globalization Strategy: $(GB). Allowed values are (:LineSearch, \
+                  :TrustRegion, :None)")
+        end
+        check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
+    else
+        α = false
+        cache.make_new_jacobian = false
+    end
+
+    update_trace!(cache, α)
+    @bb copyto!(cache.u_cache, cache.u)
+
+    callback_into_cache!(cache)
+
+    return nothing
+end
diff --git a/src/core/generic.jl b/src/core/generic.jl
new file mode 100644
index 000000000..849a259f1
--- /dev/null
+++ b/src/core/generic.jl
@@ -0,0 +1,66 @@
+function SciMLBase.__solve(prob::Union{NonlinearProblem, NonlinearLeastSquaresProblem},
+        alg::AbstractNonlinearSolveAlgorithm, args...; kwargs...)
+    cache = init(prob, alg, args...; kwargs...)
+    return solve!(cache)
+end
+
+function not_terminated(cache::AbstractNonlinearSolveCache)
+    return !cache.force_stop && get_nsteps(cache) < cache.maxiters
+end
+
+function SciMLBase.solve!(cache::AbstractNonlinearSolveCache)
+    while not_terminated(cache)
+        step!(cache)
+    end
+
+    # The solver might have set a different `retcode`
+    if cache.retcode == ReturnCode.Default
+        cache.retcode = ifelse(get_nsteps(cache) ≥ cache.maxiters, ReturnCode.MaxIters,
+            ReturnCode.Success)
+    end
+
+    update_from_termination_cache!(cache.termination_cache, cache)
+
+    update_trace!(cache.trace, get_nsteps(cache), get_u(cache), get_fu(cache), nothing,
+        nothing, nothing; last = True)
+
+    stats = ImmutableNLStats(get_nf(cache), get_njacs(cache), get_nfactors(cache),
+        get_nsolve(cache), get_nsteps(cache))
+
+    return SciMLBase.build_solution(cache.prob, cache.alg, get_u(cache), get_fu(cache);
+        cache.retcode, stats, cache.trace)
+end
+
+"""
+    step!(cache::AbstractNonlinearSolveCache;
+        recompute_jacobian::Union{Nothing, Bool} = nothing)
+
+Performs one step of the nonlinear solver.
+
+### Keyword Arguments
+
+  - `recompute_jacobian`: allows controlling whether the jacobian is recomputed at the
+    current step. If `nothing`, then the algorithm determines whether to recompute the
+    jacobian. If `true` or `false`, then the jacobian is recomputed or not recomputed,
+    respectively. For algorithms that don't use jacobian information, this keyword is
+    ignored with a one-time warning.
+"""
+function SciMLBase.step!(cache::AbstractNonlinearSolveCache{iip, timeit}, args...;
+        kwargs...) where {iip, timeit}
+    timeit && (time_start = time())
+    res = @static_timeit cache.timer "solve" begin
+        __step!(cache, args...; kwargs...)
+    end
+    cache.nsteps += 1
+
+    if timeit
+        cache.total_time += time() - time_start
+        if !cache.force_stop && cache.retcode == ReturnCode.Default &&
+           cache.total_time ≥ cache.maxtime
+            cache.retcode = ReturnCode.MaxTime
+            cache.force_stop = true
+        end
+    end
+
+    return res
+end
diff --git a/src/core/spectral_methods.jl b/src/core/spectral_methods.jl
new file mode 100644
index 000000000..31ef18343
--- /dev/null
+++ b/src/core/spectral_methods.jl
@@ -0,0 +1,209 @@
+# For spectral methods we currently only implement DF-SANE since after reading through
+# papers, this seems to be the only one that is widely used. If we have a list of more
+# papers we can see what is the right level of abstraction to implement here
+"""
+    GeneralizedDFSane{name}(linesearch, σ_min, σ_max, σ_1)
+
+A generalized version of the DF-SANE algorithm. This algorithm is a Jacobian-Free Spectral
+Method.
+
+### Arguments
+
+  - `linesearch`: Globalization using a Line Search Method. This needs to follow the
+    [`NonlinearSolve.AbstractNonlinearSolveLineSearchAlgorithm`](@ref) interface. This
+    is not optional currently, but that restriction might be lifted in the future.
+  - `σ_min`: The minimum spectral parameter allowed. This is used to ensure that the
+    spectral parameter is not too small.
+  - `σ_max`: The maximum spectral parameter allowed. This is used to ensure that the
+    spectral parameter is not too large.
+  - `σ_1`: The initial spectral parameter. If this is not provided, then the algorithm
+    initializes it as `σ_1 = <u, u> / <u, f(u)>`.
+"""
+@concrete struct GeneralizedDFSane{name} <: AbstractNonlinearSolveAlgorithm{name}
+    linesearch
+    σ_min
+    σ_max
+    σ_1
+end
+
+function __show_algorithm(io::IO, alg::GeneralizedDFSane, name, indent)
+    modifiers = String[]
+    __is_present(alg.linesearch) && push!(modifiers, "linesearch = $(alg.linesearch)")
+    push!(modifiers, "σ_min = $(alg.σ_min)")
+    push!(modifiers, "σ_max = $(alg.σ_max)")
+    push!(modifiers, "σ_1 = $(alg.σ_1)")
+    spacing = " "^indent * "    "
+    spacing_last = " "^indent
+    print(io, "$(name)(\n$(spacing)$(join(modifiers, ",\n$(spacing)"))\n$(spacing_last))")
+end
+
+concrete_jac(::GeneralizedDFSane) = nothing
+
+@concrete mutable struct GeneralizedDFSaneCache{iip, timeit} <:
+                         AbstractNonlinearSolveCache{iip, timeit}
+    # Basic Requirements
+    fu
+    fu_cache
+    u
+    u_cache
+    p
+    du
+    alg
+    prob
+
+    # Parameters
+    σ_n
+    σ_min
+    σ_max
+
+    # Internal Caches
+    linesearch_cache
+
+    # Counters
+    nf::Int
+    nsteps::Int
+    maxiters::Int
+    maxtime
+
+    # Timer
+    timer
+    total_time::Float64   # Simple Counter which works even if TimerOutput is disabled
+
+    # Termination & Tracking
+    termination_cache
+    trace
+    retcode::ReturnCode.T
+    force_stop::Bool
+end
+
+function __reinit_internal!(cache::GeneralizedDFSaneCache{iip}, args...; p = cache.p,
+        u0 = cache.u, alias_u0::Bool = false, maxiters = 1000, maxtime = nothing,
+        kwargs...) where {iip}
+    if iip
+        recursivecopy!(cache.u, u0)
+        cache.prob.f(cache.fu, cache.u, p)
+    else
+        cache.u = __maybe_unaliased(u0, alias_u0)
+        set_fu!(cache, cache.prob.f(cache.u, p))
+    end
+    cache.p = p
+
+    if cache.alg.σ_1 === nothing
+        σ_n = dot(cache.u, cache.u) / dot(cache.u, cache.fu)
+        # Spectral parameter bounds check
+        if !(cache.alg.σ_min ≤ abs(σ_n) ≤ cache.alg.σ_max)
+            test_norm = dot(cache.fu, cache.fu)
+            σ_n = clamp(inv(test_norm), T(1), T(1e5))
+        end
+    else
+        σ_n = T(cache.alg.σ_1)
+    end
+    cache.σ_n = σ_n
+
+    reset_timer!(cache.timer)
+    cache.total_time = 0.0
+
+    reset!(cache.trace)
+    reinit!(cache.termination_cache, get_fu(cache), get_u(cache); kwargs...)
+    cache.nf = 1
+    cache.nsteps = 0
+    cache.maxiters = maxiters
+    cache.maxtime = maxtime
+    cache.force_stop = false
+    cache.retcode = ReturnCode.Default
+end
+
+@internal_caches GeneralizedDFSaneCache :linesearch_cache
+
+function SciMLBase.__init(prob::AbstractNonlinearProblem, alg::GeneralizedDFSane, args...;
+        alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
+        termination_condition = nothing, internalnorm::F = DEFAULT_NORM, maxtime = nothing,
+        kwargs...) where {F}
+    timer = get_timer_output()
+    @static_timeit timer "cache construction" begin
+        u = __maybe_unaliased(prob.u0, alias_u0)
+        T = eltype(u)
+
+        @bb du = similar(u)
+        @bb u_cache = copy(u)
+        fu = evaluate_f(prob, u)
+        @bb fu_cache = copy(fu)
+
+        linesearch_cache = __internal_init(prob, alg.linesearch, prob.f, fu, u, prob.p;
+            maxiters, internalnorm, kwargs...)
+
+        abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u_cache,
+            termination_condition)
+        trace = init_nonlinearsolve_trace(alg, u, fu, nothing, du; kwargs...)
+
+        if alg.σ_1 === nothing
+            σ_n = dot(u, u) / dot(u, fu)
+            # Spectral parameter bounds check
+            if !(alg.σ_min ≤ abs(σ_n) ≤ alg.σ_max)
+                test_norm = dot(fu, fu)
+                σ_n = clamp(inv(test_norm), T(1), T(1e5))
+            end
+        else
+            σ_n = T(alg.σ_1)
+        end
+
+        return GeneralizedDFSaneCache{isinplace(prob), maxtime !== nothing}(fu, fu_cache, u,
+            u_cache, prob.p, du, alg, prob, σ_n, T(alg.σ_min), T(alg.σ_max),
+            linesearch_cache, 0, 0, maxiters, maxtime, timer, 0.0, tc_cache, trace,
+            ReturnCode.Default, false)
+    end
+end
+
+function __step!(cache::GeneralizedDFSaneCache{iip};
+        recompute_jacobian::Union{Nothing, Bool} = nothing, kwargs...) where {iip}
+    if recompute_jacobian !== nothing
+        @warn "GeneralizedDFSane is a Jacobian-Free Algorithm. Ignoring \
+              `recompute_jacobian`" maxlog=1
+    end
+
+    @static_timeit cache.timer "descent" begin
+        @bb @. cache.du = -cache.σ_n * cache.fu
+    end
+
+    @static_timeit cache.timer "linesearch" begin
+        linesearch_failed, α = __internal_solve!(cache.linesearch_cache, cache.u, cache.du)
+    end
+
+    if linesearch_failed
+        cache.retcode = ReturnCode.InternalLineSearchFailed
+        cache.force_stop = true
+        return
+    end
+
+    @static_timeit cache.timer "step" begin
+        @bb axpy!(α, cache.du, cache.u)
+        evaluate_f!(cache, cache.u, cache.p)
+    end
+
+    update_trace!(cache, α)
+    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
+
+    # Update Spectral Parameter
+    @static_timeit cache.timer "update spectral parameter" begin
+        @bb @. cache.u_cache = cache.u - cache.u_cache
+        @bb @. cache.fu_cache = cache.fu - cache.fu_cache
+
+        cache.σ_n = __dot(cache.u_cache, cache.u_cache) /
+                    __dot(cache.u_cache, cache.fu_cache)
+
+        # Spectral parameter bounds check
+        if !(cache.σ_min ≤ abs(cache.σ_n) ≤ cache.σ_max)
+            test_norm = dot(cache.fu, cache.fu)
+            T = eltype(cache.σ_n)
+            cache.σ_n = clamp(inv(test_norm), T(1), T(1e5))
+        end
+    end
+
+    # Take step
+    @bb copyto!(cache.u_cache, cache.u)
+    @bb copyto!(cache.fu_cache, cache.fu)
+
+    callback_into_cache!(cache, cache.linesearch_cache)
+
+    return
+end
diff --git a/src/default.jl b/src/default.jl
index abc4397d1..b83ee3f4e 100644
--- a/src/default.jl
+++ b/src/default.jl
@@ -1,3 +1,4 @@
+# Poly Algorithms
 """
     NonlinearSolvePolyAlgorithm(algs, ::Val{pType} = Val(:NLS)) where {pType}
 
@@ -6,7 +7,7 @@ A general way to define PolyAlgorithms for `NonlinearProblem` and
 tried in order until one succeeds. If none succeed, then the algorithm with the lowest
 residual is returned.
 
-## Arguments
+### Arguments
 
   - `algs`: a tuple of algorithms to try in-order! (If this is not a Tuple, then the
     returned algorithm is not type-stable).
@@ -14,7 +15,7 @@ residual is returned.
     `NonlinearLeastSquaresProblem`. This is used to determine the correct problem type to
     dispatch on.
 
-## Example
+### Example
 
 ```julia
 using NonlinearSolve
@@ -22,7 +23,7 @@ using NonlinearSolve
 alg = NonlinearSolvePolyAlgorithm((NewtonRaphson(), Broyden()))
 ```
 """
-struct NonlinearSolvePolyAlgorithm{pType, N, A} <: AbstractNonlinearSolveAlgorithm
+struct NonlinearSolvePolyAlgorithm{pType, N, A} <: AbstractNonlinearSolveAlgorithm{:PolyAlg}
     algs::A
 
     function NonlinearSolvePolyAlgorithm(algs, ::Val{pType} = Val(:NLS)) where {pType}
@@ -35,19 +36,26 @@ end
 function Base.show(io::IO, alg::NonlinearSolvePolyAlgorithm{pType, N}) where {pType, N}
     problem_kind = ifelse(pType == :NLS, "NonlinearProblem", "NonlinearLeastSquaresProblem")
     println(io, "NonlinearSolvePolyAlgorithm for $(problem_kind) with $(N) algorithms")
-    for i in 1:(N - 1)
-        println(io, "  $(i): $(alg.algs[i])")
+    for i in 1:N
+        num = "  [$(i)]: "
+        print(io, num)
+        __show_algorithm(io, alg.algs[i], get_name(alg.algs[i]), length(num))
+        i == N || println(io)
     end
-    print(io, "  $(N): $(alg.algs[N])")
 end
 
 @concrete mutable struct NonlinearSolvePolyAlgorithmCache{iip, N} <:
-                         AbstractNonlinearSolveCache{iip}
+                         AbstractNonlinearSolveCache{iip, false}
     caches
     alg
     current::Int
 end
 
+function reinit_cache!(cache::NonlinearSolvePolyAlgorithmCache, args...; kwargs...)
+    foreach(c -> reinit_cache!(c, args...; kwargs...), cache.caches)
+    cache.current = 1
+end
+
 for (probType, pType) in ((:NonlinearProblem, :NLS), (:NonlinearLeastSquaresProblem, :NLLS))
     algType = NonlinearSolvePolyAlgorithm{pType}
     @eval begin
@@ -158,12 +166,6 @@ for (probType, pType) in ((:NonlinearProblem, :NLS), (:NonlinearLeastSquaresProb
     end
 end
 
-function SciMLBase.reinit!(cache::NonlinearSolvePolyAlgorithmCache, args...; kwargs...)
-    for c in cache.caches
-        SciMLBase.reinit!(c, args...; kwargs...)
-    end
-end
-
 """
     RobustMultiNewton(::Type{T} = Float64; concrete_jac = nothing, linsolve = nothing,
         precs = DEFAULT_PRECS, autodiff = nothing)
@@ -180,26 +182,6 @@ or more precision / more stable linear solver choice is required).
 
   - `T`: The eltype of the initial guess. It is only used to check if some of the algorithms
     are compatible with the problem type. Defaults to `Float64`.
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing`.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
 """
 function RobustMultiNewton(::Type{T} = Float64; concrete_jac = nothing, linsolve = nothing,
         precs = DEFAULT_PRECS, autodiff = nothing) where {T}
@@ -210,8 +192,8 @@ function RobustMultiNewton(::Type{T} = Float64; concrete_jac = nothing, linsolve
         algs = (TrustRegion(; concrete_jac, linsolve, precs, autodiff),
             TrustRegion(; concrete_jac, linsolve, precs, autodiff,
                 radius_update_scheme = RadiusUpdateSchemes.Bastin),
-            NewtonRaphson(; concrete_jac, linsolve, precs, linesearch = BackTracking(),
-                autodiff),
+            NewtonRaphson(; concrete_jac, linsolve, precs,
+                linesearch = LineSearchesJL(; method = BackTracking()), autodiff),
             TrustRegion(; concrete_jac, linsolve, precs,
                 radius_update_scheme = RadiusUpdateSchemes.NLsolve, autodiff),
             TrustRegion(; concrete_jac, linsolve, precs,
@@ -232,26 +214,6 @@ for more performance and then tries more robust techniques if the faster ones fa
 
   - `T`: The eltype of the initial guess. It is only used to check if some of the algorithms
     are compatible with the problem type. Defaults to `Float64`.
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing`.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
 """
 function FastShortcutNonlinearPolyalg(::Type{T} = Float64; concrete_jac = nothing,
         linsolve = nothing, precs = DEFAULT_PRECS, must_use_jacobian::Val{JAC} = Val(false),
@@ -262,8 +224,8 @@ function FastShortcutNonlinearPolyalg(::Type{T} = Float64; concrete_jac = nothin
             algs = (NewtonRaphson(; concrete_jac, linsolve, precs, autodiff),)
         else
             algs = (NewtonRaphson(; concrete_jac, linsolve, precs, autodiff),
-                NewtonRaphson(; concrete_jac, linsolve, precs, linesearch = BackTracking(),
-                    autodiff),
+                NewtonRaphson(; concrete_jac, linsolve, precs,
+                    linesearch = LineSearchesJL(; method = BackTracking()), autodiff),
                 TrustRegion(; concrete_jac, linsolve, precs, autodiff),
                 TrustRegion(; concrete_jac, linsolve, precs,
                     radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff))
@@ -283,9 +245,7 @@ function FastShortcutNonlinearPolyalg(::Type{T} = Float64; concrete_jac = nothin
                     SimpleKlement(),
                     NewtonRaphson(; concrete_jac, linsolve, precs, autodiff),
                     NewtonRaphson(; concrete_jac, linsolve, precs,
-                        linesearch = BackTracking(), autodiff),
-                    NewtonRaphson(; concrete_jac, linsolve, precs,
-                        linesearch = BackTracking(), autodiff),
+                        linesearch = LineSearchesJL(; method = BackTracking()), autodiff),
                     TrustRegion(; concrete_jac, linsolve, precs,
                         radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff))
             end
@@ -301,7 +261,7 @@ function FastShortcutNonlinearPolyalg(::Type{T} = Float64; concrete_jac = nothin
                     Klement(; linsolve, precs),
                     NewtonRaphson(; concrete_jac, linsolve, precs, autodiff),
                     NewtonRaphson(; concrete_jac, linsolve, precs,
-                        linesearch = BackTracking(), autodiff),
+                        linesearch = LineSearchesJL(; method = BackTracking()), autodiff),
                     TrustRegion(; concrete_jac, linsolve, precs, autodiff),
                     TrustRegion(; concrete_jac, linsolve, precs,
                         radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff))
@@ -322,40 +282,20 @@ for more performance and then tries more robust techniques if the faster ones fa
 
   - `T`: The eltype of the initial guess. It is only used to check if some of the algorithms
     are compatible with the problem type. Defaults to `Float64`.
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
 """
 function FastShortcutNLLSPolyalg(::Type{T} = Float64; concrete_jac = nothing,
         linsolve = nothing, precs = DEFAULT_PRECS, kwargs...) where {T}
     if __is_complex(T)
         algs = (GaussNewton(; concrete_jac, linsolve, precs, kwargs...),
-            LevenbergMarquardt(; concrete_jac, linsolve, precs, kwargs...))
+            LevenbergMarquardt(; linsolve, precs, kwargs...))
     else
         algs = (GaussNewton(; concrete_jac, linsolve, precs, kwargs...),
             TrustRegion(; concrete_jac, linsolve, precs, kwargs...),
-            GaussNewton(; concrete_jac, linsolve, precs, linesearch = BackTracking(),
-                kwargs...),
+            GaussNewton(; concrete_jac, linsolve, precs,
+                linesearch = LineSearchesJL(; method = BackTracking()), kwargs...),
             TrustRegion(; concrete_jac, linsolve, precs,
                 radius_update_scheme = RadiusUpdateSchemes.Bastin, kwargs...),
-            LevenbergMarquardt(; concrete_jac, linsolve, precs, kwargs...))
+            LevenbergMarquardt(; linsolve, precs, kwargs...))
     end
     return NonlinearSolvePolyAlgorithm(algs, Val(:NLLS))
 end
diff --git a/src/descent/damped_newton.jl b/src/descent/damped_newton.jl
new file mode 100644
index 000000000..5a192a586
--- /dev/null
+++ b/src/descent/damped_newton.jl
@@ -0,0 +1,253 @@
+"""
+    DampedNewtonDescent(; linsolve = nothing, precs = DEFAULT_PRECS, initial_damping,
+        damping_fn)
+
+A Newton descent algorithm with damping. The damping factor is computed using the
+`damping_fn` function. The descent direction is computed as ``(JᵀJ + λDᵀD) δu = -fu``. For
+non-square Jacobians, we default to solving for `Jδx = -fu` and `√λ⋅D δx = 0`
+simultaneously. If the linear solver can't handle non-square matrices, we use the normal
+form equations ``(JᵀJ + λDᵀD) δu = Jᵀ fu``. Note that this factorization is often the faster
+choice, but it is not as numerically stable as the least squares solver.
+
+The damping factor returned must be a non-negative number.
+
+### Keyword Arguments
+
+  - `initial_damping`: The initial damping factor to use
+  - `damping_fn`: The function to use to compute the damping factor. This must satisfy the
+    [`NonlinearSolve.AbstractDampingFunction`](@ref) interface.
+"""
+@kwdef @concrete struct DampedNewtonDescent <: AbstractDescentAlgorithm
+    linsolve = nothing
+    precs = DEFAULT_PRECS
+    initial_damping
+    damping_fn
+end
+
+function Base.show(io::IO, d::DampedNewtonDescent)
+    modifiers = String[]
+    d.linsolve !== nothing && push!(modifiers, "linsolve = $(d.linsolve)")
+    d.precs !== DEFAULT_PRECS && push!(modifiers, "precs = $(d.precs)")
+    push!(modifiers, "initial_damping = $(d.initial_damping)")
+    push!(modifiers, "damping_fn = $(d.damping_fn)")
+    print(io, "DampedNewtonDescent($(join(modifiers, ", ")))")
+end
+
+supports_line_search(::DampedNewtonDescent) = true
+supports_trust_region(::DampedNewtonDescent) = true
+
+@concrete mutable struct DampedNewtonDescentCache{pre_inverted, mode} <:
+                         AbstractDescentCache
+    J
+    δu
+    δus
+    lincache
+    JᵀJ_cache
+    Jᵀfu_cache
+    rhs_cache
+    damping_fn_cache
+    timer
+end
+
+@internal_caches DampedNewtonDescentCache :lincache :damping_fn_cache
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::DampedNewtonDescent, J, fu, u;
+        pre_inverted::Val{INV} = False, linsolve_kwargs = (;), abstol = nothing,
+        timer = get_timer_output(), reltol = nothing, alias_J = true,
+        shared::Val{N} = Val(1), kwargs...) where {INV, N}
+    length(fu) != length(u) &&
+        @assert !INV "Precomputed Inverse for Non-Square Jacobian doesn't make sense."
+    @bb δu = similar(u)
+    δus = N ≤ 1 ? nothing : map(2:N) do i
+        @bb δu_ = similar(u)
+    end
+
+    normal_form_damping = returns_norm_form_damping(alg.damping_fn)
+    normal_form_linsolve = __needs_square_A(alg.linsolve, u)
+    if u isa Number
+        mode = :simple
+    elseif prob isa NonlinearProblem
+        mode = ifelse(!normal_form_damping, :simple,
+            ifelse(normal_form_linsolve, :normal_form, :least_squares))
+    else
+        if normal_form_linsolve & !normal_form_damping
+            throw(ArgumentError("Linear Solver expects Normal Form but returned Damping is \
+                                 not Normal Form. This is not supported."))
+        end
+        mode = ifelse(normal_form_damping & !normal_form_linsolve, :least_squares,
+            ifelse(!normal_form_damping & !normal_form_linsolve, :simple, :normal_form))
+    end
+
+    if mode === :least_squares
+        if requires_normal_form_jacobian(alg.damping_fn)
+            JᵀJ = transpose(J) * J  # Needed to compute the damping factor
+            jac_damp = JᵀJ
+        else
+            JᵀJ = nothing
+            jac_damp = J
+        end
+        if requires_normal_form_rhs(alg.damping_fn)
+            Jᵀfu = transpose(J) * _vec(fu)
+            rhs_damp = Jᵀfu
+        else
+            Jᵀfu = nothing
+            rhs_damp = fu
+        end
+        damping_fn_cache = __internal_init(prob, alg.damping_fn, alg.initial_damping,
+            jac_damp, rhs_damp, u, False; kwargs...)
+        D = damping_fn_cache(nothing)
+        D isa Number && (D = D * I)
+        rhs_cache = vcat(_vec(fu), _vec(u))
+        J_cache = _vcat(J, D)
+        A, b = J_cache, rhs_cache
+    elseif mode === :simple
+        damping_fn_cache = __internal_init(prob, alg.damping_fn, alg.initial_damping, J, fu,
+            u, False; kwargs...)
+        J_cache = __maybe_unaliased(J, alias_J)
+        D = damping_fn_cache(nothing)
+        J_damped = __dampen_jacobian!!(J_cache, J, D)
+        J_cache = J_damped
+        A, b = J_damped, _vec(fu)
+        JᵀJ, Jᵀfu, rhs_cache = nothing, nothing, nothing
+    elseif mode === :normal_form
+        JᵀJ = transpose(J) * J
+        Jᵀfu = transpose(J) * _vec(fu)
+        jac_damp = requires_normal_form_jacobian(alg.damping_fn) ? JᵀJ : J
+        rhs_damp = requires_normal_form_rhs(alg.damping_fn) ? Jᵀfu : fu
+        damping_fn_cache = __internal_init(prob, alg.damping_fn, alg.initial_damping,
+            jac_damp,
+            rhs_damp, u, True; kwargs...)
+        D = damping_fn_cache(nothing)
+        @bb J_cache = similar(JᵀJ)
+        @bb @. J_cache = 0
+        J_damped = __dampen_jacobian!!(J_cache, JᵀJ, D)
+        A, b = __maybe_symmetric(J_damped), _vec(Jᵀfu)
+        rhs_cache = nothing
+    end
+
+    lincache = LinearSolverCache(alg, alg.linsolve, A, b, _vec(u); abstol, reltol,
+        linsolve_kwargs...)
+
+    return DampedNewtonDescentCache{INV, mode}(J_cache, δu, δus, lincache, JᵀJ, Jᵀfu,
+        rhs_cache, damping_fn_cache, timer)
+end
+
+function __internal_solve!(cache::DampedNewtonDescentCache{INV, mode}, J, fu, u,
+        idx::Val{N} = Val(1); skip_solve::Bool = false, new_jacobian::Bool = true,
+        kwargs...) where {INV, N, mode}
+    δu = get_du(cache, idx)
+    skip_solve && return δu, true, (;)
+
+    recompute_A = idx === Val(1)
+
+    @static_timeit cache.timer "dampen" begin
+        if mode === :least_squares
+            if (J !== nothing || new_jacobian) && recompute_A
+                INV && (J = inv(J))
+                if requires_normal_form_jacobian(cache.damping_fn_cache)
+                    @bb cache.JᵀJ_cache = transpose(J) × J
+                    jac_damp = cache.JᵀJ_cache
+                else
+                    jac_damp = J
+                end
+                if requires_normal_form_rhs(cache.damping_fn_cache)
+                    @bb cache.Jᵀfu_cache = transpose(J) × fu
+                    rhs_damp = cache.Jᵀfu_cache
+                else
+                    rhs_damp = fu
+                end
+                D = __internal_solve!(cache.damping_fn_cache, jac_damp, rhs_damp, False)
+                if __can_setindex(cache.J)
+                    copyto!(@view(cache.J[1:size(J, 1), :]), J)
+                    cache.J[(size(J, 1) + 1):end, :] .= sqrt.(D)
+                else
+                    cache.J = _vcat(J, sqrt.(D))
+                end
+            end
+            A = cache.J
+            if __can_setindex(cache.rhs_cache)
+                cache.rhs_cache[1:length(fu)] .= _vec(fu)
+                cache.rhs_cache[(length(fu) + 1):end] .= false
+            else
+                cache.rhs_cache = vcat(_vec(fu), zero(_vec(u)))
+            end
+            b = cache.rhs_cache
+        elseif mode === :simple
+            if (J !== nothing || new_jacobian) && recompute_A
+                INV && (J = inv(J))
+                D = __internal_solve!(cache.damping_fn_cache, J, fu, False)
+                cache.J = __dampen_jacobian!!(cache.J, J, D)
+            end
+            A, b = cache.J, _vec(fu)
+        elseif mode === :normal_form
+            if (J !== nothing || new_jacobian) && recompute_A
+                INV && (J = inv(J))
+                @bb cache.JᵀJ_cache = transpose(J) × J
+                @bb cache.Jᵀfu_cache = transpose(J) × vec(fu)
+                D = __internal_solve!(cache.damping_fn_cache, cache.JᵀJ_cache,
+                    cache.Jᵀfu_cache, True)
+                cache.J = __dampen_jacobian!!(cache.J, cache.JᵀJ_cache, D)
+                A = __maybe_symmetric(cache.J)
+            elseif !recompute_A
+                @bb cache.Jᵀfu_cache = transpose(J) × vec(fu)
+                A = __maybe_symmetric(cache.J)
+            else
+                A = nothing
+            end
+            b = _vec(cache.Jᵀfu_cache)
+        else
+            error("Unknown mode: $(mode)")
+        end
+    end
+
+    @static_timeit cache.timer "linear solve" begin
+        δu = cache.lincache(; A, b,
+            reuse_A_if_factorization = !new_jacobian && !recompute_A,
+            kwargs..., linu = _vec(δu))
+        δu = _restructure(get_du(cache, idx), δu)
+    end
+
+    @bb @. δu *= -1
+    set_du!(cache, δu, idx)
+    return δu, true, (;)
+end
+
+# Define special concatenation for certain Array combinations
+@inline _vcat(x, y) = vcat(x, y)
+
+# J_cache is allowed to alias J
+## Compute ``J + D``
+@inline __dampen_jacobian!!(J_cache, J::SciMLBase.AbstractSciMLOperator, D) = J + D
+@inline __dampen_jacobian!!(J_cache, J::Number, D) = J + D
+@inline function __dampen_jacobian!!(J_cache, J::AbstractMatrix, D::AbstractMatrix)
+    if __can_setindex(J_cache)
+        copyto!(J_cache, J)
+        if fast_scalar_indexing(J_cache)
+            @inbounds for i in axes(J_cache, 1)
+                J_cache[i, i] += D[i, i]
+            end
+        else
+            idxs = diagind(J_cache)
+            @.. broadcast=false @view(J_cache[idxs])=@view(J[idxs]) + @view(D[idxs])
+        end
+        return J_cache
+    else
+        return @. J + D
+    end
+end
+@inline function __dampen_jacobian!!(J_cache, J::AbstractMatrix, D::Number)
+    if __can_setindex(J_cache)
+        copyto!(J_cache, J)
+        if fast_scalar_indexing(J_cache)
+            @inbounds for i in axes(J_cache, 1)
+                J_cache[i, i] += D
+            end
+        else
+            idxs = diagind(J_cache)
+            @.. broadcast=false @view(J_cache[idxs])=@view(J[idxs]) + D
+        end
+        return J_cache
+    else
+        return @. J + D
+    end
+end
diff --git a/src/descent/dogleg.jl b/src/descent/dogleg.jl
new file mode 100644
index 000000000..e1a50832f
--- /dev/null
+++ b/src/descent/dogleg.jl
@@ -0,0 +1,138 @@
+"""
+    Dogleg(; linsolve = nothing, precs = DEFAULT_PRECS)
+
+Switch between Newton's method and the steepest descent method depending on the size of the
+trust region. The trust region is specified via keyword argument `trust_region` to
+`solve!`.
+
+See also [`SteepestDescent`](@ref), [`NewtonDescent`](@ref), [`DampedNewtonDescent`](@ref).
+"""
+@concrete struct Dogleg <: AbstractDescentAlgorithm
+    newton_descent
+    steepest_descent
+end
+
+function Base.show(io::IO, d::Dogleg)
+    print(io,
+        "Dogleg(newton_descent = $(d.newton_descent), steepest_descent = $(d.steepest_descent))")
+end
+
+supports_trust_region(::Dogleg) = true
+get_linear_solver(alg::Dogleg) = get_linear_solver(alg.newton_descent)
+
+function Dogleg(; linsolve = nothing, precs = DEFAULT_PRECS, damping = False,
+        damping_fn = missing, initial_damping = missing, kwargs...)
+    if damping === False
+        return Dogleg(NewtonDescent(; linsolve, precs), SteepestDescent(; linsolve, precs))
+    end
+    if damping_fn === missing || initial_damping === missing
+        throw(ArgumentError("`damping_fn` and `initial_damping` must be supplied if \
+                             `damping = Val(true)`."))
+    end
+    return Dogleg(DampedNewtonDescent(; linsolve, precs, damping_fn, initial_damping),
+        SteepestDescent(; linsolve, precs))
+end
+
+@concrete mutable struct DoglegCache{pre_inverted, normalform} <:
+                         AbstractDescentCache
+    δu
+    δus
+    newton_cache
+    cauchy_cache
+    internalnorm
+    JᵀJ_cache
+    δu_cache_1
+    δu_cache_2
+    δu_cache_mul
+end
+
+@internal_caches DoglegCache :newton_cache :cauchy_cache
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::Dogleg, J, fu, u;
+        pre_inverted::Val{INV} = False, linsolve_kwargs = (;), abstol = nothing,
+        reltol = nothing, internalnorm::F = DEFAULT_NORM, shared::Val{N} = Val(1),
+        kwargs...) where {F, INV, N}
+    newton_cache = __internal_init(prob, alg.newton_descent, J, fu, u; pre_inverted,
+        linsolve_kwargs, abstol, reltol, shared, kwargs...)
+    cauchy_cache = __internal_init(prob, alg.steepest_descent, J, fu, u; pre_inverted,
+        linsolve_kwargs, abstol, reltol, shared, kwargs...)
+    @bb δu = similar(u)
+    δus = N ≤ 1 ? nothing : map(2:N) do i
+        @bb δu_ = similar(u)
+    end
+    @bb δu_cache_1 = similar(u)
+    @bb δu_cache_2 = similar(u)
+    @bb δu_cache_mul = similar(u)
+
+    T = promote_type(eltype(u), eltype(fu))
+
+    normal_form = prob isa NonlinearLeastSquaresProblem &&
+                  __needs_square_A(alg.newton_descent.linsolve, u)
+    JᵀJ_cache = !normal_form ? J * _vec(δu) : nothing  # TODO: Rename
+
+    return DoglegCache{INV, normal_form}(δu, δus, newton_cache, cauchy_cache, internalnorm,
+        JᵀJ_cache, δu_cache_1, δu_cache_2, δu_cache_mul)
+end
+
+# If TrustRegion is not specified, then use a Gauss-Newton step
+function __internal_solve!(cache::DoglegCache{INV, NF}, J, fu, u, idx::Val{N} = Val(1);
+        trust_region = nothing, skip_solve::Bool = false, kwargs...) where {INV, NF, N}
+    @assert trust_region!==nothing "Trust Region must be specified for Dogleg. Use \
+                                    `NewtonDescent` or `SteepestDescent` if you don't \
+                                    want to use a Trust Region."
+    δu = get_du(cache, idx)
+    T = promote_type(eltype(u), eltype(fu))
+    δu_newton, _, _ = __internal_solve!(cache.newton_cache, J, fu, u, idx; skip_solve,
+        kwargs...)
+
+    # Newton's Step within the trust region
+    if cache.internalnorm(δu_newton) ≤ trust_region
+        @bb copyto!(δu, δu_newton)
+        set_du!(cache, δu, idx)
+        return δu, true, (; δuJᵀJδu = T(NaN))
+    end
+
+    # Take intersection of steepest descent direction and trust region if Cauchy point lies
+    # outside of trust region
+    if NF
+        δu_cauchy = cache.newton_cache.Jᵀfu_cache
+        JᵀJ = cache.newton_cache.JᵀJ_cache
+        @bb @. δu_cauchy *= -1
+
+        l_grad = cache.internalnorm(δu_cauchy)
+        @bb cache.δu_cache_mul = JᵀJ × vec(δu_cauchy)
+        δuJᵀJδu = __dot(δu_cauchy, cache.δu_cache_mul)
+    else
+        δu_cauchy, _, _ = __internal_solve!(cache.cauchy_cache, J, fu, u, idx; skip_solve,
+            kwargs...)
+        J_ = INV ? inv(J) : J
+        l_grad = cache.internalnorm(δu_cauchy)
+        @bb cache.JᵀJ_cache = J × vec(δu_cauchy)  # TODO: Rename
+        δuJᵀJδu = __dot(cache.JᵀJ_cache, cache.JᵀJ_cache)
+    end
+    d_cauchy = (l_grad^3) / δuJᵀJδu
+
+    if d_cauchy ≥ trust_region
+        λ = trust_region / l_grad
+        @bb @. δu = λ * δu_cauchy
+        set_du!(cache, δu, idx)
+        return δu, true, (; δuJᵀJδu = λ^2 * δuJᵀJδu)
+    end
+
+    # FIXME: For anything other than 2-norm a quadratic root will give incorrect results
+    #        We need to do a local search with a interval root finding algorithm
+    #        optimistix has a proper implementation for this
+    # Take the intersection of dogleg with trust region if Cauchy point lies inside the
+    # trust region
+    @bb @. cache.δu_cache_1 = (d_cauchy / l_grad) * δu_cauchy
+    @bb @. cache.δu_cache_2 = δu_newton - cache.δu_cache_1
+    a = dot(cache.δu_cache_2, cache.δu_cache_2)
+    b = 2 * dot(cache.δu_cache_1, cache.δu_cache_2)
+    c = d_cauchy^2 - trust_region^2
+    aux = max(0, b^2 - 4 * a * c)
+    τ = (-b + sqrt(aux)) / (2 * a)
+
+    @bb @. δu = cache.δu_cache_1 + τ * cache.δu_cache_2
+    set_du!(cache, δu, idx)
+    return δu, true, (; δuJᵀJδu = T(NaN))
+end
diff --git a/src/descent/geodesic_acceleration.jl b/src/descent/geodesic_acceleration.jl
new file mode 100644
index 000000000..fcb1ec83e
--- /dev/null
+++ b/src/descent/geodesic_acceleration.jl
@@ -0,0 +1,133 @@
+"""
+    GeodesicAcceleration(; descent, finite_diff_step_geodesic, α)
+
+Uses the `descent` algorithm to compute the velocity and acceleration terms for the
+geodesic acceleration method. The velocity and acceleration terms are then combined to
+compute the descent direction.
+
+This method in its current form was developed for [`LevenbergMarquardt`](@ref). Performance
+for other methods are not theorectically or experimentally verified.
+
+### Keyword Arguments
+
+  - `descent`: the descent algorithm to use for computing the velocity and acceleration.
+  - `finite_diff_step_geodesic`: the step size used for finite differencing used to
+    calculate the geodesic acceleration. Defaults to `0.1` which means that the step size is
+    approximately 10% of the first-order step. See Section 3 of [1].
+  - `α`: a factor that determines if a step is accepted or rejected. To incorporate
+    geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is
+    necessary that acceptable steps meet the condition
+    ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
+    acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a
+    geodesic path) and `α_geodesic` is some number of order `1`. For most problems
+    `α_geodesic = 0.75` is a good value but for problems where convergence is difficult
+    `α_geodesic = 0.1` is an effective choice. Defaults to `0.75`. See Section 3 of
+    [transtrum2012improvements](@citet).
+"""
+@concrete struct GeodesicAcceleration <: AbstractDescentAlgorithm
+    descent
+    finite_diff_step_geodesic
+    α
+end
+
+function Base.show(io::IO, alg::GeodesicAcceleration)
+    print(io, "GeodesicAcceleration(descent = $(alg.descent), finite_diff_step_geodesic = ",
+        "$(alg.finite_diff_step_geodesic), α = $(alg.α))")
+end
+
+supports_trust_region(::GeodesicAcceleration) = true
+
+get_linear_solver(alg::GeodesicAcceleration) = get_linear_solver(alg.descent)
+
+@concrete mutable struct GeodesicAccelerationCache <: AbstractDescentCache
+    δu
+    δus
+    descent_cache
+    f
+    p
+    α
+    internalnorm
+    h
+    Jv
+    fu_cache
+    u_cache
+    last_step_accepted::Bool
+end
+
+function __reinit_internal!(cache::GeodesicAccelerationCache, args...; p = cache.p,
+        kwargs...)
+    cache.p = p
+    cache.last_step_accepted = false
+end
+
+@internal_caches GeodesicAccelerationCache :descent_cache
+
+get_velocity(cache::GeodesicAccelerationCache) = get_du(cache.descent_cache, Val(1))
+function set_velocity!(cache::GeodesicAccelerationCache, δv)
+    set_du!(cache.descent_cache, δv, Val(1))
+end
+function get_velocity(cache::GeodesicAccelerationCache, ::Val{N}) where {N}
+    get_du(cache.descent_cache, Val(2N - 1))
+end
+function set_velocity!(cache::GeodesicAccelerationCache, δv, ::Val{N}) where {N}
+    set_du!(cache.descent_cache, δv, Val(2N - 1))
+end
+get_acceleration(cache::GeodesicAccelerationCache) = get_du(cache.descent_cache, Val(2))
+function set_acceleration!(cache::GeodesicAccelerationCache, δa)
+    set_du!(cache.descent_cache, δa, Val(2))
+end
+function get_acceleration(cache::GeodesicAccelerationCache, ::Val{N}) where {N}
+    get_du(cache.descent_cache, Val(2N))
+end
+function set_acceleration!(cache::GeodesicAccelerationCache, δa, ::Val{N}) where {N}
+    set_du!(cache.descent_cache, δa, Val(2N))
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::GeodesicAcceleration, J, fu,
+        u; shared::Val{N} = Val(1), pre_inverted::Val{INV} = False, linsolve_kwargs = (;),
+        abstol = nothing, reltol = nothing, internalnorm::F = DEFAULT_NORM,
+        kwargs...) where {INV, N, F}
+    T = promote_type(eltype(u), eltype(fu))
+    @bb δu = similar(u)
+    δus = N ≤ 1 ? nothing : map(2:N) do i
+        @bb δu_ = similar(u)
+    end
+    descent_cache = __internal_init(prob, alg.descent, J, fu, u; shared = Val(N * 2),
+        pre_inverted, linsolve_kwargs, abstol, reltol, kwargs...)
+    @bb Jv = similar(fu)
+    @bb fu_cache = copy(fu)
+    @bb u_cache = similar(u)
+    return GeodesicAccelerationCache(δu, δus, descent_cache, prob.f, prob.p, T(alg.α),
+        internalnorm, T(alg.finite_diff_step_geodesic), Jv, fu_cache, u_cache, false)
+end
+
+function __internal_solve!(cache::GeodesicAccelerationCache, J, fu, u, idx::Val{N} = Val(1);
+        skip_solve::Bool = false, kwargs...) where {N}
+    a, v, δu = get_acceleration(cache, idx), get_velocity(cache, idx), get_du(cache, idx)
+    skip_solve && return δu, true, (; a, v)
+    v, _, _ = __internal_solve!(cache.descent_cache, J, fu, u, Val(2N - 1); skip_solve,
+        kwargs...)
+
+    @bb @. cache.u_cache = u + cache.h * v
+    cache.fu_cache = evaluate_f!!(cache.f, cache.fu_cache, cache.u_cache, cache.p)
+
+    J !== nothing && @bb(cache.Jv=J × vec(v))
+    Jv = _restructure(cache.fu_cache, cache.Jv)
+    @bb @. cache.fu_cache = (2 / cache.h) * ((cache.fu_cache - fu) / cache.h - Jv)
+
+    a, _, _ = __internal_solve!(cache.descent_cache, J, cache.fu_cache, u, Val(2N);
+        skip_solve, kwargs..., reuse_A_if_factorization = true)
+
+    norm_v = cache.internalnorm(v)
+    norm_a = cache.internalnorm(a)
+
+    if 2 * norm_a ≤ norm_v * cache.α
+        @bb @. δu = v + a / 2
+        set_du!(cache, δu, idx)
+        cache.last_step_accepted = true
+    else
+        cache.last_step_accepted = false
+    end
+
+    return δu, cache.last_step_accepted, (; a, v)
+end
diff --git a/src/descent/newton.jl b/src/descent/newton.jl
new file mode 100644
index 000000000..c8ba35ed9
--- /dev/null
+++ b/src/descent/newton.jl
@@ -0,0 +1,111 @@
+"""
+    NewtonDescent(; linsolve = nothing, precs = DEFAULT_PRECS)
+
+Compute the descent direction as ``J δu = -fu``. For non-square Jacobian problems, this is
+commonly referred to as the Gauss-Newton Descent.
+
+See also [`Dogleg`](@ref), [`SteepestDescent`](@ref), [`DampedNewtonDescent`](@ref).
+"""
+@kwdef @concrete struct NewtonDescent <: AbstractDescentAlgorithm
+    linsolve = nothing
+    precs = DEFAULT_PRECS
+end
+
+function Base.show(io::IO, d::NewtonDescent)
+    modifiers = String[]
+    d.linsolve !== nothing && push!(modifiers, "linsolve = $(d.linsolve)")
+    d.precs !== DEFAULT_PRECS && push!(modifiers, "precs = $(d.precs)")
+    print(io, "NewtonDescent($(join(modifiers, ", ")))")
+end
+
+supports_line_search(::NewtonDescent) = true
+
+@concrete mutable struct NewtonDescentCache{pre_inverted, normalform} <:
+                         AbstractDescentCache
+    δu
+    δus
+    lincache
+    JᵀJ_cache  # For normal form else nothing
+    Jᵀfu_cache
+    timer
+end
+
+@internal_caches NewtonDescentCache :lincache
+
+function __internal_init(prob::NonlinearProblem, alg::NewtonDescent, J, fu, u;
+        shared::Val{N} = Val(1), pre_inverted::Val{INV} = False, linsolve_kwargs = (;),
+        abstol = nothing, reltol = nothing, timer = get_timer_output(),
+        kwargs...) where {INV, N}
+    @bb δu = similar(u)
+    δus = N ≤ 1 ? nothing : map(2:N) do i
+        @bb δu_ = similar(u)
+    end
+    INV && return NewtonDescentCache{true, false}(δu, δus, nothing, nothing, nothing, timer)
+    lincache = LinearSolverCache(alg, alg.linsolve, J, _vec(fu), _vec(u); abstol, reltol,
+        linsolve_kwargs...)
+    return NewtonDescentCache{false, false}(δu, δus, lincache, nothing, nothing, timer)
+end
+
+function __internal_init(prob::NonlinearLeastSquaresProblem, alg::NewtonDescent, J, fu, u;
+        pre_inverted::Val{INV} = False, linsolve_kwargs = (;), shared::Val{N} = Val(1),
+        abstol = nothing, reltol = nothing, timer = get_timer_output(),
+        kwargs...) where {INV, N}
+    length(fu) != length(u) &&
+        @assert !INV "Precomputed Inverse for Non-Square Jacobian doesn't make sense."
+
+    normal_form = __needs_square_A(alg.linsolve, u)
+    if normal_form
+        JᵀJ = transpose(J) * J
+        Jᵀfu = transpose(J) * _vec(fu)
+        A, b = __maybe_symmetric(JᵀJ), Jᵀfu
+    else
+        JᵀJ, Jᵀfu = nothing, nothing
+        A, b = J, _vec(fu)
+    end
+    lincache = LinearSolverCache(alg, alg.linsolve, A, b, _vec(u); abstol, reltol,
+        linsolve_kwargs...)
+    @bb δu = similar(u)
+    δus = N ≤ 1 ? nothing : map(2:N) do i
+        @bb δu_ = similar(u)
+    end
+    return NewtonDescentCache{false, normal_form}(δu, δus, lincache, JᵀJ, Jᵀfu, timer)
+end
+
+function __internal_solve!(cache::NewtonDescentCache{INV, false}, J, fu, u,
+        idx::Val = Val(1); skip_solve::Bool = false, new_jacobian::Bool = true,
+        kwargs...) where {INV}
+    δu = get_du(cache, idx)
+    skip_solve && return δu, true, (;)
+    if INV
+        @assert J!==nothing "`J` must be provided when `pre_inverted = Val(true)`."
+        @bb δu = J × vec(fu)
+    else
+        @static_timeit cache.timer "linear solve" begin
+            δu = cache.lincache(; A = J, b = _vec(fu), kwargs..., linu = _vec(δu),
+                du = _vec(δu), reuse_A_if_factorization = !new_jacobian || (idx !== Val(1)))
+            δu = _restructure(get_du(cache, idx), δu)
+        end
+    end
+    @bb @. δu *= -1
+    set_du!(cache, δu, idx)
+    return δu, true, (;)
+end
+
+function __internal_solve!(cache::NewtonDescentCache{false, true}, J, fu, u,
+        idx::Val = Val(1); skip_solve::Bool = false, new_jacobian::Bool = true, kwargs...)
+    δu = get_du(cache, idx)
+    skip_solve && return δu, true, (;)
+    if idx === Val(1)
+        @bb cache.JᵀJ_cache = transpose(J) × J
+    end
+    @bb cache.Jᵀfu_cache = transpose(J) × fu
+    @static_timeit cache.timer "linear solve" begin
+        δu = cache.lincache(; A = __maybe_symmetric(cache.JᵀJ_cache), b = cache.Jᵀfu_cache,
+            kwargs..., linu = _vec(δu), du = _vec(δu),
+            reuse_A_if_factorization = !new_jacobian || (idx !== Val(1)))
+        δu = _restructure(get_du(cache, idx), δu)
+    end
+    @bb @. δu *= -1
+    set_du!(cache, δu, idx)
+    return δu, true, (;)
+end
diff --git a/src/descent/steepest.jl b/src/descent/steepest.jl
new file mode 100644
index 000000000..d19505a86
--- /dev/null
+++ b/src/descent/steepest.jl
@@ -0,0 +1,67 @@
+"""
+    SteepestDescent(; linsolve = nothing, precs = DEFAULT_PRECS)
+
+Compute the descent direction as ``δu = -Jᵀfu``. The linear solver and preconditioner are
+only used if `J` is provided in the inverted form.
+
+See also [`Dogleg`](@ref), [`NewtonDescent`](@ref), [`DampedNewtonDescent`](@ref).
+"""
+@kwdef @concrete struct SteepestDescent <: AbstractDescentAlgorithm
+    linsolve = nothing
+    precs = DEFAULT_PRECS
+end
+
+function Base.show(io::IO, d::SteepestDescent)
+    modifiers = String[]
+    d.linsolve !== nothing && push!(modifiers, "linsolve = $(d.linsolve)")
+    d.precs !== DEFAULT_PRECS && push!(modifiers, "precs = $(d.precs)")
+    print(io, "SteepestDescent($(join(modifiers, ", ")))")
+end
+
+supports_line_search(::SteepestDescent) = true
+
+@concrete mutable struct SteepestDescentCache{pre_inverted} <: AbstractDescentCache
+    δu
+    δus
+    lincache
+    timer
+end
+
+@internal_caches SteepestDescentCache :lincache
+
+@inline function __internal_init(prob::AbstractNonlinearProblem, alg::SteepestDescent, J,
+        fu, u; shared::Val{N} = Val(1), pre_inverted::Val{INV} = False,
+        linsolve_kwargs = (;), abstol = nothing, reltol = nothing,
+        timer = get_timer_output(), kwargs...) where {INV, N}
+    INV && @assert length(fu)==length(u) "Non-Square Jacobian Inverse doesn't make sense."
+    @bb δu = similar(u)
+    δus = N ≤ 1 ? nothing : map(2:N) do i
+        @bb δu_ = similar(u)
+    end
+    if INV
+        lincache = LinearSolverCache(alg, alg.linsolve, transpose(J), _vec(fu), _vec(u);
+            abstol, reltol, linsolve_kwargs...)
+    else
+        lincache = nothing
+    end
+    return SteepestDescentCache{INV}(δu, δus, lincache, timer)
+end
+
+function __internal_solve!(cache::SteepestDescentCache{INV}, J, fu, u, idx::Val = Val(1);
+        new_jacobian::Bool = true, kwargs...) where {INV}
+    δu = get_du(cache, idx)
+    if INV
+        A = J === nothing ? nothing : transpose(J)
+        @static_timeit cache.timer "linear solve" begin
+            δu = cache.lincache(; A, b = _vec(fu), kwargs..., linu = _vec(δu),
+                du = _vec(δu), reuse_A_if_factorization = !new_jacobian || idx !== Val(1))
+            δu = _restructure(get_du(cache, idx), δu)
+        end
+    else
+        @assert J!==nothing "`J` must be provided when `pre_inverted = Val(false)`."
+        @bb δu = transpose(J) × vec(fu)
+    end
+    @bb @. δu *= -1
+    set_du!(cache, δu, idx)
+    return δu, true, (;)
+end
diff --git a/src/dfsane.jl b/src/dfsane.jl
deleted file mode 100644
index b91e75183..000000000
--- a/src/dfsane.jl
+++ /dev/null
@@ -1,206 +0,0 @@
-"""
-    DFSane(; σ_min::Real = 1e-10, σ_max::Real = 1e10, σ_1::Real = 1.0, M::Int = 10,
-        γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5, n_exp::Int = 2,
-        η_strategy::Function = (fn_1, n, x_n, f_n) -> fn_1 / n^2,
-        max_inner_iterations::Int = 100)
-
-A low-overhead and allocation-free implementation of the df-sane method for solving large-scale nonlinear
-systems of equations. For in depth information about all the parameters and the algorithm,
-see the paper [1].
-
-### Keyword Arguments
-
-  - `σ_min`: the minimum value of the spectral coefficient `σₙ` which is related to the step
-    size in the algorithm. Defaults to `1e-10`.
-  - `σ_max`: the maximum value of the spectral coefficient `σₙ` which is related to the step
-    size in the algorithm. Defaults to `1e10`.
-  - `σ_1`: the initial value of the spectral coefficient `σₙ` which is related to the step
-    size in the algorithm.. Defaults to `1.0`.
-  - `M`: The monotonicity of the algorithm is determined by a this positive integer.
-    A value of 1 for `M` would result in strict monotonicity in the decrease of the L2-norm
-    of the function `f`. However, higher values allow for more flexibility in this reduction.
-    Despite this, the algorithm still ensures global convergence through the use of a
-    non-monotone line-search algorithm that adheres to the Grippo-Lampariello-Lucidi
-    condition. Values in the range of 5 to 20 are usually sufficient, but some cases may call
-    for a higher value of `M`. The default setting is 10.
-  - `γ`: a parameter that influences if a proposed step will be accepted. Higher value of `γ`
-    will make the algorithm more restrictive in accepting steps. Defaults to `1e-4`.
-  - `τ_min`: if a step is rejected the new step size will get multiplied by factor, and this
-    parameter is the minimum value of that factor. Defaults to `0.1`.
-  - `τ_max`: if a step is rejected the new step size will get multiplied by factor, and this
-    parameter is the maximum value of that factor. Defaults to `0.5`.
-  - `n_exp`: the exponent of the loss, i.e. ``f_n=||F(x_n)||^{n_exp}``. The paper uses
-    `n_exp ∈ {1,2}`. Defaults to `2`.
-  - `η_strategy`:  function to determine the parameter `η`, which enables growth
-    of ``||f_n||^2``. Called as ``η = η_strategy(fn_1, n, x_n, f_n)`` with `fn_1` initialized as
-    ``fn_1=||f(x_1)||^{n_exp}``, `n` is the iteration number, `x_n` is the current `x`-value and
-    `f_n` the current residual. Should satisfy ``η > 0`` and ``∑ₖ ηₖ < ∞``. Defaults to
-    ``fn_1 / n^2``.
-  - `max_inner_iterations`: the maximum number of iterations allowed for the inner loop of the
-    algorithm. Defaults to `100`.
-
-### References
-
-[1] W LaCruz, JM Martinez, and M Raydan (2006), Spectral Residual Method without Gradient
-Information for Solving Large-Scale Nonlinear Systems of Equations, Mathematics of
-Computation, 75, 1429-1448.
-"""
-@kwdef @concrete struct DFSane <: AbstractNonlinearSolveAlgorithm
-    σ_min = 1e-10
-    σ_max = 1e10
-    σ_1 = 1.0
-    M::Int = 10
-    γ = 1e-4
-    τ_min = 0.1
-    τ_max = 0.5
-    n_exp::Int = 2
-    η_strategy = (fn_1, n, x_n, f_n) -> fn_1 / n^2
-    max_inner_iterations::Int = 100
-end
-
-@concrete mutable struct DFSaneCache{iip} <: AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    u_cache_2
-    fu
-    fu_cache
-    du
-    history
-    f_norm
-    f_norm_0
-    M
-    σ_n
-    σ_min
-    σ_max
-    α_1
-    γ
-    τ_min
-    τ_max
-    n_exp::Int
-    p
-    force_stop::Bool
-    maxiters::Int
-    internalnorm
-    retcode::SciMLBase.ReturnCode.T
-    abstol
-    reltol
-    prob
-    stats::NLStats
-    tc_cache
-    trace
-end
-
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::DFSane, args...;
-        alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
-        kwargs...) where {uType, iip, F}
-    u = __maybe_unaliased(prob.u0, alias_u0)
-    T = eltype(u)
-
-    @bb du = similar(u)
-    @bb u_cache = copy(u)
-    @bb u_cache_2 = similar(u)
-
-    fu = evaluate_f(prob, u)
-    @bb fu_cache = copy(fu)
-
-    f_norm = internalnorm(fu)^alg.n_exp
-    f_norm_0 = f_norm
-
-    history = fill(f_norm, alg.M)
-
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u_cache,
-        termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu, nothing, du; kwargs...)
-
-    return DFSaneCache{iip}(prob.f, alg, u, u_cache, u_cache_2, fu, fu_cache, du, history,
-        f_norm, f_norm_0, alg.M, T(alg.σ_1), T(alg.σ_min), T(alg.σ_max), one(T), T(alg.γ),
-        T(alg.τ_min), T(alg.τ_max), alg.n_exp, prob.p, false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
-end
-
-function perform_step!(cache::DFSaneCache{iip}) where {iip}
-    @unpack alg, f_norm, σ_n, σ_min, σ_max, α_1, γ, τ_min, τ_max, n_exp, M, prob = cache
-    T = eltype(cache.u)
-    f_norm_old = f_norm
-
-    # Line search direction
-    @bb @. cache.du = -σ_n * cache.fu
-
-    η = alg.η_strategy(cache.f_norm_0, cache.stats.nsteps + 1, cache.u, cache.fu)
-
-    f_bar = maximum(cache.history)
-    α₊ = α_1
-    α₋ = α_1
-
-    @bb @. cache.u_cache_2 = cache.u + α₊ * cache.du
-    evaluate_f(cache, cache.u_cache_2, cache.p)
-    f_norm = cache.internalnorm(cache.fu)^n_exp
-    α = -α₊
-
-    inner_converged = false
-    for k in 1:(cache.alg.max_inner_iterations)
-        if f_norm ≤ f_bar + η - γ * α₊^2 * f_norm_old
-            α = -α₊
-            inner_converged = true
-            break
-        end
-
-        α₊ = α₊ * clamp(α₊ * f_norm_old / (f_norm + (T(2) * α₊ - T(1)) * f_norm_old),
-            τ_min, τ_max)
-        @bb @. cache.u_cache_2 = cache.u - α₋ * cache.du
-        evaluate_f(cache, cache.u_cache_2, cache.p)
-        f_norm = cache.internalnorm(cache.fu)^n_exp
-
-        if f_norm ≤ f_bar + η - γ * α₋^2 * f_norm_old
-            α = α₋
-            inner_converged = true
-            break
-        end
-
-        α₋ = α₋ * clamp(α₋ * f_norm_old / (f_norm + (T(2) * α₋ - T(1)) * f_norm_old),
-            τ_min, τ_max)
-        @bb @. cache.u_cache_2 = cache.u + α₊ * cache.du
-        evaluate_f(cache, cache.u_cache_2, cache.p)
-        f_norm = cache.internalnorm(cache.fu)^n_exp
-    end
-
-    if !inner_converged
-        cache.retcode = ReturnCode.ConvergenceFailure
-        cache.force_stop = true
-    end
-
-    @bb copyto!(cache.u, cache.u_cache_2)
-
-    update_trace!(cache, α)
-    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-
-    # Update spectral parameter
-    @bb @. cache.u_cache = cache.u - cache.u_cache
-    @bb @. cache.fu_cache = cache.fu - cache.fu_cache
-
-    cache.σ_n = dot(cache.u_cache, cache.u_cache) / dot(cache.fu_cache, cache.u_cache)
-
-    # Spectral parameter bounds check
-    if !(σ_min ≤ abs(cache.σ_n) ≤ σ_max)
-        test_norm = dot(cache.fu, cache.fu)
-        cache.σ_n = clamp(inv(test_norm), T(1), T(1e5))
-    end
-
-    # Take step
-    @bb copyto!(cache.u_cache, cache.u)
-    @bb copyto!(cache.fu_cache, cache.fu)
-    cache.f_norm = f_norm
-
-    # Update history
-    cache.history[cache.stats.nsteps % M + 1] = f_norm
-    return nothing
-end
-
-function __reinit_internal!(cache::DFSaneCache; kwargs...)
-    cache.f_norm = cache.internalnorm(cache.fu)^cache.n_exp
-    cache.f_norm_0 = cache.f_norm
-    return
-end
diff --git a/src/function_wrappers.jl b/src/function_wrappers.jl
deleted file mode 100644
index 599127f39..000000000
--- a/src/function_wrappers.jl
+++ /dev/null
@@ -1,188 +0,0 @@
-# NonlinearSolve can handle all NonlinearFunction specifications but that is not true for
-# downstream packages. Make conversion to those easier.
-function __construct_f(prob; alias_u0::Bool = false, can_handle_oop::Val{OOP} = Val(false),
-        can_handle_scalar::Val{SCALAR} = Val(false), make_fixed_point::Val{FP} = Val(false),
-        can_handle_arbitrary_dims::Val{DIMS} = Val(false),
-        force_oop::Val{FOOP} = Val(false)) where {SCALAR, OOP, DIMS, FP, FOOP}
-    if !OOP && SCALAR
-        error("Incorrect Specification: OOP not supported but scalar supported.")
-    end
-
-    resid = evaluate_f(prob, prob.u0)
-
-    if SCALAR || !(prob.u0 isa Number)
-        u0 = __maybe_unaliased(prob.u0, alias_u0)
-    else
-        u0 = [prob.u0]
-    end
-
-    f = if FP
-        if isinplace(prob)
-            @closure (du, u, p) -> begin
-                prob.f(du, u, p)
-                @. du += u
-            end
-        else
-            @closure (u, p) -> prob.f(u, p) .+ u
-        end
-    else
-        prob.f
-    end
-
-    ff = if isinplace(prob)
-        ninputs = 2
-        if DIMS || u0 isa AbstractVector
-            @closure (du, u) -> (f(du, u, prob.p); du)
-        else
-            u0_size = size(u0)
-            du_size = size(resid)
-            @closure (du, u) -> (f(reshape(du, du_size), reshape(u, u0_size), prob.p); du)
-        end
-    else
-        if prob.u0 isa Number
-            if SCALAR
-                ninputs = 1
-                @closure (u) -> f(u, prob.p)
-            elseif OOP
-                ninputs = 1
-                @closure (u) -> [f(first(u), prob.p)]
-            else
-                ninputs = 2
-                resid = [resid]
-                @closure (du, u) -> (du[1] = f(first(u), prob.p); du)
-            end
-        else
-            if OOP
-                ninputs = 1
-                if DIMS
-                    @closure (u) -> f(u, prob.p)
-                else
-                    u0_size = size(u0)
-                    @closure (u) -> _vec(f(reshape(u, u0_size), prob.p))
-                end
-            else
-                ninputs = 2
-                if DIMS
-                    @closure (du, u) -> (copyto!(du, f(u, prob.p)); du)
-                else
-                    u0_size = size(u0)
-                    @closure (du, u) -> begin
-                        copyto!(vec(du), vec(f(reshape(u, u0_size), prob.p)))
-                        return du
-                    end
-                end
-            end
-        end
-    end
-
-    f_final = if FOOP
-        if ninputs == 1
-            ff
-        else
-            du_ = DIMS ? similar(resid) : _vec(similar(resid))
-            @closure (u) -> (ff(du_, u); du_)
-        end
-    else
-        ff
-    end
-
-    return f_final, ifelse(DIMS, u0, _vec(u0))
-end
-
-function __construct_jac(prob, alg, u0; can_handle_oop::Val{OOP} = Val(false),
-        can_handle_scalar::Val{SCALAR} = Val(false),
-        can_handle_arbitrary_dims::Val{DIMS} = Val(false)) where {SCALAR, OOP, DIMS}
-    if SciMLBase.has_jac(prob.f)
-        jac = prob.f.jac
-
-        jac_final = if isinplace(prob)
-            if DIMS || u0 isa AbstractVector
-                @closure (J, u) -> (jac(reshape(J, :, length(u)), u, prob.p); J)
-            else
-                u0_size = size(u0)
-                @closure (J, u) -> (jac(reshape(J, :, length(u)), reshape(u, u0_size),
-                    prob.p);
-                J)
-            end
-        else
-            if prob.u0 isa Number
-                if SCALAR
-                    @closure (u) -> jac(u, prob.p)
-                elseif OOP
-                    @closure (u) -> [jac(first(u), prob.p)]
-                else
-                    @closure (J, u) -> (J[1] = jac(first(u), prob.p); J)
-                end
-            else
-                if OOP
-                    if DIMS
-                        @closure (u) -> jac(u, prob.p)
-                    else
-                        u0_size = size(u0)
-                        @closure (u) -> jac(reshape(u, u0_size), prob.p)
-                    end
-                else
-                    if DIMS
-                        @closure (J, u) -> (copyto!(J, jac(u, prob.p)); J)
-                    else
-                        u0_size = size(u0)
-                        @closure (J, u) -> begin
-                            copyto!(J, jac(reshape(u, u0_size), prob.p))
-                            return J
-                        end
-                    end
-                end
-            end
-        end
-
-        return jac_final
-    end
-
-    hasfield(typeof(alg), :ad) || return nothing
-
-    uf, _, J, fu, jac_cache, _, _, _ = jacobian_caches(alg, prob.f, u0, prob.p,
-        Val{isinplace(prob)}(); lininit = Val(false), linsolve_with_JᵀJ = Val(false))
-    stats = SciMLBase.NLStats(0, 0, 0, 0, 0)
-    return JacobianFunctionCache{isinplace(prob)}(J, prob.f, uf, u0, prob.p, jac_cache,
-        alg, fu, stats)
-end
-
-# Currently used only in some of the extensions. Plan is to eventually use it in all the
-# native algorithms and other extensions to provide better jacobian support
-@concrete struct JacobianFunctionCache{iip, U, P} <: Function
-    J
-    f
-    uf
-    u::U
-    p::P
-    jac_cache
-    alg
-    fu_cache
-    stats
-end
-
-SciMLBase.isinplace(::JacobianFunctionCache{iip}) where {iip} = iip
-
-function (jac_cache::JacobianFunctionCache{iip, U, P})(J::AbstractMatrix, u::U,
-        p::P = jac_cache.p) where {iip, U, P}
-    jacobian!!(J, jac_cache; u, p)
-    return J
-end
-function (jac_cache::JacobianFunctionCache{iip, U, P})(u::U, p::P) where {iip, U, P}
-    return jacobian!!(cache.J, jac_cache; u, p)
-end
-
-@concrete struct InplaceFunction{iip} <: Function
-    f
-    p
-end
-
-(f::InplaceFunction{true})(du, u) = f.f(du, u, f.p)
-(f::InplaceFunction{true})(du, u, p) = f.f(du, u, p)
-(f::InplaceFunction{false})(du, u) = (du .= f.f(u, f.p))
-(f::InplaceFunction{false})(du, u, p) = (du .= f.f(u, p))
-
-struct __make_inplace{iip} end
-
-@inline __make_inplace{iip}(f::F, p) where {iip, F} = InplaceFunction{iip}(f, p)
-@inline __make_inplace{iip}(::Nothing, p) where {iip} = nothing
diff --git a/src/gaussnewton.jl b/src/gaussnewton.jl
deleted file mode 100644
index c7e99c912..000000000
--- a/src/gaussnewton.jl
+++ /dev/null
@@ -1,160 +0,0 @@
-"""
-    GaussNewton(; concrete_jac = nothing, linsolve = nothing, linesearch = nothing,
-        precs = DEFAULT_PRECS, adkwargs...)
-
-An advanced GaussNewton implementation with support for efficient handling of sparse
-matrices via colored automatic differentiation and preconditioned linear solvers. Designed
-for large-scale and numerically-difficult nonlinear least squares problems.
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing` which means that a default is selected according to the problem specification!
-    Valid choices are types from ADTypes.jl.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `linesearch`: the line search algorithm to use. Defaults to [`LineSearch()`](@ref),
-    which means that no line search is performed. Algorithms from `LineSearches.jl` can be
-    used here directly, and they will be converted to the correct `LineSearch`.
-  - `vjp_autodiff`: Automatic Differentiation Backend used for vector-jacobian products.
-    This is applicable if the linear solver doesn't require a concrete jacobian, for eg.,
-    Krylov Methods. Defaults to `nothing`, which means if the problem is out of place and
-    `Zygote` is loaded then, we use `AutoZygote`. In all other, cases `FiniteDiff` is used.
-"""
-@concrete struct GaussNewton{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    linsolve
-    precs
-    linesearch
-    vjp_autodiff
-end
-
-function set_ad(alg::GaussNewton{CJ}, ad) where {CJ}
-    return GaussNewton{CJ}(ad, alg.linsolve, alg.precs, alg.linesearch, alg.vjp_autodiff)
-end
-
-function GaussNewton(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
-        linesearch = nothing, vjp_autodiff = nothing, autodiff = nothing)
-    linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method = linesearch)
-    return GaussNewton{_unwrap_val(concrete_jac)}(autodiff, linsolve, precs, linesearch,
-        vjp_autodiff)
-end
-
-@concrete mutable struct GaussNewtonCache{iip} <: AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    fu
-    fu_cache
-    du
-    dfu
-    p
-    uf
-    linsolve
-    J
-    JᵀJ
-    Jᵀf
-    jac_cache
-    force_stop
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    prob
-    stats::NLStats
-    tc_cache_1
-    tc_cache_2
-    ls_cache
-    trace
-end
-
-function SciMLBase.__init(prob::NonlinearLeastSquaresProblem{uType, iip}, alg_::GaussNewton,
-        args...; alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
-        kwargs...) where {uType, iip, F}
-    alg = get_concrete_algorithm(alg_, prob)
-    @unpack f, u0, p = prob
-
-    u = __maybe_unaliased(u0, alias_u0)
-    fu = evaluate_f(prob, u)
-
-    uf, linsolve, J, fu_cache, jac_cache, du, JᵀJ, Jᵀf = jacobian_caches(alg, f, u, p,
-        Val(iip); linsolve_with_JᵀJ = Val(__needs_square_A(alg, u)))
-
-    abstol, reltol, tc_cache_1 = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-    _, _, tc_cache_2 = init_termination_cache(abstol, reltol, fu, u, termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
-
-    @bb u_cache = copy(u)
-    @bb dfu = copy(fu)
-
-    return GaussNewtonCache{iip}(f, alg, u, u_cache, fu, fu_cache, du, dfu, p, uf,
-        linsolve, J, JᵀJ, Jᵀf, jac_cache, false, maxiters, internalnorm, ReturnCode.Default,
-        abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache_1, tc_cache_2,
-        init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), trace)
-end
-
-function perform_step!(cache::GaussNewtonCache{iip}) where {iip}
-    cache.J = jacobian!!(cache.J, cache)
-
-    # Use normal form to solve the Linear Problem
-    if cache.JᵀJ !== nothing
-        __update_JᵀJ!(cache)
-        __update_Jᵀf!(cache)
-        A, b = __maybe_symmetric(cache.JᵀJ), _vec(cache.Jᵀf)
-    else
-        A, b = cache.J, _vec(cache.fu)
-    end
-
-    linres = dolinsolve(cache, cache.alg.precs, cache.linsolve; A, b, linu = _vec(cache.du),
-        cache.p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    cache.du = _restructure(cache.du, linres.u)
-
-    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    @bb axpy!(-α, cache.du, cache.u)
-    evaluate_f(cache, cache.u, cache.p)
-    update_trace!(cache, α)
-
-    check_and_update!(cache.tc_cache_1, cache, cache.fu, cache.u, cache.u_cache)
-    if !cache.force_stop
-        @bb @. cache.dfu = cache.fu .- cache.dfu
-        check_and_update!(cache.tc_cache_2, cache, cache.dfu, cache.u, cache.u_cache)
-    end
-
-    @bb copyto!(cache.u_cache, cache.u)
-    @bb copyto!(cache.dfu, cache.fu)
-
-    return nothing
-end
-
-# FIXME: Reinit `JᵀJ` operator if `p` is changed
-function __reinit_internal!(cache::GaussNewtonCache;
-        termination_condition = get_termination_mode(cache.tc_cache_1), kwargs...)
-    abstol, reltol, tc_cache_1 = init_termination_cache(cache.abstol, cache.reltol,
-        cache.fu, cache.u, termination_condition)
-    _, _, tc_cache_2 = init_termination_cache(cache.abstol, cache.reltol, cache.fu,
-        cache.u, termination_condition)
-
-    cache.tc_cache_1 = tc_cache_1
-    cache.tc_cache_2 = tc_cache_2
-    cache.abstol = abstol
-    cache.reltol = reltol
-    return nothing
-end
diff --git a/src/globalization/line_search.jl b/src/globalization/line_search.jl
new file mode 100644
index 000000000..73f88dc4e
--- /dev/null
+++ b/src/globalization/line_search.jl
@@ -0,0 +1,372 @@
+"""
+    NoLineSearch <: AbstractNonlinearSolveLineSearchAlgorithm
+
+Don't perform a line search. Just return the initial step length of `1`.
+"""
+struct NoLineSearch <: AbstractNonlinearSolveLineSearchAlgorithm end
+
+@concrete mutable struct NoLineSearchCache <: AbstractNonlinearSolveLineSearchCache
+    α
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::NoLineSearch, f::F, fu, u,
+        p, args...; kwargs...) where {F}
+    return NoLineSearchCache(promote_type(eltype(fu), eltype(u))(true))
+end
+
+reinit_cache!(cache::NoLineSearchCache, args...; p = cache.p, kwargs...) = nothing
+
+__internal_solve!(cache::NoLineSearchCache, u, du) = false, cache.α
+
+"""
+    LineSearchesJL(; method = LineSearches.Static(), autodiff = nothing, α = true)
+
+Wrapper over algorithms from
+[LineSearches.jl](https://github.com/JuliaNLSolvers/LineSearches.jl/). Allows automatic
+construction of the objective functions for the line search algorithms utilizing automatic
+differentiation for fast Vector Jacobian Products.
+
+### Arguments
+
+  - `method`: the line search algorithm to use. Defaults to
+    `method = LineSearches.Static()`, which means that the step size is fixed to the value
+    of `alpha`.
+  - `autodiff`: the automatic differentiation backend to use for the line search. Using a
+    reverse mode automatic differentiation backend if recommended.
+  - `α`: the initial step size to use. Defaults to `true` (which is equivalent to `1`).
+"""
+@concrete struct LineSearchesJL <: AbstractNonlinearSolveLineSearchAlgorithm
+    method
+    initial_alpha
+    autodiff
+end
+
+function Base.show(io::IO, alg::LineSearchesJL)
+    str = "$(nameof(typeof(alg)))("
+    modifiers = String[]
+    __is_present(alg.autodiff) &&
+        push!(modifiers, "autodiff = $(nameof(typeof(alg.autodiff)))()")
+    alg.initial_alpha != true && push!(modifiers, "initial_alpha = $(alg.initial_alpha)")
+    push!(modifiers, "method = $(nameof(typeof(alg.method)))()")
+    print(io, str, join(modifiers, ", "), ")")
+end
+
+LineSearchesJL(method; kwargs...) = LineSearchesJL(; method, kwargs...)
+function LineSearchesJL(; method = LineSearches.Static(), autodiff = nothing, α = true)
+    if method isa AbstractNonlinearSolveLineSearchAlgorithm
+        Base.depwarn("Passing a native NonlinearSolve line search algorithm to \
+                      `LineSearchesJL` or `LineSearch` is deprecated. Pass the method \
+                      directly instead.", :LineSearchesJL)
+        return method
+    end
+    return LineSearchesJL(method, α, autodiff)
+end
+
+Base.@deprecate_binding LineSearch LineSearchesJL true
+
+# Wrapper over LineSearches.jl algorithms
+@concrete mutable struct LineSearchesJLCache <: AbstractNonlinearSolveLineSearchCache
+    f
+    p
+    ϕ
+    dϕ
+    ϕdϕ
+    method
+    alpha
+    grad_op
+    u_cache
+    fu_cache
+    nf::Base.RefValue{Int}
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::LineSearchesJL, f::F, fu, u,
+        p, args...; internalnorm::IN = DEFAULT_NORM, kwargs...) where {F, IN}
+    T = promote_type(eltype(fu), eltype(u))
+    if u isa Number
+        grad_op = @closure (u, fu, p) -> last(__value_derivative(Base.Fix2(f, p), u)) * fu
+    else
+        if SciMLBase.has_jvp(f)
+            if isinplace(prob)
+                g_cache = similar(u)
+                grad_op = @closure (u, fu, p) -> f.vjp(g_cache, fu, u, p)
+            else
+                grad_op = @closure (u, fu, p) -> f.vjp(fu, u, p)
+            end
+        else
+            autodiff = get_concrete_reverse_ad(alg.autodiff, prob;
+                check_forward_mode = true)
+            vjp_op = VecJacOperator(prob, fu, u; autodiff)
+            if isinplace(prob)
+                g_cache = similar(u)
+                grad_op = @closure (u, fu, p) -> vjp_op(g_cache, fu, u, p)
+            else
+                grad_op = @closure (u, fu, p) -> vjp_op(fu, u, p)
+            end
+        end
+    end
+
+    @bb u_cache = similar(u)
+    @bb fu_cache = similar(fu)
+    nf = Base.RefValue(0)
+
+    ϕ = @closure (f, p, u, du, α, u_cache, fu_cache) -> begin
+        @bb @. u_cache = u + α * du
+        fu_cache = evaluate_f!!(f, fu_cache, u_cache, p)
+        nf[] += 1
+        return @fastmath internalnorm(fu_cache)^2 / 2
+    end
+
+    dϕ = @closure (f, p, u, du, α, u_cache, fu_cache, grad_op) -> begin
+        @bb @. u_cache = u + α * du
+        fu_cache = evaluate_f!!(f, fu_cache, u_cache, p)
+        nf[] += 1
+        g₀ = grad_op(u_cache, fu_cache, p)
+        return dot(g₀, du)
+    end
+
+    ϕdϕ = @closure (f, p, u, du, α, u_cache, fu_cache, grad_op) -> begin
+        @bb @. u_cache = u + α * du
+        fu_cache = evaluate_f!!(f, fu_cache, u_cache, p)
+        nf[] += 1
+        g₀ = grad_op(u_cache, fu_cache, p)
+        obj = @fastmath internalnorm(fu_cache)^2 / 2
+        return obj, dot(g₀, du)
+    end
+
+    return LineSearchesJLCache(f, p, ϕ, dϕ, ϕdϕ, alg.method, T(alg.initial_alpha), grad_op,
+        u_cache, fu_cache, nf)
+end
+
+function __internal_solve!(cache::LineSearchesJLCache, u, du; kwargs...)
+    ϕ = @closure α -> cache.ϕ(cache.f, cache.p, u, du, α, cache.u_cache, cache.fu_cache)
+    dϕ = @closure α -> cache.dϕ(cache.f, cache.p, u, du, α, cache.u_cache, cache.fu_cache,
+        cache.grad_op)
+    ϕdϕ = @closure α -> cache.ϕdϕ(cache.f, cache.p, u, du, α, cache.u_cache, cache.fu_cache,
+        cache.grad_op)
+
+    ϕ₀, dϕ₀ = ϕdϕ(zero(eltype(u)))
+
+    # Here we should be resetting the search direction for some algorithms especially
+    # if we start mixing in jacobian reuse and such
+    dϕ₀ ≥ 0 && return (true, one(eltype(u)))
+
+    # We can technically reduce 1 axpy by reusing the returned value from cache.method
+    # but it's not worth the extra complexity
+    cache.alpha = first(cache.method(ϕ, dϕ, ϕdϕ, cache.alpha, ϕ₀, dϕ₀))
+    return (false, cache.alpha)
+end
+
+"""
+    RobustNonMonotoneLineSearch(; gamma = 1 // 10000, sigma_0 = 1, M::Int = 10,
+        tau_min = 1 // 10, tau_max = 1 // 2, n_exp::Int = 2, maxiters::Int = 100,
+        η_strategy = (fn₁, n, uₙ, fₙ) -> fn₁ / n^2)
+
+Robust NonMonotone Line Search is a derivative free line search method from DF Sane
+[la2006spectral](@cite).
+
+### Keyword Arguments
+
+  - `M`: The monotonicity of the algorithm is determined by a this positive integer.
+    A value of 1 for `M` would result in strict monotonicity in the decrease of the L2-norm
+    of the function `f`. However, higher values allow for more flexibility in this reduction.
+    Despite this, the algorithm still ensures global convergence through the use of a
+    non-monotone line-search algorithm that adheres to the Grippo-Lampariello-Lucidi
+    condition. Values in the range of 5 to 20 are usually sufficient, but some cases may
+    call for a higher value of `M`. The default setting is 10.
+  - `gamma`: a parameter that influences if a proposed step will be accepted. Higher value
+    of `gamma` will make the algorithm more restrictive in accepting steps. Defaults to
+    `1e-4`.
+  - `tau_min`: if a step is rejected the new step size will get multiplied by factor, and
+    this parameter is the minimum value of that factor. Defaults to `0.1`.
+  - `tau_max`: if a step is rejected the new step size will get multiplied by factor, and
+    this parameter is the maximum value of that factor. Defaults to `0.5`.
+  - `n_exp`: the exponent of the loss, i.e. ``f_n=||F(x_n)||^{n\\_exp}``. The paper uses
+    `n_exp ∈ {1, 2}`. Defaults to `2`.
+  - `η_strategy`:  function to determine the parameter `η`, which enables growth
+    of ``||f_n||^2``. Called as `η = η_strategy(fn_1, n, x_n, f_n)` with `fn_1` initialized
+    as ``fn_1=||f(x_1)||^{n\\_exp}``, `n` is the iteration number, `x_n` is the current
+    `x`-value and `f_n` the current residual. Should satisfy ``η > 0`` and ``∑ₖ ηₖ < ∞``.
+    Defaults to ``fn_1 / n^2``.
+  - `maxiters`: the maximum number of iterations allowed for the inner loop of the
+    algorithm. Defaults to `100`.
+"""
+@kwdef @concrete struct RobustNonMonotoneLineSearch <:
+                        AbstractNonlinearSolveLineSearchAlgorithm
+    gamma = 1 // 10000
+    sigma_1 = 1
+    M::Int = 10
+    tau_min = 1 // 10
+    tau_max = 1 // 2
+    n_exp::Int = 2
+    maxiters::Int = 100
+    η_strategy = (fn₁, n, uₙ, fₙ) -> fn₁ / n^2
+end
+
+@concrete mutable struct RobustNonMonotoneLineSearchCache <:
+                         AbstractNonlinearSolveLineSearchCache
+    f
+    p
+    ϕ
+    u_cache
+    fu_cache
+    internalnorm
+    maxiters::Int
+    history
+    γ
+    σ₁
+    M::Int
+    τ_min
+    τ_max
+    nsteps::Int
+    η_strategy
+    n_exp::Int
+    nf::Base.RefValue{Int}
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::RobustNonMonotoneLineSearch,
+        f::F, fu, u, p, args...; internalnorm::IN = DEFAULT_NORM, kwargs...) where {F, IN}
+    @bb u_cache = similar(u)
+    @bb fu_cache = similar(fu)
+    T = promote_type(eltype(fu), eltype(u))
+
+    nf = Base.RefValue(0)
+    ϕ = @closure (f, p, u, du, α, u_cache, fu_cache) -> begin
+        @bb @. u_cache = u + α * du
+        fu_cache = evaluate_f!!(f, fu_cache, u_cache, p)
+        nf[] += 1
+        return internalnorm(fu_cache)^alg.n_exp
+    end
+
+    fn₁ = internalnorm(fu)^alg.n_exp
+    η_strategy = @closure (n, xₙ, fₙ) -> alg.η_strategy(fn₁, n, xₙ, fₙ)
+
+    return RobustNonMonotoneLineSearchCache(f, p, ϕ, u_cache, fu_cache, internalnorm,
+        alg.maxiters, fill(fn₁, alg.M), T(alg.gamma), T(alg.sigma_1), alg.M, T(alg.tau_min),
+        T(alg.tau_max), 0, η_strategy, alg.n_exp, nf)
+end
+
+function __internal_solve!(cache::RobustNonMonotoneLineSearchCache, u, du; kwargs...)
+    T = promote_type(eltype(u), eltype(du))
+    ϕ = @closure α -> cache.ϕ(cache.f, cache.p, u, du, α, cache.u_cache, cache.fu_cache)
+    f_norm_old = ϕ(eltype(u)(0))
+    α₊, α₋ = T(cache.σ₁), T(cache.σ₁)
+    η = cache.η_strategy(cache.nsteps, u, f_norm_old)
+    f_bar = maximum(cache.history)
+
+    for k in 1:(cache.maxiters)
+        f_norm = ϕ(α₊)
+        f_norm ≤ f_bar + η - cache.γ * α₊ * f_norm_old && return (false, α₊)
+
+        α₊ *= clamp(α₊ * f_norm_old / (f_norm + (T(2) * α₊ - T(1)) * f_norm_old),
+            cache.τ_min, cache.τ_max)
+
+        f_norm = ϕ(-α₋)
+        f_norm ≤ f_bar + η - cache.γ * α₋ * f_norm_old && return (false, -α₋)
+
+        α₋ *= clamp(α₋ * f_norm_old / (f_norm + (T(2) * α₋ - T(1)) * f_norm_old),
+            cache.τ_min, cache.τ_max)
+    end
+
+    return true, T(cache.σ₁)
+end
+
+function callback_into_cache!(topcache, cache::RobustNonMonotoneLineSearchCache, args...)
+    fu = get_fu(topcache)
+    cache.history[mod1(cache.nsteps, cache.M)] = cache.internalnorm(fu)^cache.n_exp
+    cache.nsteps += 1
+    return
+end
+
+"""
+    LiFukushimaLineSearch(; lambda_0 = 1, beta = 1 // 2, sigma_1 = 1 // 1000,
+        sigma_2 = 1 // 1000, eta = 1 // 10, nan_max_iter::Int = 5, maxiters::Int = 100)
+
+A derivative-free line search and global convergence of Broyden-like method for nonlinear
+equations [li2000derivative](@cite).
+"""
+@kwdef @concrete struct LiFukushimaLineSearch <: AbstractNonlinearSolveLineSearchAlgorithm
+    lambda_0 = 1
+    beta = 1 // 2
+    sigma_1 = 1 // 1000
+    sigma_2 = 1 // 1000
+    eta = 1 // 10
+    rho = 9 // 10
+    nan_max_iter::Int = 5  # TODO (breaking): Change this to nan_maxiters for uniformity
+    maxiters::Int = 100
+end
+
+@concrete mutable struct LiFukushimaLineSearchCache <: AbstractNonlinearSolveLineSearchCache
+    ϕ
+    f
+    p
+    internalnorm
+    u_cache
+    fu_cache
+    λ₀
+    β
+    σ₁
+    σ₂
+    η
+    ρ
+    α
+    nan_maxiters::Int
+    maxiters::Int
+    nf::Base.RefValue{Int}
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::LiFukushimaLineSearch,
+        f::F, fu, u, p, args...; internalnorm::IN = DEFAULT_NORM, kwargs...) where {F, IN}
+    @bb u_cache = similar(u)
+    @bb fu_cache = similar(fu)
+    T = promote_type(eltype(fu), eltype(u))
+
+    nf = Base.RefValue(0)
+    ϕ = @closure (f, p, u, du, α, u_cache, fu_cache) -> begin
+        @bb @. u_cache = u + α * du
+        fu_cache = evaluate_f!!(f, fu_cache, u_cache, p)
+        nf[] += 1
+        return internalnorm(fu_cache)
+    end
+
+    return LiFukushimaLineSearchCache(ϕ, f, p, internalnorm, u_cache, fu_cache,
+        T(alg.lambda_0), T(alg.beta), T(alg.sigma_1), T(alg.sigma_2), T(alg.eta),
+        T(alg.rho), T(true), alg.nan_max_iter, alg.maxiters, nf)
+end
+
+function __internal_solve!(cache::LiFukushimaLineSearchCache, u, du; kwargs...)
+    T = promote_type(eltype(u), eltype(du))
+    ϕ = @closure α -> cache.ϕ(cache.f, cache.p, u, du, α, cache.u_cache, cache.fu_cache)
+
+    fx_norm = ϕ(T(0))
+
+    # Non-Blocking exit if the norm is NaN or Inf
+    !isfinite(fx_norm) && return (true, cache.α)
+
+    # Early Terminate based on Eq. 2.7
+    du_norm = cache.internalnorm(du)
+    fxλ_norm = ϕ(cache.α)
+    fxλ_norm ≤ cache.ρ * fx_norm - cache.σ₂ * du_norm^2 && return (false, cache.α)
+
+    λ₂, λ₁ = cache.λ₀, cache.λ₀
+    fxλp_norm = ϕ(λ₂)
+
+    if !isfinite(fxλp_norm)
+        nan_converged = false
+        for _ in 1:(cache.nan_maxiters)
+            λ₁, λ₂ = λ₂, cache.β * λ₂
+            fxλp_norm = ϕ(λ₂)
+            nan_converged = isfinite(fxλp_norm)
+            nan_converged && break
+        end
+        nan_converged || return (true, cache.α)
+    end
+
+    for i in 1:(cache.maxiters)
+        fxλp_norm = ϕ(λ₂)
+        converged = fxλp_norm ≤ (1 + cache.η) * fx_norm - cache.σ₁ * λ₂^2 * du_norm^2
+        converged && return (false, λ₂)
+        λ₁, λ₂ = λ₂, cache.β * λ₂
+    end
+
+    return true, cache.α
+end
diff --git a/src/globalization/trust_region.jl b/src/globalization/trust_region.jl
new file mode 100644
index 000000000..4e3b2f387
--- /dev/null
+++ b/src/globalization/trust_region.jl
@@ -0,0 +1,546 @@
+"""
+    LevenbergMarquardtTrustRegion(b_uphill)
+
+Trust Region method for [`LevenbergMarquardt`](@ref). This method is tightly coupled with
+the Levenberg-Marquardt method and works by directly updating the damping parameter instead
+of specifying a trust region radius.
+
+### Arguments
+
+  - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
+    choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
+    and reject all steps that increase the cost. Although this is a natural and safe choice,
+    it is often not the most efficient. Therefore downhill moves are always accepted, but
+    uphill moves are only conditionally accepted. To decide whether an uphill move will be
+    accepted at each iteration ``i``, we compute
+    ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
+    between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
+    step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
+    specify, uphill moves are accepted if
+    ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
+    iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with
+    `b_uphill = 2.0` allowing higher uphill moves than `b_uphill = 1.0`. When
+    `b_uphill = 0.0`, no uphill moves will be accepted. Defaults to `1.0`. See Section 4 of
+    [transtrum2012improvements](@citet).
+"""
+@concrete struct LevenbergMarquardtTrustRegion <: AbstractTrustRegionMethod
+    β_uphill
+end
+
+function Base.show(io::IO, alg::LevenbergMarquardtTrustRegion)
+    print(io, "LevenbergMarquardtTrustRegion(β_uphill = $(alg.β_uphill))")
+end
+
+@concrete mutable struct LevenbergMarquardtTrustRegionCache <:
+                         AbstractTrustRegionMethodCache
+    f
+    p
+    loss_old
+    v_cache
+    norm_v_old
+    internalnorm
+    β_uphill
+    last_step_accepted::Bool
+    u_cache
+    fu_cache
+    nf::Int
+end
+
+function reinit_cache!(cache::LevenbergMarquardtTrustRegionCache, args...; p = cache.p,
+        u0 = cache.v_cache, kwargs...)
+    cache.p = p
+    @bb copyto!(cache.v_cache, u0)
+    cache.loss_old = oftype(cache.loss_old, Inf)
+    cache.norm_v_old = oftype(cache.norm_v_old, Inf)
+    cache.last_step_accepted = false
+    cache.nf = 0
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::LevenbergMarquardtTrustRegion,
+        f::F, fu, u, p, args...; internalnorm::IF = DEFAULT_NORM, kwargs...) where {F, IF}
+    T = promote_type(eltype(u), eltype(fu))
+    @bb v = copy(u)
+    @bb u_cache = similar(u)
+    @bb fu_cache = similar(fu)
+    return LevenbergMarquardtTrustRegionCache(f, p, T(Inf), v, T(Inf), internalnorm,
+        alg.β_uphill, false, u_cache, fu_cache, 0)
+end
+
+function __internal_solve!(cache::LevenbergMarquardtTrustRegionCache, J, fu, u, δu,
+        descent_stats)
+    # This should be true if Geodesic Acceleration is being used
+    v = hasfield(typeof(descent_stats), :v) ? descent_stats.v : δu
+    norm_v = cache.internalnorm(v)
+    β = dot(v, cache.v_cache) / (norm_v * cache.norm_v_old)
+
+    @bb @. cache.u_cache = u + δu
+    cache.fu_cache = evaluate_f!!(cache.f, cache.fu_cache, cache.u_cache, cache.p)
+    cache.nf += 1
+
+    loss = cache.internalnorm(cache.fu_cache)
+
+    if (1 - β)^cache.β_uphill * loss ≤ cache.loss_old  # Accept Step
+        cache.last_step_accepted = true
+        cache.norm_v_old = norm_v
+        @bb copyto!(cache.v_cache, v)
+    else
+        cache.last_step_accepted = false
+    end
+
+    return cache.last_step_accepted, cache.u_cache, cache.fu_cache
+end
+
+# Don't Pollute the namespace
+"""
+    RadiusUpdateSchemes
+
+`RadiusUpdateSchemes` is provides different types of radius update schemes implemented in
+the Trust Region method. These schemes specify how the radius of the so-called trust region
+is updated after each iteration of the algorithm. The specific role and caveats associated
+with each scheme are provided below.
+
+## Using `RadiusUpdateSchemes`
+
+Simply put the desired scheme as follows:
+`sol = solve(prob, alg = TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))`.
+"""
+module RadiusUpdateSchemes
+# The weird definitions here are needed to main compatibility with the older enum variants
+
+abstract type AbstractRadiusUpdateScheme end
+
+function Base.show(io::IO, rus::AbstractRadiusUpdateScheme)
+    print(io, "RadiusUpdateSchemes.$(string(nameof(typeof(rus)))[3:end])")
+end
+
+const T = AbstractRadiusUpdateScheme
+
+struct __Simple <: AbstractRadiusUpdateScheme end
+"""
+    RadiusUpdateSchemes.Simple
+
+The simple or conventional radius update scheme. This scheme is chosen by default and
+follows the conventional approach to update the trust region radius, i.e. if the trial
+step is accepted it increases the radius by a fixed factor (bounded by a maximum radius)
+and if the trial step is rejected, it shrinks the radius by a fixed factor.
+"""
+const Simple = __Simple()
+
+struct __NLsolve <: AbstractRadiusUpdateScheme end
+"""
+    RadiusUpdateSchemes.NLsolve
+
+The same updating scheme as in NLsolve's (https://github.com/JuliaNLSolvers/NLsolve.jl)
+trust region dogleg implementation.
+"""
+const NLsolve = __NLsolve()
+
+struct __NocedalWright <: AbstractRadiusUpdateScheme end
+"""
+    RadiusUpdateSchemes.NocedalWright
+
+Trust region updating scheme as in Nocedal and Wright [see Alg 11.5, page 291].
+"""
+const NocedalWright = __NocedalWright()
+
+struct __Hei <: AbstractRadiusUpdateScheme end
+"""
+    RadiusUpdateSchemes.Hei
+
+This scheme is proposed in [hei2003self](@citet). The trust region radius depends on the
+size (norm) of the current step size. The hypothesis is to let the radius converge to zero
+as the iterations progress, which is more reliable and robust for ill-conditioned as well
+as degenerate problems.
+"""
+const Hei = __Hei()
+
+struct __Yuan <: AbstractRadiusUpdateScheme end
+"""
+    RadiusUpdateSchemes.Yuan
+
+This scheme is proposed by [yuan2015recent](@citet). Similar to Hei's scheme, the
+trust region is updated in a way so that it converges to zero, however here, the radius
+depends on the size (norm) of the current gradient of the objective (merit) function. The
+hypothesis is that the step size is bounded by the gradient size, so it makes sense to let
+the radius depend on the gradient.
+"""
+const Yuan = __Yuan()
+
+struct __Bastin <: AbstractRadiusUpdateScheme end
+"""
+    RadiusUpdateSchemes.Bastin
+
+This scheme is proposed by [bastin2010retrospective](@citet). The scheme is called a
+retrospective update scheme as it uses the model function at the current iteration to
+compute the ratio of the actual reduction and the predicted reduction in the previous trial
+step, and use this ratio to update the trust region radius. The hypothesis is to exploit the
+information made available during the optimization process in order to vary the accuracy
+of the objective function computation.
+"""
+const Bastin = __Bastin()
+
+struct __Fan <: AbstractRadiusUpdateScheme end
+"""
+    RadiusUpdateSchemes.Fan
+
+This scheme is proposed by [fan2006convergence](@citet). It is very much similar to Hei's
+and Yuan's schemes as it lets the trust region radius depend on the current size (norm) of
+the objective (merit) function itself. These new update schemes are known to improve local
+convergence.
+"""
+const Fan = __Fan()
+
+end
+
+const RUS = RadiusUpdateSchemes
+
+"""
+    GenericTrustRegionScheme(; method = RadiusUpdateSchemes.Simple,
+        max_trust_radius = nothing, initial_trust_radius = nothing,
+        step_threshold = nothing, shrink_threshold = nothing, expand_threshold = nothing,
+        shrink_factor = nothing, expand_factor = nothing, forward_ad = nothing,
+        reverse_ad = nothing)
+
+Trust Region Method that updates and stores the current trust region radius in
+`trust_region`. For any of the keyword arguments, if the value is `nothing`, then we use
+the value used in the respective paper.
+
+### Keyword Arguments
+
+  - `radius_update_scheme`: the choice of radius update scheme to be used. Defaults to
+    `RadiusUpdateSchemes.Simple` which follows the conventional approach. Other available
+    schemes are documented in [`RadiusUpdateSchemes`](@ref),. These schemes have the trust
+    region radius converging to zero that is seen to improve convergence. For more details,
+    see [1].
+  - `max_trust_radius`: the maximal trust region radius. Defaults to
+    `max(norm(fu), maximum(u) - minimum(u))`, except for `RadiusUpdateSchemes.NLsolve`
+    where it defaults to `Inf`.
+  - `initial_trust_radius`: the initial trust region radius. Defaults to
+    `max_trust_radius / 11`, except for `RadiusUpdateSchemes.NLsolve` where it defaults
+    to `u0_norm > 0 ? u0_norm : 1`.
+  - `step_threshold`: the threshold for taking a step. In every iteration, the threshold is
+    compared with a value `r`, which is the actual reduction in the objective function
+    divided by the predicted reduction. If `step_threshold > r` the model is not a good
+    approximation, and the step is rejected. Defaults to `nothing`.
+  - `shrink_threshold`: the threshold for shrinking the trust region radius. In every
+    iteration, the threshold is compared with a value `r` which is the actual reduction in
+    the objective function divided by the predicted reduction. If `shrink_threshold > r` the
+    trust region radius is shrunk by `shrink_factor`. Defaults to `nothing`.
+  - `expand_threshold`: the threshold for expanding the trust region radius. If a step is
+    taken, i.e `step_threshold < r` (with `r` defined in `shrink_threshold`), a check is
+    also made to see if `expand_threshold < r`. If that is true, the trust region radius is
+    expanded by `expand_factor`. Defaults to `nothing`.
+  - `shrink_factor`: the factor to shrink the trust region radius with if
+    `shrink_threshold > r` (with `r` defined in `shrink_threshold`). Defaults to `0.25`.
+  - `expand_factor`: the factor to expand the trust region radius with if
+    `expand_threshold < r` (with `r` defined in `shrink_threshold`). Defaults to `2.0`.
+"""
+@kwdef @concrete struct GenericTrustRegionScheme{
+    M <: RadiusUpdateSchemes.AbstractRadiusUpdateScheme}
+    method::M = RadiusUpdateSchemes.Simple
+    step_threshold = nothing
+    shrink_threshold = nothing
+    shrink_factor = nothing
+    expand_factor = nothing
+    expand_threshold = nothing
+    max_trust_radius = nothing
+    initial_trust_radius = nothing
+    forward_ad = nothing
+    reverse_ad = nothing
+end
+
+function Base.show(io::IO, alg::GenericTrustRegionScheme)
+    print(io, "GenericTrustRegionScheme(method = $(alg.method))")
+end
+
+@concrete mutable struct GenericTrustRegionSchemeCache <: AbstractTrustRegionMethodCache
+    method
+    f
+    p
+    max_trust_radius
+    initial_trust_radius
+    trust_region
+    step_threshold
+    shrink_threshold
+    expand_threshold
+    shrink_factor
+    expand_factor
+    p1
+    p2
+    p3
+    p4
+    ϵ
+    ρ
+    vjp_operator
+    jvp_operator
+    Jᵀfu_cache
+    Jδu_cache
+    δu_cache
+    internalnorm
+    u_cache
+    fu_cache
+    last_step_accepted::Bool
+    shrink_counter::Int
+    nf::Int
+    alg
+end
+
+function reinit_cache!(cache::GenericTrustRegionSchemeCache, args...; u0 = nothing,
+        p = cache.p, kwargs...)
+    T = eltype(cache.u_cache)
+    cache.p = p
+    if u0 !== nothing
+        u0_norm = cache.internalnorm(u0)
+        cache.trust_region = __initial_trust_radius(cache.alg.initial_trust_radius, T,
+            cache.alg.method, cache.max_trust_radius, u0_norm, u0_norm)  # FIXME: scheme specific
+    end
+    cache.last_step_accepted = false
+    cache.shrink_counter = 0
+    cache.nf = 0
+end
+
+# Defaults
+for func in (:__max_trust_radius, :__initial_trust_radius, :__step_threshold,
+    :__shrink_threshold, :__shrink_factor, :__expand_threshold, :__expand_factor)
+    @eval begin
+        @inline function $(func)(val, ::Type{T}, args...) where {T}
+            val_T = T(val)
+            iszero(val_T) && return $(func)(nothing, T, args...)
+            return val_T
+        end
+    end
+end
+
+@inline __max_trust_radius(::Nothing, ::Type{T}, method, u, fu_norm) where {T} = T(Inf)
+@inline function __max_trust_radius(::Nothing, ::Type{T},
+        ::Union{RUS.__Simple, RUS.__NocedalWright}, u, fu_norm) where {T}
+    u_min, u_max = extrema(u)
+    return max(T(fu_norm), u_max - u_min)
+end
+
+@inline function __initial_trust_radius(::Nothing, ::Type{T}, method, max_tr,
+        u0_norm, fu_norm) where {T}
+    method isa RUS.__NLsolve && return T(ifelse(u0_norm > 0, u0_norm, 1))
+    (method isa RUS.__Hei || method isa RUS.__Bastin) && return T(1)
+    method isa RUS.__Fan && return T((fu_norm^0.99) / 10)
+    return T(max_tr / 11)
+end
+
+@inline function __step_threshold(::Nothing, ::Type{T}, method) where {T}
+    method isa RUS.__Hei && return T(0)
+    method isa RUS.__Yuan && return T(1 // 1000)
+    method isa RUS.__Bastin && return T(1 // 20)
+    return T(1 // 10000)
+end
+
+@inline function __shrink_threshold(::Nothing, ::Type{T}, method) where {T}
+    method isa RUS.__Hei && return T(0)
+    (method isa RUS.__NLsolve || method isa RUS.__Bastin) && return T(1 // 20)
+    return T(1 // 4)
+end
+
+@inline function __expand_threshold(::Nothing, ::Type{T}, method) where {T}
+    method isa RUS.__NLsolve && return T(9 // 10)
+    method isa RUS.__Hei && return T(0)
+    method isa RUS.__Bastin && return T(9 // 10)
+    return T(3 // 4)
+end
+
+@inline function __shrink_factor(::Nothing, ::Type{T}, method) where {T}
+    method isa RUS.__NLsolve && return T(1 // 2)
+    method isa RUS.__Hei && return T(0)
+    method isa RUS.__Bastin && return T(1 // 20)
+    return T(1 // 4)
+end
+
+@inline function __get_parameters(::Type{T}, method) where {T}
+    method isa RUS.__NLsolve && return (T(1 // 2), T(0), T(0), T(0))
+    method isa RUS.__Hei && return (T(5), T(1 // 10), T(15 // 100), T(15 // 100))
+    method isa RUS.__Yuan && return (T(2), T(1 // 6), T(6), T(0))
+    method isa RUS.__Fan && return (T(1 // 10), T(1 // 4), T(12), T(1e18))
+    method isa RUS.__Bastin && return (T(5 // 2), T(1 // 4), T(0), T(0))
+    return (T(0), T(0), T(0), T(0))
+end
+
+@inline __expand_factor(::Nothing, ::Type{T}, method) where {T} = T(2)
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::GenericTrustRegionScheme,
+        f::F, fu, u, p, args...; internalnorm::IF = DEFAULT_NORM, kwargs...) where {F, IF}
+    T = promote_type(eltype(u), eltype(fu))
+    u0_norm = internalnorm(u)
+    fu_norm = internalnorm(fu)
+
+    # Common Setup
+    max_trust_radius = __max_trust_radius(alg.max_trust_radius, T, alg.method, u, fu_norm)
+    initial_trust_radius = __initial_trust_radius(alg.initial_trust_radius, T, alg.method,
+        max_trust_radius, u0_norm, fu_norm)
+    step_threshold = __step_threshold(alg.step_threshold, T, alg.method)
+    shrink_threshold = __shrink_threshold(alg.shrink_threshold, T, alg.method)
+    expand_threshold = __expand_threshold(alg.expand_threshold, T, alg.method)
+    shrink_factor = __shrink_factor(alg.shrink_factor, T, alg.method)
+    expand_factor = __expand_factor(alg.expand_factor, T, alg.method)
+
+    # Scheme Specific Setup
+    p1, p2, p3, p4 = __get_parameters(T, alg.method)
+    ϵ = T(1e-8)
+
+    vjp_operator = alg.method isa RUS.__Yuan || alg.method isa RUS.__Bastin ?
+                   VecJacOperator(prob, fu, u; autodiff = alg.reverse_ad) : nothing
+
+    jvp_operator = alg.method isa RUS.__Bastin ?
+                   JacVecOperator(prob, fu, u; autodiff = alg.forward_ad) : nothing
+
+    if alg.method isa RUS.__Yuan
+        Jᵀfu_cache = StatefulJacobianOperator(vjp_operator, u, prob.p) * _vec(fu)
+        initial_trust_radius = T(p1 * internalnorm(Jᵀfu_cache))
+    else
+        if u isa Number
+            Jᵀfu_cache = u
+        else
+            @bb Jᵀfu_cache = similar(u)
+        end
+    end
+
+    if alg.method isa RUS.__Bastin
+        @bb δu_cache = similar(u)
+    else
+        δu_cache = nothing
+    end
+
+    @bb u_cache = similar(u)
+    @bb fu_cache = similar(fu)
+    @bb Jδu_cache = similar(fu)
+
+    return GenericTrustRegionSchemeCache(alg.method, f, p, max_trust_radius,
+        initial_trust_radius, initial_trust_radius, step_threshold, shrink_threshold,
+        expand_threshold, shrink_factor, expand_factor, p1, p2, p3, p4, ϵ, T(0),
+        vjp_operator, jvp_operator, Jᵀfu_cache, Jδu_cache, δu_cache, internalnorm,
+        u_cache, fu_cache, false, 0, 0, alg)
+end
+
+function __internal_solve!(cache::GenericTrustRegionSchemeCache, J, fu, u, δu,
+        descent_stats)
+    T = promote_type(eltype(u), eltype(fu))
+    @bb @. cache.u_cache = u + δu
+    cache.fu_cache = evaluate_f!!(cache.f, cache.fu_cache, cache.u_cache, cache.p)
+    cache.nf += 1
+
+    if hasfield(typeof(descent_stats), :δuJᵀJδu) && !isnan(descent_stats.δuJᵀJδu)
+        δuJᵀJδu = descent_stats.δuJᵀJδu
+    else
+        @bb cache.Jδu_cache = J × vec(δu)
+        δuJᵀJδu = __dot(cache.Jδu_cache, cache.Jδu_cache)
+    end
+    @bb cache.Jᵀfu_cache = transpose(J) × vec(fu)
+    num = (cache.internalnorm(cache.fu_cache)^2 - cache.internalnorm(fu)^2) / 2
+    denom = __dot(δu, cache.Jᵀfu_cache) + δuJᵀJδu / 2
+    cache.ρ = num / denom
+
+    if cache.ρ > cache.step_threshold
+        cache.last_step_accepted = true
+    else
+        cache.last_step_accepted = false
+    end
+
+    if cache.method isa RUS.__Simple
+        if cache.ρ < cache.shrink_threshold
+            cache.trust_region *= cache.shrink_factor
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+            if cache.ρ > cache.expand_threshold && cache.ρ > cache.step_threshold
+                cache.trust_region = cache.expand_factor * cache.trust_region
+            end
+        end
+    elseif cache.method isa RUS.__NLsolve
+        if cache.ρ < cache.shrink_threshold
+            cache.trust_region *= cache.shrink_factor
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+            if cache.ρ ≥ cache.expand_threshold
+                cache.trust_region = cache.expand_factor * cache.internalnorm(δu)
+            elseif cache.ρ ≥ cache.p1
+                cache.trust_region = max(cache.trust_region,
+                    cache.expand_factor * cache.internalnorm(δu))
+            end
+        end
+    elseif cache.method isa RUS.__NocedalWright
+        if cache.ρ < cache.shrink_threshold
+            cache.trust_region = cache.shrink_factor * cache.internalnorm(δu)
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+            if cache.ρ > cache.expand_threshold &&
+               abs(cache.internalnorm(δu) - cache.trust_region) <
+               1e-6 * cache.trust_region
+                cache.trust_region = cache.expand_factor * cache.trust_region
+            end
+        end
+    elseif cache.method isa RUS.__Hei
+        tr_new = __rfunc(cache.ρ, cache.shrink_threshold, cache.p1, cache.p3, cache.p4,
+            cache.p2) * cache.internalnorm(δu)
+        if tr_new < cache.trust_region
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+        end
+        cache.trust_region = tr_new
+    elseif cache.method isa RUS.__Yuan
+        if cache.ρ < cache.shrink_threshold
+            cache.p1 = cache.p2 * cache.p1
+            cache.shrink_counter += 1
+        else
+            if cache.ρ ≥ cache.expand_threshold &&
+               2 * cache.internalnorm(δu) > cache.trust_region
+                cache.p1 = cache.p3 * cache.p1
+            end
+            cache.shrink_counter = 0
+        end
+        operator = StatefulJacobianOperator(cache.vjp_operator, cache.u_cache, cache.p)
+        @bb cache.Jᵀfu_cache = operator × vec(cache.fu_cache)
+        cache.trust_region = cache.p1 * cache.internalnorm(cache.Jᵀfu_cache)
+    elseif cache.method isa RUS.__Fan
+        if cache.ρ < cache.shrink_threshold
+            cache.p1 *= cache.p2
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+            cache.ρ > cache.expand_threshold && (cache.p1 = min(cache.p1 * cache.p3,
+                cache.p4))
+        end
+        cache.trust_region = cache.p1 * (cache.internalnorm(cache.fu_cache)^T(0.99))
+    elseif cache.method isa RUS.__Bastin
+        if cache.ρ > cache.step_threshold
+            jvp_op = StatefulJacobianOperator(cache.jvp_operator, cache.u_cache,
+                cache.p)
+            vjp_op = StatefulJacobianOperator(cache.vjp_operator, cache.u_cache,
+                cache.p)
+            @bb cache.Jδu_cache = jvp_op × vec(δu)
+            @bb cache.Jᵀfu_cache = vjp_op × vec(cache.fu_cache)
+            denom_1 = dot(_vec(δu), cache.Jᵀfu_cache)
+            @bb cache.Jᵀfu_cache = vjp_op × vec(cache.Jδu_cache)
+            denom_2 = dot(_vec(δu), cache.Jᵀfu_cache)
+            denom = denom_1 + denom_2 / 2
+            ρ = num / denom
+            if ρ ≥ cache.expand_threshold
+                cache.trust_region = cache.p1 * cache.internalnorm(δu)
+            end
+            cache.shrink_counter = 0
+        else
+            cache.trust_region *= cache.p2
+            cache.shrink_counter += 1
+        end
+    end
+
+    cache.trust_region = min(cache.trust_region, cache.max_trust_radius)
+
+    return cache.last_step_accepted, cache.u_cache, cache.fu_cache
+end
+
+# R-function for adaptive trust region method
+function __rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real}
+    return ifelse(r ≥ c2,
+        (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / R(π),
+        (1 - γ1 - β) * (exp(r - c2) + β / (1 - γ1 - β)))
+end
diff --git a/src/internal/approximate_initialization.jl b/src/internal/approximate_initialization.jl
new file mode 100644
index 000000000..bb9898009
--- /dev/null
+++ b/src/internal/approximate_initialization.jl
@@ -0,0 +1,281 @@
+# Jacobian Structure
+"""
+    DiagonalStructure()
+
+Preserves only the Diagonal of the Matrix.
+"""
+struct DiagonalStructure <: AbstractApproximateJacobianStructure end
+
+get_full_jacobian(cache, ::DiagonalStructure, J::Number) = J
+get_full_jacobian(cache, ::DiagonalStructure, J) = Diagonal(_vec(J))
+
+function (::DiagonalStructure)(J::AbstractMatrix; alias::Bool = false)
+    @assert size(J, 1)==size(J, 2) "Diagonal Jacobian Structure must be square!"
+    return diag(J)
+end
+(::DiagonalStructure)(J::AbstractVector; alias::Bool = false) = alias ? J : @bb(copy(J))
+(::DiagonalStructure)(J::Number; alias::Bool = false) = J
+
+(::DiagonalStructure)(::Number, J_new::Number) = J_new
+function (::DiagonalStructure)(J::AbstractVector, J_new::AbstractMatrix)
+    if __can_setindex(J)
+        if fast_scalar_indexing(J)
+            @inbounds for i in eachindex(J)
+                J[i] = J_new[i, i]
+            end
+        else
+            @.. broadcast=false J=@view(J_new[diagind(J_new)])
+        end
+        return J
+    end
+    return diag(J_new)
+end
+function (st::DiagonalStructure)(J::AbstractArray, J_new::AbstractMatrix)
+    return _restructure(J, st(vec(J), J_new))
+end
+
+"""
+    FullStructure()
+
+Stores the full matrix.
+"""
+struct FullStructure <: AbstractApproximateJacobianStructure end
+
+stores_full_jacobian(::FullStructure) = true
+
+(::FullStructure)(J; alias::Bool = false) = alias ? J : @bb(copy(J))
+
+function (::FullStructure)(J, J_new)
+    J === J_new && return J
+    @bb copyto!(J, J_new)
+    return J
+end
+
+# Initialization Strategies
+"""
+    IdentityInitialization(alpha, structure)
+
+Initialize the Jacobian to be an Identity Matrix scaled by `alpha` and maintain the
+structure as specified by `structure`.
+"""
+@concrete struct IdentityInitialization <: AbstractJacobianInitialization
+    alpha
+    structure
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::IdentityInitialization,
+        solver,
+        f::F, fu, u::Number, p; internalnorm::IN = DEFAULT_NORM, kwargs...) where {F, IN}
+    α = __initial_alpha(alg.alpha, u, fu, internalnorm)
+    return InitializedApproximateJacobianCache(α, alg.structure, alg, nothing, true,
+        internalnorm)
+end
+function __internal_init(prob::AbstractNonlinearProblem, alg::IdentityInitialization,
+        solver,
+        f::F, fu::StaticArray, u::StaticArray, p; internalnorm::IN = DEFAULT_NORM,
+        kwargs...) where {IN, F}
+    α = __initial_alpha(alg.alpha, u, fu, internalnorm)
+    if alg.structure isa DiagonalStructure
+        @assert length(u)==length(fu) "Diagonal Jacobian Structure must be square!"
+        J = one.(_vec(fu)) .* α
+    else
+        T = promote_type(eltype(u), eltype(fu))
+        if fu isa SArray
+            J_ = SArray{Tuple{prod(Size(fu)), prod(Size(u))}, T}(I * α)
+        else
+            J_ = MArray{Tuple{prod(Size(fu)), prod(Size(u))}, T}(I * α)
+        end
+        J = alg.structure(J_; alias = true)
+    end
+    return InitializedApproximateJacobianCache(J, alg.structure, alg, nothing, true,
+        internalnorm)
+end
+function __internal_init(prob::AbstractNonlinearProblem, alg::IdentityInitialization,
+        solver, f::F, fu, u, p; internalnorm::IN = DEFAULT_NORM, kwargs...) where {F, IN}
+    α = __initial_alpha(alg.alpha, u, fu, internalnorm)
+    if alg.structure isa DiagonalStructure
+        @assert length(u)==length(fu) "Diagonal Jacobian Structure must be square!"
+        J = one.(_vec(fu)) .* α
+    else
+        J_ = similar(fu, promote_type(eltype(fu), eltype(u)), length(fu), length(u))
+        J = alg.structure(__make_identity!!(J_, α); alias = true)
+    end
+    return InitializedApproximateJacobianCache(J, alg.structure, alg, nothing, true,
+        internalnorm)
+end
+
+@inline function __initial_alpha(α, u, fu, internalnorm::F) where {F}
+    return convert(promote_type(eltype(u), eltype(fu)), α)
+end
+@inline function __initial_alpha(::Nothing, u, fu, internalnorm::F) where {F}
+    fu_norm = internalnorm(fu)
+    return ifelse(fu_norm ≥ 1e-5, (2 * fu_norm) / max(norm(u), true),
+        __initial_alpha(true, u, fu, internalnorm))
+end
+
+@inline __make_identity!!(A::Number, α) = one(A) * α
+@inline __make_identity!!(A::AbstractVector, α) = __can_setindex(A) ? (A .= α) :
+                                                  (one.(A) .* α)
+@inline function __make_identity!!(A::AbstractMatrix{T}, α) where {T}
+    if A isa SMatrix
+        Sz = Size(A)
+        return SArray{Tuple{Sz[1], Sz[2]}, eltype(Sz)}(I * α)
+    end
+    @assert __can_setindex(A) "__make_identity!!(::AbstractMatrix) only works on mutable arrays!"
+    fill!(A, false)
+    if fast_scalar_indexing(A)
+        @inbounds for i in axes(A, 1)
+            A[i, i] = α
+        end
+    else
+        A[diagind(A)] .= α
+    end
+    return A
+end
+
+"""
+    TrueJacobianInitialization(structure, autodiff)
+
+Initialize the Jacobian to be the true Jacobian and maintain the structure as specified
+by `structure`. `autodiff` is used to compute the true Jacobian and if not specified we
+make a selection automatically.
+"""
+@concrete struct TrueJacobianInitialization <: AbstractJacobianInitialization
+    structure
+    autodiff
+end
+
+function __internal_init(prob::AbstractNonlinearProblem, alg::TrueJacobianInitialization,
+        solver, f::F, fu, u, p; linsolve = missing, internalnorm::IN = DEFAULT_NORM,
+        kwargs...) where {F, IN}
+    autodiff = get_concrete_forward_ad(alg.autodiff, prob; check_reverse_mode = false,
+        kwargs...)
+    jac_cache = JacobianCache(prob, solver, prob.f, fu, u, p; autodiff, linsolve)
+    J = alg.structure(jac_cache(nothing))
+    return InitializedApproximateJacobianCache(J, alg.structure, alg, jac_cache, false,
+        internalnorm)
+end
+
+"""
+    InitializedApproximateJacobianCache(J, structure, alg, cache, initialized::Bool,
+        internalnorm)
+
+A cache for Approximate Jacobian.
+
+### Arguments
+
+  - `J`: The current Jacobian.
+  - `structure`: The structure of the Jacobian.
+  - `alg`: The initialization algorithm.
+  - `cache`: The Jacobian cache [`NonlinearSolve.JacobianCache`](@ref) (if needed).
+  - `initialized`: A boolean indicating whether the Jacobian has been initialized.
+  - `internalnorm`: The norm to be used.
+
+### Interface
+
+```julia
+(cache::InitializedApproximateJacobianCache)(::Nothing)
+```
+
+Returns the current Jacobian `cache.J` with the proper `structure`.
+
+```julia
+__internal_solve!(cache::InitializedApproximateJacobianCache, fu, u, ::Val{reinit})
+```
+
+Solves for the Jacobian `cache.J` and returns it. If `reinit` is `true`, then the Jacobian
+is reinitialized.
+"""
+@concrete mutable struct InitializedApproximateJacobianCache
+    J
+    structure
+    alg
+    cache
+    initialized::Bool
+    internalnorm
+end
+
+function __reinit_internal!(cache::InitializedApproximateJacobianCache, args...; kwargs...)
+    cache.initialized = false
+end
+
+@internal_caches InitializedApproximateJacobianCache :cache
+
+function (cache::InitializedApproximateJacobianCache)(::Nothing)
+    return get_full_jacobian(cache, cache.structure, cache.J)
+end
+
+function __internal_solve!(cache::InitializedApproximateJacobianCache, fu, u,
+        ::Val{reinit}) where {reinit}
+    if reinit || !cache.initialized
+        cache(cache.alg, fu, u)
+        cache.initialized = true
+    end
+    if stores_full_jacobian(cache.structure)
+        full_J = cache.J
+    else
+        full_J = get_full_jacobian(cache, cache.structure, cache.J)
+    end
+    return full_J
+end
+
+function (cache::InitializedApproximateJacobianCache)(alg::IdentityInitialization, fu, u)
+    α = __initial_alpha(alg.alpha, u, fu, cache.internalnorm)
+    cache.J = __make_identity!!(cache.J, α)
+    return
+end
+
+function (cache::InitializedApproximateJacobianCache)(alg::TrueJacobianInitialization, fu,
+        u)
+    J_new = cache.cache(u)
+    cache.J = cache.structure(cache.J, J_new)
+    return
+end
+
+# Matrix Inversion
+@inline __safe_inv_workspace(A) = nothing, A
+@inline __safe_inv_workspace(A::ApplyArray) = __safe_inv_workspace(X)
+@inline __safe_inv_workspace(A::SparseMatrixCSC) = Matrix(A), Matrix(A)
+
+@inline __safe_inv!!(workspace, A::Number) = pinv(A)
+@inline __safe_inv!!(workspace, A::AbstractMatrix) = pinv(A)
+@inline function __safe_inv!!(workspace, A::Diagonal)
+    D = A.diag
+    @bb @. D = pinv(D)
+    return Diagonal(D)
+end
+@inline function __safe_inv!!(workspace, A::AbstractVector{T}) where {T}
+    @. A = ifelse(iszero(A), zero(T), one(T) / A)
+    return A
+end
+@inline __safe_inv!!(workspace, A::ApplyArray) = __safe_inv!!(workspace, A.f(A.args...))
+@inline function __safe_inv!!(workspace::AbstractMatrix, A::SparseMatrixCSC)
+    copyto!(workspace, A)
+    return __safe_inv!!(nothing, workspace)
+end
+@inline function __safe_inv!!(workspace, A::StridedMatrix{T}) where {T}
+    LinearAlgebra.checksquare(A)
+    if istriu(A)
+        issingular = any(iszero, @view(A[diagind(A)]))
+        A_ = UpperTriangular(A)
+        !issingular && return triu!(parent(inv(A_)))
+    elseif istril(A)
+        A_ = LowerTriangular(A)
+        issingular = any(iszero, @view(A_[diagind(A_)]))
+        !issingular && return tril!(parent(inv(A_)))
+    else
+        F = lu(A; check = false)
+        if issuccess(F)
+            Ai = LinearAlgebra.inv!(F)
+            return convert(typeof(parent(Ai)), Ai)
+        end
+    end
+    return pinv(A)
+end
+
+@inline __safe_inv(x) = __safe_inv!!(first(__safe_inv_workspace(x)), x)
+
+LazyArrays.applied_eltype(::typeof(__safe_inv), x) = eltype(x)
+LazyArrays.applied_ndims(::typeof(__safe_inv), x) = ndims(x)
+LazyArrays.applied_size(::typeof(__safe_inv), x) = size(x)
+LazyArrays.applied_axes(::typeof(__safe_inv), x) = axes(x)
diff --git a/src/internal/forward_diff.jl b/src/internal/forward_diff.jl
new file mode 100644
index 000000000..3e0937b20
--- /dev/null
+++ b/src/internal/forward_diff.jl
@@ -0,0 +1,72 @@
+# Not part of public API but helps reduce code duplication
+import SimpleNonlinearSolve: __nlsolve_ad,
+    __nlsolve_dual_soln, __nlsolve_∂f_∂p, __nlsolve_∂f_∂u
+
+function SciMLBase.solve(prob::NonlinearProblem{<:Union{Number, <:AbstractArray},
+            iip, <:Union{<:Dual{T, V, P}, <:AbstractArray{<:Dual{T, V, P}}}},
+        alg::Union{Nothing, AbstractNonlinearAlgorithm}, args...;
+        kwargs...) where {T, V, P, iip}
+    sol, partials = __nlsolve_ad(prob, alg, args...; kwargs...)
+    dual_soln = __nlsolve_dual_soln(sol.u, partials, prob.p)
+    return SciMLBase.build_solution(prob, alg, dual_soln, sol.resid; sol.retcode, sol.stats,
+        sol.original)
+end
+
+@concrete mutable struct NonlinearSolveForwardDiffCache
+    cache
+    prob
+    alg
+    p
+    values_p
+    partials_p
+end
+
+@internal_caches NonlinearSolveForwardDiffCache :cache
+
+function reinit_cache!(cache::NonlinearSolveForwardDiffCache; p = cache.p,
+        u0 = get_u(cache.cache), kwargs...)
+    inner_cache = reinit_cache!(cache.cache; p = __value(p), u0 = __value(u0),
+        kwargs...)
+    cache.cache = inner_cache
+    cache.p = p
+    cache.values_p = __value(p)
+    cache.partials_p = ForwardDiff.partials(p)
+    return cache
+end
+
+function SciMLBase.init(prob::NonlinearProblem{<:Union{Number, <:AbstractArray},
+            iip, <:Union{<:Dual{T, V, P}, <:AbstractArray{<:Dual{T, V, P}}}},
+        alg::Union{Nothing, AbstractNonlinearAlgorithm}, args...;
+        kwargs...) where {T, V, P, iip}
+    p = __value(prob.p)
+    newprob = NonlinearProblem(prob.f, __value(prob.u0), p; prob.kwargs...)
+    cache = init(newprob, alg, args...; kwargs...)
+    return NonlinearSolveForwardDiffCache(cache, newprob, alg, prob.p, p,
+        ForwardDiff.partials(prob.p))
+end
+
+function SciMLBase.solve!(cache::NonlinearSolveForwardDiffCache)
+    sol = solve!(cache.cache)
+    prob = cache.prob
+
+    uu = sol.u
+    f_p = __nlsolve_∂f_∂p(prob, prob.f, uu, cache.values_p)
+    f_x = __nlsolve_∂f_∂u(prob, prob.f, uu, cache.values_p)
+
+    z_arr = -f_x \ f_p
+
+    sumfun = ((z, p),) -> map(zᵢ -> zᵢ * ForwardDiff.partials(p), z)
+    if cache.p isa Number
+        partials = sumfun((z_arr, cache.p))
+    else
+        partials = sum(sumfun, zip(eachcol(z_arr), cache.p))
+    end
+
+    dual_soln = __nlsolve_dual_soln(sol.u, partials, cache.p)
+    return SciMLBase.build_solution(prob, cache.alg, dual_soln, sol.resid; sol.retcode,
+        sol.stats, sol.original)
+end
+
+@inline __value(x) = x
+@inline __value(x::Dual) = ForwardDiff.value(x)
+@inline __value(x::AbstractArray{<:Dual}) = map(ForwardDiff.value, x)
diff --git a/src/internal/helpers.jl b/src/internal/helpers.jl
new file mode 100644
index 000000000..f9e90b7f8
--- /dev/null
+++ b/src/internal/helpers.jl
@@ -0,0 +1,260 @@
+# Evaluate the residual function at a given point
+function evaluate_f(prob::AbstractNonlinearProblem{uType, iip}, u) where {uType, iip}
+    (; f, u0, p) = prob
+    if iip
+        fu = f.resid_prototype === nothing ? similar(u) :
+             promote_type(eltype(u), eltype(f.resid_prototype)).(f.resid_prototype)
+        f(fu, u, p)
+    else
+        fu = f(u, p)
+    end
+    return fu
+end
+
+function evaluate_f!(cache, u, p)
+    cache.nf += 1
+    if isinplace(cache)
+        cache.prob.f(get_fu(cache), u, p)
+    else
+        set_fu!(cache, cache.prob.f(u, p))
+    end
+end
+
+evaluate_f!!(prob::AbstractNonlinearProblem, fu, u, p) = evaluate_f!!(prob.f, fu, u, p)
+function evaluate_f!!(f::NonlinearFunction{iip}, fu, u, p) where {iip}
+    if iip
+        f(fu, u, p)
+        return fu
+    end
+    return f(u, p)
+end
+
+# AutoDiff Selection Functions
+struct NonlinearSolveTag end
+
+function ForwardDiff.checktag(::Type{<:ForwardDiff.Tag{<:NonlinearSolveTag, <:T}}, f::F,
+        x::AbstractArray{T}) where {T, F}
+    return true
+end
+
+function get_concrete_forward_ad(autodiff::Union{ADTypes.AbstractForwardMode,
+            ADTypes.AbstractFiniteDifferencesMode}, prob, sp::Val{test_sparse} = True,
+        args...; kwargs...) where {test_sparse}
+    return autodiff
+end
+function get_concrete_forward_ad(autodiff::ADTypes.AbstractADType, prob,
+        sp::Val{test_sparse} = True, args...;
+        check_reverse_mode = true, kwargs...) where {test_sparse}
+    if check_reverse_mode
+        @warn "$(autodiff)::$(typeof(autodiff)) is not a \
+               `Abstract(Forward/FiniteDifferences)Mode`. Use with caution." maxlog=1
+    end
+    return autodiff
+end
+function get_concrete_forward_ad(autodiff, prob, sp::Val{test_sparse} = True, args...;
+        kwargs...) where {test_sparse}
+    if test_sparse
+        (; sparsity, jac_prototype) = prob.f
+        use_sparse_ad = sparsity !== nothing || jac_prototype !== nothing
+    else
+        use_sparse_ad = false
+    end
+    ad = if !ForwardDiff.can_dual(eltype(prob.u0)) # Use Finite Differencing
+        use_sparse_ad ? AutoSparseFiniteDiff() : AutoFiniteDiff()
+    else
+        tag = ForwardDiff.Tag(NonlinearSolveTag(), eltype(prob.u0))
+        (use_sparse_ad ? AutoSparseForwardDiff : AutoForwardDiff)(; tag)
+    end
+    return ad
+end
+
+function get_concrete_reverse_ad(autodiff::Union{ADTypes.AbstractReverseMode,
+            ADTypes.AbstractFiniteDifferencesMode}, prob, sp::Val{test_sparse} = True,
+        args...; kwargs...) where {test_sparse}
+    return autodiff
+end
+function get_concrete_reverse_ad(autodiff::Union{AutoZygote, AutoSparseZygote}, prob,
+        sp::Val{test_sparse} = True, args...; kwargs...) where {test_sparse}
+    if isinplace(prob)
+        @warn "Attempting to use Zygote.jl for inplace problems. Switching to FiniteDiff. \
+               Sparsity even if present will be ignored for correctness purposes. Set \
+               the reverse ad option to `nothing` to automatically select the best option \
+               and exploit sparsity."
+        return AutoFiniteDiff() # colorvec confusion will occur if we use FiniteDiff
+    end
+    return autodiff
+end
+function get_concrete_reverse_ad(autodiff::ADTypes.AbstractADType, prob,
+        sp::Val{test_sparse} = True, args...; check_reverse_mode = true,
+        kwargs...) where {test_sparse}
+    if check_reverse_mode
+        @warn "$(autodiff)::$(typeof(autodiff)) is not a \
+               `Abstract(Forward/FiniteDifferences)Mode`. Use with caution." maxlog=1
+    end
+    return autodiff
+end
+function get_concrete_reverse_ad(autodiff, prob, sp::Val{test_sparse} = True, args...;
+        kwargs...) where {test_sparse}
+    if test_sparse
+        (; sparsity, jac_prototype) = prob.f
+        use_sparse_ad = sparsity !== nothing || jac_prototype !== nothing
+    else
+        use_sparse_ad = false
+    end
+    ad = if isinplace(prob) || !is_extension_loaded(Val(:Zygote)) # Use Finite Differencing
+        use_sparse_ad ? AutoSparseFiniteDiff() : AutoFiniteDiff()
+    else
+        use_sparse_ad ? AutoSparseZygote() : AutoZygote()
+    end
+    return ad
+end
+
+# Callbacks
+"""
+    callback_into_cache!(cache, internalcache, args...)
+
+Define custom operations on `internalcache` tightly coupled with the calling `cache`.
+`args...` contain the sequence of caches calling into `internalcache`.
+
+This unfortunately makes code very tightly coupled and not modular. It is recommended to not
+use this functionality unless it can't be avoided (like in [`LevenbergMarquardt`](@ref)).
+"""
+@inline callback_into_cache!(cache, internalcache, args...) = nothing  # By default do nothing
+
+# Extension Algorithm Helpers
+function __test_termination_condition(termination_condition, alg)
+    termination_condition !== AbsNormTerminationMode && termination_condition !== nothing &&
+        error("`$(alg)` does not support termination conditions!")
+end
+
+function __construct_extension_f(prob::AbstractNonlinearProblem; alias_u0::Bool = false,
+        can_handle_oop::Val = False, can_handle_scalar::Val = False,
+        make_fixed_point::Val = False, force_oop::Val = False)
+    if can_handle_oop === False && can_handle_scalar === True
+        error("Incorrect Specification: OOP not supported but scalar supported.")
+    end
+
+    resid = evaluate_f(prob, prob.u0)
+    u0 = can_handle_scalar === True || !(prob.u0 isa Number) ?
+         __maybe_unaliased(prob.u0, alias_u0) : [prob.u0]
+
+    fₚ = if make_fixed_point === True
+        if isinplace(prob)
+            @closure (du, u) -> (prob.f(du, u, prob.p); du .+= u)
+        else
+            @closure u -> prob.f(u, prob.p) .+ u
+        end
+    else
+        if isinplace(prob)
+            @closure (du, u) -> prob.f(du, u, prob.p)
+        else
+            @closure u -> prob.f(u, prob.p)
+        end
+    end
+
+    𝐟 = if isinplace(prob)
+        u0_size, du_size = size(u0), size(resid)
+        @closure (du, u) -> (fₚ(reshape(du, du_size), reshape(u, u0_size)); du)
+    else
+        if prob.u0 isa Number
+            if can_handle_scalar === True
+                fₚ
+            elseif can_handle_oop === True
+                @closure u -> [fₚ(first(u))]
+            else
+                @closure (du, u) -> (du[1] = fₚ(first(u)); du)
+            end
+        else
+            u0_size = size(u0)
+            if can_handle_oop === True
+                @closure u -> vec(fₚ(reshape(u, u0_size)))
+            else
+                @closure (du, u) -> (copyto!(du, fₚ(reshape(u, u0_size))); du)
+            end
+        end
+    end
+
+    𝐅 = if force_oop === True && applicable(𝐟, u0, u0)
+        _resid = resid isa Number ? [resid] : _vec(resid)
+        du = _vec(similar(_resid))
+        @closure u -> begin
+            𝐟(du, u)
+            return du
+        end
+    else
+        𝐟
+    end
+
+    return 𝐅, _vec(u0), (resid isa Number ? [resid] : _vec(resid))
+end
+
+function __construct_extension_jac(prob, alg, u0, fu; can_handle_oop::Val = False,
+        can_handle_scalar::Val = False, kwargs...)
+    Jₚ = JacobianCache(prob, alg, prob.f, fu, u0, prob.p; kwargs...)
+
+    𝓙 = (can_handle_scalar === False && prob.u0 isa Number) ? @closure(u->[Jₚ(u[1])]) : Jₚ
+
+    𝐉 = (can_handle_oop === False && !isinplace(prob)) ?
+        @closure((J, u)->copyto!(J, 𝓙(u))) : 𝓙
+
+    return 𝐉
+end
+
+# Query Statistics
+for stat in (:nsolve, :nfactors, :nsteps, :njacs, :nf)
+    fname = Symbol("get_$(stat)")
+    @eval @inline $(fname)(cache) = __query_stat(cache, $(Val(stat)))
+end
+
+@inline __query_stat(cache, stat::Val) = __direct_query_stat(cache, stat)
+@inline @generated function __direct_query_stat(cache::T, ::Val{stat}) where {T, stat}
+    hasfield(T, stat) || return :(0)
+    return :(__get_data(cache.$(stat)))
+end
+
+@inline __get_data(x::Number) = x
+@inline __get_data(x::Base.RefValue{Int}) = x[]
+
+function reinit_cache! end
+reinit_cache!(cache::Nothing, args...; kwargs...) = nothing
+reinit_cache!(cache, args...; kwargs...) = nothing
+
+function __reinit_internal! end
+__reinit_internal!(::Nothing, args...; kwargs...) = nothing
+__reinit_internal!(cache, args...; kwargs...) = nothing
+
+# Auto-generate some of the helper functions
+macro internal_caches(cType, internal_cache_names...)
+    return __internal_caches(__source__, __module__, cType, internal_cache_names)
+end
+
+function __internal_caches(__source__, __module__, cType, internal_cache_names::Tuple)
+    fields = map(name -> :($(__query_stat)(getproperty(cache, $(name)), ST)),
+        internal_cache_names)
+    callback_caches = map(name -> :($(callback_into_cache!)(cache,
+            getproperty(internalcache, $(name)), internalcache, args...)),
+        internal_cache_names)
+    callbacks_self = map(name -> :($(callback_into_cache!)(internalcache,
+            getproperty(internalcache, $(name)))), internal_cache_names)
+    reinit_caches = map(name -> :($(reinit_cache!)(getproperty(cache, $(name)),
+            args...; kwargs...)), internal_cache_names)
+    return esc(quote
+        function __query_stat(cache::$(cType), ST::Val{stat}) where {stat}
+            val = $(__direct_query_stat)(cache, ST)
+            return +($(fields...)) + val
+        end
+        function __query_stat(cache::$(cType), ST::Val{:nsteps})
+            return $(__direct_query_stat)(cache, ST)
+        end
+        function callback_into_cache!(cache, internalcache::$(cType), args...)
+            $(callback_caches...)
+        end
+        function callback_into_cache!(internalcache::$(cType))
+            $(callbacks_self...)
+        end
+        function reinit_cache!(cache::$(cType), args...; kwargs...)
+            $(reinit_caches...)
+            $(__reinit_internal!)(cache, args...; kwargs...)
+        end
+    end)
+end
diff --git a/src/internal/jacobian.jl b/src/internal/jacobian.jl
new file mode 100644
index 000000000..4ab451408
--- /dev/null
+++ b/src/internal/jacobian.jl
@@ -0,0 +1,191 @@
+"""
+    JacobianCache(prob, alg, f::F, fu, u, p; autodiff = nothing,
+        vjp_autodiff = nothing, jvp_autodiff = nothing, linsolve = missing) where {F}
+
+Construct a cache for the Jacobian of `f` w.r.t. `u`.
+
+### Arguments
+
+  - `prob`: A [`NonlinearProblem`](@ref) or a [`NonlinearLeastSquaresProblem`](@ref).
+  - `alg`: A [`AbstractNonlinearSolveAlgorithm`](@ref). Used to check for
+    [`concrete_jac`](@ref).
+  - `f`: The function to compute the Jacobian of.
+  - `fu`: The evaluation of `f(u, p)` or `f(_, u, p)`. Used to determine the size of the
+    result cache and Jacobian.
+  - `u`: The current value of the state.
+  - `p`: The current value of the parameters.
+
+### Keyword Arguments
+
+  - `autodiff`: Automatic Differentiation or Finite Differencing backend for computing the
+    jacobian. By default, selects a backend based on sparsity parameters, type of state,
+    function properties, etc.
+  - `vjp_autodiff`: Automatic Differentiation or Finite Differencing backend for computing
+    the vector-Jacobian product.
+  - `jvp_autodiff`: Automatic Differentiation or Finite Differencing backend for computing
+    the Jacobian-vector product.
+  - `linsolve`: Linear Solver Algorithm used to determine if we need a concrete jacobian
+    or if possible we can just use a [`NonlinearSolve.JacobianOperator`](@ref) instead.
+"""
+@concrete mutable struct JacobianCache{iip} <: AbstractNonlinearSolveJacobianCache{iip}
+    J
+    f
+    uf
+    fu
+    u
+    p
+    jac_cache
+    alg
+    njacs::Int
+    autodiff
+    vjp_autodiff
+    jvp_autodiff
+end
+
+function reinit_cache!(cache::JacobianCache{iip}, args...; p = cache.p, u0 = cache.u,
+        kwargs...) where {iip}
+    cache.njacs = 0
+    cache.u = u0
+    cache.p = p
+    cache.uf = JacobianWrapper{iip}(cache.f, p)
+end
+
+function JacobianCache(prob, alg, f::F, fu_, u, p; autodiff = nothing,
+        vjp_autodiff = nothing, jvp_autodiff = nothing, linsolve = missing) where {F}
+    iip = isinplace(prob)
+    uf = JacobianWrapper{iip}(f, p)
+
+    autodiff = get_concrete_forward_ad(autodiff, prob; check_reverse_mode = false)
+    jvp_autodiff = get_concrete_forward_ad(jvp_autodiff, prob, Val(false);
+        check_reverse_mode = true)
+    vjp_autodiff = get_concrete_reverse_ad(vjp_autodiff, prob, Val(false);
+        check_forward_mode = false)
+
+    has_analytic_jac = SciMLBase.has_jac(f)
+    linsolve_needs_jac = concrete_jac(alg) === nothing && (linsolve === missing ||
+                          (linsolve === nothing || __needs_concrete_A(linsolve)))
+    alg_wants_jac = concrete_jac(alg) !== nothing && concrete_jac(alg)
+    needs_jac = linsolve_needs_jac || alg_wants_jac
+
+    @bb fu = similar(fu_)
+
+    if !has_analytic_jac && needs_jac
+        sd = __sparsity_detection_alg(f, autodiff)
+        jac_cache = iip ? sparse_jacobian_cache(autodiff, sd, uf, fu, u) :
+                    sparse_jacobian_cache(autodiff, sd, uf, __maybe_mutable(u, autodiff);
+            fx = fu)
+    else
+        jac_cache = nothing
+    end
+
+    J = if !needs_jac
+        JacobianOperator(prob, fu, u; jvp_autodiff, vjp_autodiff)
+    else
+        if has_analytic_jac
+            f.jac_prototype === nothing ? undefmatrix(u) : f.jac_prototype
+        elseif f.jac_prototype === nothing
+            init_jacobian(jac_cache; preserve_immutable = Val(true))
+        else
+            f.jac_prototype
+        end
+    end
+
+    return JacobianCache{iip}(J, f, uf, fu, u, p, jac_cache, alg, 0, autodiff, vjp_autodiff,
+        jvp_autodiff)
+end
+
+function JacobianCache(prob, alg, f::F, ::Number, u::Number, p; kwargs...) where {F}
+    uf = JacobianWrapper{false}(f, p)
+    return JacobianCache{false}(u, f, uf, u, u, p, nothing, alg, 0, nothing, nothing,
+        nothing)
+end
+
+@inline (cache::JacobianCache)(u = cache.u) = cache(cache.J, u, cache.p)
+@inline function (cache::JacobianCache)(::Nothing)
+    J = cache.J
+    J isa JacobianOperator && return StatefulJacobianOperator(J, cache.u, cache.p)
+    return J
+end
+
+function (cache::JacobianCache)(J::JacobianOperator, u, p = cache.p)
+    return StatefulJacobianOperator(J, u, p)
+end
+function (cache::JacobianCache)(::Number, u, p = cache.p) # Scalar
+    cache.njacs += 1
+    J = last(__value_derivative(cache.uf, u))
+    return J
+end
+# Compute the Jacobian
+function (cache::JacobianCache{iip})(J::Union{AbstractMatrix, Nothing}, u,
+        p = cache.p) where {iip}
+    cache.njacs += 1
+    if iip
+        if has_jac(cache.f)
+            cache.f.jac(J, u, p)
+        else
+            sparse_jacobian!(J, cache.autodiff, cache.jac_cache, cache.uf, cache.fu, u)
+        end
+        J_ = J
+    else
+        J_ = if has_jac(cache.f)
+            cache.f.jac(u, p)
+        elseif __can_setindex(typeof(J))
+            sparse_jacobian!(J, cache.autodiff, cache.jac_cache, cache.uf, u)
+            J
+        else
+            sparse_jacobian(cache.autodiff, cache.jac_cache, cache.uf, u)
+        end
+    end
+    return J_
+end
+
+# Sparsity Detection Choices
+@inline __sparsity_detection_alg(_, _) = NoSparsityDetection()
+@inline function __sparsity_detection_alg(f::NonlinearFunction, ad::AbstractSparseADType)
+    if f.sparsity === nothing
+        if f.jac_prototype === nothing
+            if is_extension_loaded(Val(:Symbolics))
+                return SymbolicsSparsityDetection()
+            else
+                return ApproximateJacobianSparsity()
+            end
+        else
+            jac_prototype = f.jac_prototype
+        end
+    elseif f.sparsity isa AbstractSparsityDetection
+        if f.jac_prototype === nothing
+            return f.sparsity
+        else
+            jac_prototype = f.jac_prototype
+        end
+    elseif f.sparsity isa AbstractMatrix
+        jac_prototype = f.sparsity
+    elseif f.jac_prototype isa AbstractMatrix
+        jac_prototype = f.jac_prototype
+    else
+        error("`sparsity::typeof($(typeof(f.sparsity)))` & \
+               `jac_prototype::typeof($(typeof(f.jac_prototype)))` is not supported. \
+               Use `sparsity::AbstractMatrix` or `sparsity::AbstractSparsityDetection` or \
+               set to `nothing`. `jac_prototype` can be set to `nothing` or an \
+               `AbstractMatrix`.")
+    end
+
+    if SciMLBase.has_colorvec(f)
+        return PrecomputedJacobianColorvec(; jac_prototype, f.colorvec,
+            partition_by_rows = ad isa ADTypes.AbstractSparseReverseMode)
+    else
+        return JacPrototypeSparsityDetection(; jac_prototype)
+    end
+end
+
+@inline function __value_derivative(f::F, x::R) where {F, R}
+    T = typeof(ForwardDiff.Tag(f, R))
+    out = f(ForwardDiff.Dual{T}(x, one(x)))
+    return ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
+end
+
+@inline function __scalar_jacvec(f::F, x::R, v::V) where {F, R, V}
+    T = typeof(ForwardDiff.Tag(f, R))
+    out = f(ForwardDiff.Dual{T}(x, v))
+    return ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
+end
diff --git a/src/internal/linear_solve.jl b/src/internal/linear_solve.jl
new file mode 100644
index 000000000..184edf660
--- /dev/null
+++ b/src/internal/linear_solve.jl
@@ -0,0 +1,195 @@
+import LinearSolve: AbstractFactorization, DefaultAlgorithmChoice, DefaultLinearSolver
+
+"""
+    LinearSolverCache(alg, linsolve, A, b, u; kwargs...)
+
+Construct a cache for solving linear systems of the form `A * u = b`. Following cases are
+handled:
+
+ 1. `A` is Number, then we solve it with `u = b / A`
+ 2. `A` is `SMatrix`, then we solve it with `u = A \\ b` (using the defaults from base
+    Julia)
+ 3. `A` is `Diagonal`, then we solve it with `u = b ./ A.diag`
+ 4. In all other cases, we use `alg` to solve the linear system using
+    [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl).
+
+### Solving the System
+
+```julia
+(cache::LinearSolverCache)(; A = nothing, b = nothing, linu = nothing,
+    du = nothing, p = nothing, weight = nothing, cachedata = nothing,
+    reuse_A_if_factorization = false, kwargs...)
+```
+
+Returns the solution of the system `u` and stores the updated cache in `cache.lincache`.
+
+#### Keyword Arguments
+
+  - `reuse_A_if_factorization`: If `true`, then the factorization of `A` is reused if
+    possible. This is useful when solving the same system with different `b` values.
+    If the algorithm is an iterative solver, then we reset the internal linear solve cache.
+
+One distinct feature of this compared to the cache from LinearSolve is that it respects the
+aliasing arguments even after cache construction, i.e., if we passed in an `A` that `A` is
+not mutated, we do this by copying over `A` to a preconstructed cache.
+"""
+@concrete mutable struct LinearSolverCache <: AbstractLinearSolverCache
+    lincache
+    linsolve
+    A
+    b
+    precs
+    nsolve::Int
+    nfactors::Int
+end
+
+# FIXME: Do we need to reinit the precs?
+function reinit_cache!(cache::LinearSolverCache, args...; kwargs...)
+    cache.nsolve = 0
+    cache.nfactors = 0
+end
+
+@inline function LinearSolverCache(alg, linsolve, A::Number, b::Number, u; kwargs...)
+    return LinearSolverCache(nothing, nothing, A, b, nothing, 0, 0)
+end
+@inline function LinearSolverCache(alg, ::Nothing, A::SMatrix, b, u; kwargs...)
+    # Default handling for SArrays caching in LinearSolve is not the best. Override it here
+    return LinearSolverCache(nothing, nothing, A, b, nothing, 0, 0)
+end
+@inline function LinearSolverCache(alg, linsolve, A::Diagonal, b, u; kwargs...)
+    return LinearSolverCache(nothing, nothing, A, b, nothing, 0, 0)
+end
+function LinearSolverCache(alg, linsolve, A, b, u; kwargs...)
+    @bb b_ = copy(b)
+    @bb u_ = copy(u)
+    linprob = LinearProblem(A, b_; u0 = u_, kwargs...)
+
+    weight = __init_ones(u)
+    if __hasfield(alg, Val(:precs))
+        precs = alg.precs
+        Pl_, Pr_ = precs(A, nothing, u, nothing, nothing, nothing, nothing, nothing,
+            nothing)
+    else
+        precs, Pl_, Pr_ = nothing, nothing, nothing
+    end
+    Pl, Pr = __wrapprecs(Pl_, Pr_, weight)
+
+    # Unalias here, we will later use these as caches
+    lincache = init(linprob, linsolve; alias_A = false, alias_b = false, Pl, Pr)
+
+    return LinearSolverCache(lincache, linsolve, nothing, nothing, precs, 0, 0)
+end
+
+# Direct Linear Solve Case without Caching
+function (cache::LinearSolverCache{Nothing})(; A = nothing, b = nothing, linu = nothing,
+        kwargs...)
+    cache.nsolve += 1
+    cache.nfactors += 1
+    A === nothing || (cache.A = A)
+    b === nothing || (cache.b = b)
+    if A isa Diagonal
+        _diag = _restructure(cache.b, cache.A.diag)
+        @bb @. linu = cache.b / _diag
+        res = linu
+    else
+        res = cache.A \ cache.b
+    end
+    return res
+end
+# Use LinearSolve.jl
+function (cache::LinearSolverCache)(; A = nothing, b = nothing, linu = nothing,
+        du = nothing, p = nothing, weight = nothing, cachedata = nothing,
+        reuse_A_if_factorization = false, kwargs...)
+    cache.nsolve += 1
+
+    __update_A!(cache, A, reuse_A_if_factorization)
+    b !== nothing && (cache.lincache.b = b)
+    linu !== nothing && (cache.lincache.u = linu)
+
+    Plprev = cache.lincache.Pl isa ComposePreconditioner ? cache.lincache.Pl.outer :
+             cache.lincache.Pl
+    Prprev = cache.lincache.Pr isa ComposePreconditioner ? cache.lincache.Pr.outer :
+             cache.lincache.Pr
+
+    if cache.precs === nothing
+        _Pl, _Pr = nothing, nothing
+    else
+        _Pl, _Pr = cache.precs(cache.lincache.A, du, linu, p, nothing, A !== nothing,
+            Plprev, Prprev, cachedata)
+    end
+
+    if (_Pl !== nothing || _Pr !== nothing)
+        _weight = weight === nothing ?
+                  (cache.lincache.Pr isa Diagonal ? cache.lincache.Pr.diag :
+                   cache.lincache.Pr.inner.diag) : weight
+        Pl, Pr = __wrapprecs(_Pl, _Pr, _weight)
+        cache.lincache.Pl = Pl
+        cache.lincache.Pr = Pr
+    end
+
+    linres = solve!(cache.lincache)
+    cache.lincache = linres.cache
+
+    return linres.u
+end
+
+@inline __update_A!(cache::LinearSolverCache, ::Nothing, reuse) = cache
+@inline function __update_A!(cache::LinearSolverCache, A, reuse)
+    return __update_A!(cache, __getproperty(cache.lincache, Val(:alg)), A, reuse)
+end
+@inline function __update_A!(cache, alg, A, reuse)
+    # Not a Factorization Algorithm so don't update `nfactors`
+    __set_lincache_A(cache.lincache, A)
+    return cache
+end
+@inline function __update_A!(cache, ::AbstractFactorization, A, reuse)
+    reuse && return cache
+    __set_lincache_A(cache.lincache, A)
+    cache.nfactors += 1
+    return cache
+end
+@inline function __update_A!(cache, alg::DefaultLinearSolver, A, reuse)
+    if alg == DefaultLinearSolver(DefaultAlgorithmChoice.KrylovJL_GMRES)
+        # Force a reset of the cache. This is not properly handled in LinearSolve.jl
+        __set_lincache_A(cache.lincache, A)
+        return cache
+    end
+    reuse && return cache
+    __set_lincache_A(cache.lincache, A)
+    cache.nfactors += 1
+    return cache
+end
+
+function __set_lincache_A(lincache, new_A)
+    if LinearSolve.default_alias_A(lincache.alg, new_A, lincache.b)
+        lincache.A = new_A
+    else
+        if can_setindex(lincache.A)
+            copyto!(lincache.A, new_A)
+            lincache.A = lincache.A
+        else
+            lincache.A = new_A
+        end
+    end
+end
+
+@inline function __wrapprecs(_Pl, _Pr, weight)
+    if _Pl !== nothing
+        Pl = ComposePreconditioner(InvPreconditioner(Diagonal(_vec(weight))), _Pl)
+    else
+        Pl = InvPreconditioner(Diagonal(_vec(weight)))
+    end
+
+    if _Pr !== nothing
+        Pr = ComposePreconditioner(Diagonal(_vec(weight)), _Pr)
+    else
+        Pr = Diagonal(_vec(weight))
+    end
+
+    return Pl, Pr
+end
+
+@inline __needs_square_A(_, ::Number) = false
+@inline __needs_square_A(::Nothing, ::Number) = false
+@inline __needs_square_A(::Nothing, _) = false
+@inline __needs_square_A(linsolve, _) = LinearSolve.needs_square_A(linsolve)
diff --git a/src/internal/operators.jl b/src/internal/operators.jl
new file mode 100644
index 000000000..6412cfa07
--- /dev/null
+++ b/src/internal/operators.jl
@@ -0,0 +1,278 @@
+# We want a general form of this in SciMLOperators. However, we use this extensively and we
+# can have a custom implementation here till
+# https://github.com/SciML/SciMLOperators.jl/issues/223 is resolved.
+"""
+    JacobianOperator{vjp, iip, T} <: AbstractNonlinearSolveOperator{T}
+
+A Jacobian Operator Provides both JVP and VJP without materializing either (if possible).
+
+This is an internal operator, and is not guaranteed to have a stable API. It might even be
+moved out of NonlinearSolve.jl in the future, without a deprecation cycle. Usage of this
+outside NonlinearSolve.jl (by everyone except Avik) is strictly prohibited.
+
+`T` denotes if the Jacobian is transposed or not. `T = true` means that the Jacobian is
+transposed, and `T = false` means that the Jacobian is not transposed.
+
+### Constructor
+
+```julia
+JacobianOperator(prob::AbstractNonlinearProblem, fu, u; jvp_autodiff = nothing,
+    vjp_autodiff = nothing, skip_vjp::Val{NoVJP} = False,
+    skip_jvp::Val{NoJVP} = False) where {NoVJP, NoJVP}
+```
+
+See also [`NonlinearSolve.VecJacOperator`](@ref) and
+[`NonlinearSolve.JacVecOperator`](@ref).
+"""
+@concrete struct JacobianOperator{vjp, iip, T} <: AbstractNonlinearSolveOperator{T}
+    jvp_op
+    vjp_op
+
+    input_cache
+    output_cache
+end
+
+Base.size(J::JacobianOperator) = prod(size(J.output_cache)), prod(size(J.input_cache))
+function Base.size(J::JacobianOperator, d::Integer)
+    if d == 1
+        return prod(size(J.output_cache))
+    elseif d == 2
+        return prod(size(J.input_cache))
+    else
+        error("Invalid dimension $d for JacobianOperator")
+    end
+end
+
+for op in (:adjoint, :transpose)
+    @eval function Base.$(op)(operator::JacobianOperator{vjp, iip, T}) where {vjp, iip, T}
+        return JacobianOperator{!vjp, iip, T}(operator.jvp_op, operator.vjp_op,
+            operator.output_cache, operator.input_cache)
+    end
+end
+
+function JacobianOperator(prob::AbstractNonlinearProblem, fu, u; jvp_autodiff = nothing,
+        vjp_autodiff = nothing, skip_vjp::Val{NoVJP} = False,
+        skip_jvp::Val{NoJVP} = False) where {NoVJP, NoJVP}
+    f = prob.f
+    iip = isinplace(prob)
+    uf = JacobianWrapper{iip}(f, prob.p)
+
+    vjp_op = if NoVJP
+        nothing
+    elseif SciMLBase.has_vjp(f)
+        f.vjp
+    elseif u isa Number  # Ignore vjp directives
+        if ForwardDiff.can_dual(typeof(u))
+            @closure (v, u, p) -> last(__value_derivative(uf, u)) * v
+        else
+            @closure (v, u, p) -> FiniteDiff.finite_difference_derivative(uf, u) * v
+        end
+    else
+        vjp_autodiff = __get_nonsparse_ad(get_concrete_reverse_ad(vjp_autodiff,
+            prob, False))
+        if vjp_autodiff isa AutoZygote
+            iip && error("`AutoZygote` cannot handle inplace problems.")
+            @closure (v, u, p) -> auto_vecjac(uf, u, v)
+        elseif vjp_autodiff isa AutoFiniteDiff
+            if iip
+                cache1 = similar(fu)
+                cache2 = similar(fu)
+                @closure (Jv, v, u, p) -> num_vecjac!(Jv, uf, u, v, cache1, cache2)
+            else
+                @closure (v, u, p) -> num_vecjac(uf, __mutable(u), v)
+            end
+        else
+            error("`vjp_autodiff` = `$(typeof(vjp_autodiff))` is not supported in \
+                   JacobianOperator.")
+        end
+    end
+
+    jvp_op = if NoJVP
+        nothing
+    elseif SciMLBase.has_jvp(f)
+        f.jvp
+    elseif u isa Number  # Ignore jvp directives
+        if ForwardDiff.can_dual(typeof(u))
+            @closure (v, u, p) -> last(__scalar_jacvec(uf, u, v)) * v
+        else
+            @closure (v, u, p) -> FiniteDiff.finite_difference_derivative(uf, u) * v
+        end
+    else
+        jvp_autodiff = __get_nonsparse_ad(get_concrete_forward_ad(jvp_autodiff,
+            prob, False))
+        if jvp_autodiff isa AutoForwardDiff || jvp_autodiff isa AutoPolyesterForwardDiff
+            if iip
+                # FIXME: Technically we should propagate the tag but ignoring that for now
+                cache1 = Dual{
+                    typeof(ForwardDiff.Tag(NonlinearSolveTag(), eltype(u))), eltype(u), 1,
+                }.(similar(u), ForwardDiff.Partials.(tuple.(u)))
+                cache2 = Dual{
+                    typeof(ForwardDiff.Tag(NonlinearSolveTag(), eltype(fu))), eltype(fu), 1,
+                }.(similar(fu), ForwardDiff.Partials.(tuple.(fu)))
+                @closure (Jv, v, u, p) -> auto_jacvec!(Jv, uf, u, v, cache1, cache2)
+            else
+                @closure (v, u, p) -> auto_jacvec(uf, u, v)
+            end
+        elseif jvp_autodiff isa AutoFiniteDiff
+            if iip
+                cache1 = similar(fu)
+                cache2 = similar(u)
+                @closure (Jv, v, u, p) -> num_jacvec!(Jv, uf, u, v, cache1, cache2)
+            else
+                @closure (v, u, p) -> num_jacvec(uf, u, v)
+            end
+        else
+            error("`jvp_autodiff` = `$(typeof(jvp_autodiff))` is not supported in \
+                   JacobianOperator.")
+        end
+    end
+
+    return JacobianOperator{false, iip, promote_type(eltype(fu), eltype(u))}(jvp_op, vjp_op,
+        u, fu)
+end
+
+"""
+    VecJacOperator(args...; autodiff = nothing, kwargs...)
+
+Constructs a [`JacobianOperator`](@ref) which only provides the VJP using the
+`vjp_autodiff = autodiff`.
+
+This is very similar to `SparseDiffTools.VecJac` but is geared towards
+[`NonlinearProblem`](@ref)s. For arguments and keyword arguments see
+[`JacobianOperator`](@ref).
+"""
+function VecJacOperator(args...; autodiff = nothing, kwargs...)
+    return JacobianOperator(args...; kwargs..., skip_jvp = True, vjp_autodiff = autodiff)'
+end
+
+"""
+    JacVecOperator(args...; autodiff = nothing, kwargs...)
+
+Constructs a [`JacobianOperator`](@ref) which only provides the JVP using the
+`jvp_autodiff = autodiff`.
+
+This is very similar to `SparseDiffTools.JacVec` but is geared towards
+[`NonlinearProblem`](@ref)s. For arguments and keyword arguments see
+[`JacobianOperator`](@ref).
+"""
+function JacVecOperator(args...; autodiff = nothing, kwargs...)
+    return JacobianOperator(args...; kwargs..., skip_vjp = True, jvp_autodiff = autodiff)
+end
+
+function (op::JacobianOperator{vjp, iip})(v, u, p) where {vjp, iip}
+    if vjp
+        if iip
+            res = similar(op.output_cache)
+            op.vjp_op(res, v, u, p)
+            return res
+        else
+            return op.vjp_op(v, u, p)
+        end
+    else
+        if iip
+            res = similar(op.output_cache)
+            op.jvp_op(res, v, u, p)
+            return res
+        else
+            return op.jvp_op(v, u, p)
+        end
+    end
+end
+
+# Prevent Ambiguity
+function (op::JacobianOperator{vjp, iip})(Jv::Number, v::Number, u, p) where {vjp, iip}
+    error("Inplace Jacobian Operator not possible for scalars.")
+end
+
+function (op::JacobianOperator{vjp, iip})(Jv, v, u, p) where {vjp, iip}
+    if vjp
+        if iip
+            op.vjp_op(Jv, v, u, p)
+        else
+            copyto!(Jv, op.vjp_op(v, u, p))
+        end
+    else
+        if iip
+            op.jvp_op(Jv, v, u, p)
+        else
+            copyto!(Jv, op.jvp_op(v, u, p))
+        end
+    end
+    return Jv
+end
+
+"""
+    StatefulJacobianOperator(jac_op::JacobianOperator, u, p)
+
+Wrapper over a [`JacobianOperator`](@ref) which stores the input `u` and `p` and defines
+`mul!` and `*` for computing VJPs and JVPs.
+"""
+@concrete struct StatefulJacobianOperator{vjp, iip, T,
+    J <: JacobianOperator{vjp, iip, T}} <: AbstractNonlinearSolveOperator{T}
+    jac_op::J
+    u
+    p
+end
+
+Base.size(J::StatefulJacobianOperator) = size(J.jac_op)
+Base.size(J::StatefulJacobianOperator, d::Integer) = size(J.jac_op, d)
+
+for op in (:adjoint, :transpose)
+    @eval function Base.$op(operator::StatefulJacobianOperator)
+        return StatefulJacobianOperator($(op)(operator.jac_op), operator.u, operator.p)
+    end
+end
+
+Base.:*(J::StatefulJacobianOperator, v::AbstractArray) = J.jac_op(v, J.u, J.p)
+function Base.:*(J_op::StatefulJacobianOperator{vjp, iip, T, J, <:Number},
+        v::Number) where {vjp, iip, T, J}
+    return J_op.jac_op(v, J_op.u, J_op.p)
+end
+
+function LinearAlgebra.mul!(Jv::AbstractArray, J::StatefulJacobianOperator,
+        v::AbstractArray)
+    J.jac_op(Jv, v, J.u, J.p)
+    return Jv
+end
+
+"""
+    StatefulJacobianNormalFormOperator(vjp_operator, jvp_operator, cache)
+
+This constructs a Normal Form Jacobian Operator, i.e. it constructs the operator
+corresponding to `JᵀJ` where `J` is the Jacobian Operator. This is not meant to be directly
+constructed, rather it is constructed with `*` on two [`StatefulJacobianOperator`](@ref)s.
+"""
+@concrete mutable struct StatefulJacobianNormalFormOperator{T} <:
+                         AbstractNonlinearSolveOperator{T}
+    vjp_operator
+    jvp_operator
+    cache
+end
+
+function Base.size(J::StatefulJacobianNormalFormOperator)
+    return size(J.vjp_operator, 1), size(J.jvp_operator, 2)
+end
+
+function Base.:*(J1::StatefulJacobianOperator{true}, J2::StatefulJacobianOperator{false})
+    cache = J2 * J2.jac_op.input_cache
+    T = promote_type(eltype(J1), eltype(J2))
+    return StatefulJacobianNormalFormOperator{T}(J1, J2, cache)
+end
+
+function LinearAlgebra.mul!(C::StatefulJacobianNormalFormOperator,
+        A::StatefulJacobianOperator{true}, B::StatefulJacobianOperator{false})
+    C.vjp_operator = A
+    C.jvp_operator = B
+    return C
+end
+
+function Base.:*(JᵀJ::StatefulJacobianNormalFormOperator, x::AbstractArray)
+    return JᵀJ.vjp_operator * (JᵀJ.jvp_operator * x)
+end
+
+function LinearAlgebra.mul!(JᵀJx::AbstractArray, JᵀJ::StatefulJacobianNormalFormOperator,
+        x::AbstractArray)
+    mul!(JᵀJ.cache, JᵀJ.jvp_operator, x)
+    mul!(JᵀJx, JᵀJ.vjp_operator, JᵀJ.cache)
+    return JᵀJx
+end
diff --git a/src/internal/termination.jl b/src/internal/termination.jl
new file mode 100644
index 000000000..59d8905f5
--- /dev/null
+++ b/src/internal/termination.jl
@@ -0,0 +1,45 @@
+function init_termination_cache(abstol, reltol, du, u, ::Nothing)
+    return init_termination_cache(abstol, reltol, du, u,
+        AbsSafeBestTerminationMode(; max_stalled_steps = 32))
+end
+function init_termination_cache(abstol, reltol, du, u, tc::AbstractNonlinearTerminationMode)
+    tc_cache = init(du, u, tc; abstol, reltol, use_deprecated_retcodes = Val(false))
+    return DiffEqBase.get_abstol(tc_cache), DiffEqBase.get_reltol(tc_cache), tc_cache
+end
+
+function check_and_update!(cache, fu, u, uprev)
+    return check_and_update!(cache.termination_cache, cache, fu, u, uprev)
+end
+
+function check_and_update!(tc_cache, cache, fu, u, uprev)
+    return check_and_update!(tc_cache, cache, fu, u, uprev,
+        DiffEqBase.get_termination_mode(tc_cache))
+end
+
+function check_and_update!(tc_cache, cache, fu, u, uprev, mode)
+    if tc_cache(fu, u, uprev)
+        cache.retcode = tc_cache.retcode
+        update_from_termination_cache!(tc_cache, cache, mode, u)
+        cache.force_stop = true
+    end
+end
+
+function update_from_termination_cache!(tc_cache, cache, u = get_u(cache))
+    return update_from_termination_cache!(tc_cache, cache,
+        DiffEqBase.get_termination_mode(tc_cache), u)
+end
+
+function update_from_termination_cache!(tc_cache, cache,
+        mode::AbstractNonlinearTerminationMode, u = get_u(cache))
+    evaluate_f!(cache, u, cache.p)
+end
+
+function update_from_termination_cache!(tc_cache, cache,
+        mode::AbstractSafeBestNonlinearTerminationMode, u = get_u(cache))
+    if isinplace(cache)
+        copyto!(get_u(cache), tc_cache.u)
+    else
+        set_u!(cache, tc_cache.u)
+    end
+    evaluate_f!(cache, get_u(cache), cache.p)
+end
diff --git a/src/trace.jl b/src/internal/tracing.jl
similarity index 76%
rename from src/trace.jl
rename to src/internal/tracing.jl
index 5a7c88342..667c6ce07 100644
--- a/src/trace.jl
+++ b/src/internal/tracing.jl
@@ -1,5 +1,3 @@
-abstract type AbstractNonlinearSolveTraceLevel end
-
 """
     TraceMinimal(freq)
     TraceMinimal(; print_frequency = 1, store_frequency::Int = 1)
@@ -10,16 +8,7 @@ Trace Minimal Information
  2. f(u) inf-norm
  3. Step 2-norm
 
-## Arguments
-
-  - `freq`: Sets both `print_frequency` and `store_frequency` to `freq`.
-
-## Keyword Arguments
-
-  - `print_frequency`: Print the trace every `print_frequency` iterations if
-    `show_trace == Val(true)`.
-  - `store_frequency`: Store the trace every `store_frequency` iterations if
-    `store_trace == Val(true)`.
+See also [`TraceWithJacobianConditionNumber`](@ref) and [`TraceAll`](@ref).
 """
 @kwdef struct TraceMinimal <: AbstractNonlinearSolveTraceLevel
     print_frequency::Int = 1
@@ -30,18 +19,9 @@ end
     TraceWithJacobianConditionNumber(freq)
     TraceWithJacobianConditionNumber(; print_frequency = 1, store_frequency::Int = 1)
 
-`TraceMinimal` + Print the Condition Number of the Jacobian.
-
-## Arguments
-
-  - `freq`: Sets both `print_frequency` and `store_frequency` to `freq`.
+[`TraceMinimal`](@ref) + Print the Condition Number of the Jacobian.
 
-## Keyword Arguments
-
-  - `print_frequency`: Print the trace every `print_frequency` iterations if
-    `show_trace == Val(true)`.
-  - `store_frequency`: Store the trace every `store_frequency` iterations if
-    `store_trace == Val(true)`.
+See also [`TraceMinimal`](@ref) and [`TraceAll`](@ref).
 """
 @kwdef struct TraceWithJacobianConditionNumber <: AbstractNonlinearSolveTraceLevel
     print_frequency::Int = 1
@@ -52,22 +32,13 @@ end
     TraceAll(freq)
     TraceAll(; print_frequency = 1, store_frequency::Int = 1)
 
-`TraceWithJacobianConditionNumber` + Store the Jacobian, u, f(u), and δu.
+[`TraceWithJacobianConditionNumber`](@ref) + Store the Jacobian, u, f(u), and δu.
 
 !!! warning
 
     This is very expensive and makes copyies of the Jacobian, u, f(u), and δu.
 
-## Arguments
-
-  - `freq`: Sets both `print_frequency` and `store_frequency` to `freq`.
-
-## Keyword Arguments
-
-  - `print_frequency`: Print the trace every `print_frequency` iterations if
-    `show_trace == Val(true)`.
-  - `store_frequency`: Store the trace every `store_frequency` iterations if
-    `store_trace == Val(true)`.
+See also [`TraceMinimal`](@ref) and [`TraceWithJacobianConditionNumber`](@ref).
 """
 @kwdef struct TraceAll <: AbstractNonlinearSolveTraceLevel
     print_frequency::Int = 1
@@ -133,16 +104,6 @@ function NonlinearSolveTraceEntry(iteration, fu, δu, J, u)
         __copy(J), __copy(u), __copy(fu), __copy(δu))
 end
 
-__cond(J::AbstractMatrix) = cond(J)
-__cond(J::SVector) = __cond(Diagonal(MVector(J)))
-__cond(J::AbstractVector) = __cond(Diagonal(J))
-__cond(J::ApplyArray) = __cond(J.f(J.args...))
-__cond(J) = -1  # Covers cases where `J` is a Operator, nothing, etc.
-
-__copy(x::AbstractArray) = copy(x)
-__copy(x::Number) = x
-__copy(x) = x
-
 @concrete struct NonlinearSolveTrace{show_trace, store_trace,
     Tr <: AbstractNonlinearSolveTraceLevel}
     history
@@ -227,16 +188,13 @@ function update_trace!(cache::AbstractNonlinearSolveCache, α = true)
 
     J = __getproperty(cache, Val(:J))
     if J === nothing
-        J_inv = __getproperty(cache, Val(:J⁻¹))
-        if J_inv === nothing
-            update_trace!(trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache),
-                nothing, cache.du, α)
-        else
-            update_trace!(trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache),
-                ApplyArray(__safe_inv, J_inv), cache.du, α)
-        end
+        update_trace!(trace, get_nsteps(cache) + 1, get_u(cache), get_fu(cache),
+            nothing, cache.du, α)
+    elseif cache isa ApproximateJacobianSolveCache && store_inverse_jacobian(cache)
+        update_trace!(trace, get_nsteps(cache) + 1, get_u(cache), get_fu(cache),
+            ApplyArray(__safe_inv, J), cache.du, α)
     else
-        update_trace!(trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), J,
+        update_trace!(trace, get_nsteps(cache) + 1, get_u(cache), get_fu(cache), J,
             cache.du, α)
     end
 end
diff --git a/src/jacobian.jl b/src/jacobian.jl
deleted file mode 100644
index 20825ebda..000000000
--- a/src/jacobian.jl
+++ /dev/null
@@ -1,303 +0,0 @@
-@concrete struct KrylovJᵀJ
-    JᵀJ
-    Jᵀ
-end
-
-__maybe_symmetric(x::KrylovJᵀJ) = x.JᵀJ
-
-isinplace(JᵀJ::KrylovJᵀJ) = isinplace(JᵀJ.Jᵀ)
-
-# Select if we are going to use sparse differentiation or not
-sparsity_detection_alg(_, _) = NoSparsityDetection()
-function sparsity_detection_alg(f::NonlinearFunction, ad::AbstractSparseADType)
-    if f.sparsity === nothing
-        if f.jac_prototype === nothing
-            if is_extension_loaded(Val(:Symbolics))
-                return SymbolicsSparsityDetection()
-            else
-                return ApproximateJacobianSparsity()
-            end
-        else
-            jac_prototype = f.jac_prototype
-        end
-    elseif f.sparsity isa SparseDiffTools.AbstractSparsityDetection
-        if f.jac_prototype === nothing
-            return f.sparsity
-        else
-            jac_prototype = f.jac_prototype
-        end
-    elseif f.sparsity isa AbstractMatrix
-        jac_prototype = f.sparsity
-    elseif f.jac_prototype isa AbstractMatrix
-        jac_prototype = f.jac_prototype
-    else
-        error("`sparsity::typeof($(typeof(f.sparsity)))` & \
-               `jac_prototype::typeof($(typeof(f.jac_prototype)))` is not supported. \
-               Use `sparsity::AbstractMatrix` or `sparsity::AbstractSparsityDetection` or \
-               set to `nothing`. `jac_prototype` can be set to `nothing` or an \
-               `AbstractMatrix`.")
-    end
-
-    if SciMLBase.has_colorvec(f)
-        return PrecomputedJacobianColorvec(; jac_prototype, f.colorvec,
-            partition_by_rows = ad isa ADTypes.AbstractSparseReverseMode)
-    else
-        return JacPrototypeSparsityDetection(; jac_prototype)
-    end
-end
-
-# NoOp for Jacobian if it is not a Abstract Array -- For eg, JacVec Operator
-jacobian!!(J, cache; u = nothing, p = nothing) = J
-# `!!` notation is from BangBang.jl since J might be jacobian in case of oop `f.jac`
-# and we don't want wasteful `copyto!`
-function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache; u = cache.u,
-        p = cache.p)
-    @unpack f, uf, jac_cache, alg, fu_cache = cache
-    cache.stats.njacs += 1
-    iip = isinplace(cache)
-    if iip
-        if has_jac(f)
-            f.jac(J, u, p)
-        else
-            sparse_jacobian!(J, alg.ad, jac_cache, uf, fu_cache, u)
-        end
-        return J
-    else
-        if has_jac(f)
-            return f.jac(u, p)
-        elseif can_setindex(typeof(J))
-            return sparse_jacobian!(J, alg.ad, jac_cache, uf, u)
-        else
-            return sparse_jacobian(alg.ad, jac_cache, uf, u)
-        end
-    end
-end
-# Scalar case
-function jacobian!!(::Number, cache; u = cache.u, p = cache.p)
-    cache.stats.njacs += 1
-    return last(value_derivative(cache.uf, u))
-end
-
-# Build Jacobian Caches
-function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u, p, ::Val{iip};
-        linsolve_kwargs = (;), lininit::Val{linsolve_init} = Val(true),
-        linsolve_with_JᵀJ::Val{needsJᵀJ} = Val(false)) where {iip, needsJᵀJ, linsolve_init, F}
-    uf = SciMLBase.JacobianWrapper{iip}(f, p)
-
-    haslinsolve = hasfield(typeof(alg), :linsolve)
-
-    has_analytic_jac = has_jac(f)
-    linsolve_needs_jac = (concrete_jac(alg) === nothing &&
-                          (!haslinsolve || (haslinsolve && (alg.linsolve === nothing ||
-                             needs_concrete_A(alg.linsolve)))))
-    alg_wants_jac = (concrete_jac(alg) !== nothing && concrete_jac(alg))
-
-    # NOTE: The deepcopy is needed here since we are using the resid_prototype elsewhere
-    fu = f.resid_prototype === nothing ? (iip ? zero(u) : f(u, p)) :
-         (iip ? deepcopy(f.resid_prototype) : f.resid_prototype)
-    if !has_analytic_jac && (linsolve_needs_jac || alg_wants_jac)
-        sd = sparsity_detection_alg(f, alg.ad)
-        ad = alg.ad
-        jac_cache = iip ? sparse_jacobian_cache(ad, sd, uf, fu, u) :
-                    sparse_jacobian_cache(ad, sd, uf, __maybe_mutable(u, ad); fx = fu)
-    else
-        jac_cache = nothing
-    end
-
-    J = if !(linsolve_needs_jac || alg_wants_jac)
-        if f.jvp === nothing
-            # We don't need to construct the Jacobian
-            JacVec(uf, u; fu, autodiff = __get_nonsparse_ad(alg.ad))
-        else
-            if iip
-                jvp = (_, u, v) -> (du_ = similar(fu); f.jvp(du_, v, u, p); du_)
-                jvp! = (du_, _, u, v) -> f.jvp(du_, v, u, p)
-            else
-                jvp = (_, u, v) -> f.jvp(v, u, p)
-                jvp! = (du_, _, u, v) -> (du_ .= f.jvp(v, u, p))
-            end
-            op = SparseDiffTools.FwdModeAutoDiffVecProd(f, u, (), jvp, jvp!)
-            FunctionOperator(op, u, fu; isinplace = Val(true), outofplace = Val(false),
-                p, islinear = true)
-        end
-    else
-        if has_analytic_jac
-            f.jac_prototype === nothing ? undefmatrix(u) : f.jac_prototype
-        elseif f.jac_prototype === nothing
-            init_jacobian(jac_cache; preserve_immutable = Val(true))
-        else
-            f.jac_prototype
-        end
-    end
-
-    du = copy(u)
-
-    if needsJᵀJ
-        JᵀJ, Jᵀfu = __init_JᵀJ(J, _vec(fu), uf, u; f,
-            vjp_autodiff = __get_nonsparse_ad(__getproperty(alg, Val(:vjp_autodiff))),
-            jvp_autodiff = __get_nonsparse_ad(alg.ad))
-    else
-        JᵀJ, Jᵀfu = nothing, nothing
-    end
-
-    if linsolve_init
-        if alg isa PseudoTransient && J isa SciMLOperators.AbstractSciMLOperator
-            linprob_A = J - inv(convert(eltype(u), alg.alpha_initial)) * I
-        else
-            linprob_A = needsJᵀJ ? __maybe_symmetric(JᵀJ) : J
-        end
-        linsolve = linsolve_caches(linprob_A, needsJᵀJ ? Jᵀfu : fu, du, p, alg;
-            linsolve_kwargs)
-    else
-        linsolve = nothing
-    end
-
-    return uf, linsolve, J, fu, jac_cache, du, JᵀJ, Jᵀfu
-end
-
-## Special Handling for Scalars
-function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f::F, u::Number, p,
-        ::Val{false}; linsolve_with_JᵀJ::Val{needsJᵀJ} = Val(false),
-        kwargs...) where {needsJᵀJ, F}
-    # NOTE: Scalar `u` assumes scalar output from `f`
-    uf = SciMLBase.JacobianWrapper{false}(f, p)
-    return uf, FakeLinearSolveJLCache(u, u), u, zero(u), nothing, u, u, u
-end
-
-# Linear Solve Cache
-function linsolve_caches(A, b, u, p, alg; linsolve_kwargs = (;))
-    if A isa Number ||
-       (alg.linsolve === nothing && A isa SMatrix && linsolve_kwargs === (;))
-        # Default handling for SArrays in LinearSolve is not great. Some parts are patched
-        # but there are quite a few unnecessary allocations
-        return FakeLinearSolveJLCache(A, _vec(b))
-    end
-
-    linprob = LinearProblem(A, _vec(b); u0 = _vec(u), linsolve_kwargs...)
-
-    weight = __init_ones(u)
-
-    Pl, Pr = wrapprecs(alg.precs(A, nothing, u, p, nothing, nothing, nothing, nothing,
-            nothing)..., weight)
-    return init(linprob, alg.linsolve; alias_A = true, alias_b = true, Pl, Pr)
-end
-linsolve_caches(A::KrylovJᵀJ, b, u, p, alg) = linsolve_caches(A.JᵀJ, b, u, p, alg)
-
-__init_JᵀJ(J::Number, args...; kwargs...) = zero(J), zero(J)
-function __init_JᵀJ(J::AbstractArray, fu, args...; kwargs...)
-    JᵀJ = J' * J
-    Jᵀfu = J' * fu
-    return JᵀJ, Jᵀfu
-end
-function __init_JᵀJ(J::StaticArray, fu, args...; kwargs...)
-    JᵀJ = MArray{Tuple{size(J, 2), size(J, 2)}, eltype(J)}(undef)
-    return JᵀJ, J' * fu
-end
-function __init_JᵀJ(J::FunctionOperator, fu, uf, u, args...; f = nothing,
-        vjp_autodiff = nothing, jvp_autodiff = nothing, kwargs...)
-    # FIXME: Proper fix to this requires the FunctionOperator patch
-    if f !== nothing && f.vjp !== nothing
-        @warn "Currently we don't make use of user provided `jvp`. This is planned to be \
-               fixed in the near future."
-    end
-    autodiff = __concrete_vjp_autodiff(vjp_autodiff, jvp_autodiff, uf)
-    Jᵀ = VecJac(uf, u; fu, autodiff)
-    JᵀJ_op = SciMLOperators.cache_operator(Jᵀ * J, u)
-    JᵀJ = KrylovJᵀJ(JᵀJ_op, Jᵀ)
-    Jᵀfu = Jᵀ * fu
-    return JᵀJ, Jᵀfu
-end
-
-function __concrete_vjp_autodiff(vjp_autodiff, jvp_autodiff, uf)
-    if vjp_autodiff === nothing
-        if isinplace(uf)
-            # VecJac can be only FiniteDiff
-            return AutoFiniteDiff()
-        else
-            # Short circuit if we see that FiniteDiff was used for J computation
-            jvp_autodiff isa AutoFiniteDiff && return jvp_autodiff
-            # Check if Zygote is loaded then use Zygote else use FiniteDiff
-            is_extension_loaded(Val{:Zygote}()) && return AutoZygote()
-            return AutoFiniteDiff()
-        end
-    else
-        ad = __get_nonsparse_ad(vjp_autodiff)
-        if isinplace(uf) && ad isa AutoZygote
-            @warn "Attempting to use Zygote.jl for linesearch on an in-place problem. \
-                Falling back to finite differencing."
-            return AutoFiniteDiff()
-        end
-        return ad
-    end
-end
-
-# jvp fallback scalar
-function __gradient_operator(uf, u; autodiff, kwargs...)
-    if !(autodiff isa AutoFiniteDiff || autodiff isa AutoZygote)
-        _ad = autodiff
-        number_ad = ifelse(ForwardDiff.can_dual(eltype(u)), AutoForwardDiff(),
-            AutoFiniteDiff())
-        if u isa Number
-            autodiff = number_ad
-        else
-            if isinplace(uf)
-                autodiff = AutoFiniteDiff()
-            else
-                autodiff = ifelse(is_extension_loaded(Val{:Zygote}()), AutoZygote(),
-                    AutoFiniteDiff())
-            end
-        end
-        if _ad !== nothing && _ad !== autodiff
-            @warn "$(_ad) not supported for VecJac. Using $(autodiff) instead."
-        end
-    end
-    return u isa Number ? GradientScalar(uf, u, autodiff) :
-           VecJac(uf, u; autodiff, kwargs...)
-end
-
-@concrete mutable struct GradientScalar
-    uf
-    u
-    autodiff
-end
-
-function Base.:*(jvp::GradientScalar, v::Number)
-    if jvp.autodiff isa AutoForwardDiff
-        T = typeof(ForwardDiff.Tag(typeof(jvp.uf), typeof(jvp.u)))
-        out = jvp.uf(ForwardDiff.Dual{T}(jvp.u, one(v)))
-        return ForwardDiff.extract_derivative(T, out)
-    elseif jvp.autodiff isa AutoFiniteDiff
-        J = FiniteDiff.finite_difference_derivative(jvp.uf, jvp.u, jvp.autodiff.fdtype)
-        return J
-    else
-        error("Only ForwardDiff & FiniteDiff is currently supported.")
-    end
-end
-
-# Generic Handling of Krylov Methods for Normal Form Linear Solves
-function __update_JᵀJ!(cache::AbstractNonlinearSolveCache, J = nothing)
-    if !(cache.JᵀJ isa KrylovJᵀJ)
-        J_ = ifelse(J === nothing, cache.J, J)
-        @bb cache.JᵀJ = transpose(J_) × J_
-    end
-end
-
-function __update_Jᵀf!(cache::AbstractNonlinearSolveCache, J = nothing)
-    if cache.JᵀJ isa KrylovJᵀJ
-        @bb cache.Jᵀf = cache.JᵀJ.Jᵀ × cache.fu
-    else
-        J_ = ifelse(J === nothing, cache.J, J)
-        @bb cache.Jᵀf = transpose(J_) × vec(cache.fu)
-    end
-end
-
-# Left-Right Multiplication
-__lr_mul(cache::AbstractNonlinearSolveCache) = __lr_mul(cache, cache.JᵀJ, cache.Jᵀf)
-function __lr_mul(cache::AbstractNonlinearSolveCache, JᵀJ::KrylovJᵀJ, Jᵀf)
-    @bb cache.lr_mul_cache = JᵀJ.JᵀJ × vec(Jᵀf)
-    return dot(_vec(Jᵀf), _vec(cache.lr_mul_cache))
-end
-function __lr_mul(cache::AbstractNonlinearSolveCache, JᵀJ, Jᵀf)
-    @bb cache.lr_mul_cache = JᵀJ × vec(Jᵀf)
-    return dot(_vec(Jᵀf), _vec(cache.lr_mul_cache))
-end
diff --git a/src/klement.jl b/src/klement.jl
deleted file mode 100644
index a49a3eda9..000000000
--- a/src/klement.jl
+++ /dev/null
@@ -1,259 +0,0 @@
-"""
-    Klement(; max_resets = 100, linsolve = nothing, linesearch = nothing,
-        precs = DEFAULT_PRECS, alpha = true, init_jacobian::Val = Val(:identity),
-        autodiff = nothing)
-
-An implementation of `Klement` with line search, preconditioning and customizable linear
-solves. It is recommended to use `Broyden` for most problems over this.
-
-## Keyword Arguments
-
-  - `max_resets`: the maximum number of resets to perform. Defaults to `100`.
-
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `linesearch`: the line search algorithm to use. Defaults to [`LineSearch()`](@ref),
-    which means that no line search is performed. Algorithms from `LineSearches.jl` can be
-    used here directly, and they will be converted to the correct `LineSearch`.
-  - `alpha`: If `init_jacobian` is set to `Val(:identity)`, then the initial Jacobian
-    inverse is set to be `αI`. Defaults to `1`. Can be set to `nothing` which implies
-    `α = max(norm(u), 1) / (2 * norm(fu))`.
-  - `init_jacobian`: the method to use for initializing the jacobian. Defaults to
-    `Val(:identity)`. Choices include:
-
-      + `Val(:identity)`: Identity Matrix.
-      + `Val(:true_jacobian)`: True Jacobian. Our tests suggest that this is not very
-        stable. Instead using `Broyden` with `Val(:true_jacobian)` gives faster and more
-        reliable convergence.
-      + `Val(:true_jacobian_diagonal)`: Diagonal of True Jacobian. This is a good choice for
-        differentiable problems.
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing` which means that a default is selected according to the problem specification!
-    Valid choices are types from ADTypes.jl. (Used if `init_jacobian = Val(:true_jacobian)`)
-"""
-@concrete struct Klement{IJ, CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    max_resets::Int
-    linsolve
-    precs
-    linesearch
-    alpha
-end
-
-function __alg_print_modifiers(alg::Klement{IJ}) where {IJ}
-    modifiers = String[]
-    IJ !== :identity && push!(modifiers, "init_jacobian = Val(:$(IJ))")
-    alg.alpha !== nothing && push!(modifiers, "alpha = $(alg.alpha)")
-    return modifiers
-end
-
-function set_ad(alg::Klement{IJ, CJ}, ad) where {IJ, CJ}
-    return Klement{IJ, CJ}(ad, alg.max_resets, alg.linsolve, alg.precs,
-        alg.linesearch, alg.alpha)
-end
-
-function Klement(; max_resets::Int = 100, linsolve = nothing, alpha = true,
-        linesearch = nothing, precs = DEFAULT_PRECS, init_jacobian::Val = Val(:identity),
-        autodiff = nothing)
-    IJ = _unwrap_val(init_jacobian)
-    @assert IJ ∈ (:identity, :true_jacobian, :true_jacobian_diagonal)
-    linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method = linesearch)
-    CJ = IJ !== :identity
-    return Klement{IJ, CJ}(autodiff, max_resets, linsolve, precs, linesearch,
-        alpha)
-end
-
-@concrete mutable struct KlementCache{iip, IJ} <: AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    fu
-    fu_cache
-    fu_cache_2
-    du
-    p
-    uf
-    linsolve
-    J
-    J_cache
-    J_cache_2
-    Jdu
-    Jdu_cache
-    alpha
-    alpha_initial
-    resets
-    force_stop
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    prob
-    jac_cache
-    stats::NLStats
-    ls_cache
-    tc_cache
-    trace
-end
-
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::Klement{IJ},
-        args...; alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
-        linsolve_kwargs = (;), kwargs...) where {uType, iip, F, IJ}
-    @unpack f, u0, p = prob
-    u = __maybe_unaliased(u0, alias_u0)
-    fu = evaluate_f(prob, u)
-
-    alpha = __initial_alpha(alg_.alpha, u, fu, internalnorm)
-
-    if IJ === :true_jacobian
-        alg = get_concrete_algorithm(alg_, prob)
-        uf, _, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
-            lininit = Val(false))
-    elseif IJ === :true_jacobian_diagonal
-        alg = get_concrete_algorithm(alg_, prob)
-        uf, _, J_cache, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
-            lininit = Val(false))
-        J = __diag(J_cache)
-    elseif IJ === :identity
-        alg = alg_
-        @bb du = similar(u)
-        uf, fu_cache, jac_cache = nothing, nothing, nothing
-        J = one.(u) # Identity Init Jacobian for Klement maintains a Diagonal Structure
-        @bb J .*= alpha
-    else
-        error("Invalid `init_jacobian` value")
-    end
-
-    if IJ === :true_jacobian
-        linsolve = linsolve_caches(J, _vec(fu), _vec(du), p, alg_; linsolve_kwargs)
-    else
-        linsolve = nothing
-    end
-
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu, J, du; kwargs...)
-
-    @bb u_cache = copy(u)
-    @bb fu_cache_2 = copy(fu)
-    @bb Jdu = similar(fu)
-    if IJ === :true_jacobian
-        @bb J_cache = similar(J)
-        @bb J_cache_2 = similar(J)
-        @bb Jdu_cache = similar(fu)
-    else
-        IJ === :identity && (J_cache = nothing)
-        J_cache_2, Jdu_cache = nothing, nothing
-    end
-
-    return KlementCache{iip, IJ}(f, alg, u, u_cache, fu, fu_cache, fu_cache_2, du, p,
-        uf, linsolve, J, J_cache, J_cache_2, Jdu, Jdu_cache, alpha, alg.alpha, 0, false,
-        maxiters, internalnorm, ReturnCode.Default, abstol, reltol, prob, jac_cache,
-        NLStats(1, 0, 0, 0, 0),
-        init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
-end
-
-function perform_step!(cache::KlementCache{iip, IJ}) where {iip, IJ}
-    @unpack linsolve, alg = cache
-    T = eltype(cache.J)
-
-    if IJ === :true_jacobian
-        cache.stats.nsteps == 0 && (cache.J = jacobian!!(cache.J, cache))
-        ill_conditioned = __is_ill_conditioned(cache.J)
-    elseif IJ === :true_jacobian_diagonal
-        if cache.stats.nsteps == 0
-            cache.J_cache = jacobian!!(cache.J_cache, cache)
-            cache.J = __get_diagonal!!(cache.J, cache.J_cache)
-        end
-        ill_conditioned = __is_ill_conditioned(_vec(cache.J))
-    elseif IJ === :identity
-        ill_conditioned = __is_ill_conditioned(_vec(cache.J))
-    end
-
-    if ill_conditioned
-        if cache.resets == alg.max_resets
-            cache.force_stop = true
-            cache.retcode = ReturnCode.ConvergenceFailure
-            return nothing
-        end
-        if IJ === :true_jacobian && cache.stats.nsteps != 0
-            cache.J = jacobian!!(cache.J, cache)
-        elseif IJ === :true_jacobian_diagonal && cache.stats.nsteps != 0
-            cache.J_cache = jacobian!!(cache.J_cache, cache)
-            cache.J = __get_diagonal!!(cache.J, cache.J_cache)
-        elseif IJ === :identity
-            cache.alpha = __initial_alpha(cache.alpha, cache.alpha_initial, cache.u,
-                cache.fu, cache.internalnorm)
-            cache.J = __reinit_identity_jacobian!!(cache.J, cache.alpha)
-        end
-        cache.resets += 1
-    end
-
-    if IJ === :true_jacobian_diagonal || IJ === :identity
-        @bb @. cache.du = cache.fu / cache.J
-    else
-        # u = u - J \ fu
-        linres = dolinsolve(cache, alg.precs, cache.linsolve; A = cache.J,
-            b = _vec(cache.fu), linu = _vec(cache.du), cache.p, reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        cache.du = _restructure(cache.du, linres.u)
-    end
-
-    # Line Search
-    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    @bb axpy!(-α, cache.du, cache.u)
-
-    evaluate_f(cache, cache.u, cache.p)
-
-    update_trace!(cache, α)
-    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-
-    @bb copyto!(cache.u_cache, cache.u)
-
-    cache.force_stop && return nothing
-
-    # Update the Jacobian
-    @bb cache.du .*= -1
-    if IJ === :true_jacobian_diagonal || IJ === :identity
-        @bb @. cache.Jdu = (cache.J^2) * (cache.du^2)
-        @bb @. cache.J += ((cache.fu - cache.fu_cache_2 - cache.J * cache.du) /
-                           ifelse(iszero(cache.Jdu), T(1e-5), cache.Jdu)) * cache.du *
-                          (cache.J^2)
-    elseif IJ === :true_jacobian
-        # Klement Updates to the Full Jacobian don't work for most problems, we should
-        # probably be using the Broyden Update Rule here
-        @bb @. cache.J_cache = cache.J'^2
-        @bb @. cache.Jdu = cache.du^2
-        @bb cache.Jdu_cache = cache.J_cache × vec(cache.Jdu)
-        @bb cache.Jdu = cache.J × vec(cache.du)
-        @bb @. cache.fu_cache_2 = (cache.fu - cache.fu_cache_2 - cache.Jdu) /
-                                  ifelse(iszero(cache.Jdu_cache), T(1e-5), cache.Jdu_cache)
-        @bb cache.J_cache = vec(cache.fu_cache_2) × transpose(_vec(cache.du))
-        @bb @. cache.J_cache *= cache.J
-        @bb cache.J_cache_2 = cache.J_cache × cache.J
-        @bb cache.J .+= cache.J_cache_2
-    else
-        error("Invalid `init_jacobian` value")
-    end
-
-    @bb copyto!(cache.fu_cache_2, cache.fu)
-
-    return nothing
-end
-
-function __reinit_internal!(cache::KlementCache; kwargs...)
-    cache.alpha = __initial_alpha(cache.alpha, cache.alpha_initial, cache.u, cache.fu,
-        cache.internalnorm)
-    cache.J = __reinit_identity_jacobian!!(cache.J, cache.alpha)
-    cache.resets = 0
-    return nothing
-end
diff --git a/src/lbroyden.jl b/src/lbroyden.jl
deleted file mode 100644
index 811e3400d..000000000
--- a/src/lbroyden.jl
+++ /dev/null
@@ -1,215 +0,0 @@
-"""
-    LimitedMemoryBroyden(; max_resets::Int = 3, linesearch = nothing,
-        threshold::Int = 10, reset_tolerance = nothing)
-
-An implementation of `LimitedMemoryBroyden` with resetting and line search.
-
-## Arguments
-
-  - `max_resets`: the maximum number of resets to perform. Defaults to `3`.
-  - `reset_tolerance`: the tolerance for the reset check. Defaults to
-    `sqrt(eps(real(eltype(u))))`.
-  - `threshold`: the number of vectors to store in the low rank approximation. Defaults
-    to `10`.
-  - `linesearch`: the line search algorithm to use. Defaults to [`LineSearch()`](@ref),
-    which means that no line search is performed. Algorithms from `LineSearches.jl` can be
-    used here directly, and they will be converted to the correct `LineSearch`. It is
-    recommended to use [`LiFukushimaLineSearch`](@ref) -- a derivative free linesearch
-    specifically designed for Broyden's method.
-"""
-@concrete struct LimitedMemoryBroyden{threshold} <: AbstractNewtonAlgorithm{false, Nothing}
-    max_resets::Int
-    linesearch
-    reset_tolerance
-end
-
-function LimitedMemoryBroyden(; max_resets::Int = 3, linesearch = nothing,
-        threshold::Union{Val, Int} = Val(27), reset_tolerance = nothing)
-    linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method = linesearch)
-    return LimitedMemoryBroyden{SciMLBase._unwrap_val(threshold)}(max_resets, linesearch,
-        reset_tolerance)
-end
-
-__get_threshold(::LimitedMemoryBroyden{threshold}) where {threshold} = Val(threshold)
-__get_unwrapped_threshold(::LimitedMemoryBroyden{threshold}) where {threshold} = threshold
-
-@concrete mutable struct LimitedMemoryBroydenCache{iip} <: AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    du
-    fu
-    fu_cache
-    dfu
-    p
-    U
-    Vᵀ
-    threshold_cache
-    mat_cache
-    vᵀ_cache
-    force_stop::Bool
-    resets::Int
-    iterations_since_reset::Int
-    max_resets::Int
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    reset_tolerance
-    reset_check
-    prob
-    stats::NLStats
-    ls_cache
-    tc_cache
-    trace
-end
-
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LimitedMemoryBroyden,
-        args...; alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
-        kwargs...) where {uType, iip, F}
-    @unpack f, u0, p = prob
-    threshold = __get_threshold(alg)
-    η = min(__get_unwrapped_threshold(alg), maxiters)
-    if u0 isa Number || length(u0) ≤ η
-        # If u is a number or very small problem then we simply use Broyden
-        return SciMLBase.__init(prob,
-            Broyden(; alg.max_resets, alg.reset_tolerance, alg.linesearch), args...;
-            alias_u0, maxiters, abstol, internalnorm, kwargs...)
-    end
-    u = __maybe_unaliased(u0, alias_u0)
-    fu = evaluate_f(prob, u)
-    U, Vᵀ = __init_low_rank_jacobian(u, fu, threshold)
-
-    @bb du = copy(fu)
-    @bb u_cache = copy(u)
-    @bb fu_cache = copy(fu)
-    @bb dfu = similar(fu)
-    @bb vᵀ_cache = similar(u)
-    @bb mat_cache = similar(u)
-
-    reset_tolerance = alg.reset_tolerance === nothing ? sqrt(eps(real(eltype(u)))) :
-                      alg.reset_tolerance
-    reset_check = x -> abs(x) ≤ reset_tolerance
-
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-
-    U_part = selectdim(U, 1, 1:0)
-    Vᵀ_part = selectdim(Vᵀ, 2, 1:0)
-    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(*, Vᵀ_part, U_part), du;
-        kwargs...)
-
-    threshold_cache = __lbroyden_threshold_cache(u, threshold)
-
-    return LimitedMemoryBroydenCache{iip}(f, alg, u, u_cache, du, fu, fu_cache, dfu, p,
-        U, Vᵀ, threshold_cache, mat_cache, vᵀ_cache, false, 0, 0, alg.max_resets, maxiters,
-        internalnorm, ReturnCode.Default, abstol, reltol, reset_tolerance, reset_check,
-        prob, NLStats(1, 0, 0, 0, 0),
-        init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip)), tc_cache, trace)
-end
-
-function perform_step!(cache::LimitedMemoryBroydenCache{iip}) where {iip}
-    T = eltype(cache.u)
-
-    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    @bb axpy!(-α, cache.du, cache.u)
-    evaluate_f(cache, cache.u, cache.p)
-
-    idx = min(cache.iterations_since_reset, size(cache.U, 2))
-    U_part = selectdim(cache.U, 2, 1:idx)
-    Vᵀ_part = selectdim(cache.Vᵀ, 1, 1:idx)
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), cache.fu,
-        ApplyArray(*, Vᵀ_part, U_part), cache.du, α)
-
-    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-
-    cache.force_stop && return nothing
-
-    # Update the Inverse Jacobian Approximation
-    @bb @. cache.dfu = cache.fu - cache.fu_cache
-
-    # Only try to reset if we have enough iterations since last reset
-    if cache.iterations_since_reset > size(cache.U, 1) &&
-       (all(cache.reset_check, cache.du) || all(cache.reset_check, cache.dfu))
-        if cache.resets ≥ cache.max_resets
-            cache.retcode = ReturnCode.ConvergenceFailure
-            cache.force_stop = true
-            return nothing
-        end
-        cache.iterations_since_reset = 0
-        cache.resets += 1
-        @bb copyto!(cache.du, cache.fu)
-    else
-        @bb cache.du .*= -1
-
-        cache.vᵀ_cache = _rmatvec!!(cache.vᵀ_cache, cache.threshold_cache, U_part, Vᵀ_part,
-            cache.du)
-        cache.mat_cache = _matvec!!(cache.mat_cache, cache.threshold_cache, U_part, Vᵀ_part,
-            cache.dfu)
-
-        denom = dot(cache.vᵀ_cache, cache.dfu)
-        @bb @. cache.u_cache = (cache.du - cache.mat_cache) /
-                               ifelse(iszero(denom), T(1e-5), denom)
-
-        idx = mod1(cache.iterations_since_reset + 1, size(cache.U, 2))
-        selectdim(cache.U, 2, idx) .= _vec(cache.u_cache)
-        selectdim(cache.Vᵀ, 1, idx) .= _vec(cache.vᵀ_cache)
-
-        idx = min(cache.iterations_since_reset + 1, size(cache.U, 2))
-        U_part = selectdim(cache.U, 2, 1:idx)
-        Vᵀ_part = selectdim(cache.Vᵀ, 1, 1:idx)
-        cache.du = _matvec!!(cache.du, cache.threshold_cache, U_part, Vᵀ_part, cache.fu)
-
-        cache.iterations_since_reset += 1
-    end
-
-    @bb copyto!(cache.u_cache, cache.u)
-    @bb copyto!(cache.fu_cache, cache.fu)
-
-    return nothing
-end
-
-function __reinit_internal!(cache::LimitedMemoryBroydenCache; kwargs...)
-    cache.iterations_since_reset = 0
-    return nothing
-end
-
-function _rmatvec!!(y, xᵀU, U, Vᵀ, x)
-    # xᵀ × (-I + UVᵀ)
-    η = size(U, 2)
-    if η == 0
-        @bb @. y = -x
-        return y
-    end
-    x_ = vec(x)
-    xᵀU_ = view(xᵀU, 1:η)
-    @bb xᵀU_ = transpose(U) × x_
-    @bb y = transpose(Vᵀ) × vec(xᵀU_)
-    @bb @. y -= x
-    return y
-end
-
-function _matvec!!(y, Vᵀx, U, Vᵀ, x)
-    # (-I + UVᵀ) × x
-    η = size(U, 2)
-    if η == 0
-        @bb @. y = -x
-        return y
-    end
-    x_ = vec(x)
-    Vᵀx_ = view(Vᵀx, 1:η)
-    @bb Vᵀx_ = Vᵀ × x_
-    @bb y = U × vec(Vᵀx_)
-    @bb @. y -= x
-    return y
-end
-
-@inline function __lbroyden_threshold_cache(x, ::Val{threshold}) where {threshold}
-    return similar(x, threshold)
-end
-@inline function __lbroyden_threshold_cache(x::SArray, ::Val{threshold}) where {threshold}
-    return zeros(SVector{threshold, eltype(x)})
-end
diff --git a/src/levenberg.jl b/src/levenberg.jl
deleted file mode 100644
index 95daa3084..000000000
--- a/src/levenberg.jl
+++ /dev/null
@@ -1,395 +0,0 @@
-"""
-    LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-        precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
-        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
-        finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
-        b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
-
-An advanced Levenberg-Marquardt implementation with the improvements suggested in the
-[paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
-algorithm for nonlinear least-squares minimization". Designed for large-scale and
-numerically-difficult nonlinear systems.
-
-### How to Choose the Linear Solver?
-
-There are 2 ways to perform the LM Step
-
- 1. Solve `(JᵀJ + λDᵀD) δx = Jᵀf` directly using a linear solver
- 2. Solve for `Jδx = f` and `√λ⋅D δx = 0` simultaneously (to derive this simply compute the
-    normal form for this)
-
-The second form tends to be more robust and can be solved using any Least Squares Solver.
-If no `linsolve` or a least squares solver is provided, then we will solve the 2nd form.
-However, in most cases, this means losing structure in `J` which is not ideal. Note that
-whatever you do, do not specify solvers like `linsolve = NormalCholeskyFactorization()` or
-any such solver which converts the equation to normal form before solving. These don't use
-cache efficiently and we already support the normal form natively.
-
-Additionally, note that the first form leads to a positive definite system, so we can use
-more efficient solvers like `linsolve = CholeskyFactorization()`. If you know that the
-problem is very well conditioned, then you might want to solve the normal form directly.
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing` which means that a default is selected according to the problem specification!
-    Valid choices are types from ADTypes.jl.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `damping_initial`: the starting value for the damping factor. The damping factor is
-    inversely proportional to the step size. The damping factor is adjusted during each
-    iteration. Defaults to `1.0`. For more details, see section 2.1 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `damping_increase_factor`: the factor by which the damping is increased if a step is
-    rejected. Defaults to `2.0`. For more details, see section 2.1 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
-    accepted. Defaults to `3.0`. For more details, see section 2.1 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
-    the geodesic acceleration. Defaults to `0.1` which means that the step size is
-    approximately 10% of the first-order step. For more details, see section 3 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
-    geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
-    that acceptable steps meet the condition
-    ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
-    acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
-    path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
-    is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
-    effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
-    choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
-    and reject all steps that increase the cost. Although this is a natural and safe choice,
-    it is often not the most efficient. Therefore downhill moves are always accepted, but
-    uphill moves are only conditionally accepted. To decide whether an uphill move will be
-    accepted at each iteration ``i``, we compute
-    ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
-    between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
-    step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
-    specify, uphill moves are accepted if
-    ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
-    iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
-    allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
-    will be accepted. Defaults to `1.0`. For more details, see section 4 of
-    [this paper](https://arxiv.org/abs/1201.5885).
-  - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
-    `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
-    where `J` is the Jacobian. It is suggested by
-    [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
-    `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
-"""
-@concrete struct LevenbergMarquardt{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    linsolve
-    precs
-    damping_initial
-    damping_increase_factor
-    damping_decrease_factor
-    finite_diff_step_geodesic
-    α_geodesic
-    b_uphill
-    min_damping_D
-end
-
-function set_ad(alg::LevenbergMarquardt{CJ}, ad) where {CJ}
-    return LevenbergMarquardt{CJ}(ad, alg.linsolve, alg.precs, alg.damping_initial,
-        alg.damping_increase_factor, alg.damping_decrease_factor,
-        alg.finite_diff_step_geodesic, alg.α_geodesic, alg.b_uphill, alg.min_damping_D)
-end
-
-function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-        precs = DEFAULT_PRECS, damping_initial::Real = 1.0, α_geodesic::Real = 0.75,
-        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
-        finite_diff_step_geodesic::Real = 0.1, b_uphill::Real = 1.0,
-        min_damping_D::Real = 1e-8, autodiff = nothing)
-    _concrete_jac = ifelse(concrete_jac === nothing, true, concrete_jac)
-    return LevenbergMarquardt{_unwrap_val(_concrete_jac)}(autodiff, linsolve, precs,
-        damping_initial, damping_increase_factor, damping_decrease_factor,
-        finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
-end
-
-@concrete mutable struct LevenbergMarquardtCache{iip, fastls} <:
-                         AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    u_cache_2
-    fu
-    fu_cache
-    fu_cache_2
-    J
-    JᵀJ
-    Jv
-    DᵀD
-    v
-    v_cache
-    a
-    mat_tmp
-    rhs_tmp
-    p
-    uf
-    linsolve
-    jac_cache
-    force_stop::Bool
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    prob
-    λ
-    λ_factor
-    damping_increase_factor
-    damping_decrease_factor
-    h
-    α_geodesic
-    b_uphill
-    min_damping_D
-    norm_v_old
-    loss_old
-    make_new_J::Bool
-    stats::NLStats
-    tc_cache_1
-    tc_cache_2
-    trace
-end
-
-function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
-            NonlinearLeastSquaresProblem{uType, iip}}, alg_::LevenbergMarquardt,
-        args...; alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm::F = DEFAULT_NORM,
-        linsolve_kwargs = (;), kwargs...) where {uType, iip, F}
-    alg = get_concrete_algorithm(alg_, prob)
-    @unpack f, u0, p = prob
-
-    u = __maybe_unaliased(u0, alias_u0)
-    T = eltype(u)
-    fu = evaluate_f(prob, u)
-
-    fastls = prob isa NonlinearProblem && !__needs_square_A(alg, u0)
-
-    if !fastls
-        uf, linsolve, J, fu_cache, jac_cache, du, JᵀJ, v = jacobian_caches(alg, f, u, p,
-            Val(iip); linsolve_kwargs, linsolve_with_JᵀJ = Val(true))
-    else
-        uf, linsolve, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p,
-            Val(iip); linsolve_kwargs, linsolve_with_JᵀJ = Val(false))
-        u_ = _vec(u)
-        @bb JᵀJ = similar(u_)
-        @bb v = similar(du)
-    end
-
-    λ = T(alg.damping_initial)
-    λ_factor = T(alg.damping_increase_factor)
-    damping_increase_factor = T(alg.damping_increase_factor)
-    damping_decrease_factor = T(alg.damping_decrease_factor)
-    h = T(alg.finite_diff_step_geodesic)
-    α_geodesic = T(alg.α_geodesic)
-    b_uphill = T(alg.b_uphill)
-    min_damping_D = T(alg.min_damping_D)
-
-    DᵀD = __init_diagonal(u, min_damping_D)
-
-    loss = internalnorm(fu)
-
-    a = du # `du` is not used anywhere, use it to store `a`
-
-    make_new_J = true
-
-    abstol, reltol, tc_cache_1 = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-    if prob isa NonlinearLeastSquaresProblem
-        _, _, tc_cache_2 = init_termination_cache(abstol, reltol, fu, u,
-            termination_condition)
-    else
-        tc_cache_2 = nothing
-    end
-
-    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
-
-    if !fastls
-        @bb mat_tmp = zero(JᵀJ)
-        rhs_tmp = nothing
-    else
-        mat_tmp = _vcat(J, DᵀD)
-        @bb mat_tmp .*= T(0)
-        rhs_tmp = vcat(_vec(fu), _vec(u))
-        @bb rhs_tmp .*= T(0)
-        linsolve = linsolve_caches(mat_tmp, rhs_tmp, u, p, alg; linsolve_kwargs)
-    end
-
-    @bb u_cache = copy(u)
-    @bb u_cache_2 = similar(u)
-    @bb fu_cache_2 = similar(fu)
-    Jv = J * _vec(v)
-    @bb v_cache = zero(v)
-
-    return LevenbergMarquardtCache{iip, fastls}(f, alg, u, u_cache, u_cache_2, fu, fu_cache,
-        fu_cache_2, J, JᵀJ, Jv, DᵀD, v, v_cache, a, mat_tmp, rhs_tmp, p, uf,
-        linsolve, jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol,
-        reltol, prob, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h,
-        α_geodesic, b_uphill, min_damping_D, loss, loss, make_new_J,
-        NLStats(1, 0, 0, 0, 0), tc_cache_1, tc_cache_2, trace)
-end
-
-function perform_step!(cache::LevenbergMarquardtCache{iip, fastls}) where {iip, fastls}
-    @unpack alg, linsolve = cache
-
-    if cache.make_new_J
-        cache.J = jacobian!!(cache.J, cache)
-        if fastls
-            cache.JᵀJ = __sum_JᵀJ!!(cache.JᵀJ, cache.J)
-        else
-            @bb cache.JᵀJ = transpose(cache.J) × cache.J
-        end
-        cache.DᵀD = __update_LM_diagonal!!(cache.DᵀD, cache.JᵀJ)
-        cache.make_new_J = false
-    end
-
-    # Usual Levenberg-Marquardt step ("velocity").
-    # The following lines do: cache.v = -cache.mat_tmp \ cache.u_tmp
-    if fastls
-        if setindex_trait(cache.mat_tmp) === CanSetindex()
-            copyto!(@view(cache.mat_tmp[1:length(cache.fu), :]), cache.J)
-            cache.mat_tmp[(length(cache.fu) + 1):end, :] .= sqrt.(cache.λ .* cache.DᵀD)
-        else
-            cache.mat_tmp = _vcat(cache.J, sqrt.(cache.λ .* cache.DᵀD))
-        end
-        if setindex_trait(cache.rhs_tmp) === CanSetindex()
-            cache.rhs_tmp[1:length(cache.fu)] .= _vec(cache.fu)
-        else
-            cache.rhs_tmp = _vcat(_vec(cache.fu), zero(_vec(cache.u)))
-        end
-        linres = dolinsolve(cache, alg.precs, linsolve; A = cache.mat_tmp,
-            b = cache.rhs_tmp, linu = _vec(cache.v), cache.p, reltol = cache.abstol)
-    else
-        @bb cache.u_cache_2 = transpose(cache.J) × cache.fu
-        @bb @. cache.mat_tmp = cache.JᵀJ + cache.λ * cache.DᵀD
-        linres = dolinsolve(cache, alg.precs, linsolve;
-            A = __maybe_symmetric(cache.mat_tmp), b = _vec(cache.u_cache_2),
-            linu = _vec(cache.v), cache.p, reltol = cache.abstol)
-    end
-    cache.linsolve = linres.cache
-    linu = _restructure(cache.v, linres.u)
-    @bb @. cache.v = -linu
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, get_u(cache), get_fu(cache), cache.J,
-        cache.v)
-
-    # Geodesic acceleration (step_size = v + a / 2).
-    @bb @. cache.u_cache_2 = cache.u + cache.h * cache.v
-    evaluate_f(cache, cache.u_cache_2, cache.p, Val(:fu_cache_2))
-
-    # The following lines do: cache.a = -cache.mat_tmp \ cache.fu_tmp
-    # NOTE: Don't pass `A` in again, since we want to reuse the previous solve
-    @bb cache.Jv = cache.J × vec(cache.v)
-    Jv = _restructure(cache.fu_cache_2, cache.Jv)
-    @bb @. cache.fu_cache_2 = (2 / cache.h) * ((cache.fu_cache_2 - cache.fu) / cache.h - Jv)
-    if fastls
-        if setindex_trait(cache.rhs_tmp) === CanSetindex()
-            cache.rhs_tmp[1:length(cache.fu)] .= _vec(cache.fu_cache_2)
-        else
-            cache.rhs_tmp = _vcat(_vec(cache.fu_cache_2), zero(_vec(cache.u)))
-        end
-        linres = dolinsolve(cache, alg.precs, linsolve; b = cache.rhs_tmp,
-            linu = _vec(cache.a), cache.p, reltol = cache.abstol)
-    else
-        @bb cache.u_cache_2 = transpose(cache.J) × cache.fu_cache_2
-        linres = dolinsolve(cache, alg.precs, linsolve; b = _vec(cache.u_cache_2),
-            linu = _vec(cache.a), cache.p, reltol = cache.abstol)
-    end
-    cache.linsolve = linres.cache
-    linu = _restructure(cache.a, linres.u)
-    @bb @. cache.a = -linu
-
-    # Require acceptable steps to satisfy the following condition.
-    norm_v = cache.internalnorm(cache.v)
-    if 2 * cache.internalnorm(cache.a) ≤ cache.α_geodesic * norm_v
-        @bb @. cache.u_cache_2 = cache.u + cache.v + cache.a / 2
-        evaluate_f(cache, cache.u_cache_2, cache.p, Val(:fu_cache_2))
-        loss = cache.internalnorm(cache.fu_cache_2)
-
-        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-        β = dot(cache.v, cache.v_cache) / (norm_v * cache.norm_v_old)
-        if (1 - β)^cache.b_uphill * loss ≤ cache.loss_old
-            # Accept step.
-            @bb copyto!(cache.u, cache.u_cache_2)
-            check_and_update!(cache.tc_cache_1, cache, cache.fu_cache_2, cache.u,
-                cache.u_cache)
-            if !cache.force_stop && cache.tc_cache_2 !== nothing # For NLLS Problems
-                @bb @. cache.fu = cache.fu_cache_2 - cache.fu
-                check_and_update!(cache.tc_cache_2, cache, cache.fu, cache.u, cache.u_cache)
-            end
-            @bb copyto!(cache.fu, cache.fu_cache_2)
-            @bb copyto!(cache.v_cache, cache.v)
-            cache.norm_v_old = norm_v
-            cache.loss_old = loss
-            cache.λ_factor = 1 / cache.damping_decrease_factor
-            cache.make_new_J = true
-        end
-    end
-
-    @bb copyto!(cache.u_cache, cache.u)
-    cache.λ *= cache.λ_factor
-    cache.λ_factor = cache.damping_increase_factor
-    return nothing
-end
-
-@inline __update_LM_diagonal!!(y::Number, x::Number) = max(y, x)
-@inline function __update_LM_diagonal!!(y::Diagonal, x::AbstractVector)
-    if setindex_trait(y.diag) === CanSetindex()
-        @. y.diag = max(y.diag, x)
-        return y
-    else
-        return Diagonal(max.(y.diag, x))
-    end
-end
-@inline function __update_LM_diagonal!!(y::Diagonal, x::AbstractMatrix)
-    if setindex_trait(y.diag) === CanSetindex()
-        if fast_scalar_indexing(y.diag)
-            @inbounds for i in axes(x, 1)
-                y.diag[i] = max(y.diag[i], x[i, i])
-            end
-            return y
-        else
-            idxs = diagind(x)
-            @.. broadcast=false y.diag=max(y.diag, @view(x[idxs]))
-            return y
-        end
-    else
-        idxs = diagind(x)
-        return Diagonal(@.. broadcast=false max(y.diag, @view(x[idxs])))
-    end
-end
-
-function __reinit_internal!(cache::LevenbergMarquardtCache;
-        termination_condition = get_termination_mode(cache.tc_cache_1), kwargs...)
-    abstol, reltol, tc_cache_1 = init_termination_cache(cache.abstol, cache.reltol,
-        cache.fu, cache.u, termination_condition)
-    if cache.tc_cache_2 !== nothing
-        _, _, tc_cache_2 = init_termination_cache(cache.abstol, cache.reltol, cache.fu,
-            cache.u, termination_condition)
-        cache.tc_cache_2 = tc_cache_2
-    end
-
-    cache.tc_cache_1 = tc_cache_1
-    cache.abstol = abstol
-    cache.reltol = reltol
-    return nothing
-end
diff --git a/src/linesearch.jl b/src/linesearch.jl
deleted file mode 100644
index 33de25ae7..000000000
--- a/src/linesearch.jl
+++ /dev/null
@@ -1,305 +0,0 @@
-"""
-    LineSearch(; method = nothing, autodiff = nothing, alpha = true)
-
-Wrapper over algorithms from
-[LineSearches.jl](https://github.com/JuliaNLSolvers/LineSearches.jl/). Allows automatic
-construction of the objective functions for the line search algorithms utilizing automatic
-differentiation for fast Vector Jacobian Products.
-
-### Arguments
-
-  - `method`: the line search algorithm to use. Defaults to `nothing`, which means that the
-    step size is fixed to the value of `alpha`.
-  - `autodiff`: the automatic differentiation backend to use for the line search. Defaults to
-    `AutoFiniteDiff()`, which means that finite differencing is used to compute the VJP.
-    `AutoZygote()` will be faster in most cases, but it requires `Zygote.jl` to be manually
-    installed and loaded.
-  - `alpha`: the initial step size to use. Defaults to `true` (which is equivalent to `1`).
-"""
-@concrete struct LineSearch
-    method
-    autodiff
-    α
-end
-
-function LineSearch(; method = nothing, autodiff = nothing, alpha = true)
-    return LineSearch(method, autodiff, alpha)
-end
-
-@inline function init_linesearch_cache(ls::LineSearch, f::F, u, p, fu, iip) where {F}
-    return init_linesearch_cache(ls.method, ls, f, u, p, fu, iip)
-end
-
-@concrete struct NoLineSearchCache
-    α
-end
-
-function init_linesearch_cache(::Nothing, ls::LineSearch, f::F, u, p, fu, iip) where {F}
-    return NoLineSearchCache(convert(eltype(u), ls.α))
-end
-
-perform_linesearch!(cache::NoLineSearchCache, u, du) = cache.α
-
-# LineSearches.jl doesn't have a supertype so default to that
-function init_linesearch_cache(_, ls::LineSearch, f::F, u, p, fu, iip) where {F}
-    return LineSearchesJLCache(ls, f, u, p, fu, iip)
-end
-
-# FIXME: The closures lead to too many unnecessary runtime dispatches which leads to the
-#        massive increase in precompilation times.
-# Wrapper over LineSearches.jl algorithms
-@concrete mutable struct LineSearchesJLCache
-    f
-    ϕ
-    dϕ
-    ϕdϕ
-    α
-    ls
-end
-
-function LineSearchesJLCache(ls::LineSearch, f::F, u::Number, p, _, ::Val{false}) where {F}
-    eval_f(u, du, α) = eval_f(u - α * du)
-    eval_f(u) = f(u, p)
-
-    ls.method isa Static && return LineSearchesJLCache(eval_f, nothing, nothing, nothing,
-        convert(typeof(u), ls.α), ls)
-
-    g(u, fu) = last(value_derivative(Base.Fix2(f, p), u)) * fu
-
-    function ϕ(u, du)
-        function ϕ_internal(α)
-            u_ = u - α * du
-            _fu = eval_f(u_)
-            return dot(_fu, _fu) / 2
-        end
-        return ϕ_internal
-    end
-
-    function dϕ(u, du)
-        function dϕ_internal(α)
-            u_ = u - α * du
-            _fu = eval_f(u_)
-            g₀ = g(u_, _fu)
-            return dot(g₀, -du)
-        end
-        return dϕ_internal
-    end
-
-    function ϕdϕ(u, du)
-        function ϕdϕ_internal(α)
-            u_ = u - α * du
-            _fu = eval_f(u_)
-            g₀ = g(u_, _fu)
-            return dot(_fu, _fu) / 2, dot(g₀, -du)
-        end
-        return ϕdϕ_internal
-    end
-
-    return LineSearchesJLCache(eval_f, ϕ, dϕ, ϕdϕ, convert(eltype(u), ls.α), ls)
-end
-
-function LineSearchesJLCache(ls::LineSearch, f::F, u, p, fu1, IIP::Val{iip}) where {iip, F}
-    fu = iip ? deepcopy(fu1) : nothing
-    u_ = _mutable_zero(u)
-
-    function eval_f(u, du, α)
-        @. u_ = u - α * du
-        return eval_f(u_)
-    end
-    eval_f(u) = evaluate_f(f, u, p, IIP; fu)
-
-    ls.method isa Static && return LineSearchesJLCache(eval_f, nothing, nothing, nothing,
-        convert(eltype(u), ls.α), ls)
-
-    g₀ = _mutable_zero(u)
-
-    autodiff = if ls.autodiff === nothing
-        if !iip && is_extension_loaded(Val{:Zygote}())
-            AutoZygote()
-        else
-            AutoFiniteDiff()
-        end
-    else
-        if iip && (ls.autodiff isa AutoZygote || ls.autodiff isa AutoSparseZygote)
-            @warn "Attempting to use Zygote.jl for linesearch on an in-place problem. \
-                Falling back to finite differencing."
-            AutoFiniteDiff()
-        else
-            ls.autodiff
-        end
-    end
-
-    function g!(u, fu)
-        if f.jvp !== nothing
-            @warn "Currently we don't make use of user provided `jvp` in linesearch. This \
-                   is planned to be fixed in the near future." maxlog=1
-        end
-        op = VecJac(SciMLBase.JacobianWrapper(f, p), u; fu = fu1, autodiff)
-        if iip
-            mul!(g₀, op, fu)
-            return g₀
-        else
-            return op * fu
-        end
-    end
-
-    function ϕ(u, du)
-        function ϕ_internal(α)
-            @. u_ = u - α * du
-            _fu = eval_f(u_)
-            return dot(_fu, _fu) / 2
-        end
-        return ϕ_internal
-    end
-
-    function dϕ(u, du)
-        function dϕ_internal(α)
-            @. u_ = u - α * du
-            _fu = eval_f(u_)
-            g₀ = g!(u_, _fu)
-            return dot(g₀, -du)
-        end
-        return dϕ_internal
-    end
-
-    function ϕdϕ(u, du)
-        function ϕdϕ_internal(α)
-            @. u_ = u - α * du
-            _fu = eval_f(u_)
-            g₀ = g!(u_, _fu)
-            return dot(_fu, _fu) / 2, dot(g₀, -du)
-        end
-        return ϕdϕ_internal
-    end
-
-    return LineSearchesJLCache(eval_f, ϕ, dϕ, ϕdϕ, convert(eltype(u), ls.α), ls)
-end
-
-function perform_linesearch!(cache::LineSearchesJLCache, u, du)
-    cache.ls.method isa Static && return cache.α
-
-    ϕ = cache.ϕ(u, du)
-    dϕ = cache.dϕ(u, du)
-    ϕdϕ = cache.ϕdϕ(u, du)
-
-    ϕ₀, dϕ₀ = ϕdϕ(zero(eltype(u)))
-
-    return first(cache.ls.method(ϕ, dϕ, ϕdϕ, cache.α, ϕ₀, dϕ₀))
-end
-
-"""
-    LiFukushimaLineSearch(; lambda_0 = 1.0, beta = 0.5, sigma_1 = 0.001,
-        eta = 0.1, nan_max_iter = 5, maxiters = 50)
-
-A derivative-free line search and global convergence of Broyden-like method for nonlinear
-equations by Dong-Hui Li & Masao Fukushima. For more details see
-https://doi.org/10.1080/10556780008805782
-"""
-struct LiFukushimaLineSearch{T} <: AbstractNonlinearSolveLineSearchAlgorithm
-    λ₀::T
-    β::T
-    σ₁::T
-    σ₂::T
-    η::T
-    ρ::T
-    nan_max_iter::Int
-    maxiters::Int
-end
-
-function LiFukushimaLineSearch(; lambda_0 = 1.0, beta = 0.1, sigma_1 = 0.001,
-        sigma_2 = 0.001, eta = 0.1, rho = 0.9, nan_max_iter = 5, maxiters = 50)
-    T = promote_type(typeof(lambda_0), typeof(beta), typeof(sigma_1), typeof(eta),
-        typeof(rho), typeof(sigma_2))
-    return LiFukushimaLineSearch{T}(lambda_0, beta, sigma_1, sigma_2, eta, rho,
-        nan_max_iter, maxiters)
-end
-
-@concrete mutable struct LiFukushimaLineSearchCache{iip}
-    f
-    p
-    u_cache
-    fu_cache
-    alg
-    α
-end
-
-function init_linesearch_cache(alg::LiFukushimaLineSearch, ls::LineSearch, f::F, _u, p, _fu,
-        ::Val{iip}) where {iip, F}
-    fu = iip ? deepcopy(_fu) : nothing
-    u = iip ? deepcopy(_u) : nothing
-    return LiFukushimaLineSearchCache{iip}(f, p, u, fu, alg, ls.α)
-end
-
-function perform_linesearch!(cache::LiFukushimaLineSearchCache{iip}, u, du) where {iip}
-    (; β, σ₁, σ₂, η, λ₀, ρ, nan_max_iter, maxiters) = cache.alg
-    λ₂ = λ₀
-    λ₁ = λ₂
-
-    if iip
-        cache.f(cache.fu_cache, u, cache.p)
-        fx_norm = norm(cache.fu_cache, 2)
-    else
-        fx_norm = norm(cache.f(u, cache.p), 2)
-    end
-
-    # Non-Blocking exit if the norm is NaN or Inf
-    !isfinite(fx_norm) && return cache.α
-
-    # Early Terminate based on Eq. 2.7
-    if iip
-        cache.u_cache .= u .- du
-        cache.f(cache.fu_cache, cache.u_cache, cache.p)
-        fxλ_norm = norm(cache.fu_cache, 2)
-    else
-        fxλ_norm = norm(cache.f(u .- du, cache.p), 2)
-    end
-
-    fxλ_norm ≤ ρ * fx_norm - σ₂ * norm(du, 2)^2 && return cache.α
-
-    if iip
-        cache.u_cache .= u .- λ₂ .* du
-        cache.f(cache.fu_cache, cache.u_cache, cache.p)
-        fxλp_norm = norm(cache.fu_cache, 2)
-    else
-        fxλp_norm = norm(cache.f(u .- λ₂ .* du, cache.p), 2)
-    end
-
-    if !isfinite(fxλp_norm)
-        # Backtrack a finite number of steps
-        nan_converged = false
-        for _ in 1:nan_max_iter
-            λ₁, λ₂ = λ₂, β * λ₂
-
-            if iip
-                cache.u_cache .= u .+ λ₂ .* du
-                cache.f(cache.fu_cache, cache.u_cache, cache.p)
-                fxλp_norm = norm(cache.fu_cache, 2)
-            else
-                fxλp_norm = norm(cache.f(u .+ λ₂ .* du, cache.p), 2)
-            end
-
-            nan_converged = isfinite(fxλp_norm)
-            nan_converged && break
-        end
-
-        # Non-Blocking exit if the norm is still NaN or Inf
-        !nan_converged && return cache.α
-    end
-
-    for _ in 1:maxiters
-        if iip
-            cache.u_cache .= u .- λ₂ .* du
-            cache.f(cache.fu_cache, cache.u_cache, cache.p)
-            fxλp_norm = norm(cache.fu_cache, 2)
-        else
-            fxλp_norm = norm(cache.f(u .- λ₂ .* du, cache.p), 2)
-        end
-
-        converged = fxλp_norm ≤ (1 + η) * fx_norm - σ₁ * λ₂^2 * norm(du, 2)^2
-
-        converged && break
-        λ₁, λ₂ = λ₂, β * λ₂
-    end
-
-    return λ₂
-end
diff --git a/src/pseudotransient.jl b/src/pseudotransient.jl
deleted file mode 100644
index 7045e38cd..000000000
--- a/src/pseudotransient.jl
+++ /dev/null
@@ -1,156 +0,0 @@
-"""
-    PseudoTransient(; concrete_jac = nothing, linsolve = nothing,
-        precs = DEFAULT_PRECS, alpha_initial = 1e-3, adkwargs...)
-
-An implementation of PseudoTransient method that is used to solve steady state problems in
-an accelerated manner. It uses an adaptive time-stepping to integrate an initial value of
-nonlinear problem until sufficient accuracy in the desired steady-state is achieved to
-switch over to Newton's method and gain a rapid convergence. This implementation
-specifically uses "switched evolution relaxation" SER method. For detail information about
-the time-stepping and algorithm, please see the paper:
-[Coffey, Todd S. and Kelley, C. T. and Keyes, David E. (2003), Pseudotransient Continuation and Differential-Algebraic Equations,
-SIAM Journal on Scientific Computing,25, 553-569.](https://doi.org/10.1137/S106482750241044X)
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing` which means that a default is selected according to the problem specification!
-    Valid choices are types from ADTypes.jl.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `alpha_initial` : the initial pseudo time step. it defaults to 1e-3. If it is small,
-    you are going to need more iterations to converge but it can be more stable.
-"""
-@concrete struct PseudoTransient{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    linsolve
-    precs
-    alpha_initial
-end
-
-function set_ad(alg::PseudoTransient{CJ}, ad) where {CJ}
-    return PseudoTransient{CJ}(ad, alg.linsolve, alg.precs, alg.alpha_initial)
-end
-
-function PseudoTransient(; concrete_jac = nothing, linsolve = nothing,
-        precs = DEFAULT_PRECS, alpha_initial = 1e-3, autodiff = nothing)
-    return PseudoTransient{_unwrap_val(concrete_jac)}(autodiff, linsolve, precs,
-        alpha_initial)
-end
-
-@concrete mutable struct PseudoTransientCache{iip} <: AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    fu
-    fu_cache
-    du
-    p
-    alpha
-    res_norm
-    uf
-    linsolve
-    J
-    jac_cache
-    force_stop
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    prob
-    stats::NLStats
-    tc_cache
-    trace
-end
-
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::PseudoTransient,
-        args...; alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm = DEFAULT_NORM,
-        linsolve_kwargs = (;), kwargs...) where {uType, iip}
-    alg = get_concrete_algorithm(alg_, prob)
-
-    @unpack f, u0, p = prob
-    u = __maybe_unaliased(u0, alias_u0)
-    fu = evaluate_f(prob, u)
-    uf, linsolve, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
-        linsolve_kwargs)
-    alpha = convert(eltype(u), alg.alpha_initial)
-    res_norm = internalnorm(fu)
-
-    @bb u_cache = copy(u)
-
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
-
-    return PseudoTransientCache{iip}(f, alg, u, u_cache, fu, fu_cache, du, p, alpha,
-        res_norm, uf, linsolve, J, jac_cache, false, maxiters, internalnorm,
-        ReturnCode.Default, abstol, reltol, prob, NLStats(1, 0, 0, 0, 0), tc_cache, trace)
-end
-
-function perform_step!(cache::PseudoTransientCache{iip}) where {iip}
-    @unpack alg = cache
-
-    cache.J = jacobian!!(cache.J, cache)
-
-    inv_α = inv(cache.alpha)
-    if cache.J isa SciMLOperators.AbstractSciMLOperator
-        A = cache.J - inv_α * I
-    elseif setindex_trait(cache.J) === CanSetindex()
-        if fast_scalar_indexing(cache.J)
-            @inbounds for i in axes(cache.J, 1)
-                cache.J[i, i] = cache.J[i, i] - inv_α
-            end
-        else
-            idxs = diagind(cache.J)
-            @.. broadcast=false @view(cache.J[idxs])=@view(cache.J[idxs]) - inv_α
-        end
-        A = cache.J
-    else
-        cache.J = cache.J - inv_α * I
-        A = cache.J
-    end
-
-    # u = u - J \ fu
-    linres = dolinsolve(cache, alg.precs, cache.linsolve; A, b = _vec(cache.fu),
-        linu = _vec(cache.du), cache.p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    cache.du = _restructure(cache.du, linres.u)
-
-    @bb axpy!(-true, cache.du, cache.u)
-
-    evaluate_f(cache, cache.u, cache.p)
-
-    update_trace!(cache, true)
-
-    new_norm = cache.internalnorm(cache.fu)
-    cache.alpha *= cache.res_norm / new_norm
-    cache.res_norm = new_norm
-
-    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-
-    @bb copyto!(cache.u_cache, cache.u)
-    return nothing
-end
-
-function __reinit_internal!(cache::PseudoTransientCache; alpha = cache.alg.alpha_initial,
-        kwargs...)
-    cache.alpha = convert(eltype(cache.u), alpha)
-    cache.res_norm = cache.internalnorm(cache.fu)
-    return nothing
-end
diff --git a/src/raphson.jl b/src/raphson.jl
deleted file mode 100644
index 0fa918232..000000000
--- a/src/raphson.jl
+++ /dev/null
@@ -1,122 +0,0 @@
-"""
-    NewtonRaphson(; concrete_jac = nothing, linsolve = nothing, linesearch = nothing,
-        precs = DEFAULT_PRECS, adkwargs...)
-
-An advanced NewtonRaphson implementation with support for efficient handling of sparse
-matrices via colored automatic differentiation and preconditioned linear solvers. Designed
-for large-scale and numerically-difficult nonlinear systems.
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing` which means that a default is selected according to the problem specification!
-    Valid choices are types from ADTypes.jl.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `linesearch`: the line search algorithm to use. Defaults to [`LineSearch()`](@ref),
-    which means that no line search is performed. Algorithms from `LineSearches.jl` can be
-    used here directly, and they will be converted to the correct `LineSearch`.
-"""
-@concrete struct NewtonRaphson{CJ, AD} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    linsolve
-    precs
-    linesearch
-end
-
-function set_ad(alg::NewtonRaphson{CJ}, ad) where {CJ}
-    return NewtonRaphson{CJ}(ad, alg.linsolve, alg.precs, alg.linesearch)
-end
-
-function NewtonRaphson(; concrete_jac = nothing, linsolve = nothing, linesearch = nothing,
-        precs = DEFAULT_PRECS, autodiff = nothing)
-    linesearch = linesearch isa LineSearch ? linesearch : LineSearch(; method = linesearch)
-    return NewtonRaphson{_unwrap_val(concrete_jac)}(autodiff, linsolve, precs, linesearch)
-end
-
-@concrete mutable struct NewtonRaphsonCache{iip} <: AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    fu
-    u_cache
-    fu_cache
-    du
-    p
-    uf
-    linsolve
-    J
-    jac_cache
-    force_stop
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    prob
-    stats::NLStats
-    ls_cache
-    tc_cache
-    trace
-end
-
-function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg_::NewtonRaphson, args...;
-        alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm = DEFAULT_NORM, linsolve_kwargs = (;),
-        kwargs...) where {uType, iip}
-    alg = get_concrete_algorithm(alg_, prob)
-    @unpack f, u0, p = prob
-    u = __maybe_unaliased(u0, alias_u0)
-    fu = evaluate_f(prob, u)
-    uf, linsolve, J, fu_cache, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip);
-        linsolve_kwargs)
-
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-
-    ls_cache = init_linesearch_cache(alg.linesearch, f, u, p, fu, Val(iip))
-    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
-
-    @bb u_cache = copy(u)
-
-    return NewtonRaphsonCache{iip}(f, alg, u, fu, u_cache, fu_cache, du, p, uf, linsolve, J,
-        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, reltol, prob,
-        NLStats(1, 0, 0, 0, 0), ls_cache, tc_cache, trace)
-end
-
-function perform_step!(cache::NewtonRaphsonCache{iip}) where {iip}
-    @unpack alg = cache
-
-    cache.J = jacobian!!(cache.J, cache)
-
-    # u = u - J \ fu
-    linres = dolinsolve(cache, alg.precs, cache.linsolve; A = cache.J, b = _vec(cache.fu),
-        linu = _vec(cache.du), cache.p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    cache.du = _restructure(cache.du, linres.u)
-
-    # Line Search
-    α = perform_linesearch!(cache.ls_cache, cache.u, cache.du)
-    @bb axpy!(-α, cache.du, cache.u)
-
-    evaluate_f(cache, cache.u, cache.p)
-
-    update_trace!(cache, α)
-    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-
-    @bb copyto!(cache.u_cache, cache.u)
-    return nothing
-end
diff --git a/src/timer_outputs.jl b/src/timer_outputs.jl
new file mode 100644
index 000000000..510e1d5ed
--- /dev/null
+++ b/src/timer_outputs.jl
@@ -0,0 +1,56 @@
+# Timer Outputs has some overhead, so we only use it if we are debugging
+# Even `@static_timeit` has overhead so we write our custom version of that using
+# Preferences
+const TIMER_OUTPUTS_ENABLED = @load_preference("enable_timer_outputs", false)
+
+@static if TIMER_OUTPUTS_ENABLED
+    using TimerOutputs
+end
+
+"""
+    enable_timer_outputs()
+
+Enable `TimerOutput` for all `NonlinearSolve` algorithms. This is useful for debugging
+but has some overhead, so it is disabled by default.
+"""
+function enable_timer_outputs()
+    @set_preferences!("enable_timer_outputs"=>true)
+    @info "Timer Outputs Enabled. Restart the Julia session for this to take effect."
+end
+
+"""
+    disable_timer_outputs()
+
+Disable `TimerOutput` for all `NonlinearSolve` algorithms. This should be used when
+`NonlinearSolve` is being used in performance-critical code.
+"""
+function disable_timer_outputs()
+    @set_preferences!("enable_timer_outputs"=>false)
+    @info "Timer Outputs Disabled. Restart the Julia session for this to take effect."
+end
+
+function get_timer_output()
+    @static if TIMER_OUTPUTS_ENABLED
+        return TimerOutput()
+    else
+        return nothing
+    end
+end
+
+"""
+    @static_timeit to name expr
+
+Like `TimerOutputs.@timeit_debug` but has zero overhead if `TimerOutputs` is disabled via
+[`NonlinearSolve.disable_timer_outputs()`](@ref).
+"""
+macro static_timeit(to, name, expr)
+    @static if TIMER_OUTPUTS_ENABLED
+        return TimerOutputs.timer_expr(__module__, false, to, name, expr)
+    else
+        return esc(expr)
+    end
+end
+
+@static if !TIMER_OUTPUTS_ENABLED
+    @inline reset_timer!(::Nothing) = nothing
+end
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
deleted file mode 100644
index 3312cbc63..000000000
--- a/src/trustRegion.jl
+++ /dev/null
@@ -1,595 +0,0 @@
-"""
-    RadiusUpdateSchemes
-
-`RadiusUpdateSchemes` is the standard enum interface for different types of radius update
-schemes implemented in the Trust Region method. These schemes specify how the radius of the
-so-called trust region is updated after each iteration of the algorithm. The specific role
-and caveats associated with each scheme are provided below.
-
-## Using `RadiusUpdateSchemes`
-
-`RadiusUpdateSchemes` uses the standard
-[EnumX Interface](https://github.com/fredrikekre/EnumX.jl), and hence inherits all
-properties of being an EnumX, including the type of each constituent enum states as
-`RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
-`TrustRegion(radius_update_scheme = your desired update scheme)`. For example,
-`sol = solve(prob, alg=TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))`.
-"""
-@enumx RadiusUpdateSchemes begin
-    """
-        RadiusUpdateSchemes.Simple
-
-    The simple or conventional radius update scheme. This scheme is chosen by default and
-    follows the conventional approach to update the trust region radius, i.e. if the trial
-    step is accepted it increases the radius by a fixed factor (bounded by a maximum radius)
-    and if the trial step is rejected, it shrinks the radius by a fixed factor.
-    """
-    Simple
-
-    """
-        RadiusUpdateSchemes.NLsolve
-
-    The same updating scheme as in NLsolve's (https://github.com/JuliaNLSolvers/NLsolve.jl)
-    trust region dogleg implementation.
-    """
-    NLsolve
-
-    """
-        RadiusUpdateSchemes.NocedalWright
-
-    Trust region updating scheme as in Nocedal and Wright [see Alg 11.5, page 291].
-    """
-    NocedalWright
-
-    """
-        RadiusUpdateSchemes.Hei
-
-    This scheme is proposed by Hei, L. [1]. The trust region radius depends on the size
-    (norm) of the current step size. The hypothesis is to let the radius converge to zero as
-    the iterations progress, which is more reliable and robust for ill-conditioned as well
-    as degenerate problems.
-
-    [1] Hei, Long. "A self-adaptive trust region algorithm." Journal of Computational
-    Mathematics (2003): 229-236.
-    """
-    Hei
-
-    """
-        RadiusUpdateSchemes.Yuan
-
-    This scheme is proposed by Yuan, Y [1]. Similar to Hei's scheme, the trust region is
-    updated in a way so that it converges to zero, however here, the radius depends on the
-    size (norm) of the current gradient of the objective (merit) function. The hypothesis is
-    that the step size is bounded by the gradient size, so it makes sense to let the radius
-    depend on the gradient.
-
-    [1] Fan, Jinyan, Jianyu Pan, and Hongyan Song. "A retrospective trust region algorithm
-    with trust region converging to zero." Journal of Computational Mathematics 34.4 (2016):
-    421-436.
-    """
-    Yuan
-
-    """
-        RadiusUpdateSchemes.Bastin
-
-    This scheme is proposed by Bastin, et al. [1]. The scheme is called a retrospective
-    update scheme as it uses the model function at the current iteration to compute the
-    ratio of the actual reduction and the predicted reduction in the previous trial step,
-    and use this ratio to update the trust region radius. The hypothesis is to exploit the
-    information made available during the optimization process in order to vary the accuracy
-    of the objective function computation.
-
-    [1] Bastin, Fabian, et al. "A retrospective trust-region method for unconstrained
-    optimization." Mathematical programming 123 (2010): 395-418.
-    """
-    Bastin
-
-    """
-        RadiusUpdateSchemes.Fan
-
-    This scheme is proposed by Fan, J. [1]. It is very much similar to Hei's and Yuan's
-    schemes as it lets the trust region radius depend on the current size (norm) of the
-    objective (merit) function itself. These new update schemes are known to improve local
-    convergence.
-
-    [1] Fan, Jinyan. "Convergence rate of the trust region method for nonlinear equations
-    under local error bound condition." Computational Optimization and Applications 34.2
-    (2006): 215-227.
-    """
-    Fan
-end
-
-"""
-    TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
-        radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple,
-        max_trust_radius::Real = 0 // 1, initial_trust_radius::Real = 0 // 1,
-        step_threshold::Real = 1 // 10, shrink_threshold::Real = 1 // 4,
-        expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
-        expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, adkwargs...)
-
-An advanced TrustRegion implementation with support for efficient handling of sparse
-matrices via colored automatic differentiation and preconditioned linear solvers. Designed
-for large-scale and numerically-difficult nonlinear systems.
-
-### Keyword Arguments
-
-  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-    ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-    `nothing` which means that a default is selected according to the problem specification!.
-    Valid choices are types from ADTypes.jl.
-  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-    then the Jacobian will not be constructed and instead direct Jacobian-vector products
-    `J*v` are computed using forward-mode automatic differentiation or finite differencing
-    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-    the construction of the Jacobian.
-  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-    LinearSolve.jl default algorithm choice. For more information on available algorithm
-    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-    preconditioners. For more information on specifying preconditioners for LinearSolve
-    algorithms, consult the
-    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-  - `radius_update_scheme`: the choice of radius update scheme to be used. Defaults to `RadiusUpdateSchemes.Simple`
-    which follows the conventional approach. Other available schemes are `RadiusUpdateSchemes.Hei`,
-    `RadiusUpdateSchemes.Yuan`, `RadiusUpdateSchemes.Bastin`, `RadiusUpdateSchemes.Fan`. These schemes
-    have the trust region radius converging to zero that is seen to improve convergence. For more details, see the
-    [Yuan, Yx](https://link.springer.com/article/10.1007/s10107-015-0893-2#Sec4).
-  - `max_trust_radius`: the maximal trust region radius.
-    Defaults to `max(norm(fu), maximum(u) - minimum(u))`.
-  - `initial_trust_radius`: the initial trust region radius. Defaults to
-    `max_trust_radius / 11`.
-  - `step_threshold`: the threshold for taking a step. In every iteration, the threshold is
-    compared with a value `r`, which is the actual reduction in the objective function divided
-    by the predicted reduction. If `step_threshold > r` the model is not a good approximation,
-    and the step is rejected. Defaults to `0.1`. For more details, see
-    [Rahpeymaii, F.](https://link.springer.com/article/10.1007/s40096-020-00339-4)
-  - `shrink_threshold`: the threshold for shrinking the trust region radius. In every
-    iteration, the threshold is compared with a value `r` which is the actual reduction in the
-    objective function divided by the predicted reduction. If `shrink_threshold > r` the trust
-    region radius is shrunk by `shrink_factor`. Defaults to `0.25`. For more details, see
-    [Rahpeymaii, F.](https://link.springer.com/article/10.1007/s40096-020-00339-4)
-  - `expand_threshold`: the threshold for expanding the trust region radius. If a step is
-    taken, i.e `step_threshold < r` (with `r` defined in `shrink_threshold`), a check is also
-    made to see if `expand_threshold < r`. If that is true, the trust region radius is
-    expanded by `expand_factor`. Defaults to `0.75`.
-  - `shrink_factor`: the factor to shrink the trust region radius with if
-    `shrink_threshold > r` (with `r` defined in `shrink_threshold`). Defaults to `0.25`.
-  - `expand_factor`: the factor to expand the trust region radius with if
-    `expand_threshold < r` (with `r` defined in `shrink_threshold`). Defaults to `2.0`.
-  - `max_shrink_times`: the maximum number of times to shrink the trust region radius in a
-    row, `max_shrink_times` is exceeded, the algorithm returns. Defaults to `32`.
-  - `vjp_autodiff`: Automatic Differentiation Backend used for vector-jacobian products.
-    This is applicable if the linear solver doesn't require a concrete jacobian, for eg.,
-    Krylov Methods. Defaults to `nothing`, which means if the problem is out of place and
-    `Zygote` is loaded then, we use `AutoZygote`. In all other, cases `FiniteDiff` is used.
-"""
-@concrete struct TrustRegion{CJ, AD, MTR} <: AbstractNewtonAlgorithm{CJ, AD}
-    ad::AD
-    linsolve
-    precs
-    radius_update_scheme::RadiusUpdateSchemes.T
-    max_trust_radius
-    initial_trust_radius::MTR
-    step_threshold::MTR
-    shrink_threshold::MTR
-    expand_threshold::MTR
-    shrink_factor::MTR
-    expand_factor::MTR
-    max_shrink_times::Int
-    vjp_autodiff
-end
-
-function set_ad(alg::TrustRegion{CJ}, ad) where {CJ}
-    return TrustRegion{CJ}(ad, alg.linsolve, alg.precs, alg.radius_update_scheme,
-        alg.max_trust_radius, alg.initial_trust_radius, alg.step_threshold,
-        alg.shrink_threshold, alg.expand_threshold, alg.shrink_factor, alg.expand_factor,
-        alg.max_shrink_times, alg.vjp_autodiff)
-end
-
-function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAULT_PRECS,
-        radius_update_scheme::RadiusUpdateSchemes.T = RadiusUpdateSchemes.Simple,
-        max_trust_radius::Real = 0 // 1, initial_trust_radius::Real = 0 // 1,
-        step_threshold::Real = 1 // 10000, shrink_threshold::Real = 1 // 4,
-        expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
-        expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, vjp_autodiff = nothing,
-        autodiff = nothing)
-    return TrustRegion{_unwrap_val(concrete_jac)}(autodiff, linsolve, precs,
-        radius_update_scheme, max_trust_radius, initial_trust_radius, step_threshold,
-        shrink_threshold, expand_threshold, shrink_factor, expand_factor, max_shrink_times,
-        vjp_autodiff)
-end
-
-@concrete mutable struct TrustRegionCache{iip} <: AbstractNonlinearSolveCache{iip}
-    f
-    alg
-    u
-    u_cache
-    u_cache_2
-    u_gauss_newton
-    u_cauchy
-    fu
-    fu_cache
-    fu_cache_2
-    J
-    J_cache
-    JᵀJ
-    Jᵀf
-    p
-    uf
-    du
-    lr_mul_cache
-    linsolve
-    jac_cache
-    force_stop::Bool
-    maxiters::Int
-    internalnorm
-    retcode::ReturnCode.T
-    abstol
-    reltol
-    prob
-    radius_update_scheme::RadiusUpdateSchemes.T
-    trust_r
-    max_trust_r
-    step_threshold
-    shrink_threshold
-    expand_threshold
-    shrink_factor
-    expand_factor
-    loss
-    loss_new
-    shrink_counter::Int
-    make_new_J::Bool
-    r
-    p1
-    p2
-    p3
-    p4
-    ϵ
-    vjp_operator  # For Yuan
-    stats::NLStats
-    tc_cache
-    trace
-end
-
-function SciMLBase.__init(prob::Union{NonlinearProblem{uType, iip},
-            NonlinearLeastSquaresProblem{uType, iip}}, alg_::TrustRegion, args...;
-        alias_u0 = false, maxiters = 1000, abstol = nothing, reltol = nothing,
-        termination_condition = nothing, internalnorm = DEFAULT_NORM,
-        linsolve_kwargs = (;), kwargs...) where {uType, iip}
-    alg = get_concrete_algorithm(alg_, prob)
-    @unpack f, u0, p = prob
-    u = __maybe_unaliased(u0, alias_u0)
-    @bb u_cache = copy(u)
-    @bb u_cache_2 = similar(u)
-    fu = evaluate_f(prob, u)
-    @bb fu_cache_2 = zero(fu)
-
-    loss = __trust_region_loss(internalnorm, fu)
-
-    uf, _, J, fu_cache, jac_cache, du, JᵀJ, Jᵀf = jacobian_caches(alg, f, u, p, Val(iip);
-        linsolve_kwargs, linsolve_with_JᵀJ = Val(true), lininit = Val(false))
-    linsolve = linsolve_caches(J, fu_cache, du, p, alg)
-
-    @bb u_cache_2 = similar(u)
-    @bb u_cauchy = similar(u)
-    @bb u_gauss_newton = similar(u)
-    J_cache = J isa SciMLOperators.AbstractSciMLOperator ||
-              setindex_trait(J) === CannotSetindex() ? J : similar(J)
-    @bb lr_mul_cache = similar(du)
-
-    loss_new = loss
-    shrink_counter = 0
-    make_new_J = true
-    r = loss
-
-    floatType = typeof(r)
-
-    # set trust region update scheme
-    radius_update_scheme = alg.radius_update_scheme
-
-    # set default type for all trust region parameters
-    trustType = floatType
-    if radius_update_scheme == RadiusUpdateSchemes.NLsolve
-        max_trust_radius = convert(trustType, Inf)
-        initial_trust_radius = internalnorm(u0) > 0 ? convert(trustType, internalnorm(u0)) :
-                               one(trustType)
-    else
-        max_trust_radius = convert(trustType, alg.max_trust_radius)
-        if iszero(max_trust_radius)
-            max_trust_radius = convert(trustType,
-                max(internalnorm(fu), maximum(u) - minimum(u)))
-        end
-        initial_trust_radius = convert(trustType, alg.initial_trust_radius)
-        if iszero(initial_trust_radius)
-            initial_trust_radius = convert(trustType, max_trust_radius / 11)
-        end
-    end
-    step_threshold = convert(trustType, alg.step_threshold)
-    shrink_threshold = convert(trustType, alg.shrink_threshold)
-    expand_threshold = convert(trustType, alg.expand_threshold)
-    shrink_factor = convert(trustType, alg.shrink_factor)
-    expand_factor = convert(trustType, alg.expand_factor)
-
-    # Parameters for the Schemes
-    p1 = convert(floatType, 0.0)
-    p2 = convert(floatType, 0.0)
-    p3 = convert(floatType, 0.0)
-    p4 = convert(floatType, 0.0)
-    ϵ = convert(floatType, 1.0e-8)
-    vjp_operator = nothing
-    if radius_update_scheme === RadiusUpdateSchemes.NLsolve
-        p1 = convert(floatType, 0.5)
-    elseif radius_update_scheme === RadiusUpdateSchemes.Hei
-        step_threshold = convert(trustType, 0.0)
-        shrink_threshold = convert(trustType, 0.25)
-        expand_threshold = convert(trustType, 0.25)
-        p1 = convert(floatType, 5.0)  # M
-        p2 = convert(floatType, 0.1)  # β
-        p3 = convert(floatType, 0.15) # γ1
-        p4 = convert(floatType, 0.15) # γ2
-        initial_trust_radius = convert(trustType, 1.0)
-    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-        step_threshold = convert(trustType, 0.0001)
-        shrink_threshold = convert(trustType, 0.25)
-        expand_threshold = convert(trustType, 0.25)
-        p1 = convert(floatType, 2.0)   # μ
-        p2 = convert(floatType, 1 / 6) # c5
-        p3 = convert(floatType, 6.0)   # c6
-        vjp_operator = __gradient_operator(uf, u; fu,
-            autodiff = __get_nonsparse_ad(alg.vjp_autodiff))
-        @bb Jᵀf = vjp_operator × fu
-        initial_trust_radius = convert(trustType, p1 * internalnorm(Jᵀf))
-    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-        step_threshold = convert(trustType, 0.0001)
-        shrink_threshold = convert(trustType, 0.25)
-        expand_threshold = convert(trustType, 0.75)
-        p1 = convert(floatType, 0.1) # μ
-        p2 = convert(floatType, 0.25) # c5
-        p3 = convert(floatType, 12.0) # c6
-        p4 = convert(floatType, 1.0e18) # M
-        initial_trust_radius = convert(trustType, p1 * (internalnorm(fu)^0.99))
-    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-        step_threshold = convert(trustType, 0.05)
-        shrink_threshold = convert(trustType, 0.05)
-        expand_threshold = convert(trustType, 0.9)
-        p1 = convert(floatType, 2.5)  # alpha_1
-        p2 = convert(floatType, 0.25) # alpha_2
-        initial_trust_radius = convert(trustType, 1.0)
-    end
-
-    abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fu, u,
-        termination_condition)
-    trace = init_nonlinearsolve_trace(alg, u, fu, ApplyArray(__zero, J), du; kwargs...)
-
-    return TrustRegionCache{iip}(f, alg, u, u_cache, u_cache_2, u_gauss_newton, u_cauchy,
-        fu, fu_cache, fu_cache_2, J, J_cache, JᵀJ, Jᵀf, p, uf, du, lr_mul_cache, linsolve,
-        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, reltol, prob,
-        radius_update_scheme, initial_trust_radius, max_trust_radius, step_threshold,
-        shrink_threshold, expand_threshold, shrink_factor, expand_factor, loss, loss_new,
-        shrink_counter, make_new_J, r, p1, p2, p3, p4, ϵ, vjp_operator,
-        NLStats(1, 0, 0, 0, 0), tc_cache, trace)
-end
-
-function perform_step!(cache::TrustRegionCache{iip}) where {iip}
-    if cache.make_new_J
-        cache.J = jacobian!!(cache.J, cache)
-
-        __update_JᵀJ!(cache)
-        __update_Jᵀf!(cache)
-
-        # do not use A = cache.H, b = _vec(cache.g) since it is equivalent
-        # to  A = cache.J, b = _vec(fu) as long as the Jacobian is non-singular
-        linres = dolinsolve(cache, cache.alg.precs, cache.linsolve, A = cache.J,
-            b = _vec(cache.fu), linu = _vec(cache.u_gauss_newton), p = cache.p,
-            reltol = cache.abstol)
-        cache.linsolve = linres.cache
-        cache.u_gauss_newton = _restructure(cache.u_gauss_newton, linres.u)
-        @bb @. cache.u_gauss_newton *= -1
-    end
-
-    # compute dogleg step
-    dogleg!(cache)
-
-    # compute the potentially new u
-    @bb @. cache.u_cache_2 = cache.u + cache.du
-    evaluate_f(cache, cache.u_cache_2, cache.p, Val{:fu_cache_2}())
-    trust_region_step!(cache)
-    return nothing
-end
-
-function retrospective_step!(cache::TrustRegionCache{iip}) where {iip}
-    J = jacobian!!(cache.J_cache, cache)
-    __update_JᵀJ!(cache, J)
-    __update_Jᵀf!(cache, J)
-
-    num = __trust_region_loss(cache, cache.fu) - __trust_region_loss(cache, cache.fu_cache)
-    denom = dot(_vec(cache.du), _vec(cache.Jᵀf)) + __lr_mul(cache, cache.JᵀJ, cache.du) / 2
-    return num / denom
-end
-
-function trust_region_step!(cache::TrustRegionCache)
-    cache.loss_new = __trust_region_loss(cache, cache.fu_cache_2)
-
-    # Compute the ratio of the actual reduction to the predicted reduction.
-    cache.r = -(cache.loss - cache.loss_new) /
-              (dot(_vec(cache.du), _vec(cache.Jᵀf)) +
-               __lr_mul(cache, cache.JᵀJ, _vec(cache.du)) / 2)
-
-    @unpack r, radius_update_scheme = cache
-    make_new_J = false
-    if r > cache.step_threshold
-        take_step!(cache)
-        cache.loss = cache.loss_new
-        make_new_J = true
-    end
-
-    if radius_update_scheme === RadiusUpdateSchemes.Simple
-        if r < cache.shrink_threshold
-            cache.trust_r *= cache.shrink_factor
-            cache.shrink_counter += 1
-        else
-            cache.shrink_counter = 0
-            if r > cache.step_threshold && r > cache.expand_threshold
-                cache.trust_r = min(cache.expand_factor * cache.trust_r, cache.max_trust_r)
-            end
-        end
-    elseif radius_update_scheme === RadiusUpdateSchemes.NLsolve
-        if r < 1 // 10
-            cache.shrink_counter += 1
-            cache.trust_r *= 1 // 2
-        else
-            cache.shrink_counter = 0
-            if r ≥ 9 // 10
-                cache.trust_r = 2 * cache.internalnorm(cache.du)
-            elseif r ≥ 1 // 2
-                cache.trust_r = max(cache.trust_r, 2 * cache.internalnorm(cache.du))
-            end
-        end
-    elseif radius_update_scheme === RadiusUpdateSchemes.NocedalWright
-        if r < 1 // 4
-            cache.shrink_counter += 1
-            cache.trust_r = (1 // 4) * cache.internalnorm(cache.du)
-        else
-            cache.shrink_counter = 0
-            if r > 3 // 4 &&
-               abs(cache.internalnorm(cache.du) - cache.trust_r) < 1e-6 * cache.trust_r
-                cache.trust_r = min(2 * cache.trust_r, cache.max_trust_r)
-            end
-        end
-    elseif radius_update_scheme === RadiusUpdateSchemes.Hei
-        @unpack shrink_threshold, p1, p2, p3, p4 = cache
-        tr_new = __rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(cache.du)
-        if tr_new < cache.trust_r
-            cache.shrink_counter += 1
-        else
-            cache.shrink_counter = 0
-        end
-        cache.trust_r = tr_new
-
-        cache.internalnorm(cache.Jᵀf) < cache.ϵ && (cache.force_stop = true)
-    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-        if r < cache.shrink_threshold
-            cache.p1 = cache.p2 * cache.p1
-            cache.shrink_counter += 1
-        else
-            if r ≥ cache.expand_threshold &&
-               cache.internalnorm(cache.du) > cache.trust_r / 2
-                cache.p1 = cache.p3 * cache.p1
-            end
-            cache.shrink_counter = 0
-        end
-
-        @bb cache.Jᵀf = cache.vjp_operator × vec(cache.fu)
-        cache.trust_r = cache.p1 * cache.internalnorm(cache.Jᵀf)
-
-        cache.internalnorm(cache.Jᵀf) < cache.ϵ && (cache.force_stop = true)
-    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-        if r < cache.shrink_threshold
-            cache.p1 *= cache.p2
-            cache.shrink_counter += 1
-        else
-            cache.shrink_counter = 0
-            r > cache.expand_threshold && (cache.p1 = min(cache.p1 * cache.p3, cache.p4))
-        end
-        cache.trust_r = cache.p1 * (cache.internalnorm(cache.fu)^0.99)
-        cache.internalnorm(cache.Jᵀf) < cache.ϵ && (cache.force_stop = true)
-    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-        if r > cache.step_threshold
-            if retrospective_step!(cache) ≥ cache.expand_threshold
-                cache.trust_r = max(cache.p1 * cache.internalnorm(cache.du), cache.trust_r)
-            end
-            cache.shrink_counter = 0
-        else
-            cache.trust_r *= cache.p2
-            cache.shrink_counter += 1
-        end
-    end
-
-    update_trace!(cache.trace, cache.stats.nsteps + 1, cache.u, cache.fu, cache.J,
-        @~(cache.u.-cache.u_cache))
-    check_and_update!(cache, cache.fu, cache.u, cache.u_cache)
-end
-
-function dogleg!(cache::TrustRegionCache{iip}) where {iip}
-    # Take the full Gauss-Newton step if lies within the trust region.
-    if cache.internalnorm(cache.u_gauss_newton) ≤ cache.trust_r
-        @bb copyto!(cache.du, cache.u_gauss_newton)
-        return
-    end
-
-    # Take intersection of steepest descent direction and trust region if Cauchy point lies
-    # outside of trust region
-    l_grad = cache.internalnorm(cache.Jᵀf) # length of the gradient
-    d_cauchy = l_grad^3 / __lr_mul(cache)
-    g = _restructure(cache.du, cache.Jᵀf)
-    if d_cauchy ≥ cache.trust_r
-        # step to the end of the trust region
-        @bb @. cache.du = -(cache.trust_r / l_grad) * g
-        return
-    end
-
-    # Take the intersection of dogleg with trust region if Cauchy point lies inside the
-    # trust region
-    @bb @. cache.u_cauchy = -(d_cauchy / l_grad) * g # compute Cauchy point
-    @bb @. cache.u_cache_2 = cache.u_gauss_newton - cache.u_cauchy # calf of the dogleg
-
-    a = dot(cache.u_cache_2, cache.u_cache_2)
-    b = 2 * dot(cache.u_cauchy, cache.u_cache_2)
-    c = d_cauchy^2 - cache.trust_r^2
-    # technically guaranteed to be non-negative but hedging against floating point issues
-    aux = max(b^2 - 4 * a * c, 0)
-    # stepsize along dogleg to trust region boundary
-    τ = (-b + sqrt(aux)) / (2 * a)
-
-    @bb @. cache.du = cache.u_cauchy + τ * cache.u_cache_2
-    return
-end
-
-function take_step!(cache::TrustRegionCache)
-    @bb copyto!(cache.u_cache, cache.u)
-    @bb copyto!(cache.u, cache.u_cache_2)
-    @bb copyto!(cache.fu_cache, cache.fu)
-    @bb copyto!(cache.fu, cache.fu_cache_2)
-end
-
-function not_terminated(cache::TrustRegionCache)
-    non_shrink_terminated = cache.force_stop || cache.stats.nsteps ≥ cache.maxiters
-    # Terminated due to convergence or maxiters
-    non_shrink_terminated && return false
-    # Terminated due to too many shrink
-    shrink_terminated = cache.shrink_counter ≥ cache.alg.max_shrink_times
-    if shrink_terminated
-        cache.retcode = ReturnCode.ConvergenceFailure
-        return false
-    end
-    return true
-end
-
-# FIXME: Reinit `JᵀJ` operator if `p` is changed
-function __reinit_internal!(cache::TrustRegionCache; kwargs...)
-    if cache.vjp_operator !== nothing
-        cache.vjp_operator = __gradient_operator(cache.uf, cache.u; cache.fu,
-            autodiff = __get_nonsparse_ad(cache.alg.ad))
-        @bb cache.Jᵀf = cache.vjp_operator × cache.fu
-    end
-    cache.loss = __trust_region_loss(cache, cache.fu)
-    cache.loss_new = cache.loss
-    cache.shrink_counter = 0
-    cache.trust_r = convert(eltype(cache.u),
-        ifelse(cache.alg.initial_trust_radius == 0, cache.max_trust_r / 11,
-            cache.alg.initial_trust_radius))
-    cache.make_new_J = true
-    return nothing
-end
-
-__trust_region_loss(cache::TrustRegionCache, x) = __trust_region_loss(cache.internalnorm, x)
-__trust_region_loss(nf::F, x) where {F} = nf(x)^2 / 2
-
-# R-function for adaptive trust region method
-function __rfunc(r::R, c2::R, M::R, γ1::R, γ2::R, β::R) where {R <: Real}
-    return ifelse(r ≥ c2,
-        (2 * (M - 1 - γ2) * atan(r - c2) + (1 + γ2)) / R(π),
-        (1 - γ1 - β) * (exp(r - c2) + β / (1 - γ1 - β)))
-end
diff --git a/src/utils.jl b/src/utils.jl
index 9bf4f6987..ef8fdf713 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -1,358 +1,63 @@
-const DEFAULT_NORM = DiffEqBase.NONLINEARSOLVE_DEFAULT_NORM
-
+# Defaults
+@inline DEFAULT_NORM(args...) = DiffEqBase.NONLINEARSOLVE_DEFAULT_NORM(args...)
+@inline DEFAULT_PRECS(W, du, u, p, t, newW, Plprev, Prprev, cachedata) = nothing, nothing
 @inline DEFAULT_TOLERANCE(args...) = DiffEqBase._get_tolerance(args...)
 
-@concrete mutable struct FakeLinearSolveJLCache
-    A
-    b
-end
-
-@concrete struct FakeLinearSolveJLResult
-    cache
-    u
-end
-
-# Ignores NaN
-function __findmin(f, x)
-    return findmin(x) do xᵢ
-        fx = f(xᵢ)
-        return isnan(fx) ? Inf : fx
+# Helper  Functions
+@static if VERSION ≤ v"1.10-"
+    @inline @generated function __hasfield(::T, ::Val{field}) where {T, field}
+        return :($(field ∉ fieldnames(T)))
     end
+else
+    @inline __hasfield(::T, ::Val{field}) where {T, field} = hasfield(T, field)
 end
 
-struct NonlinearSolveTag end
-
-function ForwardDiff.checktag(::Type{<:ForwardDiff.Tag{<:NonlinearSolveTag, <:T}}, f::F,
-        x::AbstractArray{T}) where {T, F}
-    return true
+@generated function __getproperty(s::S, ::Val{X}) where {S, X}
+    hasfield(S, X) && return :(s.$X)
+    return :(missing)
 end
 
-"""
-    value_derivative(f, x)
+@inline __needs_concrete_A(::Nothing) = false
+@inline __needs_concrete_A(linsolve) = needs_concrete_A(linsolve)
 
-Compute `f(x), d/dx f(x)` in the most efficient way.
-"""
-function value_derivative(f::F, x::R) where {F, R}
-    T = typeof(ForwardDiff.Tag(f, R))
-    out = f(ForwardDiff.Dual{T}(x, one(x)))
-    ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
-end
+@inline __maybe_mutable(x, ::AutoSparseEnzyme) = __mutable(x)
+@inline __maybe_mutable(x, _) = x
 
-@inline value(x) = x
-@inline value(x::Dual) = ForwardDiff.value(x)
-@inline value(x::AbstractArray{<:Dual}) = map(ForwardDiff.value, x)
-
-@inline _vec(v) = vec(v)
+@inline @generated function _vec(v)
+    hasmethod(vec, Tuple{typeof(v)}) || return :(vec(v))
+    return :(v)
+end
 @inline _vec(v::Number) = v
 @inline _vec(v::AbstractVector) = v
 
 @inline _restructure(y, x) = restructure(y, x)
 @inline _restructure(y::Number, x::Number) = x
 
-DEFAULT_PRECS(W, du, u, p, t, newW, Plprev, Prprev, cachedata) = nothing, nothing
-
-function dolinsolve(cache, precs::P, linsolve::FakeLinearSolveJLCache; A = nothing,
-        linu = nothing, b = nothing, du = nothing, p = nothing, weight = nothing,
-        cachedata = nothing, reltol = nothing, reuse_A_if_factorization = false) where {P}
-    # Update Statistics
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += !(A isa Number)
-
-    A !== nothing && (linsolve.A = A)
-    b !== nothing && (linsolve.b = b)
-    linres = linsolve.A \ linsolve.b
-    return FakeLinearSolveJLResult(linsolve, linres)
-end
-
-function dolinsolve(cache, precs::P, linsolve; A = nothing, linu = nothing, b = nothing,
-        du = nothing, p = nothing, weight = nothing, cachedata = nothing, reltol = nothing,
-        reuse_A_if_factorization = false) where {P}
-    # Update Statistics
-    cache.stats.nsolve += 1
-    cache.stats.nfactors += 1
-
-    # Some Algorithms would reuse factorization but it causes the cache to not reset in
-    # certain cases
-    if A !== nothing
-        alg = __getproperty(linsolve, Val(:alg))
-        if alg !== nothing && ((alg isa LinearSolve.AbstractFactorization) ||
-            (alg isa LinearSolve.DefaultLinearSolver && !(alg ==
-               LinearSolve.DefaultLinearSolver(LinearSolve.DefaultAlgorithmChoice.KrylovJL_GMRES))))
-            # Factorization Algorithm
-            if reuse_A_if_factorization
-                cache.stats.nfactors -= 1
-            else
-                linsolve.A = A
-            end
-        else
-            linsolve.A = A
-        end
-    else
-        cache.stats.nfactors -= 1
-    end
-    b !== nothing && (linsolve.b = b)
-    linu !== nothing && (linsolve.u = linu)
-
-    Plprev = linsolve.Pl isa ComposePreconditioner ? linsolve.Pl.outer : linsolve.Pl
-    Prprev = linsolve.Pr isa ComposePreconditioner ? linsolve.Pr.outer : linsolve.Pr
-
-    _Pl, _Pr = precs(linsolve.A, du, linu, p, nothing, A !== nothing, Plprev, Prprev,
-        cachedata)
-    if (_Pl !== nothing || _Pr !== nothing)
-        _weight = weight === nothing ?
-                  (linsolve.Pr isa Diagonal ? linsolve.Pr.diag : linsolve.Pr.inner.diag) :
-                  weight
-        Pl, Pr = wrapprecs(_Pl, _Pr, _weight)
-        linsolve.Pl = Pl
-        linsolve.Pr = Pr
-    end
-
-    linres = reltol === nothing ? solve!(linsolve) : solve!(linsolve; reltol)
-
-    return linres
-end
-
-function wrapprecs(_Pl, _Pr, weight)
-    if _Pl !== nothing
-        Pl = ComposePreconditioner(InvPreconditioner(Diagonal(_vec(weight))), _Pl)
-    else
-        Pl = InvPreconditioner(Diagonal(_vec(weight)))
-    end
-
-    if _Pr !== nothing
-        Pr = ComposePreconditioner(Diagonal(_vec(weight)), _Pr)
-    else
-        Pr = Diagonal(_vec(weight))
-    end
-
-    return Pl, Pr
-end
-
-concrete_jac(_) = nothing
-concrete_jac(::AbstractNewtonAlgorithm{CJ}) where {CJ} = CJ
-
-_mutable_zero(x) = zero(x)
-_mutable_zero(x::SArray) = MArray(x)
-
-_mutable(x) = x
-_mutable(x::SArray) = MArray(x)
-
-# __maybe_mutable(x, ::AbstractFiniteDifferencesMode) = _mutable(x)
-# The shadow allocated for Enzyme needs to be mutable
-__maybe_mutable(x, ::AutoSparseEnzyme) = _mutable(x)
-__maybe_mutable(x, _) = x
-
-# Helper function to get value of `f(u, p)`
-function evaluate_f(prob::Union{NonlinearProblem{uType, iip},
-            NonlinearLeastSquaresProblem{uType, iip}}, u) where {uType, iip}
-    @unpack f, u0, p = prob
-    if iip
-        fu = f.resid_prototype === nothing ? similar(u) : f.resid_prototype
-        f(fu, u, p)
-    else
-        fu = f(u, p)
-    end
-    return fu
-end
-
-function evaluate_f(f::F, u, p, ::Val{iip}; fu = nothing) where {F, iip}
-    if iip
-        f(fu, u, p)
-        return fu
-    else
-        return f(u, p)
-    end
-end
-
-function evaluate_f(cache::AbstractNonlinearSolveCache, u, p,
-        fu_sym::Val{FUSYM} = Val(nothing)) where {FUSYM}
-    cache.stats.nf += 1
-    if FUSYM === nothing
-        if isinplace(cache)
-            cache.prob.f(get_fu(cache), u, p)
-        else
-            set_fu!(cache, cache.prob.f(u, p))
-        end
-    else
-        if isinplace(cache)
-            cache.prob.f(__getproperty(cache, fu_sym), u, p)
-        else
-            setproperty!(cache, FUSYM, cache.prob.f(u, p))
-        end
-    end
-    return nothing
-end
-
-# Concretize Algorithms
-function get_concrete_algorithm(alg, prob)
-    !hasfield(typeof(alg), :ad) && return alg
-    alg.ad isa ADTypes.AbstractADType && return alg
-
-    # Figure out the default AD
-    # Now that we have handed trivial cases, we can allow extending this function
-    # for specific algorithms
-    return __get_concrete_algorithm(alg, prob)
-end
-
-function __get_concrete_algorithm(alg, prob)
-    @unpack sparsity, jac_prototype = prob.f
-    use_sparse_ad = sparsity !== nothing || jac_prototype !== nothing
-    ad = if !ForwardDiff.can_dual(eltype(prob.u0))
-        # Use Finite Differencing
-        use_sparse_ad ? AutoSparseFiniteDiff() : AutoFiniteDiff()
-    else
-        (use_sparse_ad ? AutoSparseForwardDiff : AutoForwardDiff)(;
-            tag = ForwardDiff.Tag(NonlinearSolveTag(), eltype(prob.u0)))
-    end
-    return set_ad(alg, ad)
-end
-
-function init_termination_cache(abstol, reltol, du, u, ::Nothing)
-    return init_termination_cache(abstol, reltol, du, u, AbsSafeBestTerminationMode())
-end
-function init_termination_cache(abstol, reltol, du, u, tc::AbstractNonlinearTerminationMode)
-    tc_cache = init(du, u, tc; abstol, reltol)
-    return DiffEqBase.get_abstol(tc_cache), DiffEqBase.get_reltol(tc_cache), tc_cache
-end
-
-function check_and_update!(cache, fu, u, uprev)
-    return check_and_update!(cache.tc_cache, cache, fu, u, uprev)
-end
-function check_and_update!(tc_cache, cache, fu, u, uprev)
-    return check_and_update!(tc_cache, cache, fu, u, uprev,
-        DiffEqBase.get_termination_mode(tc_cache))
-end
-function check_and_update!(tc_cache, cache, fu, u, uprev,
-        mode::AbstractNonlinearTerminationMode)
-    if tc_cache(fu, u, uprev)
-        # Just a sanity measure!
-        if isinplace(cache)
-            cache.prob.f(get_fu(cache), u, cache.prob.p)
-        else
-            set_fu!(cache, cache.prob.f(u, cache.prob.p))
-        end
-        cache.force_stop = true
-    end
-end
-function check_and_update!(tc_cache, cache, fu, u, uprev,
-        mode::AbstractSafeNonlinearTerminationMode)
-    if tc_cache(fu, u, uprev)
-        if tc_cache.retcode == NonlinearSafeTerminationReturnCode.Success
-            cache.retcode = ReturnCode.Success
-        end
-        if tc_cache.retcode == NonlinearSafeTerminationReturnCode.PatienceTermination
-            cache.retcode = ReturnCode.ConvergenceFailure
-        end
-        if tc_cache.retcode == NonlinearSafeTerminationReturnCode.ProtectiveTermination
-            cache.retcode = ReturnCode.Unstable
-        end
-        # Just a sanity measure!
-        if isinplace(cache)
-            cache.prob.f(get_fu(cache), u, cache.prob.p)
-        else
-            set_fu!(cache, cache.prob.f(u, cache.prob.p))
-        end
-        cache.force_stop = true
-    end
-end
-function check_and_update!(tc_cache, cache, fu, u, uprev,
-        mode::AbstractSafeBestNonlinearTerminationMode)
-    if tc_cache(fu, u, uprev)
-        if tc_cache.retcode == NonlinearSafeTerminationReturnCode.Success
-            cache.retcode = ReturnCode.Success
-        end
-        if tc_cache.retcode == NonlinearSafeTerminationReturnCode.PatienceTermination
-            cache.retcode = ReturnCode.ConvergenceFailure
-        end
-        if tc_cache.retcode == NonlinearSafeTerminationReturnCode.ProtectiveTermination
-            cache.retcode = ReturnCode.Unstable
-        end
-        if isinplace(cache)
-            copyto!(get_u(cache), tc_cache.u)
-            cache.prob.f(get_fu(cache), get_u(cache), cache.prob.p)
-        else
-            set_u!(cache, tc_cache.u)
-            set_fu!(cache, cache.prob.f(get_u(cache), cache.prob.p))
-        end
-        cache.force_stop = true
-    end
-end
-
-@inline __init_identity_jacobian(u::Number, fu, α = true) = oftype(u, α)
-@inline @views function __init_identity_jacobian(u, fu, α = true)
-    J = similar(fu, promote_type(eltype(fu), eltype(u)), length(fu), length(u))
-    fill!(J, zero(eltype(J)))
-    if fast_scalar_indexing(J)
-        @inbounds for i in axes(J, 1)
-            J[i, i] = α
-        end
-    else
-        J[diagind(J)] .= α
-    end
-    return J
-end
-@inline function __init_identity_jacobian(u::StaticArray, fu::StaticArray, α = true)
-    T = promote_type(eltype(fu), eltype(u))
-    return MArray{Tuple{prod(Size(fu)), prod(Size(u))}, T}(I * α)
-end
-@inline function __init_identity_jacobian(u::SArray, fu::SArray, α = true)
-    T = promote_type(eltype(fu), eltype(u))
-    return SArray{Tuple{prod(Size(fu)), prod(Size(u))}, T}(I * α)
-end
-
-@inline __reinit_identity_jacobian!!(J::Number, α = true) = oftype(J, α)
-@inline __reinit_identity_jacobian!!(J::AbstractVector, α = true) = fill!(J, α)
-@inline @views function __reinit_identity_jacobian!!(J::AbstractMatrix, α = true)
-    fill!(J, zero(eltype(J)))
-    if fast_scalar_indexing(J)
-        @inbounds for i in axes(J, 1)
-            J[i, i] = α
-        end
-    else
-        J[diagind(J)] .= α
-    end
-    return J
-end
-@inline function __reinit_identity_jacobian!!(J::SVector, α = true)
-    return ones(SArray{Tuple{Size(J)[1]}, eltype(J)}) .* α
-end
-@inline function __reinit_identity_jacobian!!(J::SMatrix, α = true)
-    S = Size(J)
-    return SArray{Tuple{S[1], S[2]}, eltype(J)}(I) .* α
-end
-
-function __init_low_rank_jacobian(u::StaticArray{S1, T1}, fu::StaticArray{S2, T2},
-        ::Val{threshold}) where {S1, S2, T1, T2, threshold}
-    T = promote_type(T1, T2)
-    fuSize, uSize = Size(fu), Size(u)
-    Vᵀ = MArray{Tuple{threshold, prod(uSize)}, T}(undef)
-    U = MArray{Tuple{prod(fuSize), threshold}, T}(undef)
-    return U, Vᵀ
-end
-function __init_low_rank_jacobian(u, fu, ::Val{threshold}) where {threshold}
-    Vᵀ = similar(u, threshold, length(u))
-    U = similar(u, length(fu), threshold)
-    return U, Vᵀ
+@inline function __init_ones(x)
+    w = similar(x)
+    recursivefill!(w, true)
+    return w
 end
+@inline __init_ones(x::StaticArray) = ones(typeof(x))
 
-@inline __is_ill_conditioned(x::Number) = iszero(x)
-@inline __is_ill_conditioned(x::AbstractMatrix) = cond(x) ≥
-                                                  inv(eps(real(eltype(x)))^(1 // 2))
-@inline __is_ill_conditioned(x::AbstractVector) = any(iszero, x)
-@inline __is_ill_conditioned(x) = false
-
-# Safe getproperty
-@generated function __getproperty(s::S, ::Val{X}) where {S, X}
-    hasfield(S, X) && return :(s.$X)
-    return :(nothing)
+@inline __maybe_unaliased(x::Union{Number, SArray}, ::Bool) = x
+@inline function __maybe_unaliased(x::AbstractArray, alias::Bool)
+    # Spend time coping iff we will mutate the array
+    (alias || !__can_setindex(typeof(x))) && return x
+    return deepcopy(x)
 end
+@inline __maybe_unaliased(x::AbstractNonlinearSolveOperator, alias::Bool) = x
 
-# Non-square matrix
-@inline __needs_square_A(_, ::Number) = true
-@inline __needs_square_A(alg, _) = LinearSolve.needs_square_A(alg.linsolve)
+@inline __cond(J::AbstractMatrix) = cond(J)
+@inline __cond(J::SVector) = __cond(Diagonal(MVector(J)))
+@inline __cond(J::AbstractVector) = __cond(Diagonal(J))
+@inline __cond(J::ApplyArray) = __cond(J.f(J.args...))
+@inline __cond(J::SparseMatrixCSC) = __cond(Matrix(J))
+@inline __cond(J) = -1  # Covers cases where `J` is a Operator, nothing, etc.
 
-# Define special concatenation for certain Array combinations
-@inline _vcat(x, y) = vcat(x, y)
+@inline __copy(x::AbstractArray) = copy(x)
+@inline __copy(x::Number) = x
+@inline __copy(x) = x
 
 # LazyArrays for tracing
 __zero(x::AbstractArray) = zero(x)
@@ -362,43 +67,6 @@ LazyArrays.applied_ndims(::typeof(__zero), x) = ndims(x)
 LazyArrays.applied_size(::typeof(__zero), x) = size(x)
 LazyArrays.applied_axes(::typeof(__zero), x) = axes(x)
 
-# Safe Inverse: Try to use `inv` but if lu fails use `pinv`
-@inline __safe_inv(A::Number) = pinv(A)
-@inline __safe_inv(A::AbstractMatrix) = pinv(A)
-@inline __safe_inv(A::AbstractVector) = __safe_inv(Diagonal(A)).diag
-@inline __safe_inv(A::ApplyArray) = __safe_inv(A.f(A.args...))
-@inline function __safe_inv(A::StridedMatrix{T}) where {T}
-    LinearAlgebra.checksquare(A)
-    if istriu(A)
-        A_ = UpperTriangular(A)
-        issingular = any(iszero, @view(A_[diagind(A_)]))
-        !issingular && return triu!(parent(inv(A_)))
-    elseif istril(A)
-        A_ = LowerTriangular(A)
-        issingular = any(iszero, @view(A_[diagind(A_)]))
-        !issingular && return tril!(parent(inv(A_)))
-    else
-        F = lu(A; check = false)
-        if issuccess(F)
-            Ai = LinearAlgebra.inv!(F)
-            return convert(typeof(parent(Ai)), Ai)
-        end
-    end
-    return pinv(A)
-end
-@inline __safe_inv(A::SparseMatrixCSC) = __safe_inv(Matrix(A))
-
-LazyArrays.applied_eltype(::typeof(__safe_inv), x) = eltype(x)
-LazyArrays.applied_ndims(::typeof(__safe_inv), x) = ndims(x)
-LazyArrays.applied_size(::typeof(__safe_inv), x) = size(x)
-LazyArrays.applied_axes(::typeof(__safe_inv), x) = axes(x)
-
-# SparseAD --> NonSparseAD
-@inline __get_nonsparse_ad(::AutoSparseForwardDiff) = AutoForwardDiff()
-@inline __get_nonsparse_ad(::AutoSparseFiniteDiff) = AutoFiniteDiff()
-@inline __get_nonsparse_ad(::AutoSparseZygote) = AutoZygote()
-@inline __get_nonsparse_ad(ad) = ad
-
 # Use Symmetric Matrices if known to be efficient
 @inline __maybe_symmetric(x) = Symmetric(x)
 @inline __maybe_symmetric(x::Number) = x
@@ -407,100 +75,85 @@ LazyArrays.applied_axes(::typeof(__safe_inv), x) = axes(x)
 @inline __maybe_symmetric(x::SparseArrays.AbstractSparseMatrix) = x
 @inline __maybe_symmetric(x::SciMLOperators.AbstractSciMLOperator) = x
 
-# Unalias
-@inline __maybe_unaliased(x::Union{Number, SArray}, ::Bool) = x
-@inline function __maybe_unaliased(x::AbstractArray, alias::Bool)
-    # Spend time coping iff we will mutate the array
-    (alias || !can_setindex(typeof(x))) && return x
-    return deepcopy(x)
-end
+# SparseAD --> NonSparseAD
+@inline __get_nonsparse_ad(::AutoSparseForwardDiff) = AutoForwardDiff()
+@inline __get_nonsparse_ad(::AutoSparsePolyesterForwardDiff) = AutoPolyesterForwardDiff()
+@inline __get_nonsparse_ad(::AutoSparseFiniteDiff) = AutoFiniteDiff()
+@inline __get_nonsparse_ad(::AutoSparseZygote) = AutoZygote()
+@inline __get_nonsparse_ad(ad) = ad
 
-# Init ones
-@inline function __init_ones(x)
-    w = similar(x)
-    recursivefill!(w, true)
-    return w
-end
-@inline __init_ones(x::StaticArray) = ones(typeof(x))
+# Simple Checks
+@inline __is_present(::Nothing) = false
+@inline __is_present(::Missing) = false
+@inline __is_present(::Any) = true
+@inline __is_present(::NoLineSearch) = false
 
-# Diagonal of type `u`
-__init_diagonal(u::Number, v) = oftype(u, v)
-function __init_diagonal(u::SArray, v)
-    u_ = vec(u)
-    return Diagonal(ones(typeof(u_)) * v)
-end
-function __init_diagonal(u, v)
-    d = similar(vec(u))
-    d .= v
-    return Diagonal(d)
-end
+@inline __is_complex(::Type{ComplexF64}) = true
+@inline __is_complex(::Type{ComplexF32}) = true
+@inline __is_complex(::Type{Complex}) = true
+@inline __is_complex(::Type{T}) where {T} = false
 
-# Reduce sum
-function __sum_JᵀJ!!(y, J)
-    if setindex_trait(y) === CanSetindex()
-        sum!(abs2, y, J')
-        return y
-    else
-        return sum(abs2, J'; dims = 1)
+function __findmin(f, x)
+    return findmin(x) do xᵢ
+        fx = f(xᵢ)
+        return isnan(fx) ? Inf : fx
     end
 end
 
-# Alpha for Initial Jacobian Guess
-# The values are somewhat different from SciPy, these were tuned to the 23 test problems
-@inline function __initial_inv_alpha(α::Number, u, fu, norm::F) where {F}
-    return convert(promote_type(eltype(u), eltype(fu)), inv(α))
-end
-@inline function __initial_inv_alpha(::Nothing, u, fu, norm::F) where {F}
-    norm_fu = norm(fu)
-    return ifelse(norm_fu ≥ 1e-5, max(norm(u), true) / (2 * norm_fu),
-        convert(promote_type(eltype(u), eltype(fu)), true))
-end
-@inline __initial_inv_alpha(inv_α, α::Number, u, fu, norm::F) where {F} = inv_α
-@inline function __initial_inv_alpha(inv_α, α::Nothing, u, fu, norm::F) where {F}
-    return __initial_inv_alpha(α, u, fu, norm)
-end
+@inline __can_setindex(x) = can_setindex(x)
+@inline __can_setindex(::Number) = false
 
-@inline function __initial_alpha(α::Number, u, fu, norm::F) where {F}
-    return convert(promote_type(eltype(u), eltype(fu)), α)
-end
-@inline function __initial_alpha(::Nothing, u, fu, norm::F) where {F}
-    norm_fu = norm(fu)
-    return ifelse(1e-5 ≤ norm_fu ≤ 1e5, max(norm(u), true) / (2 * norm_fu),
-        convert(promote_type(eltype(u), eltype(fu)), true))
-end
-@inline __initial_alpha(α_initial, α::Number, u, fu, norm::F) where {F} = α_initial
-@inline function __initial_alpha(α_initial, α::Nothing, u, fu, norm::F) where {F}
-    return __initial_alpha(α, u, fu, norm)
+@inline function __mutable(x)
+    __can_setindex(x) && return x
+    y = similar(x)
+    copyto!(y, x)
+    return y
 end
+@inline __mutable(x::SArray) = MArray(x)
 
-# Diagonal
-@inline function __get_diagonal!!(J::AbstractVector, J_full::AbstractMatrix)
-    if can_setindex(J)
-        if fast_scalar_indexing(J)
-            @inbounds for i in eachindex(J)
-                J[i] = J_full[i, i]
-            end
-        else
-            J .= view(J_full, diagind(J_full))
-        end
-    else
-        J = __diag(J_full)
-    end
-    return J
+@inline __dot(x, y) = dot(_vec(x), _vec(y))
+
+# Return an ImmutableNLStats object when we know that NLStats won't be updated
+"""
+    ImmutableNLStats(nf, njacs, nfactors, nsolve, nsteps)
+
+Statistics from the nonlinear equation solver about the solution process.
+
+## Fields
+
+  - nf: Number of function evaluations.
+  - njacs: Number of Jacobians created during the solve.
+  - nfactors: Number of factorzations of the jacobian required for the solve.
+  - nsolve: Number of linear solves `W\b` required for the solve.
+  - nsteps: Total number of iterations for the nonlinear solver.
+"""
+struct ImmutableNLStats
+    nf::Int
+    njacs::Int
+    nfactors::Int
+    nsolve::Int
+    nsteps::Int
 end
-@inline function __get_diagonal!!(J::AbstractArray, J_full::AbstractMatrix)
-    return _restructure(J, __get_diagonal!!(_vec(J), J_full))
+
+function Base.show(io::IO, ::MIME"text/plain", s::ImmutableNLStats)
+    println(io, summary(s))
+    @printf io "%-50s %-d\n" "Number of function evaluations:" s.nf
+    @printf io "%-50s %-d\n" "Number of Jacobians created:" s.njacs
+    @printf io "%-50s %-d\n" "Number of factorizations:" s.nfactors
+    @printf io "%-50s %-d\n" "Number of linear solves:" s.nsolve
+    @printf io "%-50s %-d" "Number of nonlinear solver iterations:" s.nsteps
 end
-@inline __get_diagonal!!(J::Number, J_full::Number) = J_full
 
-@inline __diag(x::AbstractMatrix) = diag(x)
-@inline __diag(x::AbstractVector) = x
-@inline __diag(x::Number) = x
+function Base.merge(s1::ImmutableNLStats, s2::ImmutableNLStats)
+    return ImmutableNLStats(s1.nf + s2.nf, s1.njacs + s2.njacs, s1.nfactors + s2.nfactors,
+        s1.nsolve + s2.nsolve, s1.nsteps + s2.nsteps)
+end
 
-@inline __is_complex(::Type{ComplexF64}) = true
-@inline __is_complex(::Type{ComplexF32}) = true
-@inline __is_complex(::Type{Complex}) = true
-@inline __is_complex(::Type{T}) where {T} = false
+"""
+    pickchunksize(x) = pickchunksize(length(x))
+    pickchunksize(x::Int)
 
-@inline __reshape(x::Number, args...) = x
-@inline __reshape(x::AbstractArray, args...) = reshape(x, args...)
+Determine the chunk size for ForwardDiff and PolyesterForwardDiff based on the input length.
+"""
+@inline pickchunksize(x) = pickchunksize(length(x))
+@inline pickchunksize(x::Int) = ForwardDiff.pickchunksize(x)
diff --git a/test/core/23_test_problems.jl b/test/core/23_test_problems.jl
index f3eeb58e6..8f1f07322 100644
--- a/test/core/23_test_problems.jl
+++ b/test/core/23_test_problems.jl
@@ -17,11 +17,11 @@ function test_on_library(problems, dicts, alg_ops, broken_tests, ϵ = 1e-4;
 
                     skip = skip_tests !== nothing && idx in skip_tests[alg]
                     if skip
-                        @test_skip norm(res) ≤ ϵ
+                        @test_skip norm(res, Inf) ≤ ϵ
                         continue
                     end
                     broken = idx in broken_tests[alg] ? true : false
-                    @test norm(res)≤ϵ broken=broken
+                    @test norm(res, Inf)≤ϵ broken=broken
                 catch err
                     @error err
                     broken = idx in broken_tests[alg] ? true : false
@@ -45,27 +45,28 @@ end
     test_on_library(problems, dicts, alg_ops, broken_tests)
 end
 
-@testset "TrustRegion 23 Test Problems" begin
-    alg_ops = (TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Simple),
-        TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan),
-        TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei),
-        TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan),
-        TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.NLsolve))
-
-    broken_tests = Dict(alg => Int[] for alg in alg_ops)
-    broken_tests[alg_ops[1]] = [11, 21]
-    broken_tests[alg_ops[2]] = [11, 21]
-    broken_tests[alg_ops[3]] = [11, 21]
-    broken_tests[alg_ops[4]] = [11, 21]
-    broken_tests[alg_ops[5]] = [21]
-    broken_tests[alg_ops[6]] = [21]
-
-    test_on_library(problems, dicts, alg_ops, broken_tests)
-end
+# @testset "TrustRegion 23 Test Problems" begin
+#     alg_ops = (TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Simple),
+#         TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan),
+#         TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei),
+#         TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan),
+#         TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin),
+#         TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.NLsolve))
+
+#     broken_tests = Dict(alg => Int[] for alg in alg_ops)
+#     broken_tests[alg_ops[1]] = [11, 21]
+#     broken_tests[alg_ops[2]] = [11, 21]
+#     broken_tests[alg_ops[3]] = [11, 21]
+#     broken_tests[alg_ops[4]] = [11, 21]
+#     broken_tests[alg_ops[5]] = [21]
+#     broken_tests[alg_ops[6]] = [21]
+
+#     test_on_library(problems, dicts, alg_ops, broken_tests)
+# end
 
 @testset "LevenbergMarquardt 23 Test Problems" begin
-    alg_ops = (LevenbergMarquardt(), LevenbergMarquardt(; α_geodesic = 0.1),
+    alg_ops = (LevenbergMarquardt(),
+        LevenbergMarquardt(; α_geodesic = 0.1),
         LevenbergMarquardt(; linsolve = CholeskyFactorization()))
 
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
@@ -86,19 +87,16 @@ end
 end
 
 @testset "Broyden 23 Test Problems" begin
-    alg_ops = (Broyden(), Broyden(; init_jacobian = Val(:true_jacobian)),
+    alg_ops = (Broyden(),
+        Broyden(; init_jacobian = Val(:true_jacobian)),
         Broyden(; update_rule = Val(:bad_broyden)),
-        Broyden(; init_jacobian = Val(:true_jacobian), update_rule = Val(:bad_broyden)),
-        Broyden(; update_rule = Val(:diagonal)),
-        Broyden(; init_jacobian = Val(:true_jacobian), update_rule = Val(:diagonal)))
+        Broyden(; init_jacobian = Val(:true_jacobian), update_rule = Val(:bad_broyden)))
 
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
-    broken_tests[alg_ops[1]] = [1, 5, 11]
+    broken_tests[alg_ops[1]] = [1, 5, 11, 15]
     broken_tests[alg_ops[2]] = [1, 5, 8, 11, 18]
     broken_tests[alg_ops[3]] = [1, 5, 9, 11]
-    broken_tests[alg_ops[4]] = [1, 5, 6, 8, 11]
-    broken_tests[alg_ops[5]] = [1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 21]
-    broken_tests[alg_ops[6]] = [2, 3, 4, 5, 6, 8, 9, 11, 12, 21, 22]
+    broken_tests[alg_ops[4]] = [5, 6, 8, 11]
 
     test_on_library(problems, dicts, alg_ops, broken_tests)
 end
@@ -107,17 +105,19 @@ end
     alg_ops = (Klement(), Klement(; init_jacobian = Val(:true_jacobian_diagonal)))
 
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
-    broken_tests[alg_ops[1]] = [1, 2, 4, 5, 11, 22]
+    broken_tests[alg_ops[1]] = [1, 2, 4, 5, 11, 18, 22]
     broken_tests[alg_ops[2]] = [2, 4, 5, 7, 18, 22]
 
     test_on_library(problems, dicts, alg_ops, broken_tests)
 end
 
 @testset "PseudoTransient 23 Test Problems" begin
-    alg_ops = (PseudoTransient(; alpha_initial = 10.0),)
+    # PT relies on the root being a stable equilibrium for convergence, so it won't work on
+    # most problems
+    alg_ops = (PseudoTransient(),)
 
     broken_tests = Dict(alg => Int[] for alg in alg_ops)
-    broken_tests[alg_ops[1]] = [1, 9, 18, 21, 22]
+    broken_tests[alg_ops[1]] = [1, 2, 3, 11, 15, 16]
 
     test_on_library(problems, dicts, alg_ops, broken_tests)
 end
diff --git a/test/core/nlls.jl b/test/core/nlls.jl
index 331f84faa..07c0dbff2 100644
--- a/test/core/nlls.jl
+++ b/test/core/nlls.jl
@@ -29,7 +29,7 @@ prob_iip = NonlinearLeastSquaresProblem(NonlinearFunction(loss_function;
 nlls_problems = [prob_oop, prob_iip]
 
 solvers = []
-for linsolve in [nothing, LUFactorization(), KrylovJL_GMRES()]
+for linsolve in [nothing, LUFactorization(), KrylovJL_GMRES(), KrylovJL_LSMR()]
     vjp_autodiffs = linsolve isa KrylovJL ? [nothing, AutoZygote(), AutoFiniteDiff()] :
                     [nothing]
     for linesearch in [Static(), BackTracking(), HagerZhang(), StrongWolfe(), MoreThuente()],
@@ -42,6 +42,8 @@ append!(solvers,
     [
         LevenbergMarquardt(),
         LevenbergMarquardt(; linsolve = LUFactorization()),
+        LevenbergMarquardt(; linsolve = KrylovJL_GMRES()),
+        LevenbergMarquardt(; linsolve = KrylovJL_LSMR()),
         nothing,
     ])
 for radius_update_scheme in [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.NocedalWright,
@@ -66,7 +68,7 @@ end
 function vjp!(Jv, v, θ, p)
     resid = zeros(length(p))
     J = ForwardDiff.jacobian((resid, θ) -> loss_function(resid, θ, p), resid, θ)
-    mul!(vec(Jv), v', J)
+    mul!(vec(Jv), transpose(J), v)
     return nothing
 end
 
@@ -78,10 +80,6 @@ probs = [
 ]
 
 for prob in probs, solver in solvers
-    !(solver isa GaussNewton) && continue
-    !(solver.linsolve isa KrylovJL) && continue
-    @test_warn "Currently we don't make use of user provided `jvp`. This is planned to be \
-    fixed in the near future." sol=solve(prob, solver; maxiters = 10000, abstol = 1e-8)
     sol = solve(prob, solver; maxiters = 10000, abstol = 1e-8)
     @test maximum(abs, sol.resid) < 1e-6
 end
diff --git a/test/core/rootfind.jl b/test/core/rootfind.jl
index 7092e18d8..ff26c3a08 100644
--- a/test/core/rootfind.jl
+++ b/test/core/rootfind.jl
@@ -1,6 +1,16 @@
 using BenchmarkTools, LinearSolve, NonlinearSolve, StaticArrays, Random, LinearAlgebra,
     Test, ForwardDiff, Zygote, Enzyme, SparseDiffTools, DiffEqBase
 
+function __autosparseenzyme()
+    @static if Sys.iswindows()
+        @warn "Enzyme on Windows stalls. Using AutoSparseFiniteDiff instead till \
+               https://github.com/EnzymeAD/Enzyme.jl/issues/1236 is resolved."
+        return AutoSparseFiniteDiff()
+    else
+        return AutoSparseEnzyme()
+    end
+end
+
 _nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
 
 quadratic_f(u, p) = u .* u .- p
@@ -43,7 +53,7 @@ const TERMINATION_CONDITIONS = [
             StrongWolfe(), BackTracking(), HagerZhang(), MoreThuente()),
         ad in (AutoFiniteDiff(), AutoZygote())
 
-        linesearch = LineSearch(; method = lsmethod, autodiff = ad)
+        linesearch = LineSearchesJL(; method = lsmethod, autodiff = ad)
         u0s = ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
 
         @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
@@ -95,7 +105,7 @@ const TERMINATION_CONDITIONS = [
     @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
 
     @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (AutoSparseForwardDiff(),
-            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0])
+            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), __autosparseenzyme()), u0 in (1.0, [1.0, 1.0])
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
         @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
     end
@@ -134,8 +144,6 @@ end
     @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme) linear_solver: $(linsolve)" for u0 in u0s,
         radius_update_scheme in radius_update_schemes, linsolve in linear_solvers
 
-        !(u0 isa Array) && linsolve !== nothing && continue
-
         abstol = ifelse(linsolve isa KrylovJL, 1e-6, 1e-9)
 
         sol = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme, linsolve, abstol)
@@ -177,7 +185,7 @@ end
     @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
 
     @testset "ADType: $(autodiff) u0: $(_nameof(u0)) radius_update_scheme: $(radius_update_scheme)" for autodiff in (AutoSparseForwardDiff(),
-            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0]),
+            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), __autosparseenzyme()), u0 in (1.0, [1.0, 1.0]),
         radius_update_scheme in radius_update_schemes
 
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
@@ -281,7 +289,7 @@ end
     end
 
     @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (AutoSparseForwardDiff(),
-            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0])
+            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), __autosparseenzyme()), u0 in (1.0, [1.0, 1.0])
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
         @test all(solve(probN, LevenbergMarquardt(; autodiff); abstol = 1e-9,
             reltol = 1e-9).u .≈ sqrt(2.0))
@@ -416,7 +424,7 @@ end
 
             probN = NonlinearProblem{false}(quadratic_f, [1.0, 1.0], 2.0)
             sol = solve(probN, alg, abstol = 1e-11)
-            @test all(abs.(quadratic_f(sol.u, 2.0)) .< 1e-10)
+            @test all(abs.(quadratic_f(sol.u, 2.0)) .< 1e-6)
         end
     end
 
@@ -496,20 +504,12 @@ end
     @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
 
     @testset "ADType: $(autodiff) u0: $(_nameof(u0))" for autodiff in (AutoSparseForwardDiff(),
-            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0])
+            AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(), __autosparseenzyme()), u0 in (1.0, [1.0, 1.0])
         probN = NonlinearProblem(quadratic_f, u0, 2.0)
         @test all(solve(probN, PseudoTransient(; alpha_initial = 10.0, autodiff)).u .≈
                   sqrt(2.0))
     end
 
-    @testset "NewtonRaphson Fails but PT passes" begin # Test that `PseudoTransient` passes a test that `NewtonRaphson` fails on.
-        p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-        u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-        probN = NonlinearProblem{false}(newton_fails, u0, p)
-        sol = solve(probN, PseudoTransient(alpha_initial = 1.0), abstol = 1e-10)
-        @test all(abs.(newton_fails(sol.u, p)) .< 1e-10)
-    end
-
     @testset "Termination condition: $(termination_condition) u0: $(_nameof(u0))" for termination_condition in TERMINATION_CONDITIONS,
         u0 in (1.0, [1.0, 1.0])
 
@@ -543,7 +543,7 @@ end
         init_jacobian in (Val(:identity), Val(:true_jacobian)),
         update_rule in (Val(:good_broyden), Val(:bad_broyden), Val(:diagonal))
 
-        linesearch = LineSearch(; method = lsmethod, autodiff = ad)
+        linesearch = LineSearchesJL(; method = lsmethod, autodiff = ad)
         u0s = ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
 
         @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
@@ -614,7 +614,7 @@ end
         ad in (AutoFiniteDiff(), AutoZygote()),
         init_jacobian in (Val(:identity), Val(:true_jacobian), Val(:true_jacobian_diagonal))
 
-        linesearch = LineSearch(; method = lsmethod, autodiff = ad)
+        linesearch = LineSearchesJL(; method = lsmethod, autodiff = ad)
         u0s = ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
 
         @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
@@ -686,7 +686,7 @@ end
             LiFukushimaLineSearch()),
         ad in (AutoFiniteDiff(), AutoZygote())
 
-        linesearch = LineSearch(; method = lsmethod, autodiff = ad)
+        linesearch = LineSearchesJL(; method = lsmethod, autodiff = ad)
         u0s = ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
 
         @testset "[OOP] u0: $(typeof(u0))" for u0 in u0s
@@ -765,11 +765,17 @@ end
 
     prob = NonlinearProblem(NonlinearFunction{false}(F; jvp = JVP), u0, u0)
     sol = solve(prob, NewtonRaphson(; linsolve = KrylovJL_GMRES()); abstol = 1e-13)
-
-    @test norm(F(sol.u, u0)) ≤ 1e-6
+    @test norm(sol.resid, Inf) ≤ 1e-6
+    sol = solve(prob,
+        TrustRegion(; linsolve = KrylovJL_GMRES(), vjp_autodiff = AutoFiniteDiff());
+        abstol = 1e-13)
+    @test norm(sol.resid, Inf) ≤ 1e-6
 
     prob = NonlinearProblem(NonlinearFunction{true}(F!; jvp = JVP!), u0, u0)
     sol = solve(prob, NewtonRaphson(; linsolve = KrylovJL_GMRES()); abstol = 1e-13)
-
-    @test norm(F(sol.u, u0)) ≤ 1e-6
+    @test norm(sol.resid, Inf) ≤ 1e-6
+    sol = solve(prob,
+        TrustRegion(; linsolve = KrylovJL_GMRES(), vjp_autodiff = AutoFiniteDiff());
+        abstol = 1e-13)
+    @test norm(sol.resid, Inf) ≤ 1e-6
 end
diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml
index 371205fea..2c366f675 100644
--- a/test/gpu/Project.toml
+++ b/test/gpu/Project.toml
@@ -2,6 +2,7 @@
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 NonlinearSolve = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 
 [compat]
 CUDA = "5"
diff --git a/test/gpu/core.jl b/test/gpu/core.jl
index 8459a2a2a..eff394853 100644
--- a/test/gpu/core.jl
+++ b/test/gpu/core.jl
@@ -1,27 +1,35 @@
-using CUDA, NonlinearSolve, LinearSolve
+using CUDA, NonlinearSolve, LinearSolve, StableRNGs, Test
 
 CUDA.allowscalar(false)
 
-A = cu(rand(4, 4))
-u0 = cu(rand(4))
-b = cu(rand(4))
+A = cu(rand(StableRNG(0), 4, 4))
+u0 = cu(rand(StableRNG(0), 4))
+b = cu(rand(StableRNG(0), 4))
 
 linear_f(du, u, p) = (du .= A * u .+ b)
 
 prob = NonlinearProblem(linear_f, u0)
 
-for alg in (NewtonRaphson(), LevenbergMarquardt(; linsolve = QRFactorization()),
-    PseudoTransient(; alpha_initial = 1.0f0), Klement(), Broyden(),
-    LimitedMemoryBroyden(), TrustRegion())
-    @test_nowarn sol = solve(prob, alg; abstol = 1.0f-8, reltol = 1.0f-8)
+SOLVERS = (NewtonRaphson(), LevenbergMarquardt(; linsolve = QRFactorization()),
+    LevenbergMarquardt(; linsolve = KrylovJL_GMRES()), PseudoTransient(), Klement(),
+    Broyden(; linesearch = LiFukushimaLineSearch()),
+    LimitedMemoryBroyden(; threshold = 2, linesearch = LiFukushimaLineSearch()),
+    DFSane(), TrustRegion(; linsolve = QRFactorization()),
+    TrustRegion(; linsolve = KrylovJL_GMRES(), concrete_jac = true),  # Needed if Zygote not loaded
+    nothing)
+
+@testset "[IIP] GPU Solvers" begin
+    for alg in SOLVERS
+        @test_nowarn sol = solve(prob, alg; abstol = 1.0f-5, reltol = 1.0f-5)
+    end
 end
 
 linear_f(u, p) = A * u .+ b
 
 prob = NonlinearProblem{false}(linear_f, u0)
 
-for alg in (NewtonRaphson(), LevenbergMarquardt(; linsolve = QRFactorization()),
-    PseudoTransient(; alpha_initial = 1.0f0), Klement(), Broyden(),
-    LimitedMemoryBroyden(), TrustRegion())
-    @test_nowarn sol = solve(prob, alg; abstol = 1.0f-8, reltol = 1.0f-8)
+@testset "[OOP] GPU Solvers" begin
+    for alg in SOLVERS
+        @test_nowarn sol = solve(prob, alg; abstol = 1.0f-5, reltol = 1.0f-5)
+    end
 end
diff --git a/test/misc/bruss.jl b/test/misc/bruss.jl
index 729629c38..96f1a4241 100644
--- a/test/misc/bruss.jl
+++ b/test/misc/bruss.jl
@@ -40,14 +40,16 @@ end
 
 u0 = init_brusselator_2d(xyd_brusselator)
 prob_brusselator_2d = NonlinearProblem(brusselator_2d_loop, u0, p)
-sol = solve(prob_brusselator_2d, NewtonRaphson())
-@test norm(sol.resid) < 1e-8
+sol = solve(prob_brusselator_2d, NewtonRaphson(); abstol = 1e-8)
+@test norm(sol.resid, Inf) < 1e-8
 
-sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseForwardDiff()))
-@test norm(sol.resid) < 1e-8
+sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseForwardDiff());
+    abstol = 1e-8)
+@test norm(sol.resid, Inf) < 1e-8
 
-sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseFiniteDiff()))
-@test norm(sol.resid) < 1e-8
+sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseFiniteDiff());
+    abstol = 1e-8)
+@test norm(sol.resid, Inf) < 1e-8
 
 du0 = copy(u0)
 jac_sparsity = Symbolics.jacobian_sparsity((du, u) -> brusselator_2d_loop(du, u, p), du0,
@@ -56,16 +58,17 @@ jac_prototype = float.(jac_sparsity)
 fill!(jac_prototype, 0)
 @test all(iszero, jac_prototype)
 
-ff = NonlinearFunction(brusselator_2d_loop; jac_prototype)
-prob_brusselator_2d = NonlinearProblem(ff, u0, p)
+ff_iip = NonlinearFunction(brusselator_2d_loop; jac_prototype)
+prob_brusselator_2d = NonlinearProblem(ff_iip, u0, p)
 
-sol = solve(prob_brusselator_2d, NewtonRaphson())
-@test norm(sol.resid) < 1e-8
+sol = solve(prob_brusselator_2d, NewtonRaphson(); abstol = 1e-8)
+@test norm(sol.resid, Inf) < 1e-8
 @test !all(iszero, jac_prototype)
 
-sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseFiniteDiff()))
-@test norm(sol.resid) < 1e-8
+sol = solve(prob_brusselator_2d, NewtonRaphson(autodiff = AutoSparseFiniteDiff());
+    abstol = 1e-8)
+@test norm(sol.resid, Inf) < 1e-8
 
 cache = init(prob_brusselator_2d, NewtonRaphson(; autodiff = AutoSparseForwardDiff()));
-@test maximum(cache.jac_cache.coloring.colorvec) == 12
-@test cache.alg.ad isa AutoSparseForwardDiff
+@test maximum(cache.jac_cache.jac_cache.coloring.colorvec) == 12
+@test cache.jac_cache.autodiff isa AutoSparseForwardDiff
diff --git a/test/misc/infeasible.jl b/test/misc/infeasible.jl
deleted file mode 100644
index 74ec4128e..000000000
--- a/test/misc/infeasible.jl
+++ /dev/null
@@ -1,65 +0,0 @@
-using LinearAlgebra, NonlinearSolve, StaticArrays, Test
-
-# this is infeasible
-function f1!(out, u, p)
-    μ = 3.986004415e14
-    x = 7000.0e3
-    y = -6.970561549987071e-9
-    z = -3.784706123246018e-9
-    v_x = 8.550491684548064e-12 + u[1]
-    v_y = 6631.60076191005 + u[2]
-    v_z = 3600.665431405663 + u[3]
-    r = @SVector [x, y, z]
-    v = @SVector [v_x, v_y, v_z]
-    h = cross(r, v)
-    ev = cross(v, h) / μ - r / norm(r)
-    i = acos(h[3] / norm(h))
-    e = norm(ev)
-    a = 1 / (2 / norm(r) - (norm(v)^2 / μ))
-    out .= [a - 42.0e6, e - 1e-5, i - 1e-5]
-    return nothing
-end
-
-# this is unfeasible
-function f1(u, p)
-    μ = 3.986004415e14
-    x = 7000.0e3
-    y = -6.970561549987071e-9
-    z = -3.784706123246018e-9
-    v_x = 8.550491684548064e-12 + u[1]
-    v_y = 6631.60076191005 + u[2]
-    v_z = 3600.665431405663 + u[3]
-    r = [x, y, z]
-    v = [v_x, v_y, v_z]
-    h = cross(r, v)
-    ev = cross(v, h) / μ - r / norm(r)
-    i = acos(h[3] / norm(h))
-    e = norm(ev)
-    a = 1 / (2 / norm(r) - (norm(v)^2 / μ))
-    return [a - 42.0e6, e - 1e-5, i - 1e-5]
-end
-
-@testset "[IIP] Infeasible" begin
-    u0 = [0.0, 0.0, 0.0]
-    prob = NonlinearProblem(f1!, u0)
-    sol = solve(prob)
-
-    @test all(!isnan, sol.u)
-    @test !SciMLBase.successful_retcode(sol.retcode)
-end
-
-@testset "[OOP] Infeasible" begin
-    u0 = [0.0, 0.0, 0.0]
-    prob = NonlinearProblem(f1, u0)
-    sol = solve(prob)
-
-    @test all(!isnan, sol.u)
-    @test !SciMLBase.successful_retcode(sol.retcode)
-
-    u0 = @SVector [0.0, 0.0, 0.0]
-    prob = NonlinearProblem(f1, u0)
-    sol = solve(prob)
-
-    @test all(!isnan, sol.u)
-    @test !SciMLBase.successful_retcode(sol.retcode)
-end
diff --git a/test/misc/no_ad.jl b/test/misc/no_ad.jl
deleted file mode 100644
index 4dc8a1a8e..000000000
--- a/test/misc/no_ad.jl
+++ /dev/null
@@ -1,23 +0,0 @@
-using LinearAlgebra, NonlinearSolve, Test
-
-@testset "[IIP] no AD" begin
-    f_iip = Base.Experimental.@opaque (du, u, p) -> du .= u .* u .- p
-    u0 = [0.0]
-    prob = NonlinearProblem(f_iip, u0, 1.0)
-    for alg in [RobustMultiNewton(autodiff = AutoFiniteDiff()())]
-        sol = solve(prob, alg)
-        @test isapprox(only(sol.u), 1.0)
-        @test SciMLBase.successful_retcode(sol.retcode)
-    end
-end
-
-@testset "[OOP] no AD" begin
-    f_oop = Base.Experimental.@opaque (u, p) -> u .* u .- p
-    u0 = [0.0]
-    prob = NonlinearProblem{false}(f_oop, u0, 1.0)
-    for alg in [RobustMultiNewton(autodiff = AutoFiniteDiff())]
-        sol = solve(prob, alg)
-        @test isapprox(only(sol.u), 1.0)
-        @test SciMLBase.successful_retcode(sol.retcode)
-    end
-end
diff --git a/test/misc/polyalgs.jl b/test/misc/polyalgs.jl
index 9eb42599a..e36c066fc 100644
--- a/test/misc/polyalgs.jl
+++ b/test/misc/polyalgs.jl
@@ -1,85 +1,180 @@
-using NonlinearSolve, Test, NaNMath, OrdinaryDiffEq
-
-f(u, p) = u .* u .- 2
-u0 = [1.0, 1.0]
-probN = NonlinearProblem{false}(f, u0)
-
-custom_polyalg = NonlinearSolvePolyAlgorithm((Broyden(), LimitedMemoryBroyden()))
-
-# Uses the `__solve` function
-@time solver = solve(probN; abstol = 1e-9)
-@test SciMLBase.successful_retcode(solver)
-@time solver = solve(probN, RobustMultiNewton(); abstol = 1e-9)
-@test SciMLBase.successful_retcode(solver)
-@time solver = solve(probN, FastShortcutNonlinearPolyalg(); abstol = 1e-9)
-@test SciMLBase.successful_retcode(solver)
-@time solver = solve(probN, custom_polyalg; abstol = 1e-9)
-@test SciMLBase.successful_retcode(solver)
-
-# Test the caching interface
-cache = init(probN; abstol = 1e-9);
-@time solver = solve!(cache)
-@test SciMLBase.successful_retcode(solver)
-cache = init(probN, RobustMultiNewton(); abstol = 1e-9);
-@time solver = solve!(cache)
-@test SciMLBase.successful_retcode(solver)
-cache = init(probN, FastShortcutNonlinearPolyalg(); abstol = 1e-9);
-@time solver = solve!(cache)
-@test SciMLBase.successful_retcode(solver)
-cache = init(probN, custom_polyalg; abstol = 1e-9);
-@time solver = solve!(cache)
-@test SciMLBase.successful_retcode(solver)
-
-# https://github.com/SciML/NonlinearSolve.jl/issues/153
-function f(du, u, p)
-    s1, s1s2, s2 = u
-    k1, c1, Δt = p
-
-    du[1] = -0.25 * c1 * k1 * s1 * s2
-    du[2] = 0.25 * c1 * k1 * s1 * s2
-    du[3] = -0.25 * c1 * k1 * s1 * s2
+using NonlinearSolve, Test, NaNMath, OrdinaryDiffEq, StaticArrays, LinearAlgebra
+
+@testset "Basic PolyAlgorithms" begin
+    f(u, p) = u .* u .- 2
+    u0 = [1.0, 1.0]
+    probN = NonlinearProblem{false}(f, u0)
+
+    custom_polyalg = NonlinearSolvePolyAlgorithm((Broyden(), LimitedMemoryBroyden()))
+
+    # Uses the `__solve` function
+    @time solver = solve(probN; abstol = 1e-9)
+    @test SciMLBase.successful_retcode(solver)
+    @time solver = solve(probN, RobustMultiNewton(); abstol = 1e-9)
+    @test SciMLBase.successful_retcode(solver)
+    @time solver = solve(probN, FastShortcutNonlinearPolyalg(); abstol = 1e-9)
+    @test SciMLBase.successful_retcode(solver)
+    @time solver = solve(probN, custom_polyalg; abstol = 1e-9)
+    @test SciMLBase.successful_retcode(solver)
+
+    # Test the caching interface
+    cache = init(probN; abstol = 1e-9)
+    @time solver = solve!(cache)
+    @test SciMLBase.successful_retcode(solver)
+    cache = init(probN, RobustMultiNewton(); abstol = 1e-9)
+    @time solver = solve!(cache)
+    @test SciMLBase.successful_retcode(solver)
+    cache = init(probN, FastShortcutNonlinearPolyalg(); abstol = 1e-9)
+    @time solver = solve!(cache)
+    @test SciMLBase.successful_retcode(solver)
+    cache = init(probN, custom_polyalg; abstol = 1e-9)
+    @time solver = solve!(cache)
+    @test SciMLBase.successful_retcode(solver)
 end
 
-prob = NonlinearProblem(f, [2.0, 2.0, 2.0], [1.0, 2.0, 2.5])
-sol = solve(prob; abstol = 1e-9)
-@test SciMLBase.successful_retcode(sol)
+@testset "Testing #153 Singular Exception" begin
+    # https://github.com/SciML/NonlinearSolve.jl/issues/153
+    function f(du, u, p)
+        s1, s1s2, s2 = u
+        k1, c1, Δt = p
+
+        du[1] = -0.25 * c1 * k1 * s1 * s2
+        du[2] = 0.25 * c1 * k1 * s1 * s2
+        du[3] = -0.25 * c1 * k1 * s1 * s2
+    end
+
+    prob = NonlinearProblem(f, [2.0, 2.0, 2.0], [1.0, 2.0, 2.5])
+    sol = solve(prob; abstol = 1e-9)
+    @test SciMLBase.successful_retcode(sol)
+end
 
-# https://github.com/SciML/NonlinearSolve.jl/issues/187
-# If we use a General Nonlinear Solver the solution might go out of the domain!
-ff_interval(u, p) = 0.5 / 1.5 * NaNMath.log.(u ./ (1.0 .- u)) .- 2.0 * u .+ 1.0
+@testset "Simple Scalar Problem #187" begin
+    # https://github.com/SciML/NonlinearSolve.jl/issues/187
+    # If we use a General Nonlinear Solver the solution might go out of the domain!
+    ff_interval(u, p) = 0.5 / 1.5 * NaNMath.log.(u ./ (1.0 .- u)) .- 2.0 * u .+ 1.0
 
-uspan = (0.02, 0.1)
-prob = IntervalNonlinearProblem(ff_interval, uspan)
-sol = solve(prob; abstol = 1e-9)
-@test SciMLBase.successful_retcode(sol)
+    uspan = (0.02, 0.1)
+    prob = IntervalNonlinearProblem(ff_interval, uspan)
+    sol = solve(prob; abstol = 1e-9)
+    @test SciMLBase.successful_retcode(sol)
 
-u0 = 0.06
-p = 2.0
-prob = NonlinearProblem(ff_interval, u0, p)
-sol = solve(prob; abstol = 1e-9)
-@test SciMLBase.successful_retcode(sol)
+    u0 = 0.06
+    p = 2.0
+    prob = NonlinearProblem(ff_interval, u0, p)
+    sol = solve(prob; abstol = 1e-9)
+    @test SciMLBase.successful_retcode(sol)
+end
 
 # Shooting Problem: Taken from BoundaryValueDiffEq.jl
 # Testing for Complex Valued Root Finding. For Complex valued inputs we drop some of the
 # algorithms which dont support those.
-function ode_func!(du, u, p, t)
-    du[1] = u[2]
-    du[2] = -u[1]
-    return nothing
+@testset "Complex Valued Problems: Single-Shooting" begin
+    function ode_func!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = -u[1]
+        return nothing
+    end
+
+    function objective_function!(resid, u0, p)
+        odeprob = ODEProblem{true}(ode_func!, u0, (0.0, 100.0), p)
+        sol = solve(odeprob, Tsit5(), abstol = 1e-9, reltol = 1e-9, verbose = false)
+        resid[1] = sol(0.0)[1]
+        resid[2] = sol(100.0)[1] - 1.0
+        return nothing
+    end
+
+    prob = NonlinearProblem{true}(objective_function!, [0.0, 1.0] .+ 1im)
+    sol = solve(prob; abstol = 1e-10)
+    @test SciMLBase.successful_retcode(sol)
+    # This test is not meant to return success but test that all the default solvers can handle
+    # complex valued problems
+    @test_nowarn solve(prob; abstol = 1e-19, maxiters = 10)
+    @test_nowarn solve(prob, RobustMultiNewton(eltype(prob.u0)); abstol = 1e-19,
+        maxiters = 10)
+end
+
+@testset "[IIP] no AD" begin
+    f_iip = Base.Experimental.@opaque (du, u, p) -> du .= u .* u .- p
+    u0 = [0.0]
+    prob = NonlinearProblem(f_iip, u0, 1.0)
+    for alg in [RobustMultiNewton(autodiff = AutoFiniteDiff())]
+        sol = solve(prob, alg)
+        @test isapprox(only(sol.u), 1.0)
+        @test SciMLBase.successful_retcode(sol.retcode)
+    end
+end
+
+@testset "[OOP] no AD" begin
+    f_oop = Base.Experimental.@opaque (u, p) -> u .* u .- p
+    u0 = [0.0]
+    prob = NonlinearProblem{false}(f_oop, u0, 1.0)
+    for alg in [RobustMultiNewton(autodiff = AutoFiniteDiff())]
+        sol = solve(prob, alg)
+        @test isapprox(only(sol.u), 1.0)
+        @test SciMLBase.successful_retcode(sol.retcode)
+    end
 end
 
-function objective_function!(resid, u0, p)
-    odeprob = ODEProblem{true}(ode_func!, u0, (0.0, 100.0), p)
-    sol = solve(odeprob, Tsit5(), abstol = 1e-9, reltol = 1e-9, verbose = false)
-    resid[1] = sol(0.0)[1]
-    resid[2] = sol(100.0)[1] - 1.0
+# this is infeasible
+function f1_infeasible!(out, u, p)
+    μ = 3.986004415e14
+    x = 7000.0e3
+    y = -6.970561549987071e-9
+    z = -3.784706123246018e-9
+    v_x = 8.550491684548064e-12 + u[1]
+    v_y = 6631.60076191005 + u[2]
+    v_z = 3600.665431405663 + u[3]
+    r = @SVector [x, y, z]
+    v = @SVector [v_x, v_y, v_z]
+    h = cross(r, v)
+    ev = cross(v, h) / μ - r / norm(r)
+    i = acos(h[3] / norm(h))
+    e = norm(ev)
+    a = 1 / (2 / norm(r) - (norm(v)^2 / μ))
+    out .= [a - 42.0e6, e - 1e-5, i - 1e-5]
     return nothing
 end
 
-prob = NonlinearProblem{true}(objective_function!, [0.0, 1.0] .+ 1im)
-sol = solve(prob; abstol = 1e-10)
-@test SciMLBase.successful_retcode(sol)
-# This test is not meant to return success but test that all the default solvers can handle
-# complex valued problems
-@test_nowarn solve(prob; abstol = 1e-19, maxiters = 10)
-@test_nowarn solve(prob, RobustMultiNewton(eltype(prob.u0)); abstol = 1e-19, maxiters = 10)
+# this is unfeasible
+function f1_infeasible(u, p)
+    μ = 3.986004415e14
+    x = 7000.0e3
+    y = -6.970561549987071e-9
+    z = -3.784706123246018e-9
+    v_x = 8.550491684548064e-12 + u[1]
+    v_y = 6631.60076191005 + u[2]
+    v_z = 3600.665431405663 + u[3]
+    r = [x, y, z]
+    v = [v_x, v_y, v_z]
+    h = cross(r, v)
+    ev = cross(v, h) / μ - r / norm(r)
+    i = acos(h[3] / norm(h))
+    e = norm(ev)
+    a = 1 / (2 / norm(r) - (norm(v)^2 / μ))
+    return [a - 42.0e6, e - 1e-5, i - 1e-5]
+end
+
+@testset "[IIP] Infeasible" begin
+    u0 = [0.0, 0.0, 0.0]
+    prob = NonlinearProblem(f1_infeasible!, u0)
+    sol = solve(prob)
+
+    @test all(!isnan, sol.u)
+    @test !SciMLBase.successful_retcode(sol.retcode)
+end
+
+@testset "[OOP] Infeasible" begin
+    u0 = [0.0, 0.0, 0.0]
+    prob = NonlinearProblem(f1_infeasible, u0)
+    sol = solve(prob)
+
+    @test all(!isnan, sol.u)
+    @test !SciMLBase.successful_retcode(sol.retcode)
+
+    u0 = @SVector [0.0, 0.0, 0.0]
+    prob = NonlinearProblem(f1_infeasible, u0)
+    sol = solve(prob)
+
+    @test all(!isnan, sol.u)
+    @test !SciMLBase.successful_retcode(sol.retcode)
+end
diff --git a/test/misc/qa.jl b/test/misc/qa.jl
index 9d123470d..4629c2132 100644
--- a/test/misc/qa.jl
+++ b/test/misc/qa.jl
@@ -7,7 +7,8 @@ using NonlinearSolve, Aqua
     Aqua.test_piracies(NonlinearSolve,
         treat_as_own = [NonlinearProblem, NonlinearLeastSquaresProblem])
     Aqua.test_project_extras(NonlinearSolve)
-    Aqua.test_stale_deps(NonlinearSolve)
+    # Timer Outputs needs to be enabled via Preferences
+    Aqua.test_stale_deps(NonlinearSolve; ignore = [:TimerOutputs])
     Aqua.test_unbound_args(NonlinearSolve)
     Aqua.test_undefined_exports(NonlinearSolve)
 end
diff --git a/test/runtests.jl b/test/runtests.jl
index f48303249..5a1be8e22 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -33,9 +33,7 @@ end
         @time @safetestset "Sparsity Tests: Bruss Steady State" include("misc/bruss.jl")
         @time @safetestset "Polyalgs" include("misc/polyalgs.jl")
         @time @safetestset "Matrix Resizing" include("misc/matrix_resizing.jl")
-        @time @safetestset "Infeasible Problems" include("misc/infeasible.jl")
         @time @safetestset "Banded Matrices" include("misc/banded_matrices.jl")
-        @time @safetestset "No AD" include("misc/no_ad.jl")
     end
 
     if GROUP == "GPU"
diff --git a/test/wrappers/fixedpoint.jl b/test/wrappers/fixedpoint.jl
index 282c8c124..87d8e9d7b 100644
--- a/test/wrappers/fixedpoint.jl
+++ b/test/wrappers/fixedpoint.jl
@@ -1,5 +1,5 @@
-using NonlinearSolve,
-    FixedPointAcceleration, SpeedMapping, NLsolve, SIAMFANLEquations, LinearAlgebra, Test
+using NonlinearSolve, LinearAlgebra, Test
+import SIAMFANLEquations, FixedPointAcceleration, SpeedMapping, NLsolve
 
 # Simple Scalar Problem
 @testset "Simple Scalar Problem" begin
diff --git a/test/wrappers/nlls.jl b/test/wrappers/nlls.jl
index 3e31b47de..dfd8aa5fe 100644
--- a/test/wrappers/nlls.jl
+++ b/test/wrappers/nlls.jl
@@ -1,6 +1,5 @@
-
 using NonlinearSolve,
-    LinearSolve, LinearAlgebra, Test, StableRNGs, Random, ForwardDiff, Zygote
+    LinearAlgebra, Test, StableRNGs, StaticArrays, Random, ForwardDiff, Zygote
 import FastLevenbergMarquardt, LeastSquaresOptim, MINPACK
 
 true_function(x, θ) = @. θ[1] * exp(θ[2] * x) * cos(θ[3] * x + θ[4])
@@ -10,7 +9,7 @@ true_function(y, x, θ) = (@. y = θ[1] * exp(θ[2] * x) * cos(θ[3] * x + θ[4]
 
 x = [-1.0, -0.5, 0.0, 0.5, 1.0]
 
-y_target = true_function(x, θ_true)
+const y_target = true_function(x, θ_true)
 
 function loss_function(θ, p)
     ŷ = true_function(p, θ)
@@ -30,45 +29,13 @@ prob_iip = NonlinearLeastSquaresProblem(NonlinearFunction(loss_function;
 
 nlls_problems = [prob_oop, prob_iip]
 
-solvers = [
-    LeastSquaresOptimJL(:lm),
-    LeastSquaresOptimJL(:dogleg),
-]
+solvers = [LeastSquaresOptimJL(alg; autodiff) for alg in (:lm, :dogleg),
+autodiff in (nothing, AutoForwardDiff(), AutoFiniteDiff(), :central, :forward)]
 
 for prob in nlls_problems, solver in solvers
     @time sol = solve(prob, solver; maxiters = 10000, abstol = 1e-8)
     @test SciMLBase.successful_retcode(sol)
-    @test norm(sol.resid) < 1e-6
-end
-
-# This is just for testing that we can use vjp provided by the user
-function vjp(v, θ, p)
-    resid = zeros(length(p))
-    J = ForwardDiff.jacobian((resid, θ) -> loss_function(resid, θ, p), resid, θ)
-    return vec(v' * J)
-end
-
-function vjp!(Jv, v, θ, p)
-    resid = zeros(length(p))
-    J = ForwardDiff.jacobian((resid, θ) -> loss_function(resid, θ, p), resid, θ)
-    mul!(vec(Jv), v', J)
-    return nothing
-end
-
-probs = [
-    NonlinearLeastSquaresProblem(NonlinearFunction{true}(loss_function;
-            resid_prototype = zero(y_target), vjp = vjp!), θ_init, x),
-    NonlinearLeastSquaresProblem(NonlinearFunction{false}(loss_function;
-            resid_prototype = zero(y_target), vjp = vjp), θ_init, x),
-]
-
-for prob in probs, solver in solvers
-    !(solver isa GaussNewton) && continue
-    !(solver.linsolve isa KrylovJL) && continue
-    @test_warn "Currently we don't make use of user provided `jvp`. This is planned to be \
-    fixed in the near future." sol=solve(prob, solver; maxiters = 10000, abstol = 1e-8)
-    sol = solve(prob, solver; maxiters = 10000, abstol = 1e-8)
-    @test norm(sol.resid) < 1e-6
+    @test norm(sol.resid, Inf) < 1e-6
 end
 
 function jac!(J, θ, p)
@@ -110,5 +77,21 @@ append!(solvers, [CMINPACK(; method) for method in (:auto, :lm, :lmdif)])
 
 for solver in solvers, prob in probs
     @time sol = solve(prob, solver; maxiters = 10000, abstol = 1e-8)
-    @test norm(sol.resid) < 1e-6
+    @test norm(sol.resid, Inf) < 1e-6
 end
+
+# Static Arrays -- Fast Levenberg-Marquardt
+x_sa = SA[-1.0, -0.5, 0.0, 0.5, 1.0]
+
+const y_target_sa = true_function(x_sa, θ_true)
+
+function loss_function_sa(θ, p)
+    ŷ = true_function(p, θ)
+    return ŷ .- y_target_sa
+end
+
+θ_init_sa = SVector{4}(θ_init)
+prob_sa = NonlinearLeastSquaresProblem{false}(loss_function_sa, θ_init_sa, x)
+
+@time sol = solve(prob_sa, FastLevenbergMarquardtJL())
+@test norm(sol.resid, Inf) < 1e-6