diff --git a/README.md b/README.md
index d7556a3e..cd21ff94 100644
--- a/README.md
+++ b/README.md
@@ -34,7 +34,8 @@ This package provides an implementation of four classic algorithms for unconstra
     > high-order regularized models. *Mathematical Programming*, 163(1), 359-368.
     > DOI: [10.1007/s10107-016-1065-8](https://doi.org/10.1007/s10107-016-1065-8)
 
-  
+- `fomo`: a first-order method with momentum for unconstrained optimization;
+
 - `tron`: a pure Julia implementation of TRON, a trust-region solver for bound-constrained optimization described in
 
     >  Chih-Jen Lin and Jorge J. Moré, *Newton's Method for Large Bound-Constrained
diff --git a/docs/src/solvers.md b/docs/src/solvers.md
index 06fe0eed..322f7c2e 100644
--- a/docs/src/solvers.md
+++ b/docs/src/solvers.md
@@ -6,10 +6,11 @@
 - [`tron`](@ref)
 - [`trunk`](@ref)
 - [`R2`](@ref)
+- [`fomo`](@ref)
 
 | Problem type          | Solvers  |
 | --------------------- | -------- |
-| Unconstrained NLP     | [`lbfgs`](@ref), [`tron`](@ref), [`trunk`](@ref), [`R2`](@ref)|
+| Unconstrained NLP     | [`lbfgs`](@ref), [`tron`](@ref), [`trunk`](@ref), [`R2`](@ref), [`fomo`](@ref)|
 | Unconstrained NLS     | [`trunk`](@ref), [`tron`](@ref) |
 | Bound-constrained NLP | [`tron`](@ref) |
 | Bound-constrained NLS | [`tron`](@ref) |
@@ -21,4 +22,5 @@ lbfgs
 tron
 trunk
 R2
+fomo
 ```
diff --git a/src/JSOSolvers.jl b/src/JSOSolvers.jl
index cd65c9b2..85afc4fe 100644
--- a/src/JSOSolvers.jl
+++ b/src/JSOSolvers.jl
@@ -13,7 +13,7 @@ export solve!
 # Unconstrained solvers
 include("lbfgs.jl")
 include("trunk.jl")
-include("R2.jl")
+include("fomo.jl")
 
 # Unconstrained solvers for NLS
 include("trunkls.jl")
diff --git a/src/R2.jl b/src/R2.jl
deleted file mode 100644
index b7304dc3..00000000
--- a/src/R2.jl
+++ /dev/null
@@ -1,231 +0,0 @@
-export R2, R2Solver
-
-"""
-    R2(nlp; kwargs...)
-
-A first-order quadratic regularization method for unconstrained optimization.
-
-For advanced usage, first define a `R2Solver` to preallocate the memory used in the algorithm, and then call `solve!`:
-
-    solver = R2Solver(nlp)
-    solve!(solver, nlp; kwargs...)
-
-# Arguments
-- `nlp::AbstractNLPModel{T, V}` is the model to solve, see `NLPModels.jl`.
-
-# Keyword arguments 
-- `x::V = nlp.meta.x0`: the initial guess.
-- `atol::T = √eps(T)`: absolute tolerance.
-- `rtol::T = √eps(T)`: relative tolerance: algorithm stops when ‖∇f(xᵏ)‖ ≤ atol + rtol * ‖∇f(x⁰)‖.
-- `η1 = eps(T)^(1/4)`, `η2 = T(0.95)`: step acceptance parameters.
-- `γ1 = T(1/2)`, `γ2 = 1/γ1`: regularization update parameters.
-- `αmax = 1/eps(T)`: maximum value for step size parameter for R2 algorithm.
-- `max_eval::Int = -1`: maximum number of evaluation of the objective function.
-- `max_time::Float64 = 30.0`: maximum time limit in seconds.
-- `max_iter::Int = typemax(Int)`: maximum number of iterations.
-- `β = T(0) ∈ [0,1]` is the constant in the momentum term. If `β == 0`, R2 does not use momentum.
-- `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
-
-# Output
-The value returned is a `GenericExecutionStats`, see `SolverCore.jl`.
-
-# Callback
-The callback is called at each iteration.
-The expected signature of the callback is `callback(nlp, solver, stats)`, and its output is ignored.
-Changing any of the input arguments will affect the subsequent iterations.
-In particular, setting `stats.status = :user` will stop the algorithm.
-All relevant information should be available in `nlp` and `solver`.
-Notably, you can access, and modify, the following:
-- `solver.x`: current iterate;
-- `solver.gx`: current gradient;
-- `stats`: structure holding the output of the algorithm (`GenericExecutionStats`), which contains, among other things:
-  - `stats.dual_feas`: norm of current gradient;
-  - `stats.iter`: current iteration counter;
-  - `stats.objective`: current objective function value;
-  - `stats.status`: current status of the algorithm. Should be `:unknown` unless the algorithm has attained a stopping criterion. Changing this to anything will stop the algorithm, but you should use `:user` to properly indicate the intention.
-  - `stats.elapsed_time`: elapsed time in seconds.
-
-# Examples
-```jldoctest
-using JSOSolvers, ADNLPModels
-nlp = ADNLPModel(x -> sum(x.^2), ones(3))
-stats = R2(nlp)
-
-# output
-
-"Execution stats: first-order stationary"
-```
-
-```jldoctest
-using JSOSolvers, ADNLPModels
-nlp = ADNLPModel(x -> sum(x.^2), ones(3))
-solver = R2Solver(nlp);
-stats = solve!(solver, nlp)
-
-# output
-
-"Execution stats: first-order stationary"
-```
-"""
-mutable struct R2Solver{T, V} <: AbstractOptimizationSolver
-  x::V
-  gx::V
-  cx::V
-  d::V   # used for momentum term
-  α::T
-end
-
-function R2Solver(nlp::AbstractNLPModel{T, V}) where {T, V}
-  x = similar(nlp.meta.x0)
-  gx = similar(nlp.meta.x0)
-  cx = similar(nlp.meta.x0)
-  d = fill!(similar(nlp.meta.x0), 0)
-  α = zero(T) # init it to zero for now 
-  return R2Solver{T, V}(x, gx, cx, d, α)
-end
-
-@doc (@doc R2Solver) function R2(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
-  solver = R2Solver(nlp)
-  return solve!(solver, nlp; kwargs...)
-end
-
-function SolverCore.reset!(solver::R2Solver{T}) where {T}
-  solver.d .= zero(T)
-  solver
-end
-SolverCore.reset!(solver::R2Solver, ::AbstractNLPModel) = reset!(solver)
-
-function SolverCore.solve!(
-  solver::R2Solver{T, V},
-  nlp::AbstractNLPModel{T, V},
-  stats::GenericExecutionStats{T, V};
-  callback = (args...) -> nothing,
-  x::V = nlp.meta.x0,
-  atol::T = √eps(T),
-  rtol::T = √eps(T),
-  η1 = eps(T)^(1 / 4),
-  η2 = T(0.95),
-  γ1 = T(1 / 2),
-  γ2 = 1 / γ1,
-  αmax = T(Inf),
-  max_time::Float64 = 30.0,
-  max_eval::Int = -1,
-  max_iter::Int = typemax(Int),
-  β::T = T(0),
-  verbose::Int = 0,
-) where {T, V}
-  unconstrained(nlp) || error("R2 should only be called on unconstrained problems.")
-
-  reset!(stats)
-  start_time = time()
-  set_time!(stats, 0.0)
-
-  x = solver.x .= x
-  ∇fk = solver.gx
-  ck = solver.cx
-  d = solver.d
-  αk = solver.α
-
-  set_iter!(stats, 0)
-  set_objective!(stats, obj(nlp, x))
-
-  grad!(nlp, x, ∇fk)
-  norm_∇fk = norm(∇fk)
-  set_dual_residual!(stats, norm_∇fk)
-
-  αk = 1 / 2^round(log2(norm_∇fk + 1))
-  # Stopping criterion: 
-  ϵ = atol + rtol * norm_∇fk
-  optimal = norm_∇fk ≤ ϵ
-  if optimal
-    @info("Optimal point found at initial point")
-    @info @sprintf "%5s  %9s  %7s  %7s " "iter" "f" "‖∇f‖" "α"
-    @info @sprintf "%5d  %9.2e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk αk
-  end
-  if verbose > 0 && mod(stats.iter, verbose) == 0
-    @info @sprintf "%5s  %9s  %7s  %7s " "iter" "f" "‖∇f‖" "α"
-    infoline = @sprintf "%5d  %9.2e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk αk
-  end
-
-  set_status!(
-    stats,
-    get_status(
-      nlp,
-      elapsed_time = stats.elapsed_time,
-      optimal = optimal,
-      max_eval = max_eval,
-      iter = stats.iter,
-      max_iter = max_iter,
-      max_time = max_time,
-    ),
-  )
-
-  solver.α = αk
-  callback(nlp, solver, stats)
-  αk = solver.α
-
-  done = stats.status != :unknown
-
-  while !done
-    if β == 0
-      ck .= x .- (∇fk .* αk)
-    else
-      d .= ∇fk .* (T(1) - β) .+ d .* β
-      ck .= x .- (d .* αk)
-    end
-    ΔTk = norm_∇fk^2 * αk
-    fck = obj(nlp, ck)
-    if fck == -Inf
-      set_status!(stats, :unbounded)
-      break
-    end
-
-    ρk = (stats.objective - fck) / ΔTk
-
-    # Update regularization parameters
-    if ρk >= η2
-      αk = min(αmax, γ2 * αk)
-    elseif ρk < η1
-      αk = αk * γ1
-    end
-
-    # Acceptance of the new candidate
-    if ρk >= η1
-      x .= ck
-      set_objective!(stats, fck)
-      grad!(nlp, x, ∇fk)
-      norm_∇fk = norm(∇fk)
-    end
-
-    set_iter!(stats, stats.iter + 1)
-    set_time!(stats, time() - start_time)
-    set_dual_residual!(stats, norm_∇fk)
-    optimal = norm_∇fk ≤ ϵ
-
-    if verbose > 0 && mod(stats.iter, verbose) == 0
-      @info infoline
-      infoline = @sprintf "%5d  %9.2e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk αk
-    end
-
-    set_status!(
-      stats,
-      get_status(
-        nlp,
-        elapsed_time = stats.elapsed_time,
-        optimal = optimal,
-        max_eval = max_eval,
-        iter = stats.iter,
-        max_iter = max_iter,
-        max_time = max_time,
-      ),
-    )
-    solver.α = αk
-    callback(nlp, solver, stats)
-    αk = solver.α
-
-    done = stats.status != :unknown
-  end
-
-  set_solution!(stats, x)
-  return stats
-end
diff --git a/src/fomo.jl b/src/fomo.jl
new file mode 100644
index 00000000..d8e206bb
--- /dev/null
+++ b/src/fomo.jl
@@ -0,0 +1,482 @@
+export fomo, FomoSolver, FoSolver, fo, R2, TR, tr_step, r2_step
+
+abstract type AbstractFirstOrderSolver <: AbstractOptimizationSolver end
+
+abstract type AbstractFOMethod end
+struct tr_step <: AbstractFOMethod end
+struct r2_step <: AbstractFOMethod end
+
+"""
+    fomo(nlp; kwargs...)
+
+A First-Order with MOmentum (FOMO) model-based method for unconstrained optimization. Supports quadratic regularization and trust region method with linear model.
+
+# Algorithm description
+
+The step is computed along
+d = - (1-βmax) .* ∇f(xk) - βmax .* mk
+with mk the memory of past gradients (initialized at 0), and updated at each successful iteration as
+mk .= ∇f(xk) .* (1 - βmax) .+ mk .* βmax
+and βmax ∈ [0,β] chosen as to ensure d is gradient-related, i.e., the following 2 conditions are satisfied:
+(1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk ≥ θ1 * ‖∇f(xk)‖² (1)
+‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) *. ∇f(xk) + βmax .* mk‖       (2)
+
+# Advanced usage
+
+For advanced usage, first define a `FomoSolver` to preallocate the memory used in the algorithm, and then call `solve!`:
+
+    solver = FomoSolver(nlp)
+    solve!(solver, nlp; kwargs...)
+
+**No momentum**: if the user does not whish to use momentum (`β` = 0), it is recommended to use the memory-optimized `fo` method.
+    
+# Arguments
+
+- `nlp::AbstractNLPModel{T, V}` is the model to solve, see `NLPModels.jl`.
+
+# Keyword arguments 
+
+- `x::V = nlp.meta.x0`: the initial guess.
+- `atol::T = √eps(T)`: absolute tolerance.
+- `rtol::T = √eps(T)`: relative tolerance: algorithm stops when ‖∇f(xᵏ)‖ ≤ atol + rtol * ‖∇f(x⁰)‖.
+- `η1 = eps(T)^(1/4)`, `η2 = T(0.95)`: step acceptance parameters.
+- `γ1 = T(1/2)`, `γ2 = T(2)`: regularization update parameters.
+- `γ3 = T(1/2)` : momentum factor βmax update parameter in case of unsuccessful iteration.
+- `αmax = 1/eps(T)`: maximum step parameter for fomo algorithm.
+- `max_eval::Int = -1`: maximum number of objective evaluations.
+- `max_time::Float64 = 30.0`: maximum time limit in seconds.
+- `max_iter::Int = typemax(Int)`: maximum number of iterations.
+- `β = T(0.9) ∈ [0,1)`: target decay rate for the momentum.
+- `θ1 = T(0.1)`: momentum contribution parameter for convergence condition (1).
+- `θ2 = T(eps(T)^(1/3))`: momentum contribution parameter for convergence condition (2). 
+- `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
+- `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.
+
+# Output
+
+The value returned is a `GenericExecutionStats`, see `SolverCore.jl`.
+
+# Callback
+
+The callback is called at each iteration.
+The expected signature of the callback is `callback(nlp, solver, stats)`, and its output is ignored.
+Changing any of the input arguments will affect the subsequent iterations.
+In particular, setting `stats.status = :user || stats.stats = :unknown` will stop the algorithm.
+All relevant information should be available in `nlp` and `solver`.
+Notably, you can access, and modify, the following:
+- `solver.x`: current iterate;
+- `solver.gx`: current gradient;
+- `stats`: structure holding the output of the algorithm (`GenericExecutionStats`), which contains, among other things:
+    - `stats.dual_feas`: norm of current gradient;
+    - `stats.iter`: current iteration counter;
+    - `stats.objective`: current objective function value;
+    - `stats.status`: current status of the algorithm. Should be `:unknown` unless the algorithm has attained a stopping criterion. Changing this to anything will stop the algorithm, but you should use `:user` to properly indicate the intention.
+    - `stats.elapsed_time`: elapsed time in seconds.
+
+# Examples
+
+## `fomo`
+
+```jldoctest
+using JSOSolvers, ADNLPModels
+nlp = ADNLPModel(x -> sum(x.^2), ones(3))
+stats = fomo(nlp)
+
+# output
+
+"Execution stats: first-order stationary"
+```
+
+```jldoctest
+using JSOSolvers, ADNLPModels
+nlp = ADNLPModel(x -> sum(x.^2), ones(3))
+solver = FomoSolver(nlp);
+stats = solve!(solver, nlp)
+
+# output
+
+"Execution stats: first-order stationary"
+```
+"""
+mutable struct FomoSolver{T, V} <: AbstractFirstOrderSolver
+  x::V
+  g::V
+  c::V
+  m::V
+  d::V
+  p::V
+  α::T
+end
+
+function FomoSolver(nlp::AbstractNLPModel{T, V}) where {T, V}
+  x = similar(nlp.meta.x0)
+  g = similar(nlp.meta.x0)
+  c = similar(nlp.meta.x0)
+  m = fill!(similar(nlp.meta.x0), 0)
+  d = fill!(similar(nlp.meta.x0), 0)
+  p = similar(nlp.meta.x0)
+  return FomoSolver{T, V}(x, g, c, m, d, p, T(0))
+end
+
+@doc (@doc FomoSolver) function fomo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
+  solver = FomoSolver(nlp)
+  solver_specific = Dict(:avgβmax => T(0.0))
+  stats = GenericExecutionStats(nlp; solver_specific = solver_specific)
+  return solve!(solver, nlp, stats; kwargs...)
+end
+
+function SolverCore.reset!(solver::FomoSolver{T}) where {T}
+  fill!(solver.m, 0)
+  solver
+end
+
+SolverCore.reset!(solver::FomoSolver, ::AbstractNLPModel) = reset!(solver)
+
+"""
+    fo(nlp; kwargs...)
+    R2(nlp; kwargs...)
+    TR(nlp; kwargs...)
+
+A First-Order (FO) model-based method for unconstrained optimization. Supports quadratic regularization and trust region method with linear model.
+
+For advanced usage, first define a `FomoSolver` to preallocate the memory used in the algorithm, and then call `solve!`:
+
+    solver = FoSolver(nlp)
+    solve!(solver, nlp; kwargs...)
+
+`R2` and `TR` runs `fo` with the dedicated `step_backend` keyword argument.
+
+# Arguments
+
+- `nlp::AbstractNLPModel{T, V}` is the model to solve, see `NLPModels.jl`.
+
+# Keyword arguments 
+
+- `x::V = nlp.meta.x0`: the initial guess.
+- `atol::T = √eps(T)`: absolute tolerance.
+- `rtol::T = √eps(T)`: relative tolerance: algorithm stops when ‖∇f(xᵏ)‖ ≤ atol + rtol * ‖∇f(x⁰)‖.
+- `η1 = eps(T)^(1/4)`, `η2 = T(0.95)`: step acceptance parameters.
+- `γ1 = T(1/2)`, `γ2 = T(2)`: regularization update parameters.
+- `αmax = 1/eps(T)`: maximum step parameter for fomo algorithm.
+- `max_eval::Int = -1`: maximum number of evaluation of the objective function.
+- `max_time::Float64 = 30.0`: maximum time limit in seconds.
+- `max_iter::Int = typemax(Int)`: maximum number of iterations.
+- `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
+- `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.
+
+# Output
+
+The value returned is a `GenericExecutionStats`, see `SolverCore.jl`.
+
+# Callback
+
+The callback is called at each iteration.
+The expected signature of the callback is `callback(nlp, solver, stats)`, and its output is ignored.
+Changing any of the input arguments will affect the subsequent iterations.
+In particular, setting `stats.status = :user || stats.stats = :unknown` will stop the algorithm.
+All relevant information should be available in `nlp` and `solver`.
+Notably, you can access, and modify, the following:
+- `solver.x`: current iterate;
+- `solver.gx`: current gradient;
+- `stats`: structure holding the output of the algorithm (`GenericExecutionStats`), which contains, among other things:
+    - `stats.dual_feas`: norm of current gradient;
+    - `stats.iter`: current iteration counter;
+    - `stats.objective`: current objective function value;
+    - `stats.status`: current status of the algorithm. Should be `:unknown` unless the algorithm has attained a stopping criterion. Changing this to anything will stop the algorithm, but you should use `:user` to properly indicate the intention.
+    - `stats.elapsed_time`: elapsed time in seconds.
+
+# Examples
+
+```jldoctest
+using JSOSolvers, ADNLPModels
+nlp = ADNLPModel(x -> sum(x.^2), ones(3))
+stats = fo(nlp) # run with step_backend = r2_step(), equivalent to R2(nlp)
+
+# output
+
+"Execution stats: first-order stationary"
+```
+
+```jldoctest
+using JSOSolvers, ADNLPModels
+nlp = ADNLPModel(x -> sum(x.^2), ones(3))
+solver = FoSolver(nlp);
+stats = solve!(solver, nlp)
+
+# output
+
+"Execution stats: first-order stationary"
+```
+"""
+mutable struct FoSolver{T, V} <: AbstractFirstOrderSolver
+  x::V
+  g::V
+  c::V
+  α::T
+end
+
+function FoSolver(nlp::AbstractNLPModel{T, V}) where {T, V}
+  x = similar(nlp.meta.x0)
+  g = similar(nlp.meta.x0)
+  c = similar(nlp.meta.x0)
+  return FoSolver{T, V}(x, g, c, T(0))
+end
+
+"""
+    `R2Solver` is deprecated, please check the documentation of `R2`.
+"""
+mutable struct R2Solver{T, V} <: AbstractOptimizationSolver end
+
+Base.@deprecate R2Solver(nlp::AbstractNLPModel; kwargs...) FoSolver(nlp::AbstractNLPModel; kwargs...)
+
+@doc (@doc FoSolver) function fo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
+  solver = FoSolver(nlp)
+  stats = GenericExecutionStats(nlp)
+  return solve!(solver, nlp, stats; step_backend = r2_step(), kwargs...)
+end
+
+@doc (@doc FoSolver) function R2(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
+  fo(nlp; step_backend = r2_step(), kwargs...)
+end
+
+@doc (@doc FoSolver) function TR(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
+  fo(nlp; step_backend = tr_step(), kwargs...)
+end
+
+function SolverCore.reset!(solver::FoSolver{T}) where {T}
+  solver
+end
+
+SolverCore.reset!(solver::FoSolver, ::AbstractNLPModel) = reset!(solver)
+
+function SolverCore.solve!(
+  solver::Union{FoSolver, FomoSolver},
+  nlp::AbstractNLPModel{T, V},
+  stats::GenericExecutionStats{T, V};
+  callback = (args...) -> nothing,
+  x::V = nlp.meta.x0,
+  atol::T = √eps(T),
+  rtol::T = √eps(T),
+  η1::T = T(eps(T)^(1 / 4)),
+  η2::T = T(0.95),
+  γ1::T = T(1 / 2),
+  γ2::T = T(2),
+  γ3::T = T(1 / 2),
+  αmax::T = 1 / eps(T),
+  max_time::Float64 = 30.0,
+  max_eval::Int = -1,
+  max_iter::Int = typemax(Int),
+  β::T = T(0.9),
+  θ1::T = T(0.1),
+  θ2::T = T(eps(T)^(1 / 3)),
+  verbose::Int = 0,
+  step_backend = r2_step(),
+) where {T, V}
+  use_momentum = typeof(solver) <: FomoSolver
+  is_r2 = typeof(step_backend) <: r2_step
+  unconstrained(nlp) || error("fomo should only be called on unconstrained problems.")
+
+  reset!(stats)
+  start_time = time()
+  set_time!(stats, 0.0)
+
+  x = solver.x .= x
+  ∇fk = solver.g
+  c = solver.c
+  momentum = use_momentum ? solver.m : nothing # not used if no momentum
+  d = use_momentum ? solver.d : solver.g # g = d if no momentum
+  p = use_momentum ? solver.p : nothing # not used if no momentum
+  set_iter!(stats, 0)
+  set_objective!(stats, obj(nlp, x))
+
+  grad!(nlp, x, ∇fk)
+  norm_∇fk = norm(∇fk)
+  set_dual_residual!(stats, norm_∇fk)
+
+  solver.α = init_alpha(norm_∇fk, step_backend)
+
+  # Stopping criterion: 
+  ϵ = atol + rtol * norm_∇fk
+  optimal = norm_∇fk ≤ ϵ
+  step_param_name = is_r2 ? "σ" : "Δ"
+  if optimal
+    @info("Optimal point found at initial point")
+    if is_r2
+      @info @sprintf "%5s  %9s  %7s  %7s " "iter" "f" "‖∇f‖" step_param_name
+      @info @sprintf "%5d  %9.2e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk 1 / solver.α
+    else
+      @info @sprintf "%5s  %9s  %7s  %7s " "iter" "f" "‖∇f‖" step_param_name
+      @info @sprintf "%5d  %9.2e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk solver.α
+    end
+  else
+    if verbose > 0 && mod(stats.iter, verbose) == 0
+      step_param = is_r2 ? 1 / solver.α : solver.α
+      if !use_momentum
+        @info @sprintf "%5s  %9s  %7s  %7s  %7s " "iter" "f" "‖∇f‖" step_param_name "ρk"
+        infoline =
+          @sprintf "%5d  %9.2e  %7.1e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk step_param ' '
+      else
+        @info @sprintf "%5s  %9s  %7s  %7s  %7s  %7s " "iter" "f" "‖∇f‖" step_param_name "ρk" "βmax"
+        infoline =
+          @sprintf "%5d  %9.2e  %7.1e  %7.1e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk step_param ' ' 0
+      end
+    end
+  end
+
+  set_status!(
+    stats,
+    get_status(
+      nlp,
+      elapsed_time = stats.elapsed_time,
+      optimal = optimal,
+      max_eval = max_eval,
+      iter = stats.iter,
+      max_iter = max_iter,
+      max_time = max_time,
+    ),
+  )
+
+  callback(nlp, solver, stats)
+
+  done = stats.status != :unknown
+
+  d .= ∇fk
+  norm_d = norm_∇fk
+  βmax = T(0)
+  ρk = T(0)
+  avgβmax = T(0)
+  siter::Int = 0
+  oneT = T(1)
+  mdot∇f = T(0) # dot(momentum,∇fk)
+  while !done
+    λk = step_mult(solver.α, norm_d, step_backend)
+    c .= x .- λk .* d
+    step_underflow = x == c # step addition underfow on every dimensions, should happen before solver.α == 0
+    ΔTk = ((oneT - βmax) * norm_∇fk^2 + βmax * mdot∇f) * λk # = dot(d,∇fk) * λk with momentum, ‖∇fk‖²λk without momentum
+    fck = obj(nlp, c)
+    if fck == -Inf
+      set_status!(stats, :unbounded)
+      break
+    end
+    ρk = (stats.objective - fck) / ΔTk
+    # Update regularization parameters
+    if ρk >= η2
+      solver.α = min(αmax, γ2 * solver.α)
+    elseif ρk < η1
+      solver.α = solver.α * γ1
+      if use_momentum
+        βmax *= γ3
+        d .= ∇fk .* (oneT - βmax) .+ momentum .* βmax
+      end
+    end
+
+    # Acceptance of the new candidate
+    if ρk >= η1
+      x .= c
+      if use_momentum
+        momentum .= ∇fk .* (oneT - β) .+ momentum .* β
+      end
+      set_objective!(stats, fck)
+      grad!(nlp, x, ∇fk)
+      norm_∇fk = norm(∇fk)
+      if use_momentum
+        mdot∇f = dot(momentum, ∇fk)
+        p .= momentum .- ∇fk
+        diff_norm = norm(p)
+        βmax = find_beta(diff_norm, mdot∇f, norm_∇fk, β, θ1, θ2)
+        d .= ∇fk .* (oneT - βmax) .+ momentum .* βmax
+        norm_d = norm(d)
+        avgβmax += βmax
+        siter += 1
+      end
+    end
+
+    set_iter!(stats, stats.iter + 1)
+    set_time!(stats, time() - start_time)
+    set_dual_residual!(stats, norm_∇fk)
+    optimal = norm_∇fk ≤ ϵ
+
+    if verbose > 0 && mod(stats.iter, verbose) == 0
+      @info infoline
+      step_param = is_r2 ? 1 / solver.α : solver.α
+      if !use_momentum
+        infoline =
+          @sprintf "%5d  %9.2e  %7.1e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk step_param ρk
+      else
+        infoline =
+          @sprintf "%5d  %9.2e  %7.1e  %7.1e  %7.1e  %7.1e" stats.iter stats.objective norm_∇fk step_param ρk βmax
+      end
+    end
+
+    set_status!(
+      stats,
+      get_status(
+        nlp,
+        elapsed_time = stats.elapsed_time,
+        optimal = optimal,
+        max_eval = max_eval,
+        iter = stats.iter,
+        max_iter = max_iter,
+        max_time = max_time,
+      ),
+    )
+
+    callback(nlp, solver, stats)
+
+    step_underflow && set_status!(stats, :small_step)
+    solver.α == 0 && set_status!(stats, :exception) # :small_nlstep exception should happen before
+
+    done = stats.status != :unknown
+  end
+  if use_momentum
+    avgβmax /= siter
+    set_solver_specific!(stats, :avgβmax, avgβmax)
+  end
+  set_solution!(stats, x)
+  return stats
+end
+
+"""
+    find_beta(m, mdot∇f, norm_∇f, β, θ1, θ2)
+
+Compute value `βmax` that saturates the contribution of the momentum term to the gradient.
+`βmax` is computed such that the two gradient-related conditions are ensured: 
+1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm ≥ θ1 * ‖∇f(xk)‖²
+2. ‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) * ∇f(xk) .+ βmax .* m‖
+with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm` 
+"""
+function find_beta(diff_norm::T, mdot∇f::T, norm_∇f::T, β::T, θ1::T, θ2::T) where {T, V}
+  n1 = norm_∇f^2 - mdot∇f
+  β1 = n1 > 0 ? (1 - θ1) * norm_∇f^2 / n1 : β
+  β2 = diff_norm != 0 ? (1 - θ2) * norm_∇f / diff_norm : β
+  return min(β, min(β1, β2))
+end
+
+"""
+    init_alpha(norm_∇fk::T, ::r2_step)
+    init_alpha(norm_∇fk::T, ::tr_step)
+
+Initialize `α` step size parameter.
+Ensure first step is the same for quadratic regularization and trust region methods.
+"""
+function init_alpha(norm_∇fk::T, ::r2_step) where {T}
+  1 / 2^round(log2(norm_∇fk + 1))
+end
+
+function init_alpha(norm_∇fk::T, ::tr_step) where {T}
+  norm_∇fk / 2^round(log2(norm_∇fk + 1))
+end
+
+"""
+    step_mult(α::T, norm_∇fk::T, ::r2_step)
+    step_mult(α::T, norm_∇fk::T, ::tr_step)
+
+Compute step size multiplier: `α` for quadratic regularization(`::r2` and `::R2og`) and `α/norm_∇fk` for trust region (`::tr`).
+"""
+function step_mult(α::T, norm_∇fk::T, ::r2_step) where {T}
+  α
+end
+
+function step_mult(α::T, norm_∇fk::T, ::tr_step) where {T}
+  α / norm_∇fk
+end
\ No newline at end of file
diff --git a/test/allocs.jl b/test/allocs.jl
index b02b6621..f5768c4a 100644
--- a/test/allocs.jl
+++ b/test/allocs.jl
@@ -30,12 +30,17 @@ end
 
 if Sys.isunix()
   @testset "Allocation tests" begin
-    @testset "$symsolver" for symsolver in (:LBFGSSolver, :R2Solver, :TrunkSolver, :TronSolver)
+    @testset "$symsolver" for symsolver in (:LBFGSSolver, :FoSolver, :FomoSolver, :TrunkSolver, :TronSolver)
       for model in NLPModelsTest.nlp_problems
         nlp = eval(Meta.parse(model))()
         if unconstrained(nlp) || (bound_constrained(nlp) && (symsolver == :TronSolver))
           solver = eval(symsolver)(nlp)
-          stats = GenericExecutionStats(nlp)
+          if symsolver == :FomoSolver
+            T = eltype(nlp.meta.x0)
+            stats = GenericExecutionStats(nlp, solver_specific = Dict(:avgβmax => T(0)))
+          else
+            stats = GenericExecutionStats(nlp)
+          end
           with_logger(NullLogger()) do
             SolverCore.solve!(solver, nlp, stats)
             reset!(solver)
diff --git a/test/callback.jl b/test/callback.jl
index f43796fd..ddadc799 100644
--- a/test/callback.jl
+++ b/test/callback.jl
@@ -31,6 +31,11 @@ using ADNLPModels, JSOSolvers, LinearAlgebra, Logging #, Plots
     tron(nlp, callback = cb)
   end
   @test stats.iter == 8
+
+  stats = with_logger(NullLogger()) do
+    fomo(nlp, callback = cb)
+  end
+  @test stats.iter == 8
 end
 
 @testset "Test callback for NLS" begin
diff --git a/test/consistency.jl b/test/consistency.jl
index 94569dca..fb725b5b 100644
--- a/test/consistency.jl
+++ b/test/consistency.jl
@@ -10,8 +10,9 @@ function consistency()
   @testset "Consistency" begin
     args = Pair{Symbol, Number}[:atol => 1e-6, :rtol => 1e-6, :max_eval => 20000, :max_time => 60.0]
 
-    @testset "NLP with $mtd" for mtd in [trunk, lbfgs, tron, R2]
+    @testset "NLP with $mtd" for mtd in [trunk, lbfgs, tron, R2, fomo]
       with_logger(NullLogger()) do
+        reset!(unlp)
         stats = mtd(unlp; args...)
         @test stats isa GenericExecutionStats
         @test stats.status == :first_order
@@ -27,7 +28,7 @@ function consistency()
       end
     end
 
-    @testset "Quasi-Newton NLP with $mtd" for mtd in [trunk, lbfgs, tron, R2]
+    @testset "Quasi-Newton NLP with $mtd" for mtd in [trunk, lbfgs, tron, R2, fomo]
       with_logger(NullLogger()) do
         reset!(qnlp)
         stats = mtd(qnlp; args...)
diff --git a/test/restart.jl b/test/restart.jl
index 02d90902..38765465 100644
--- a/test/restart.jl
+++ b/test/restart.jl
@@ -1,5 +1,6 @@
 @testset "Test restart with a different initial guess: $fun" for (fun, s) in (
-  (:R2, :R2Solver),
+  (:R2, :FoSolver),
+  (:fomo, :FomoSolver),
   (:lbfgs, :LBFGSSolver),
   (:tron, :TronSolver),
   (:trunk, :TrunkSolver),
@@ -43,7 +44,8 @@ end
 end
 
 @testset "Test restart with a different problem: $fun" for (fun, s) in (
-  (:R2, :R2Solver),
+  (:R2, :FoSolver),
+  (:fomo, :FomoSolver),
   (:lbfgs, :LBFGSSolver),
   (:tron, :TronSolver),
   (:trunk, :TrunkSolver),
diff --git a/test/runtests.jl b/test/runtests.jl
index de0295ed..bb41eeba 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -18,7 +18,7 @@ using JSOSolvers
 end
 
 @testset "Test iteration limit" begin
-  @testset "$fun" for fun in (R2, lbfgs, tron, trunk)
+  @testset "$fun" for fun in (R2, fomo, lbfgs, tron, trunk)
     f(x) = (x[1] - 1)^2 + 4 * (x[2] - x[1]^2)^2
     nlp = ADNLPModel(f, [-1.2; 1.0])
 
diff --git a/test/test_solvers.jl b/test/test_solvers.jl
index cb41e83e..eb9029e1 100644
--- a/test/test_solvers.jl
+++ b/test/test_solvers.jl
@@ -8,6 +8,8 @@ function tests()
         ("lbfgs", lbfgs),
         ("tron", tron),
         ("R2", R2),
+        ("fomo_r2", fomo),
+        ("fomo_tr", (nlp; kwargs...) -> fomo(nlp, step_backend = JSOSolvers.tr_step(); kwargs...)),
       ]
         unconstrained_nlp(solver)
         multiprecision_nlp(solver, :unc)