Fomo nonmonotone extension (#267)

d-monnet · tmigot · github-actions[bot] · web-flow · commit 866492f5edf6 · 2024-04-13T15:15:11.000-04:00
* update find_beta for the nonmonotone case * nonmonotone extension, remove useless norm computation * Add unbounded optimality in lbfgs * Add unbounded optimality for TRUNK * Add unbounded objective tests (#268) * add unbounded below obj test. Fix unbounded test in fomo. * add unbounded below obj test. Fix unbounded test in fomo. * standardize fomo :unbounded condition, add objective value test in unbounded tests. * rename: fk -> f0 * 🤖 Format .jl files (#270) Co-authored-by: d-monnet <d-monnet@users.noreply.github.com> * fomatting Co-authored-by: Tangi Migot <tangi.migot@gmail.com> * update find_beta for the nonmonotone case * nonmonotone extension, remove useless norm computation * fomatting Co-authored-by: Tangi Migot <tangi.migot@gmail.com> * fix rebase errors * add tests, replace `circshift` by index * fix allocs test * Update src/fomo.jl Co-authored-by: Tangi Migot <tangi.migot@gmail.com> * update docstring * update docstring * Update src/fomo.jl Co-authored-by: Tangi Migot <tangi.migot@gmail.com> --------- Co-authored-by: tmigot <tangi.migot@gmail.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: d-monnet <d-monnet@users.noreply.github.com>
diff --git a/src/fomo.jl b/src/fomo.jl
@@ -20,6 +20,9 @@ mk .= ∇f(xk) .* (1 - βmax) .+ mk .* βmax
 and βmax ∈ [0,β] chosen as to ensure d is gradient-related, i.e., the following 2 conditions are satisfied:
 (1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk ≥ θ1 * ‖∇f(xk)‖² (1)
 ‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) *. ∇f(xk) + βmax .* mk‖       (2)
+In the nonmonotone case, (1) rewrites
+(1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk + (fm - fk)/μk ≥ θ1 * ‖∇f(xk)‖²,
+with fm the largest objective value over the last M successful iterations, and fk = f(xk).
 
 # Advanced usage
 
@@ -49,6 +52,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i
 - `β = T(0.9) ∈ [0,1)`: target decay rate for the momentum.
 - `θ1 = T(0.1)`: momentum contribution parameter for convergence condition (1).
 - `θ2 = T(eps(T)^(1/3))`: momentum contribution parameter for convergence condition (2). 
+- `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour. 
 - `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
 - `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.
 
@@ -107,28 +111,35 @@ mutable struct FomoSolver{T, V} <: AbstractFirstOrderSolver
   m::V
   d::V
   p::V
+  o::V
   α::T
 end
 
-function FomoSolver(nlp::AbstractNLPModel{T, V}) where {T, V}
+function FomoSolver(nlp::AbstractNLPModel{T, V}; M::Int = 1) where {T, V}
   x = similar(nlp.meta.x0)
   g = similar(nlp.meta.x0)
   c = similar(nlp.meta.x0)
   m = fill!(similar(nlp.meta.x0), 0)
   d = fill!(similar(nlp.meta.x0), 0)
   p = similar(nlp.meta.x0)
-  return FomoSolver{T, V}(x, g, c, m, d, p, T(0))
+  o = fill!(Vector{T}(undef, M), -Inf)
+  return FomoSolver{T, V}(x, g, c, m, d, p, o, T(0))
 end
 
-@doc (@doc FomoSolver) function fomo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
-  solver = FomoSolver(nlp)
+@doc (@doc FomoSolver) function fomo(
+  nlp::AbstractNLPModel{T, V};
+  M::Int = 1,
+  kwargs...,
+) where {T, V}
+  solver = FomoSolver(nlp; M)
   solver_specific = Dict(:avgβmax => T(0.0))
   stats = GenericExecutionStats(nlp; solver_specific = solver_specific)
   return solve!(solver, nlp, stats; kwargs...)
 end
 
 function SolverCore.reset!(solver::FomoSolver{T}) where {T}
   fill!(solver.m, 0)
+  fill!(solver.o, -Inf)
   solver
 end
 
@@ -163,6 +174,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i
 - `max_eval::Int = -1`: maximum number of evaluation of the objective function.
 - `max_time::Float64 = 30.0`: maximum time limit in seconds.
 - `max_iter::Int = typemax(Int)`: maximum number of iterations.
+- `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour. 
 - `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
 - `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.
 
@@ -201,14 +213,16 @@ mutable struct FoSolver{T, V} <: AbstractFirstOrderSolver
   x::V
   g::V
   c::V
+  o::V
   α::T
 end
 
-function FoSolver(nlp::AbstractNLPModel{T, V}) where {T, V}
+function FoSolver(nlp::AbstractNLPModel{T, V}; M::Int = 1) where {T, V}
   x = similar(nlp.meta.x0)
   g = similar(nlp.meta.x0)
   c = similar(nlp.meta.x0)
-  return FoSolver{T, V}(x, g, c, T(0))
+  o = fill!(Vector{T}(undef, M), -Inf)
+  return FoSolver{T, V}(x, g, c, o, T(0))
 end
 
 """
@@ -218,11 +232,12 @@ mutable struct R2Solver{T, V} <: AbstractOptimizationSolver end
 
 Base.@deprecate R2Solver(nlp::AbstractNLPModel; kwargs...) FoSolver(
   nlp::AbstractNLPModel;
+  M = 1,
   kwargs...,
 )
 
-@doc (@doc FoSolver) function fo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
-  solver = FoSolver(nlp)
+@doc (@doc FoSolver) function fo(nlp::AbstractNLPModel{T, V}; M::Int = 1, kwargs...) where {T, V}
+  solver = FoSolver(nlp; M)
   stats = GenericExecutionStats(nlp)
   return solve!(solver, nlp, stats; step_backend = r2_step(), kwargs...)
 end
@@ -236,6 +251,7 @@ end
 end
 
 function SolverCore.reset!(solver::FoSolver{T}) where {T}
+  fill!(solver.o, -Inf)
   solver
 end
 
@@ -281,6 +297,11 @@ function SolverCore.solve!(
   set_iter!(stats, 0)
   f0 = obj(nlp, x)
   set_objective!(stats, f0)
+  obj_mem = solver.o
+  M = length(obj_mem)
+  mem_ind = 0
+  obj_mem[mem_ind+1] = stats.objective
+  max_obj_mem = stats.objective
 
   grad!(nlp, x, ∇fk)
   norm_∇fk = norm(∇fk)
@@ -346,13 +367,13 @@ function SolverCore.solve!(
   oneT = T(1)
   mdot∇f = T(0) # dot(momentum,∇fk)
   while !done
-    λk = step_mult(solver.α, norm_d, step_backend)
-    c .= x .- λk .* d
+    μk = step_mult(solver.α, norm_d, step_backend)
+    c .= x .- μk .* d
     step_underflow = x == c # step addition underfow on every dimensions, should happen before solver.α == 0
-    ΔTk = ((oneT - βmax) * norm_∇fk^2 + βmax * mdot∇f) * λk # = dot(d,∇fk) * λk with momentum, ‖∇fk‖²λk without momentum
+    ΔTk = ((oneT - βmax) * norm_∇fk^2 + βmax * mdot∇f) * μk # = dot(d,∇fk) * μk with momentum, ‖∇fk‖²μk without momentum
     fck = obj(nlp, c)
     unbounded = fck < fmin
-    ρk = (stats.objective - fck) / ΔTk
+    ρk = (max_obj_mem - fck) / (max_obj_mem - stats.objective + ΔTk)
     # Update regularization parameters
     if ρk >= η2
       solver.α = min(αmax, γ2 * solver.α)
@@ -371,13 +392,16 @@ function SolverCore.solve!(
         momentum .= ∇fk .* (oneT - β) .+ momentum .* β
       end
       set_objective!(stats, fck)
+      mem_ind = (mem_ind+1) % M
+      obj_mem[mem_ind+1] = stats.objective
+      max_obj_mem = maximum(obj_mem)
+
       grad!(nlp, x, ∇fk)
       norm_∇fk = norm(∇fk)
       if use_momentum
         mdot∇f = dot(momentum, ∇fk)
         p .= momentum .- ∇fk
-        diff_norm = norm(p)
-        βmax = find_beta(diff_norm, mdot∇f, norm_∇fk, β, θ1, θ2)
+        βmax = find_beta(p, mdot∇f, norm_∇fk, μk, stats.objective, max_obj_mem, β, θ1, θ2)
         d .= ∇fk .* (oneT - βmax) .+ momentum .* βmax
         norm_d = norm(d)
         avgβmax += βmax
@@ -432,18 +456,29 @@ function SolverCore.solve!(
 end
 
 """
-    find_beta(m, mdot∇f, norm_∇f, β, θ1, θ2)
+    find_beta(m, mdot∇f, norm_∇f, μk, fk, max_obj_mem, β, θ1, θ2)
 
-Compute value `βmax` that saturates the contribution of the momentum term to the gradient.
-`βmax` is computed such that the two gradient-related conditions are ensured: 
-1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm ≥ θ1 * ‖∇f(xk)‖²
+Compute βmax which saturates the contribution of the momentum term to the gradient.
+`βmax` is computed such that the two gradient-related conditions (first one is relaxed in the nonmonotone case) are ensured: 
+1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm + (max_obj_mem - fk)/μk ≥ θ1 * ‖∇f(xk)‖²
 2. ‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) * ∇f(xk) .+ βmax .* m‖
-with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm` 
+with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm`, `fk` the model at s=0, `max_obj_mem` the largest objective value over the last M successful iterations.
 """
-function find_beta(diff_norm::T, mdot∇f::T, norm_∇f::T, β::T, θ1::T, θ2::T) where {T}
+function find_beta(
+  p::V,
+  mdot∇f::T,
+  norm_∇f::T,
+  μk::T,
+  fk::T,
+  max_obj_mem::T,
+  β::T,
+  θ1::T,
+  θ2::T,
+) where {T, V}
   n1 = norm_∇f^2 - mdot∇f
-  β1 = n1 > 0 ? (1 - θ1) * norm_∇f^2 / n1 : β
-  β2 = diff_norm != 0 ? (1 - θ2) * norm_∇f / diff_norm : β
+  n2 = norm(p)
+  β1 = n1 > 0 ? ((1 - θ1) * norm_∇f^2 - (fk - max_obj_mem) / μk) / n1 : β
+  β2 = n2 != 0 ? (1 - θ2) * norm_∇f / n2 : β
   return min(β, min(β1, β2))
 end
 
diff --git a/test/allocs.jl b/test/allocs.jl
@@ -35,7 +35,11 @@ if Sys.isunix()
       for model in NLPModelsTest.nlp_problems
         nlp = eval(Meta.parse(model))()
         if unconstrained(nlp) || (bound_constrained(nlp) && (symsolver == :TronSolver))
-          solver = eval(symsolver)(nlp)
+          if (symsolver == :FoSolver || symsolver == :FomoSolver)
+            solver = eval(symsolver)(nlp; M = 2) # nonmonotone configuration allocates extra memory
+          else
+            solver = eval(symsolver)(nlp)
+          end
           if symsolver == :FomoSolver
             T = eltype(nlp.meta.x0)
             stats = GenericExecutionStats(nlp, solver_specific = Dict(:avgβmax => T(0)))
diff --git a/test/test_solvers.jl b/test/test_solvers.jl
@@ -14,6 +14,14 @@ function tests()
         unconstrained_nlp(solver)
         multiprecision_nlp(solver, :unc)
       end
+      @testset "$name : nonmonotone configuration" for (name, solver) in [
+        ("R2", (nlp; kwargs...) -> R2(nlp, M = 2; kwargs...)),
+        ("fomo_r2", (nlp; kwargs...) -> fomo(nlp, M = 2; kwargs...)),
+        ("fomo_tr", (nlp; kwargs...) -> fomo(nlp, M = 2, step_backend = JSOSolvers.tr_step(); kwargs...)),
+      ]
+        unconstrained_nlp(solver)
+        multiprecision_nlp(solver, :unc)
+      end
     end
     @testset "Bound-constrained solvers" begin
       @testset "$solver" for solver in [tron]