diff --git a/src/fomo.jl b/src/fomo.jl index dc7dd338..0346ff01 100644 --- a/src/fomo.jl +++ b/src/fomo.jl @@ -20,6 +20,9 @@ mk .= ∇f(xk) .* (1 - βmax) .+ mk .* βmax and βmax ∈ [0,β] chosen as to ensure d is gradient-related, i.e., the following 2 conditions are satisfied: (1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk ≥ θ1 * ‖∇f(xk)‖² (1) ‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) *. ∇f(xk) + βmax .* mk‖ (2) +In the nonmonotone case, (1) rewrites +(1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk + (fm - fk)/μk ≥ θ1 * ‖∇f(xk)‖², +with fm the largest objective value over the last M successful iterations, and fk = f(xk). # Advanced usage @@ -49,6 +52,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i - `β = T(0.9) ∈ [0,1)`: target decay rate for the momentum. - `θ1 = T(0.1)`: momentum contribution parameter for convergence condition (1). - `θ2 = T(eps(T)^(1/3))`: momentum contribution parameter for convergence condition (2). +- `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour. - `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration. - `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region. @@ -107,21 +111,27 @@ mutable struct FomoSolver{T, V} <: AbstractFirstOrderSolver m::V d::V p::V + o::V α::T end -function FomoSolver(nlp::AbstractNLPModel{T, V}) where {T, V} +function FomoSolver(nlp::AbstractNLPModel{T, V}; M::Int = 1) where {T, V} x = similar(nlp.meta.x0) g = similar(nlp.meta.x0) c = similar(nlp.meta.x0) m = fill!(similar(nlp.meta.x0), 0) d = fill!(similar(nlp.meta.x0), 0) p = similar(nlp.meta.x0) - return FomoSolver{T, V}(x, g, c, m, d, p, T(0)) + o = fill!(Vector{T}(undef, M), -Inf) + return FomoSolver{T, V}(x, g, c, m, d, p, o, T(0)) end -@doc (@doc FomoSolver) function fomo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V} - solver = FomoSolver(nlp) +@doc (@doc FomoSolver) function fomo( + nlp::AbstractNLPModel{T, V}; + M::Int = 1, + kwargs..., +) where {T, V} + solver = FomoSolver(nlp; M) solver_specific = Dict(:avgβmax => T(0.0)) stats = GenericExecutionStats(nlp; solver_specific = solver_specific) return solve!(solver, nlp, stats; kwargs...) @@ -129,6 +139,7 @@ end function SolverCore.reset!(solver::FomoSolver{T}) where {T} fill!(solver.m, 0) + fill!(solver.o, -Inf) solver end @@ -163,6 +174,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i - `max_eval::Int = -1`: maximum number of evaluation of the objective function. - `max_time::Float64 = 30.0`: maximum time limit in seconds. - `max_iter::Int = typemax(Int)`: maximum number of iterations. +- `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour. - `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration. - `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region. @@ -201,14 +213,16 @@ mutable struct FoSolver{T, V} <: AbstractFirstOrderSolver x::V g::V c::V + o::V α::T end -function FoSolver(nlp::AbstractNLPModel{T, V}) where {T, V} +function FoSolver(nlp::AbstractNLPModel{T, V}; M::Int = 1) where {T, V} x = similar(nlp.meta.x0) g = similar(nlp.meta.x0) c = similar(nlp.meta.x0) - return FoSolver{T, V}(x, g, c, T(0)) + o = fill!(Vector{T}(undef, M), -Inf) + return FoSolver{T, V}(x, g, c, o, T(0)) end """ @@ -218,11 +232,12 @@ mutable struct R2Solver{T, V} <: AbstractOptimizationSolver end Base.@deprecate R2Solver(nlp::AbstractNLPModel; kwargs...) FoSolver( nlp::AbstractNLPModel; + M = 1, kwargs..., ) -@doc (@doc FoSolver) function fo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V} - solver = FoSolver(nlp) +@doc (@doc FoSolver) function fo(nlp::AbstractNLPModel{T, V}; M::Int = 1, kwargs...) where {T, V} + solver = FoSolver(nlp; M) stats = GenericExecutionStats(nlp) return solve!(solver, nlp, stats; step_backend = r2_step(), kwargs...) end @@ -236,6 +251,7 @@ end end function SolverCore.reset!(solver::FoSolver{T}) where {T} + fill!(solver.o, -Inf) solver end @@ -281,6 +297,11 @@ function SolverCore.solve!( set_iter!(stats, 0) f0 = obj(nlp, x) set_objective!(stats, f0) + obj_mem = solver.o + M = length(obj_mem) + mem_ind = 0 + obj_mem[mem_ind+1] = stats.objective + max_obj_mem = stats.objective grad!(nlp, x, ∇fk) norm_∇fk = norm(∇fk) @@ -346,13 +367,13 @@ function SolverCore.solve!( oneT = T(1) mdot∇f = T(0) # dot(momentum,∇fk) while !done - λk = step_mult(solver.α, norm_d, step_backend) - c .= x .- λk .* d + μk = step_mult(solver.α, norm_d, step_backend) + c .= x .- μk .* d step_underflow = x == c # step addition underfow on every dimensions, should happen before solver.α == 0 - ΔTk = ((oneT - βmax) * norm_∇fk^2 + βmax * mdot∇f) * λk # = dot(d,∇fk) * λk with momentum, ‖∇fk‖²λk without momentum + ΔTk = ((oneT - βmax) * norm_∇fk^2 + βmax * mdot∇f) * μk # = dot(d,∇fk) * μk with momentum, ‖∇fk‖²μk without momentum fck = obj(nlp, c) unbounded = fck < fmin - ρk = (stats.objective - fck) / ΔTk + ρk = (max_obj_mem - fck) / (max_obj_mem - stats.objective + ΔTk) # Update regularization parameters if ρk >= η2 solver.α = min(αmax, γ2 * solver.α) @@ -371,13 +392,16 @@ function SolverCore.solve!( momentum .= ∇fk .* (oneT - β) .+ momentum .* β end set_objective!(stats, fck) + mem_ind = (mem_ind+1) % M + obj_mem[mem_ind+1] = stats.objective + max_obj_mem = maximum(obj_mem) + grad!(nlp, x, ∇fk) norm_∇fk = norm(∇fk) if use_momentum mdot∇f = dot(momentum, ∇fk) p .= momentum .- ∇fk - diff_norm = norm(p) - βmax = find_beta(diff_norm, mdot∇f, norm_∇fk, β, θ1, θ2) + βmax = find_beta(p, mdot∇f, norm_∇fk, μk, stats.objective, max_obj_mem, β, θ1, θ2) d .= ∇fk .* (oneT - βmax) .+ momentum .* βmax norm_d = norm(d) avgβmax += βmax @@ -432,18 +456,29 @@ function SolverCore.solve!( end """ - find_beta(m, mdot∇f, norm_∇f, β, θ1, θ2) + find_beta(m, mdot∇f, norm_∇f, μk, fk, max_obj_mem, β, θ1, θ2) -Compute value `βmax` that saturates the contribution of the momentum term to the gradient. -`βmax` is computed such that the two gradient-related conditions are ensured: -1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm ≥ θ1 * ‖∇f(xk)‖² +Compute βmax which saturates the contribution of the momentum term to the gradient. +`βmax` is computed such that the two gradient-related conditions (first one is relaxed in the nonmonotone case) are ensured: +1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm + (max_obj_mem - fk)/μk ≥ θ1 * ‖∇f(xk)‖² 2. ‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) * ∇f(xk) .+ βmax .* m‖ -with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm` +with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm`, `fk` the model at s=0, `max_obj_mem` the largest objective value over the last M successful iterations. """ -function find_beta(diff_norm::T, mdot∇f::T, norm_∇f::T, β::T, θ1::T, θ2::T) where {T} +function find_beta( + p::V, + mdot∇f::T, + norm_∇f::T, + μk::T, + fk::T, + max_obj_mem::T, + β::T, + θ1::T, + θ2::T, +) where {T, V} n1 = norm_∇f^2 - mdot∇f - β1 = n1 > 0 ? (1 - θ1) * norm_∇f^2 / n1 : β - β2 = diff_norm != 0 ? (1 - θ2) * norm_∇f / diff_norm : β + n2 = norm(p) + β1 = n1 > 0 ? ((1 - θ1) * norm_∇f^2 - (fk - max_obj_mem) / μk) / n1 : β + β2 = n2 != 0 ? (1 - θ2) * norm_∇f / n2 : β return min(β, min(β1, β2)) end diff --git a/test/allocs.jl b/test/allocs.jl index f8866f93..b042ef32 100644 --- a/test/allocs.jl +++ b/test/allocs.jl @@ -35,7 +35,11 @@ if Sys.isunix() for model in NLPModelsTest.nlp_problems nlp = eval(Meta.parse(model))() if unconstrained(nlp) || (bound_constrained(nlp) && (symsolver == :TronSolver)) - solver = eval(symsolver)(nlp) + if (symsolver == :FoSolver || symsolver == :FomoSolver) + solver = eval(symsolver)(nlp; M = 2) # nonmonotone configuration allocates extra memory + else + solver = eval(symsolver)(nlp) + end if symsolver == :FomoSolver T = eltype(nlp.meta.x0) stats = GenericExecutionStats(nlp, solver_specific = Dict(:avgβmax => T(0))) diff --git a/test/test_solvers.jl b/test/test_solvers.jl index eb9029e1..dff97532 100644 --- a/test/test_solvers.jl +++ b/test/test_solvers.jl @@ -14,6 +14,14 @@ function tests() unconstrained_nlp(solver) multiprecision_nlp(solver, :unc) end + @testset "$name : nonmonotone configuration" for (name, solver) in [ + ("R2", (nlp; kwargs...) -> R2(nlp, M = 2; kwargs...)), + ("fomo_r2", (nlp; kwargs...) -> fomo(nlp, M = 2; kwargs...)), + ("fomo_tr", (nlp; kwargs...) -> fomo(nlp, M = 2, step_backend = JSOSolvers.tr_step(); kwargs...)), + ] + unconstrained_nlp(solver) + multiprecision_nlp(solver, :unc) + end end @testset "Bound-constrained solvers" begin @testset "$solver" for solver in [tron]