Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fomo nonmonotone extension #267

Merged
merged 18 commits into from
Apr 13, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 65 additions & 27 deletions src/fomo.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ mk .= ∇f(xk) .* (1 - βmax) .+ mk .* βmax
and βmax ∈ [0,β] chosen as to ensure d is gradient-related, i.e., the following 2 conditions are satisfied:
(1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk ≥ θ1 * ‖∇f(xk)‖² (1)
‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) *. ∇f(xk) + βmax .* mk‖ (2)
In the nonmonotone case, (1) rewrites
(1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk + (fm - fk)/μk ≥ θ1 * ‖∇f(xk)‖²,
with fm the greatest objective value over the last M successful iterations, and fk = f(xk).
d-monnet marked this conversation as resolved.
Show resolved Hide resolved

# Advanced usage

Expand Down Expand Up @@ -49,6 +52,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i
- `β = T(0.9) ∈ [0,1)`: target decay rate for the momentum.
- `θ1 = T(0.1)`: momentum contribution parameter for convergence condition (1).
- `θ2 = T(eps(T)^(1/3))`: momentum contribution parameter for convergence condition (2).
- `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour.
- `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
- `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.

Expand Down Expand Up @@ -107,28 +111,35 @@ mutable struct FomoSolver{T, V} <: AbstractFirstOrderSolver
m::V
d::V
p::V
o::V
α::T
end

function FomoSolver(nlp::AbstractNLPModel{T, V}) where {T, V}
function FomoSolver(nlp::AbstractNLPModel{T, V}; M::Int = 1) where {T, V}
x = similar(nlp.meta.x0)
g = similar(nlp.meta.x0)
c = similar(nlp.meta.x0)
m = fill!(similar(nlp.meta.x0), 0)
d = fill!(similar(nlp.meta.x0), 0)
p = similar(nlp.meta.x0)
return FomoSolver{T, V}(x, g, c, m, d, p, T(0))
o = fill!(Vector{T}(undef, M), -Inf)
return FomoSolver{T, V}(x, g, c, m, d, p, o, T(0))
end

@doc (@doc FomoSolver) function fomo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
solver = FomoSolver(nlp)
@doc (@doc FomoSolver) function fomo(
nlp::AbstractNLPModel{T, V};
M::Int = 1,
kwargs...,
) where {T, V}
solver = FomoSolver(nlp; M)
solver_specific = Dict(:avgβmax => T(0.0))
stats = GenericExecutionStats(nlp; solver_specific = solver_specific)
return solve!(solver, nlp, stats; kwargs...)
end

function SolverCore.reset!(solver::FomoSolver{T}) where {T}
fill!(solver.m, 0)
fill!(solver.o, -Inf)
solver
end

Expand Down Expand Up @@ -163,6 +174,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i
- `max_eval::Int = -1`: maximum number of evaluation of the objective function.
- `max_time::Float64 = 30.0`: maximum time limit in seconds.
- `max_iter::Int = typemax(Int)`: maximum number of iterations.
- `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour.
- `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
- `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.

Expand Down Expand Up @@ -201,14 +213,16 @@ mutable struct FoSolver{T, V} <: AbstractFirstOrderSolver
x::V
g::V
c::V
o::V
α::T
end

function FoSolver(nlp::AbstractNLPModel{T, V}) where {T, V}
function FoSolver(nlp::AbstractNLPModel{T, V}; M::Int = 1) where {T, V}
x = similar(nlp.meta.x0)
g = similar(nlp.meta.x0)
c = similar(nlp.meta.x0)
return FoSolver{T, V}(x, g, c, T(0))
o = fill!(Vector{T}(undef, M), -Inf)
return FoSolver{T, V}(x, g, c, o, T(0))
end

"""
Expand All @@ -218,11 +232,12 @@ mutable struct R2Solver{T, V} <: AbstractOptimizationSolver end

Base.@deprecate R2Solver(nlp::AbstractNLPModel; kwargs...) FoSolver(
nlp::AbstractNLPModel;
M = 1,
kwargs...,
)

@doc (@doc FoSolver) function fo(nlp::AbstractNLPModel{T, V}; kwargs...) where {T, V}
solver = FoSolver(nlp)
@doc (@doc FoSolver) function fo(nlp::AbstractNLPModel{T, V}; M::Int = 1, kwargs...) where {T, V}
solver = FoSolver(nlp; M)
stats = GenericExecutionStats(nlp)
return solve!(solver, nlp, stats; step_backend = r2_step(), kwargs...)
end
Expand All @@ -236,6 +251,7 @@ end
end

function SolverCore.reset!(solver::FoSolver{T}) where {T}
fill!(solver.o, -Inf)
solver
end

Expand Down Expand Up @@ -279,7 +295,13 @@ function SolverCore.solve!(
d = use_momentum ? solver.d : solver.g # g = d if no momentum
p = use_momentum ? solver.p : nothing # not used if no momentum
set_iter!(stats, 0)
set_objective!(stats, obj(nlp, x))
f0 = obj(nlp, x)
set_objective!(stats, f0)
obj_mem = solver.o
M = length(obj_mem)
mem_ind = 0
obj_mem[mem_ind+1] = stats.objective
max_obj_mem = stats.objective

grad!(nlp, x, ∇fk)
norm_∇fk = norm(∇fk)
Expand All @@ -288,6 +310,9 @@ function SolverCore.solve!(
solver.α = init_alpha(norm_∇fk, step_backend)

# Stopping criterion:
fmin = min(-one(T), f0) / eps(T)
unbounded = f0 < fmin

ϵ = atol + rtol * norm_∇fk
optimal = norm_∇fk ≤ ϵ
step_param_name = is_r2 ? "σ" : "Δ"
Expand Down Expand Up @@ -321,6 +346,7 @@ function SolverCore.solve!(
nlp,
elapsed_time = stats.elapsed_time,
optimal = optimal,
unbounded = unbounded,
max_eval = max_eval,
iter = stats.iter,
max_iter = max_iter,
Expand All @@ -341,16 +367,13 @@ function SolverCore.solve!(
oneT = T(1)
mdot∇f = T(0) # dot(momentum,∇fk)
while !done
λk = step_mult(solver.α, norm_d, step_backend)
c .= x .- λk .* d
μk = step_mult(solver.α, norm_d, step_backend)
c .= x .- μk .* d
step_underflow = x == c # step addition underfow on every dimensions, should happen before solver.α == 0
ΔTk = ((oneT - βmax) * norm_∇fk^2 + βmax * mdot∇f) * λk # = dot(d,∇fk) * λk with momentum, ‖∇fk‖²λk without momentum
ΔTk = ((oneT - βmax) * norm_∇fk^2 + βmax * mdot∇f) * μk # = dot(d,∇fk) * μk with momentum, ‖∇fk‖²μk without momentum
fck = obj(nlp, c)
if fck == -Inf
set_status!(stats, :unbounded)
break
end
ρk = (stats.objective - fck) / ΔTk
unbounded = fck < fmin
ρk = (max_obj_mem - fck) / (max_obj_mem - stats.objective + ΔTk)
# Update regularization parameters
if ρk >= η2
solver.α = min(αmax, γ2 * solver.α)
Expand All @@ -369,13 +392,16 @@ function SolverCore.solve!(
momentum .= ∇fk .* (oneT - β) .+ momentum .* β
end
set_objective!(stats, fck)
mem_ind = (mem_ind+1) % M
obj_mem[mem_ind+1] = stats.objective
max_obj_mem = maximum(obj_mem)

grad!(nlp, x, ∇fk)
norm_∇fk = norm(∇fk)
if use_momentum
mdot∇f = dot(momentum, ∇fk)
p .= momentum .- ∇fk
diff_norm = norm(p)
βmax = find_beta(diff_norm, mdot∇f, norm_∇fk, β, θ1, θ2)
βmax = find_beta(p, mdot∇f, norm_∇fk, μk, stats.objective, max_obj_mem, β, θ1, θ2)
d .= ∇fk .* (oneT - βmax) .+ momentum .* βmax
norm_d = norm(d)
avgβmax += βmax
Expand Down Expand Up @@ -406,6 +432,7 @@ function SolverCore.solve!(
nlp,
elapsed_time = stats.elapsed_time,
optimal = optimal,
unbounded = unbounded,
max_eval = max_eval,
iter = stats.iter,
max_iter = max_iter,
Expand All @@ -429,18 +456,29 @@ function SolverCore.solve!(
end

"""
find_beta(m, mdot∇f, norm_∇f, β, θ1, θ2)
find_beta(m, mdot∇f, norm_∇f, μk, fk, max_obj_mem, β, θ1, θ2)

Compute value `βmax` that saturates the contribution of the momentum term to the gradient.
`βmax` is computed such that the two gradient-related conditions are ensured:
1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm ≥ θ1 * ‖∇f(xk)‖²
Compute βmax which saturates the contibution of the momentum term to the gradient.
d-monnet marked this conversation as resolved.
Show resolved Hide resolved
`βmax` is computed such that the two gradient-related conditions (first one is relaxed in the nonmonotone case) are ensured:
1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm + (max_obj_mem - fk)/μk ≥ θ1 * ‖∇f(xk)‖²
2. ‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) * ∇f(xk) .+ βmax .* m‖
with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm`
with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm`, `fk` the model at s=0, `max_obj_mem` the greatest value of objective over the last M successful iterations.
"""
function find_beta(diff_norm::T, mdot∇f::T, norm_∇f::T, β::T, θ1::T, θ2::T) where {T}
function find_beta(
p::V,
mdot∇f::T,
norm_∇f::T,
μk::T,
fk::T,
max_obj_mem::T,
β::T,
θ1::T,
θ2::T,
) where {T, V}
n1 = norm_∇f^2 - mdot∇f
β1 = n1 > 0 ? (1 - θ1) * norm_∇f^2 / n1 : β
β2 = diff_norm != 0 ? (1 - θ2) * norm_∇f / diff_norm : β
n2 = norm(p)
β1 = n1 > 0 ? ((1 - θ1) * norm_∇f^2 - (fk - max_obj_mem) / μk) / n1 : β
β2 = n2 != 0 ? (1 - θ2) * norm_∇f / n2 : β
return min(β, min(β1, β2))
end

Expand Down
5 changes: 5 additions & 0 deletions src/lbfgs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -153,13 +153,16 @@ function SolverCore.solve!(
verbose > 0 && @info log_row(Any[stats.iter, f, ∇fNorm, T, Int])

optimal = ∇fNorm ≤ ϵ
fmin = min(-one(T), f) / eps(T)
unbounded = f < fmin

set_status!(
stats,
get_status(
nlp,
elapsed_time = stats.elapsed_time,
optimal = optimal,
unbounded = unbounded,
max_eval = max_eval,
iter = stats.iter,
max_iter = max_iter,
Expand Down Expand Up @@ -210,13 +213,15 @@ function SolverCore.solve!(
set_time!(stats, time() - start_time)
set_dual_residual!(stats, ∇fNorm)
optimal = ∇fNorm ≤ ϵ
unbounded = f < fmin

set_status!(
stats,
get_status(
nlp,
elapsed_time = stats.elapsed_time,
optimal = optimal,
unbounded = unbounded,
max_eval = max_eval,
iter = stats.iter,
max_iter = max_iter,
Expand Down
5 changes: 5 additions & 0 deletions src/trunk.jl
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,8 @@ function SolverCore.solve!(
set_objective!(stats, f)
set_dual_residual!(stats, ∇fNorm2)
optimal = ∇fNorm2 ≤ ϵ
fmin = min(-one(T), f) / eps(T)
unbounded = f < fmin

verbose > 0 && @info log_header(
[:iter, :f, :dual, :radius, :ratio, :inner, :bk, :cgstatus],
Expand All @@ -210,6 +212,7 @@ function SolverCore.solve!(
nlp,
elapsed_time = stats.elapsed_time,
optimal = optimal,
unbounded = unbounded,
max_eval = max_eval,
iter = stats.iter,
max_iter = max_iter,
Expand Down Expand Up @@ -381,13 +384,15 @@ function SolverCore.solve!(
update!(tr, sNorm)

optimal = ∇fNorm2 ≤ ϵ
unbounded = f < fmin

set_status!(
stats,
get_status(
nlp,
elapsed_time = stats.elapsed_time,
optimal = optimal,
unbounded = unbounded,
max_eval = max_eval,
iter = stats.iter,
max_iter = max_iter,
Expand Down
6 changes: 5 additions & 1 deletion test/allocs.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,11 @@ if Sys.isunix()
for model in NLPModelsTest.nlp_problems
nlp = eval(Meta.parse(model))()
if unconstrained(nlp) || (bound_constrained(nlp) && (symsolver == :TronSolver))
solver = eval(symsolver)(nlp)
if (symsolver == :FoSolver || symsolver == :FomoSolver)
solver = eval(symsolver)(nlp; M = 2) # nonmonotone configuration allocates extra memory
else
solver = eval(symsolver)(nlp)
end
if symsolver == :FomoSolver
T = eltype(nlp.meta.x0)
stats = GenericExecutionStats(nlp, solver_specific = Dict(:avgβmax => T(0)))
Expand Down
13 changes: 13 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ end
end
end

@testset "Test unbounded below" begin
@testset "$fun" for fun in (R2, fomo, lbfgs, tron, trunk)
T = Float64
x0 = [T(0)]
f(x) = -exp(x[1])
nlp = ADNLPModel(f, x0)

stats = eval(fun)(nlp)
@test stats.status == :unbounded
@test stats.objective < -one(T) / eps(T)
end
end

include("restart.jl")
include("callback.jl")
include("consistency.jl")
Expand Down
8 changes: 8 additions & 0 deletions test/test_solvers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@ function tests()
unconstrained_nlp(solver)
multiprecision_nlp(solver, :unc)
end
@testset "$name : nonmonotone configuration" for (name, solver) in [
("R2", (nlp; kwargs...) -> R2(nlp, M = 2; kwargs...)),
("fomo_r2", (nlp; kwargs...) -> fomo(nlp, M = 2; kwargs...)),
("fomo_tr", (nlp; kwargs...) -> fomo(nlp, M = 2, step_backend = JSOSolvers.tr_step(); kwargs...)),
]
unconstrained_nlp(solver)
multiprecision_nlp(solver, :unc)
end
end
@testset "Bound-constrained solvers" begin
@testset "$solver" for solver in [tron]
Expand Down
Loading