@@ -20,6 +20,9 @@ mk .= ∇f(xk) .* (1 - βmax) .+ mk .* βmax
20
20
and βmax ∈ [0,β] chosen as to ensure d is gradient-related, i.e., the following 2 conditions are satisfied:
21
21
(1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk ≥ θ1 * ‖∇f(xk)‖² (1)
22
22
‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) *. ∇f(xk) + βmax .* mk‖ (2)
23
+ In the nonmonotone case, (1) rewrites
24
+ (1-βmax) .* ∇f(xk) + βmax .* ∇f(xk)ᵀmk + (fm - fk)/μk ≥ θ1 * ‖∇f(xk)‖²,
25
+ with fm the largest objective value over the last M successful iterations, and fk = f(xk).
23
26
24
27
# Advanced usage
25
28
@@ -49,6 +52,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i
49
52
- `β = T(0.9) ∈ [0,1)`: target decay rate for the momentum.
50
53
- `θ1 = T(0.1)`: momentum contribution parameter for convergence condition (1).
51
54
- `θ2 = T(eps(T)^(1/3))`: momentum contribution parameter for convergence condition (2).
55
+ - `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour.
52
56
- `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
53
57
- `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.
54
58
@@ -107,28 +111,35 @@ mutable struct FomoSolver{T, V} <: AbstractFirstOrderSolver
107
111
m:: V
108
112
d:: V
109
113
p:: V
114
+ o:: V
110
115
α:: T
111
116
end
112
117
113
- function FomoSolver (nlp:: AbstractNLPModel{T, V} ) where {T, V}
118
+ function FomoSolver (nlp:: AbstractNLPModel{T, V} ; M :: Int = 1 ) where {T, V}
114
119
x = similar (nlp. meta. x0)
115
120
g = similar (nlp. meta. x0)
116
121
c = similar (nlp. meta. x0)
117
122
m = fill! (similar (nlp. meta. x0), 0 )
118
123
d = fill! (similar (nlp. meta. x0), 0 )
119
124
p = similar (nlp. meta. x0)
120
- return FomoSolver {T, V} (x, g, c, m, d, p, T (0 ))
125
+ o = fill! (Vector {T} (undef, M), - Inf )
126
+ return FomoSolver {T, V} (x, g, c, m, d, p, o, T (0 ))
121
127
end
122
128
123
- @doc (@doc FomoSolver) function fomo (nlp:: AbstractNLPModel{T, V} ; kwargs... ) where {T, V}
124
- solver = FomoSolver (nlp)
129
+ @doc (@doc FomoSolver) function fomo (
130
+ nlp:: AbstractNLPModel{T, V} ;
131
+ M:: Int = 1 ,
132
+ kwargs... ,
133
+ ) where {T, V}
134
+ solver = FomoSolver (nlp; M)
125
135
solver_specific = Dict (:avgβmax => T (0.0 ))
126
136
stats = GenericExecutionStats (nlp; solver_specific = solver_specific)
127
137
return solve! (solver, nlp, stats; kwargs... )
128
138
end
129
139
130
140
function SolverCore. reset! (solver:: FomoSolver{T} ) where {T}
131
141
fill! (solver. m, 0 )
142
+ fill! (solver. o, - Inf )
132
143
solver
133
144
end
134
145
@@ -163,6 +174,7 @@ For advanced usage, first define a `FomoSolver` to preallocate the memory used i
163
174
- `max_eval::Int = -1`: maximum number of evaluation of the objective function.
164
175
- `max_time::Float64 = 30.0`: maximum time limit in seconds.
165
176
- `max_iter::Int = typemax(Int)`: maximum number of iterations.
177
+ - `M = 1` : requires objective decrease over the `M` last iterates (nonmonotone context). `M=1` implies monotone behaviour.
166
178
- `verbose::Int = 0`: if > 0, display iteration details every `verbose` iteration.
167
179
- `step_backend = r2_step()`: step computation mode. Options are `r2_step()` for quadratic regulation step and `tr_step()` for first-order trust-region.
168
180
@@ -201,14 +213,16 @@ mutable struct FoSolver{T, V} <: AbstractFirstOrderSolver
201
213
x:: V
202
214
g:: V
203
215
c:: V
216
+ o:: V
204
217
α:: T
205
218
end
206
219
207
- function FoSolver (nlp:: AbstractNLPModel{T, V} ) where {T, V}
220
+ function FoSolver (nlp:: AbstractNLPModel{T, V} ; M :: Int = 1 ) where {T, V}
208
221
x = similar (nlp. meta. x0)
209
222
g = similar (nlp. meta. x0)
210
223
c = similar (nlp. meta. x0)
211
- return FoSolver {T, V} (x, g, c, T (0 ))
224
+ o = fill! (Vector {T} (undef, M), - Inf )
225
+ return FoSolver {T, V} (x, g, c, o, T (0 ))
212
226
end
213
227
214
228
"""
@@ -218,11 +232,12 @@ mutable struct R2Solver{T, V} <: AbstractOptimizationSolver end
218
232
219
233
Base. @deprecate R2Solver (nlp:: AbstractNLPModel ; kwargs... ) FoSolver (
220
234
nlp:: AbstractNLPModel ;
235
+ M = 1 ,
221
236
kwargs... ,
222
237
)
223
238
224
- @doc (@doc FoSolver) function fo (nlp:: AbstractNLPModel{T, V} ; kwargs... ) where {T, V}
225
- solver = FoSolver (nlp)
239
+ @doc (@doc FoSolver) function fo (nlp:: AbstractNLPModel{T, V} ; M :: Int = 1 , kwargs... ) where {T, V}
240
+ solver = FoSolver (nlp; M )
226
241
stats = GenericExecutionStats (nlp)
227
242
return solve! (solver, nlp, stats; step_backend = r2_step (), kwargs... )
228
243
end
236
251
end
237
252
238
253
function SolverCore. reset! (solver:: FoSolver{T} ) where {T}
254
+ fill! (solver. o, - Inf )
239
255
solver
240
256
end
241
257
@@ -281,6 +297,11 @@ function SolverCore.solve!(
281
297
set_iter! (stats, 0 )
282
298
f0 = obj (nlp, x)
283
299
set_objective! (stats, f0)
300
+ obj_mem = solver. o
301
+ M = length (obj_mem)
302
+ mem_ind = 0
303
+ obj_mem[mem_ind+ 1 ] = stats. objective
304
+ max_obj_mem = stats. objective
284
305
285
306
grad! (nlp, x, ∇fk)
286
307
norm_∇fk = norm (∇fk)
@@ -346,13 +367,13 @@ function SolverCore.solve!(
346
367
oneT = T (1 )
347
368
mdot∇f = T (0 ) # dot(momentum,∇fk)
348
369
while ! done
349
- λk = step_mult (solver. α, norm_d, step_backend)
350
- c .= x .- λk .* d
370
+ μk = step_mult (solver. α, norm_d, step_backend)
371
+ c .= x .- μk .* d
351
372
step_underflow = x == c # step addition underfow on every dimensions, should happen before solver.α == 0
352
- ΔTk = ((oneT - βmax) * norm_∇fk^ 2 + βmax * mdot∇f) * λk # = dot(d,∇fk) * λk with momentum, ‖∇fk‖²λk without momentum
373
+ ΔTk = ((oneT - βmax) * norm_∇fk^ 2 + βmax * mdot∇f) * μk # = dot(d,∇fk) * μk with momentum, ‖∇fk‖²μk without momentum
353
374
fck = obj (nlp, c)
354
375
unbounded = fck < fmin
355
- ρk = (stats . objective - fck) / ΔTk
376
+ ρk = (max_obj_mem - fck) / (max_obj_mem - stats . objective + ΔTk)
356
377
# Update regularization parameters
357
378
if ρk >= η2
358
379
solver. α = min (αmax, γ2 * solver. α)
@@ -371,13 +392,16 @@ function SolverCore.solve!(
371
392
momentum .= ∇fk .* (oneT - β) .+ momentum .* β
372
393
end
373
394
set_objective! (stats, fck)
395
+ mem_ind = (mem_ind+ 1 ) % M
396
+ obj_mem[mem_ind+ 1 ] = stats. objective
397
+ max_obj_mem = maximum (obj_mem)
398
+
374
399
grad! (nlp, x, ∇fk)
375
400
norm_∇fk = norm (∇fk)
376
401
if use_momentum
377
402
mdot∇f = dot (momentum, ∇fk)
378
403
p .= momentum .- ∇fk
379
- diff_norm = norm (p)
380
- βmax = find_beta (diff_norm, mdot∇f, norm_∇fk, β, θ1, θ2)
404
+ βmax = find_beta (p, mdot∇f, norm_∇fk, μk, stats. objective, max_obj_mem, β, θ1, θ2)
381
405
d .= ∇fk .* (oneT - βmax) .+ momentum .* βmax
382
406
norm_d = norm (d)
383
407
avgβmax += βmax
@@ -432,18 +456,29 @@ function SolverCore.solve!(
432
456
end
433
457
434
458
"""
435
- find_beta(m, mdot∇f, norm_∇f, β, θ1, θ2)
459
+ find_beta(m, mdot∇f, norm_∇f, μk, fk, max_obj_mem, β, θ1, θ2)
436
460
437
- Compute value ` βmax` that saturates the contribution of the momentum term to the gradient.
438
- `βmax` is computed such that the two gradient-related conditions are ensured:
439
- 1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm ≥ θ1 * ‖∇f(xk)‖²
461
+ Compute βmax which saturates the contribution of the momentum term to the gradient.
462
+ `βmax` is computed such that the two gradient-related conditions (first one is relaxed in the nonmonotone case) are ensured:
463
+ 1. (1-βmax) * ‖∇f(xk)‖² + βmax * ∇f(xk)ᵀm + (max_obj_mem - fk)/μk ≥ θ1 * ‖∇f(xk)‖²
440
464
2. ‖∇f(xk)‖ ≥ θ2 * ‖(1-βmax) * ∇f(xk) .+ βmax .* m‖
441
- with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm`
465
+ with `m` the momentum term and `mdot∇f = ∇f(xk)ᵀm`, `fk` the model at s=0, `max_obj_mem` the largest objective value over the last M successful iterations.
442
466
"""
443
- function find_beta (diff_norm:: T , mdot∇f:: T , norm_∇f:: T , β:: T , θ1:: T , θ2:: T ) where {T}
467
+ function find_beta (
468
+ p:: V ,
469
+ mdot∇f:: T ,
470
+ norm_∇f:: T ,
471
+ μk:: T ,
472
+ fk:: T ,
473
+ max_obj_mem:: T ,
474
+ β:: T ,
475
+ θ1:: T ,
476
+ θ2:: T ,
477
+ ) where {T, V}
444
478
n1 = norm_∇f^ 2 - mdot∇f
445
- β1 = n1 > 0 ? (1 - θ1) * norm_∇f^ 2 / n1 : β
446
- β2 = diff_norm != 0 ? (1 - θ2) * norm_∇f / diff_norm : β
479
+ n2 = norm (p)
480
+ β1 = n1 > 0 ? ((1 - θ1) * norm_∇f^ 2 - (fk - max_obj_mem) / μk) / n1 : β
481
+ β2 = n2 != 0 ? (1 - θ2) * norm_∇f / n2 : β
447
482
return min (β, min (β1, β2))
448
483
end
449
484
0 commit comments