-
Notifications
You must be signed in to change notification settings - Fork 31
/
Copy pathoptimizer_util.py
executable file
·133 lines (119 loc) · 5.28 KB
/
optimizer_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
"""
Copyright 2020 The OneFlow Authors. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
import oneflow as flow
import math
import pprint
def add_optimizer_args(parser):
group = parser.add_argument_group('optimizer parameters',
'entire group applies only to optimizer parameters')
group.add_argument("--optimizer", type=str, default="sgd", help="sgd, adam, rmsprop")
group.add_argument("--learning_rate", type=float, default=0.256)
group.add_argument("--wd", type=float, default=1.0/32768, help="weight decay")
group.add_argument("--momentum", type=float, default=0.875, help="momentum")
group.add_argument('--lr_decay', type=str, default='cosine', help='cosine, step, polynomial, exponential, None')
group.add_argument('--lr_decay_rate', type=float, default='0.94', help='exponential learning decay rate')
group.add_argument('--lr_decay_epochs', type=int, default=2, help='exponential learning rate decay every n epochs')
group.add_argument('--warmup_epochs', type=int, default=5,
help='the epochs to warmp-up lr to scaled large-batch value')
group.add_argument('--decay_rate', type=float, default='0.9', help='decay rate of RMSProp')
group.add_argument('--epsilon', type=float, default='1', help='epsilon')
group.add_argument('--gradient_clipping', type=float, default=0.0, help='gradient clipping')
return parser
def set_up_optimizer(loss, args):
total_device_num = args.num_nodes * args.gpu_num_per_node
train_batch_size = total_device_num * args.batch_size_per_device
batches_per_epoch = math.ceil(args.num_examples / train_batch_size)
warmup_batches = batches_per_epoch * args.warmup_epochs
num_train_batches = batches_per_epoch * args.num_epochs
decay_batches = num_train_batches - warmup_batches
exponential_decay_batches = batches_per_epoch * args.lr_decay_epochs
# set up warmup strategy
warmup = flow.optimizer.warmup.linear(warmup_batches, 0) if warmup_batches > 0 else None
# set up grad_clipping
grad_clipping = flow.optimizer.grad_clipping.by_global_norm(args.gradient_clipping) if args.gradient_clipping > 0.0 else None
# set up learning rate scheduler
if args.lr_decay == 'cosine':
# CosineScheduler
lr_scheduler = flow.optimizer.CosineScheduler(
base_lr=args.learning_rate,
steps = decay_batches,
warmup=warmup
)
elif args.lr_decay == 'step':
# PiecewiseScalingScheduler
lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
base_lr=args.learning_rate,
boundaries=[30, 60, 80],
scale=[0.1, 0.01, 0.001],
warmup=warmup
)
elif args.lr_decay == 'polynomial':
# PolynomialSchduler
lr_scheduler = flow.optimizer.PolynomialSchduler(
base_lr=args.learning_rate,
steps=decay_batches,
end_learning_rate=0.00001,
power=1.0,
cycle=False,
warmup=warmup
)
elif args.lr_decay == 'exponential':
# ExponentialScheduler
lr_scheduler = flow.optimizer.ExponentialScheduler(
base_lr=args.learning_rate,
steps=exponential_decay_batches,
decay_rate=args.lr_decay_rate,
staircase=False,
warmup=warmup
)
else:
lr_scheduler = flow.optimizer.PiecewiseScalingScheduler(
base_lr=args.learning_rate,
boundaries=[args.num_epochs],
scale=[1.0],
warmup=warmup
)
# set up optimizer
if args.optimizer=='sgd':
print("Optimizer: SGD")
flow.optimizer.SGD(lr_scheduler,
momentum=args.momentum if args.momentum>0 else None,
grad_clipping = grad_clipping
).minimize(loss)
elif args.optimizer=='adam':
if args.wd > 0 and args.wd < 1.0 :
print("Optimizer: AdamW")
flow.optimizer.AdamW(
lr_scheduler = lr_scheduler,
weight_decay = args.wd,
weight_decay_excludes='_bn-',
grad_clipping = grad_clipping,
epsilon=args.epsilon
).minimize(loss)
else:
print("Optimizer: Adam")
flow.optimizer.Adam(lr_scheduler=lr_scheduler,
grad_clipping=grad_clipping,
epsilon=args.epsilon
).minimize(loss)
elif args.optimizer=='rmsprop':
print("Optimizer: RMSProp")
flow.optimizer.RMSProp(lr_scheduler=lr_scheduler,
decay_rate=args.decay_rate,
epsilon=args.epsilon
).minimize(loss)
if __name__ == '__main__':
import config as configs
parser = configs.get_parser()
args = parser.parse_args()
configs.print_args(args)