Skip to content

Commit

Permalink
Add a utility to standardize learning rates across optimizers
Browse files Browse the repository at this point in the history
  • Loading branch information
karlhigley committed Sep 27, 2020
1 parent d2b472a commit 61ca507
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 4 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "torch-optim-sparse"
version = "0.1.1"
version = "0.1.2"
description = "Truly sparse optimizers for PyTorch"
authors = ["Karl Higley <[email protected]>"]

Expand Down
41 changes: 38 additions & 3 deletions torch_optim_sparse/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,41 @@
__version__ = '0.1.0'
__version__ = '0.1.2'

from .sparser_sgd import SparserSGD
from .sparser_sgdw import SparserSGDW
from .sparser_adam import SparserAdam
from .sparser_adamw import SparserAdamW
from .sparser_sgd import SparserSGD
from .sparser_sgdw import SparserSGDW


def convert_lr(eff_lr, momentum=0.0, beta1=0.0, beta2=0.0, batch_size=1):
"""Calculates what learning rate to use for rough equivalence with plain SGD
Useful for supplying one set of hyper-parameters to sweep across with multiple optimizers
and getting them all to converge with hyper-parameters that are somewhere near the same order
of magnitude. Accounts for the effects of optimizer batch size, momentum, and adaptive
learning rates in Adam and SGD variants.
All params except the effective learning rate are optional; only supply the params that are
relevant to the optimizer you want to use.
Args:
eff_lr (float): The effective learning rate you want.
momentum (float, optional): The SGD momentum coefficient. Defaults to 0.0, but 0.9 is typical.
beta1 (float, optional): The Adam first moment coefficient. Defaults to 0.0, but 0.9 is typical.
beta2 (float, optional): The Adam second moment coefficient. Defaults to 0.0, but 0.999 is typical.
batch_size (int, optional): The number of examples in a mini-batch. Defaults to 1.
Returns:
lr (float): The adjusted learning rate to supply to the optimizer
"""
lr = eff_lr

if beta1 != 1.0 or beta2 != 1.0:
lr = lr * (1 - beta2) / (1 - beta1)

if momentum != 0.0:
lr = lr * (1 - momentum)

if batch_size > 1:
lr = lr * batch_size

return lr

0 comments on commit 61ca507

Please sign in to comment.