-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a utility to standardize learning rates across optimizers
- Loading branch information
1 parent
d2b472a
commit 61ca507
Showing
2 changed files
with
39 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
[tool.poetry] | ||
name = "torch-optim-sparse" | ||
version = "0.1.1" | ||
version = "0.1.2" | ||
description = "Truly sparse optimizers for PyTorch" | ||
authors = ["Karl Higley <[email protected]>"] | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,41 @@ | ||
__version__ = '0.1.0' | ||
__version__ = '0.1.2' | ||
|
||
from .sparser_sgd import SparserSGD | ||
from .sparser_sgdw import SparserSGDW | ||
from .sparser_adam import SparserAdam | ||
from .sparser_adamw import SparserAdamW | ||
from .sparser_sgd import SparserSGD | ||
from .sparser_sgdw import SparserSGDW | ||
|
||
|
||
def convert_lr(eff_lr, momentum=0.0, beta1=0.0, beta2=0.0, batch_size=1): | ||
"""Calculates what learning rate to use for rough equivalence with plain SGD | ||
Useful for supplying one set of hyper-parameters to sweep across with multiple optimizers | ||
and getting them all to converge with hyper-parameters that are somewhere near the same order | ||
of magnitude. Accounts for the effects of optimizer batch size, momentum, and adaptive | ||
learning rates in Adam and SGD variants. | ||
All params except the effective learning rate are optional; only supply the params that are | ||
relevant to the optimizer you want to use. | ||
Args: | ||
eff_lr (float): The effective learning rate you want. | ||
momentum (float, optional): The SGD momentum coefficient. Defaults to 0.0, but 0.9 is typical. | ||
beta1 (float, optional): The Adam first moment coefficient. Defaults to 0.0, but 0.9 is typical. | ||
beta2 (float, optional): The Adam second moment coefficient. Defaults to 0.0, but 0.999 is typical. | ||
batch_size (int, optional): The number of examples in a mini-batch. Defaults to 1. | ||
Returns: | ||
lr (float): The adjusted learning rate to supply to the optimizer | ||
""" | ||
lr = eff_lr | ||
|
||
if beta1 != 1.0 or beta2 != 1.0: | ||
lr = lr * (1 - beta2) / (1 - beta1) | ||
|
||
if momentum != 0.0: | ||
lr = lr * (1 - momentum) | ||
|
||
if batch_size > 1: | ||
lr = lr * batch_size | ||
|
||
return lr |