Add a utility to standardize learning rates across optimizers

karlhigley · Sep 27, 2020 · 61ca507 · 61ca507
1 parent d2b472a
commit 61ca507
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 4 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "torch-optim-sparse"
-version = "0.1.1"
+version = "0.1.2"
 description = "Truly sparse optimizers for PyTorch"
 authors = ["Karl Higley <[email protected]>"]
 

diff --git a/torch_optim_sparse/__init__.py b/torch_optim_sparse/__init__.py
@@ -1,6 +1,41 @@
-__version__ = '0.1.0'
+__version__ = '0.1.2'
 
-from .sparser_sgd import SparserSGD
-from .sparser_sgdw import SparserSGDW
 from .sparser_adam import SparserAdam
 from .sparser_adamw import SparserAdamW
+from .sparser_sgd import SparserSGD
+from .sparser_sgdw import SparserSGDW
+
+
+def convert_lr(eff_lr, momentum=0.0, beta1=0.0, beta2=0.0, batch_size=1):
+    """Calculates what learning rate to use for rough equivalence with plain SGD
+
+    Useful for supplying one set of hyper-parameters to sweep across with multiple optimizers
+    and getting them all to converge with hyper-parameters that are somewhere near the same order
+    of magnitude. Accounts for the effects of optimizer batch size, momentum, and adaptive
+    learning rates in Adam and SGD variants.
+
+    All params except the effective learning rate are optional; only supply the params that are
+    relevant to the optimizer you want to use.
+
+    Args:
+        eff_lr (float): The effective learning rate you want.
+        momentum (float, optional): The SGD momentum coefficient. Defaults to 0.0, but 0.9 is typical.
+        beta1 (float, optional): The Adam first moment coefficient. Defaults to 0.0, but 0.9 is typical.
+        beta2 (float, optional): The Adam second moment coefficient. Defaults to 0.0, but 0.999 is typical.
+        batch_size (int, optional): The number of examples in a mini-batch. Defaults to 1.
+
+    Returns:
+        lr (float): The adjusted learning rate to supply to the optimizer
+    """
+    lr = eff_lr
+
+    if beta1 != 1.0 or beta2 != 1.0:
+        lr = lr * (1 - beta2) / (1 - beta1)
+
+    if momentum != 0.0:
+        lr = lr * (1 - momentum)
+
+    if batch_size > 1:
+        lr = lr * batch_size
+
+    return lr