[optim][sgd] group tensors in foreach to maximize perf (pytorch#92338)

janeyx99 · pytorchmergebot · commit a41f00ed7084 · 2023-01-18T04:02:41.000Z
Make foreach faster for SGD Pull Request resolved: pytorch#92338 Approved by: https://github.com/albanD
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
@@ -2,6 +2,7 @@
 from torch import Tensor
 from .optimizer import Optimizer, required, _use_grad_for_differentiable
 from typing import List, Optional
+from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
 __all__ = ['SGD', 'sgd']
 
@@ -271,48 +272,50 @@ def _multi_tensor_sgd(params: List[Tensor],
     if len(params) == 0:
         return
 
-    if has_sparse_grad is None:
-        has_sparse_grad = any(grad.is_sparse for grad in grads)
+    grouped_tensors = _group_tensors_by_device_and_dtype([params, grads, momentum_buffer_list], with_indices=True)
+    for device_params, device_grads, device_momentum_buffer_list, indices in grouped_tensors.values():
+        device_has_sparse_grad = any(grad.is_sparse for grad in device_grads)
 
-    if maximize:
-        grads = torch._foreach_neg(tuple(grads))  # type: ignore[assignment]
+        if maximize:
+            device_grads = torch._foreach_neg(tuple(device_grads))  # type: ignore[assignment]
 
-    if weight_decay != 0:
-        grads = torch._foreach_add(grads, params, alpha=weight_decay)
+        if weight_decay != 0:
+            device_grads = torch._foreach_add(device_grads, device_params, alpha=weight_decay)
+
+        if momentum != 0:
+            bufs = []
 
-    if momentum != 0:
-        bufs = []
+            all_states_with_momentum_buffer = True
+            for i in range(len(device_momentum_buffer_list)):
+                if device_momentum_buffer_list[i] is None:
+                    all_states_with_momentum_buffer = False
+                    break
+                else:
+                    bufs.append(device_momentum_buffer_list[i])
 
-        all_states_with_momentum_buffer = True
-        for i in range(len(momentum_buffer_list)):
-            if momentum_buffer_list[i] is None:
-                all_states_with_momentum_buffer = False
-                break
+            if all_states_with_momentum_buffer:
+                torch._foreach_mul_(bufs, momentum)
+                torch._foreach_add_(bufs, device_grads, alpha=1 - dampening)
             else:
-                bufs.append(momentum_buffer_list[i])
+                bufs = []
+                for i in range(len(device_momentum_buffer_list)):
+                    if device_momentum_buffer_list[i] is None:
+                        buf = device_momentum_buffer_list[i] = momentum_buffer_list[indices[i]] = \
+                            torch.clone(device_grads[i]).detach()
+                    else:
+                        buf = device_momentum_buffer_list[i]
+                        buf.mul_(momentum).add_(device_grads[i], alpha=1 - dampening)
 
-        if all_states_with_momentum_buffer:
-            torch._foreach_mul_(bufs, momentum)
-            torch._foreach_add_(bufs, grads, alpha=1 - dampening)
-        else:
-            bufs = []
-            for i in range(len(momentum_buffer_list)):
-                if momentum_buffer_list[i] is None:
-                    buf = momentum_buffer_list[i] = torch.clone(grads[i]).detach()
-                else:
-                    buf = momentum_buffer_list[i]
-                    buf.mul_(momentum).add_(grads[i], alpha=1 - dampening)
+                    bufs.append(buf)
 
-                bufs.append(buf)
+            if nesterov:
+                torch._foreach_add_(device_grads, bufs, alpha=momentum)
+            else:
+                device_grads = bufs
 
-        if nesterov:
-            torch._foreach_add_(grads, bufs, alpha=momentum)
+        if not device_has_sparse_grad:
+            torch._foreach_add_(device_params, device_grads, alpha=-lr)
         else:
-            grads = bufs
-
-    if not has_sparse_grad:
-        torch._foreach_add_(params, grads, alpha=-lr)
-    else:
-        # foreach APIs dont support sparse
-        for i in range(len(params)):
-            params[i].add_(grads[i], alpha=-lr)
+            # foreach APIs don't support sparse
+            for i in range(len(device_params)):
+                device_params[i].add_(device_grads[i], alpha=-lr)
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
@@ -1,27 +1,37 @@
 from collections import defaultdict
-from typing import List, Dict, Tuple
+from typing import List, Dict, Tuple, Optional, Union
 
 import torch
 from torch import Tensor
 
 
-# This util function splits tensors into groups by device and dtype, which is useful before sending
-# tensors off to a foreach implementation, which requires tensors to be on one device and dtype.
-# Currently, this function is only used in torch.optim.
+# _group_tensors_by_device_and_dtype is a util function that splits tensors into groups by device and dtype,
+# which is useful before sending tensors off to a foreach implementation, which requires tensors to be on
+# one device and dtype.
 # If tensorlistlist contains more than one tensorlist, the following assumptions are made BUT NOT verified:
 #   - tensorlists CAN be None
 #   - all tensors in the first specified list cannot be None
 #   - given an index i, all specified tensorlist[i]s match in dtype and device
+# with_indices (bool, optional): whether to track previous indices as the last list per dictionary entry.
+#   It comes in handy if there are Nones or literals in the tensorlists that are getting scattered out.
+#   Whereas mutating a tensor in the resulting split-up tensorlists WILL propagate changes back to the
+#   original input tensorlists, changing up Nones/literals WILL NOT propagate, and manual propagation
+#   may be necessary. Check out torch/optim/sgd.py for an example.
 @torch.no_grad()
-def _group_tensors_by_device_and_dtype(tensorlistlist: List[List[Tensor]]) -> Dict[Tuple[str, torch.dtype], List[List[Tensor]]]:
+def _group_tensors_by_device_and_dtype(tensorlistlist: List[List[Tensor]],
+                                       with_indices: Optional[bool] = False) -> \
+        Dict[Tuple[str, torch.dtype], List[List[Union[Tensor, int]]]]:
     assert all([not x or len(x) == len(tensorlistlist[0]) for x in tensorlistlist]), (
            "all specified tensorlists must match in length")
-    per_device_and_dtype_tensors: Dict[Tuple[str, torch.dtype], List[List[Tensor]]] = defaultdict(
-        lambda: [[] for _ in range(len(tensorlistlist))])
+    per_device_and_dtype_tensors: Dict[Tuple[str, torch.dtype], List[List[Union[Tensor, int]]]] = defaultdict(
+        lambda: [[] for _ in range(len(tensorlistlist) + (1 if with_indices else 0))])
     for i, t in enumerate(tensorlistlist[0]):
         key = (str(t.device), t.dtype)
         for j in range(len(tensorlistlist)):
             # a tensorlist may be empty/None
             if tensorlistlist[j]:
                 per_device_and_dtype_tensors[key][j].append(tensorlistlist[j][i])
+        if with_indices:
+            # tack on previous index
+            per_device_and_dtype_tensors[key][j + 1].append(i)
     return per_device_and_dtype_tensors