scale_grads with foreach + compile

IvanKobzarev · IvanKobzarev · commit cab831f1ca38 · 2025-04-22T06:51:16.000-07:00
ghstack-source-id: 081a1a9 Pull Request resolved: #2624
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -913,8 +913,19 @@ def train(self) -> None:
                         torch.distributed.all_reduce(num_tokens)
                         # This will ensure that the logged loss matches what we're optimizing
                         torch.distributed.all_reduce(running_loss)
+
                         # Manually scale the gradients from unnormalized loss by total # of tokens
-                        training.scale_grads(self._model, self.dp_degree / num_tokens)
+                        def scale_grads_fn():
+                            training.scale_grads_(
+                                self._model.parameters(), self.dp_degree / num_tokens
+                            )
+
+                        if self._compile:
+                            training.compile_scale_grads(
+                                scale_grads_fn, verbose=self._is_rank_zero
+                            )()
+                        else:
+                            scale_grads_fn()
                         if self._clip_grad_norm is not None:
                             grad_norm = torch.nn.utils.clip_grad_norm_(
                                 self._model.parameters(),
diff --git a/torchtune/training/__init__.py b/torchtune/training/__init__.py
@@ -12,6 +12,7 @@
     compile_loss,
     compile_model,
     compile_optimizer_step,
+    compile_scale_grads,
 )
 from torchtune.training._distributed import (
     gather_cpu_state_dict,
@@ -29,7 +30,7 @@
     shard_model,
     validate_no_params_on_meta_device,
 )
-from torchtune.training._grad_scaler import scale_grads
+from torchtune.training._grad_scaler import scale_grads, scale_grads_
 from torchtune.training._model_util import disable_dropout
 from torchtune.training._profiler import (
     DEFAULT_PROFILE_DIR,
@@ -140,6 +141,7 @@
     "compile_loss",
     "compile_model",
     "compile_optimizer_step",
+    "compile_scale_grads",
     "NoOpManager",
     "OffloadActivations",
     "FormattedCheckpointFiles",
diff --git a/torchtune/training/_compile.py b/torchtune/training/_compile.py
@@ -93,3 +93,10 @@ def compile_optimizer_step(optimizer_step_fn, verbose: bool = True):
     if verbose:
         log.info("Compiling optimizer step function with torch.compile...")
     return torch.compile(optimizer_step_fn, backend=backend)
+
+
+def compile_scale_grads(fn, verbose: bool = True):
+    backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
+    if verbose:
+        log.info("Compiling scale_grads function with torch.compile...")
+    return torch.compile(fn, backend=backend)
diff --git a/torchtune/training/_grad_scaler.py b/torchtune/training/_grad_scaler.py
@@ -4,8 +4,13 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+from collections import defaultdict
+from typing import Optional
+
 import torch
-from torch import nn
+from torch import nn, Tensor
+from torch.nn.utils.clip_grad import _no_grad, _tensor_or_tensors
+from torch.utils._foreach_utils import _device_has_foreach_support, _has_foreach_support
 
 
 def scale_grads(model: nn.Module, scaler: torch.Tensor) -> None:
@@ -29,3 +34,70 @@ def scale_grads(model: nn.Module, scaler: torch.Tensor) -> None:
             scaler = scaler.to(device)
         if p.grad is not None:
             p.grad *= scaler
+
+
+@_no_grad
+def scale_grads_(
+    parameters: _tensor_or_tensors,
+    scaler: torch.Tensor,
+    foreach: Optional[bool] = None,
+) -> None:
+    r"""Scale gradients of iterable parameters.
+
+    This function is equivalent to :func:`torch.mul_` applied to each parameter.
+    Gradients are modified in-place, multiplying by specified scaler.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients scaled
+        scaler (Tensor): multiplier to scale gradients
+        foreach (bool): use the faster foreach-based implementation.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
+            Default: ``None``
+    Returns:
+        None
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    else:
+        parameters = list(parameters)
+    _scale_grad_(parameters, scaler, foreach)
+
+
+def _group_tensors_by_device_and_dtype(
+    tensors: list[torch.Tensor],
+) -> dict[tuple[torch.device, torch.dtype], list[Tensor]]:
+    ret = defaultdict(list)
+    for i, tensor in enumerate(tensors):
+        ret[(tensor.device, tensor.dtype)].append(tensor)
+
+    return ret
+
+
+@_no_grad
+def _scale_grad_(
+    parameters: _tensor_or_tensors,
+    scaler: torch.Tensor,
+    foreach: Optional[bool] = None,
+) -> None:
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    grads = [p.grad for p in parameters if p.grad is not None]
+    if len(grads) == 0:
+        return
+    grouped_grads = _group_tensors_by_device_and_dtype(grads)
+
+    for (device, _), device_grads in grouped_grads.items():
+        if (foreach is None and _has_foreach_support(device_grads, device)) or (
+            foreach and _device_has_foreach_support(device)
+        ):
+            torch._foreach_mul_(device_grads, scaler.to(device))
+        elif foreach:
+            raise RuntimeError(
+                f"foreach=True was passed, but can't use the foreach API on {device.type} tensors"
+            )
+        else:
+            scaler_device = scaler.to(device)
+            for g in device_grads:
+                g.mul_(scaler_device)