Allow a context manager to be called around apply_jit (#2927)

kddnewton · facebook-github-bot · commit 9c67b0c5d996 · 2025-05-01T06:53:39.000-07:00
Summary:

When running torch.jit.script on the various forward functions, you can run into issues if there are any other utilites interacting with the function definitions. As an example, if you have another JIT running, you need to disable it throughout this process.

This commit adds the ability to additionally pass an apply_jit_context context manager wherever apply_jit is currently passed that will be called around the application of the torch jit.

Differential Revision: D73781040
diff --git a/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py b/torchrec/distributed/train_pipeline/tests/test_train_pipelines_utils.py
@@ -10,7 +10,8 @@
 import copy
 import enum
 import unittest
-from typing import List
+from contextlib import contextmanager
+from typing import Generator, List
 from unittest.mock import MagicMock
 
 import torch
@@ -42,6 +43,29 @@ class ModelType(enum.Enum):
 
 
 class TrainPipelineUtilsTest(TrainPipelineSparseDistTestBase):
+    # pyre-fixme[56]: Pyre was not able to infer the type of argument
+    @unittest.skipIf(
+        not torch.cuda.is_available(),
+        "Not enough GPUs, this test requires at least one GPU",
+    )
+    def test_rewrite_model_apply_jit(self) -> None:
+        @contextmanager
+        def apply_jit_context(events: list[str]) -> Generator[None, None, None]:
+            events.append("__enter__")
+            yield
+            events.append("__exit__")
+
+        events = []
+        _rewrite_model(
+            model=self._setup_model(),
+            context=TrainPipelineContext(),
+            dist_stream=None,
+            apply_jit=True,
+            apply_jit_context=apply_jit_context(events),
+        )
+
+        self.assertEqual(events, ["__enter__", "__exit__"])
+
     # pyre-fixme[56]: Pyre was not able to infer the type of argument
     @unittest.skipIf(
         not torch.cuda.is_available(),
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -11,6 +11,7 @@
 import contextlib
 import logging
 from collections import deque
+from contextlib import nullcontext
 from dataclasses import dataclass
 from typing import (
     Any,
@@ -319,6 +320,9 @@ def progress(self, dataloader_iter: Iterator[In]) -> Out:
         return output
 
 
+_apply_jit_context_default: ContextManager[None] = nullcontext()
+
+
 class TrainPipelineSparseDist(TrainPipeline[In, Out]):
     """
     This pipeline overlaps device transfer, and `ShardedModule.input_dist()` with
@@ -344,6 +348,8 @@ class TrainPipelineSparseDist(TrainPipeline[In, Out]):
         execute_all_batches (bool): executes remaining batches in pipeline after
             exhausting dataloader iterator.
         apply_jit (bool): apply torch.jit.script to non-pipelined (unsharded) modules.
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
     """
 
     # The PipelinedForward class that is used in _rewrite_model
@@ -362,12 +368,14 @@ def __init__(
         custom_model_fwd: Optional[
             Callable[[Optional[In]], Tuple[torch.Tensor, Out]]
         ] = None,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
     ) -> None:
         self._model = model
         self._optimizer = optimizer
         self._device = device
         self._execute_all_batches = execute_all_batches
         self._apply_jit = apply_jit
+        self._apply_jit_context = apply_jit_context
 
         if device.type == "cuda":
             # use two data streams to support two concurrent batches
@@ -643,6 +651,7 @@ def _pipeline_model(
             apply_jit=self._apply_jit,
             pipelined_forward=pipelined_forward,
             pipeline_postproc=self._pipeline_postproc,
+            apply_jit_context=self._apply_jit_context,
         )
         # initializes input dist, so we can override input dist forwards
         self.start_sparse_data_dist(batch, context)
@@ -993,6 +1002,8 @@ class TrainPipelineSemiSync(TrainPipelineSparseDist[In, Out]):
         start_batch (int): batch to begin semi-sync training.  Typically small period of synchronous training reduces early stage NEX.
         stash_gradients (bool): if True, will store gradients for each parameter to insure true "Semi-Sync"
             training.  If False, will update dense optimizer as soon as gradients available (naive "Semi-Sync)
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
     """
 
     # The PipelinedForward class that is used in _rewrite_model
@@ -1012,6 +1023,7 @@ def __init__(
             Callable[[Optional[In]], Tuple[torch.Tensor, Out]]
         ] = None,
         strict: bool = False,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
     ) -> None:
         super().__init__(
             model=model,
@@ -1022,6 +1034,7 @@ def __init__(
             context_type=EmbeddingTrainPipelineContext,
             pipeline_postproc=pipeline_postproc,
             custom_model_fwd=custom_model_fwd,
+            apply_jit_context=apply_jit_context,
         )
         self._start_batch = start_batch
         self._stash_gradients = stash_gradients
@@ -1305,6 +1318,8 @@ class PrefetchTrainPipelineSparseDist(TrainPipelineSparseDist[In, Out]):
         execute_all_batches (bool): executes remaining batches in pipeline after
             exhausting dataloader iterator.
         apply_jit (bool): apply torch.jit.script to non-pipelined (unsharded) modules.
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
     """
 
     # The PipelinedForward class that is used in _rewrite_model
@@ -1321,6 +1336,7 @@ def __init__(
         custom_model_fwd: Optional[
             Callable[[Optional[In]], Tuple[torch.Tensor, Out]]
         ] = None,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
     ) -> None:
         super().__init__(
             model=model,
@@ -1331,6 +1347,7 @@ def __init__(
             context_type=PrefetchTrainPipelineContext,
             pipeline_postproc=pipeline_postproc,
             custom_model_fwd=custom_model_fwd,
+            apply_jit_context=apply_jit_context,
         )
         self._context = PrefetchTrainPipelineContext(version=0)
         self._prefetch_stream: Optional[torch.Stream] = (
@@ -1462,6 +1479,8 @@ class EvalPipelineSparseDist(TrainPipelineSparseDist[In, Out]):
         device (torch.device): device where device transfer, sparse data dist, and
             forward/backward pass will happen.
         apply_jit (bool): apply torch.jit.script to non-pipelined (unsharded) modules.
+        apply_jit_context (ContextManager): a context manager that will surround the
+            application of the JIT
     """
 
     # The PipelinedForward class that is used in _rewrite_model
@@ -1473,8 +1492,16 @@ def __init__(
         optimizer: torch.optim.Optimizer,
         device: torch.device,
         apply_jit: bool = False,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
     ) -> None:
-        super().__init__(model, optimizer, device, True, apply_jit)
+        super().__init__(
+            model,
+            optimizer,
+            device,
+            True,
+            apply_jit,
+            apply_jit_context=apply_jit_context,
+        )
         self._batch_loader: Optional[DataLoadingThread[In]] = None
 
     def __del__(self) -> None:
@@ -1836,6 +1863,7 @@ def __init__(
         custom_model_fwd: Optional[
             Callable[[Optional[In]], Tuple[torch.Tensor, Out]]
         ] = None,
+        apply_jit_context: ContextManager[None] = _apply_jit_context_default,
     ) -> None:
         super().__init__(
             model,
@@ -1846,6 +1874,7 @@ def __init__(
             context_type,
             pipeline_postproc,
             custom_model_fwd,
+            apply_jit_context=apply_jit_context,
         )
 
         torch._logging.set_logs(compiled_autograd_verbose=True)
diff --git a/torchrec/distributed/train_pipeline/utils.py b/torchrec/distributed/train_pipeline/utils.py
@@ -11,7 +11,7 @@
 import itertools
 import logging
 from collections import defaultdict, OrderedDict
-from contextlib import AbstractContextManager
+from contextlib import AbstractContextManager, nullcontext
 from dataclasses import dataclass, field
 
 from itertools import chain
@@ -20,6 +20,7 @@
     Any,
     Callable,
     cast,
+    ContextManager,
     Dict,
     Generic,
     Iterable,
@@ -1480,6 +1481,9 @@ def _pipeline_detach_model(
         setattr(model, postproc_mod.fqn, postproc_mod.postproc_module)
 
 
+_rewrite_model_apply_jit_context_default: ContextManager[None] = nullcontext()
+
+
 # pyre-ignore[3] Return type must be specified as type that does not contain
 def _rewrite_model(  # noqa C901
     model: torch.nn.Module,
@@ -1490,6 +1494,7 @@ def _rewrite_model(  # noqa C901
     pipelined_forward: Type[BaseForward[TrainPipelineContext]] = PipelinedForward,
     pipeline_postproc: bool = False,
     default_stream: Optional[torch.Stream] = None,
+    apply_jit_context: ContextManager[None] = _rewrite_model_apply_jit_context_default,
 ) -> Tuple[
     List[ShardedModule],
     torch.nn.Module,
@@ -1598,10 +1603,11 @@ def _rewrite_model(  # noqa C901
 
     # JIT script unsharded modules if applicable.
     if apply_jit:
-        graph_model = torch.fx.GraphModule(model, graph)
-        _jit_modules(graph_model, "")
-        if isinstance(input_model, DistributedModelParallel):
-            input_model.module = graph_model
+        with apply_jit_context:
+            graph_model = torch.fx.GraphModule(model, graph)
+            _jit_modules(graph_model, "")
+            if isinstance(input_model, DistributedModelParallel):
+                input_model.module = graph_model
 
     if non_pipelined_sharded_modules:
         logger.warn(