all_to_all_single with async_op (#3436)

TroyGarden · facebook-github-bot · commit fc289214a3db · 2025-10-06T14:26:25.000-07:00
Summary: # context * add benchmark for `all_to_all_single` with `async_op` option. * the comms uses a different cuda stream (comms stream) so it's non-blocking for the followed operations (on main cuda stream) * of course the comms results (pre-allocated output) are not valid until the comms' done (the pre-check fails) * when there's data dependency on the comms' output, user's need to call `req.wait()` explicitly, so that the main cuda stream wait on the comms stream NOTE: the `req.wait()` call is non-blocking on the CPU side. # ref * [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) * [sync and async comms](https://docs.pytorch.org/docs/stable/distributed.html#collective-functions) * [CUDA semantics](https://docs.pytorch.org/docs/stable/notes/cuda.html) Differential Revision: D83924526
diff --git a/torchrec/distributed/benchmark/base.py b/torchrec/distributed/benchmark/base.py
@@ -601,6 +601,7 @@ def _run_benchmark_core(
     pre_gpu_load: int = 0,
     export_stacks: bool = False,
     reset_accumulated_memory_stats: bool = False,
+    all_rank_traces: bool = False,
 ) -> BenchmarkResult:
     """Internal helper that contains the core benchmarking logic shared by
     ``benchmark`` and ``benchmark_func``.  All heavy–lifting (timing, memory
@@ -721,9 +722,10 @@ def _run_benchmark_core(
         def _trace_handler(prof: torch.profiler.profile) -> None:
             total_avg = prof.profiler.total_average()
             logger.info(f" TOTAL_AVERAGE:\n{name}\n{total_avg}")
-            if rank > 0:
+            if not all_rank_traces and rank > 0:
+                # only save trace for rank 0 when all_rank_traces is disabled
                 return
-            trace_file = f"{output_dir}/trace-{name}.json"
+            trace_file = f"{output_dir}/trace-{name}-rank{rank}.json"
             logger.info(f" PROFILE[{name}].chrome_trace:{trace_file}")
             prof.export_chrome_trace(trace_file)
             if export_stacks:
@@ -828,6 +830,7 @@ class BenchFuncConfig:
     device_type: str = "cuda"
     pre_gpu_load: int = 0
     export_stacks: bool = False
+    all_rank_traces: bool = False
 
     # pyre-ignore [2]
     def benchmark_func_kwargs(self, **kwargs_to_override) -> Dict[str, Any]:
@@ -840,6 +843,7 @@ def benchmark_func_kwargs(self, **kwargs_to_override) -> Dict[str, Any]:
             "device_type": self.device_type,
             "pre_gpu_load": self.pre_gpu_load,
             "export_stacks": self.export_stacks,
+            "all_rank_traces": self.all_rank_traces,
         } | kwargs_to_override
 
 
@@ -857,6 +861,7 @@ def benchmark_func(
     device_type: str = "cuda",
     pre_gpu_load: int = 0,
     export_stacks: bool = False,
+    all_rank_traces: bool = False,
 ) -> BenchmarkResult:
     """
     Args:
@@ -879,6 +884,7 @@ def benchmark_func(
         pre_gpu_load: Number of dummy matmul operations to run before the first
             measured iteration (helps simulating a loaded allocator).
         export_stacks: Whether to export flamegraph-compatible stack files.
+        all_rank_traces: Whether to export traces from all ranks.
     """
     if benchmark_func_kwargs is None:
         benchmark_func_kwargs = {}
@@ -905,4 +911,5 @@ def _profile_iter_fn(prof: torch.profiler.profile) -> None:
         pre_gpu_load=pre_gpu_load,
         export_stacks=export_stacks,
         reset_accumulated_memory_stats=True,
+        all_rank_traces=all_rank_traces,
     )
diff --git a/torchrec/distributed/benchmark/benchmark_comms.py b/torchrec/distributed/benchmark/benchmark_comms.py
@@ -11,18 +11,22 @@
 Example usage:
 
 Buck2 (internal):
-    buck2 run @fbcode//mode/opt fbcode//torchrec/distributed/benchmark:benchmark_comms -- 
+    buck2 run @fbcode//mode/opt fbcode//torchrec/distributed/benchmark:benchmark_comms -- \
+        a2a_single --name=a2a_sync_base-$(hg whereami | cut -c 1-10)
 
 OSS (external):
-    python -m torchrec.distributed.benchmark.benchmark_comms 
+    python -m torchrec.distributed.benchmark.benchmark_comms \
+        a2a_single --name=a2a_sync_base-$(git rev-parse --short HEAD || echo $USER)
 
+see README.md for more details
 """
 
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
 import torch.distributed as dist
+import torch.nn.functional as F
 
 from torch.autograd.profiler import record_function
 
@@ -47,10 +51,34 @@ class AllToAllSingleRunConfig(BenchFuncConfig):
     profile_dir: str = "."
     num_benchmarks: int = 1
     num_profiles: int = 2
-    num_mul: int = 10
+    num_mul: int = 5
     num_concat: int = 100
 
 
+def _compute(
+    dim: int,
+    num_mul: int,
+    num_concat: int,
+    ctx: MultiProcessContext,
+    x: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if x is None:
+        x = torch.rand(dim, dim, device=ctx.device) - 0.5
+    for _ in range(num_mul):
+        x = F.normalize(x @ x) * 10
+    x = torch.sigmoid(x).reshape(1, dim, dim) + ctx.rank
+    return torch.concat([x] * num_concat)
+
+
+def _validate(x: torch.Tensor, ctx: MultiProcessContext) -> List[torch.Tensor]:
+    mixed_ranks = x.to(torch.int).reshape(-1)
+    N = mixed_ranks.numel() // ctx.world_size
+    checks = [
+        torch.all(mixed_ranks[i * N : (i + 1) * N] == i) for i in range(ctx.world_size)
+    ]
+    return checks
+
+
 # all_to_all_single with sync and single stream
 def a2a_sync_base(
     batch_inputs: List[Dict[str, Any]],
@@ -60,38 +88,66 @@ def a2a_sync_base(
     ctx: MultiProcessContext,
 ) -> None:
     with record_function("## pre-comms compute ##"):
-        pre_comms = torch.rand(dim, dim, device=ctx.device) - 0.5
-        for _ in range(num_mul):
-            pre_comms = pre_comms @ pre_comms
-            pre_comms = torch.sigmoid(pre_comms - torch.mean(pre_comms))
-        pre_comms = torch.sigmoid(pre_comms).reshape(1, dim, dim) + ctx.rank
-        pre_comms = torch.concat([pre_comms] * num_concat)
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
 
     with record_function("## all_to_all_single ##"):
         post_comms = torch.empty_like(pre_comms)
         req = dist.all_to_all_single(output=post_comms, input=pre_comms, group=ctx.pg)
 
     with record_function("## comms validation ##"):
-        mixed_ranks = post_comms.to(torch.int).reshape(-1)
-        N = mixed_ranks.numel() // ctx.world_size
-        checks = [
-            torch.all(mixed_ranks[i * N : (i + 1) * N] == i)
-            for i in range(ctx.world_size)
-        ]
+        checks = _validate(post_comms, ctx)
 
     with record_function("## irrelevant compute ##"):
-        pre_comms = torch.rand(dim, dim, device=ctx.device) - 0.5
-        for _ in range(num_mul):
-            pre_comms = pre_comms @ pre_comms
-            pre_comms = torch.sigmoid(pre_comms - torch.mean(pre_comms))
-        pre_comms = torch.sigmoid(pre_comms) + ctx.rank
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
 
     with record_function("## post-comms compute ##"):
-        post_comms = post_comms[0]
-        for _ in range(num_mul):
-            post_comms = post_comms @ post_comms
-            post_comms = torch.sigmoid(pre_comms - torch.mean(post_comms))
-        post_comms = torch.sigmoid(post_comms) + ctx.rank
+        post_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=post_comms[0]
+        )
+
+    with record_function("## assert ##"):
+        assert all(checks)
+
+
+# all_to_all_single with sync and single stream
+def a2a_async_base(
+    batch_inputs: List[Dict[str, Any]],
+    dim: int,
+    num_mul: int,
+    num_concat: int,
+    ctx: MultiProcessContext,
+) -> None:
+    with record_function("## pre-comms compute ##"):
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
+
+    with record_function("## all_to_all_single ##"):
+        # use zeros instead of empty to make sure no previous data used
+        post_comms = torch.zeros_like(pre_comms)
+        req = dist.all_to_all_single(
+            output=post_comms,
+            input=pre_comms,
+            group=ctx.pg,
+            async_op=True,
+        )
+
+    with record_function("## comms validation ##"):
+        # pre-check is performed before comms' done
+        # all() will trigger a device-to-host sync to get the result
+        # of course you can also make it async by wrapping with Awaitable
+        pre_checks = all(_validate(post_comms, ctx))
+
+    with record_function("## irrelevant compute ##"):
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
+
+    with record_function(f"## post-comms compute: pre-check-{pre_checks}##"):
+        # assertion fails without wait(), this wait() makes the main cuda stream wait
+        # for the comms to finish, so the post-comms compute will be blocked until
+        # the comms is done
+        req.wait()
+        checks = _validate(post_comms, ctx)
+        post_comms = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=post_comms[0]
+        )
 
     with record_function("## assert ##"):
         assert all(checks)
@@ -114,6 +170,8 @@ def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig)
 
         if arg.name.startswith("a2a_sync_base"):
             func = a2a_sync_base
+        elif arg.name.startswith("a2a_async_base"):
+            func = a2a_async_base
         else:
             func = a2a_sync_base
 
@@ -128,7 +186,7 @@ def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig)
             },
             func_to_benchmark=func,
             rank=rank,
-            **arg.benchmark_func_kwargs()
+            **arg.benchmark_func_kwargs(),
         )
 
         if rank == 0: