Adding split FSDP Collective Pass

Sanket Jayant Purandare · xmfan · commit b4a76e9b80d6 · 2025-10-29T12:14:38.000-07:00
diff --git a/autoparallel/_passes/graph_multiplex.py b/autoparallel/_passes/graph_multiplex.py
@@ -1,3 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
 import copy
 
 import torch
diff --git a/autoparallel/_passes/split_fsdp_collectives.py b/autoparallel/_passes/split_fsdp_collectives.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+
+import dataclasses
+
+import torch
+import torch.utils._pytree as pytree
+from torch._functorch._aot_autograd.descriptors import AOTOutput
+from torch._functorch.partitioners import _extract_graph_with_inputs_outputs
+
+
+@dataclasses.dataclass(frozen=True)
+class PrefetchOutput(AOTOutput):
+    pass
+
+
+def split_fsdp_prefetch(
+    gm: torch.fx.GraphModule,
+) -> tuple[torch.fx.GraphModule, torch.fx.GraphModule]:
+    g = gm.graph
+    g_ins = g.find_nodes(op="placeholder")
+    prefetch_g_outs_map = {}
+
+    for g_in in g_ins:
+        n = g_in
+        while True:
+            if len(n.users) != 1:
+                break
+            user = next(iter(n.users))
+            if len(user.all_input_nodes) > 1:
+                break
+            n = user
+        prefetch_g_outs_map[g_in] = n
+
+    prefetch_g_outs = list(prefetch_g_outs_map.values())
+    prefetch_g_outs_descs: list[AOTOutput] = [
+        PrefetchOutput() for _ in range(len(prefetch_g_outs))
+    ]
+
+    prefetch_g = _extract_graph_with_inputs_outputs(
+        g,
+        g_ins,
+        prefetch_g_outs,
+        prefetch_g_outs_descs,
+    )
+
+    g_outs = pytree.arg_tree_leaves(*(n.args for n in g.find_nodes(op="output")))
+    g_outs_descs = pytree.arg_tree_leaves(
+        next(iter(g.find_nodes(op="output"))).meta.get("desc", [None] * len(g_outs))
+    )
+    main_g = _extract_graph_with_inputs_outputs(
+        g,
+        prefetch_g_outs,
+        g_outs,
+        g_outs_descs,
+    )
+    main_gm = torch.fx._lazy_graph_module._make_graph_module(gm, main_g)
+    prefetch_gm = torch.fx._lazy_graph_module._make_graph_module(gm, prefetch_g)
+    return prefetch_gm, main_gm
diff --git a/examples/example_llama3.py b/examples/example_llama3.py
@@ -12,6 +12,7 @@
 from torch.testing._internal.distributed.fake_pg import FakeStore
 
 from autoparallel._passes.graph_multiplex import multiplex_fw_bw_graph
+from autoparallel._passes.split_fsdp_collectives import split_fsdp_prefetch
 from autoparallel._testing.models.llama3 import Transformer, TransformerModelArgs
 from autoparallel.api import AutoParallel
 from autoparallel.auto_bucketing import (
@@ -257,9 +258,22 @@ def _pass(graph):
     if multiplex_graph:
         f_gm = autop.fw_module
         b_gm = autop.bw_module
-        multiplexed_gm = multiplex_fw_bw_graph(f_gm, b_gm)
+        print("Original Fwd Graph:")
         print(f_gm.graph)
+        print("Original Bwd Graph:")
         print(b_gm.graph)
+        prefetch_f_gm, main_f_gm = split_fsdp_prefetch(f_gm)
+        print("Main Fwd Graph:")
+        print(main_f_gm.graph)
+        print("Prefetch Fwd Graph:")
+        print(prefetch_f_gm.graph)
+        prefetch_b_gm, main_b_gm = split_fsdp_prefetch(b_gm)
+        print("Main Bwd Graph:")
+        print(main_b_gm.graph)
+        print("Prefetch Bwd Graph:")
+        print(prefetch_b_gm.graph)
+        multiplexed_gm = multiplex_fw_bw_graph(main_f_gm, main_b_gm)
+        print("Multiplexed Graph:")
         print(multiplexed_gm.graph)
 
 # run weight init on our sharded DTensor params