Remove stack decomposition and add stack rule (#271)

fmassa · web-flow · commit e3acd4bf3361 · 2025-12-02T14:36:00.000+01:00
Using the stack decomposition leads to fewer sharding options. As an example, two S(0) tensors can be successfully stacked together at dimension 0, which should lead to a S(1) output sharding. If we instead keep the decomposition from https://github.com/pytorch/pytorch/blob/ded9bcd61a059bf723e6e84689552962b480ea77/torch/_refs/__init__.py#L4116, which first concatenates at the stack dim and then applies a view, we can't obtain the same sharding option. This is because stack has a stricter set of requirements as cat, which the decomposition makes us miss it. Once I removed the decomposition, I faced an issue that the propagation rules from stack aren't correctly implemented, so I had to re-implement it. I'm following a much simpler pattern for the propagation rules, which is to enumerate all possible sharding options and expand the mesh afterwards, which makes the implementation much simpler. This I believe is in-line with what @wconstab is doing for his refactoring of the propagation rules
diff --git a/autoparallel/api.py b/autoparallel/api.py
@@ -63,6 +63,7 @@ def _get_decomp_table():
     decomp_table.pop(torch.ops.aten.native_layer_norm_backward.default)
     decomp_table.pop(torch.ops.aten._softmax_backward_data.default)
     decomp_table.pop(torch.ops.aten._softmax.default)
+    decomp_table.pop(torch.ops.aten.stack.default)
 
     # decompose addmm to allow for TP on mm
     decomp_table.pop(torch.ops.aten.addmm.default)
diff --git a/autoparallel/propagation_rules.py b/autoparallel/propagation_rules.py
@@ -843,3 +843,49 @@ def scatter_strategy(mesh, op_schema: OpSchema):
     return expand_to_full_mesh_op_strategy(
         mesh, op_schema, single_mesh_dim_strategies, input_index=1
     )
+
+
+@register_opschema_rule(torch.ops.aten.stack.default)
+def stack_strategy(mesh, op_schema: OpSchema):
+    from torch.distributed.tensor._ops._tensor_ops import (
+        PlacementList,
+        TupleStrategy,
+        cast,
+        expand_to_full_mesh_op_strategy,
+        normalize_dim,
+    )
+
+    input_tuple_strategy = op_schema.args_schema[0]
+    assert isinstance(input_tuple_strategy, TupleStrategy)
+
+    num_input_tensor = len(input_tuple_strategy.children)
+    first_input_strategy = input_tuple_strategy.children[0]
+    assert isinstance(first_input_strategy, OpStrategy)
+    common_input_ndim = first_input_strategy.ndim
+
+    dim = cast(int, op_schema.args_schema[1]) if len(op_schema.args_schema) > 1 else 0
+    # normalize the dim to be within the common input ndim
+    dim = normalize_dim(dim, common_input_ndim)
+
+    possible_input_strategies: PlacementList = [Replicate()] + [  # type: ignore[assignment]
+        Shard(i) for i in range(common_input_ndim)
+    ]
+    possible_output_strategies: PlacementList = (
+        [Replicate()]  # type: ignore[assignment]
+        + [Shard(i) for i in range(dim)]
+        + [Shard(i + 1) for i in range(dim, common_input_ndim)]
+    )
+
+    single_mesh_dim_strategies = []
+    for input_strategy, output_strategy in zip(
+        possible_input_strategies, possible_output_strategies
+    ):
+        strategy: PlacementList = [output_strategy] + [
+            input_strategy
+        ] * num_input_tensor
+        single_mesh_dim_strategies.append(strategy)
+
+    s = expand_to_full_mesh_op_strategy(
+        mesh, op_schema, single_mesh_dim_strategies, input_index=1
+    )
+    return s