pytorch
diff --git a/‎torchrec/distributed/embeddingbag.py
Lines changed: 8 additions & 8 deletions b/‎torchrec/distributed/embeddingbag.py
Lines changed: 8 additions & 8 deletions
diff --git a/‎torchrec/distributed/mc_modules.py
Lines changed: 3 additions & 1 deletion b/‎torchrec/distributed/mc_modules.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎torchrec/distributed/model_parallel.py
Lines changed: 79 additions & 0 deletions b/‎torchrec/distributed/model_parallel.py
Lines changed: 79 additions & 0 deletions
diff --git a/‎torchrec/distributed/planner/tests/test_types.py
Lines changed: 93 additions & 1 deletion b/‎torchrec/distributed/planner/tests/test_types.py
Lines changed: 93 additions & 1 deletion
diff --git a/‎torchrec/distributed/planner/types.py
Lines changed: 21 additions & 0 deletions b/‎torchrec/distributed/planner/types.py
Lines changed: 21 additions & 0 deletions
diff --git a/‎torchrec/distributed/sharding/dynamic_sharding.py
Lines changed: 24 additions & 0 deletions b/‎torchrec/distributed/sharding/dynamic_sharding.py
Lines changed: 24 additions & 0 deletions
diff --git a/‎torchrec/distributed/sharding_plan.py
Lines changed: 14 additions & 0 deletions b/‎torchrec/distributed/sharding_plan.py
Lines changed: 14 additions & 0 deletions
@@ -1531,15 +1531,9 @@ def update_shards(
         current_state = self.state_dict()
         # TODO: Save Optimizers
 
-        saved_weights = {}
         # TODO: Saving lookups tensors to CPU to eventually avoid recreating them completely again
-        for i, lookup in enumerate(self._lookups):
-            for attribute, tbe_module in lookup.named_modules():
-                if type(tbe_module) is DenseTableBatchedEmbeddingBagsCodegen:
-                    saved_weights[str(i) + "." + attribute] = tbe_module.weights.cpu()
-                    # Note: lookup.purge should delete tbe_module and weights
-                    # del tbe_module.weights
-                    # del tbe_module
+        # TODO: Ensure lookup tensors are actually being deleted
+        for _, lookup in enumerate(self._lookups):
             # pyre-ignore
             lookup.purge()
 
@@ -1603,6 +1597,12 @@ def update_shards(
             for embedding_configs in self.sharding_type_to_sharding_infos.values()
         ]
 
+        # Reset input dists
+        self._has_uninitialized_input_dist = True
+        self._input_dists: List[nn.Module] = []
+        self._features_order: List[int] = []
+        self._feature_splits: List[int] = []
+
         self._create_lookups()
         self._update_output_dist()
 
 
@@ -671,7 +671,9 @@ def _kjt_list_to_tensor_list(
                     vals.append(feature_split.values() + offset)
                 remapped_ids_ret.append(torch.cat(vals).view(-1, 1))
             else:
-                remapped_ids_ret.append(kjt.values() + self._table_to_offset[tables[0]])
+                remapped_ids_ret.append(
+                    (kjt.values() + self._table_to_offset[tables[0]]).unsqueeze(-1)
+                )
         return remapped_ids_ret
 
     def global_to_local_index(
 
@@ -35,6 +35,7 @@
 from torchrec.distributed.types import (
     EnumerableShardingSpec,
     ModuleSharder,
+    ParameterSharding,
     ShardedModule,
     ShardingEnv,
     ShardingEnv2D,
@@ -612,6 +613,84 @@ def _reset_parameters(module: nn.Module) -> None:
             if hasattr(m, "reset_parameters"):
                 m.reset_parameters()
 
+    def reshard(
+        self,
+        sharded_module_fqn: str,
+        changed_shard_to_params: Dict[str, ParameterSharding],
+    ) -> None:
+        """
+        Reshards an already-sharded module in the DMP given a set of ParameterShardings to change placements.
+
+        This method allows you to dynamically change the sharding strategy for a specific module
+        without recreating the entire DMP. It's particularly useful for:
+        1. Adapting to changing requirements during training
+        2. Implementing progressive sharding strategies
+        3. Rebalancing load across devices
+        4. A/B Testing different sharding plans
+
+        Args:
+            path_to_sharded_module (str): The path to the sharded module in the DMP.
+                For example, "sparse.ebc".
+            changed_shard_to_params (Dict[str, ParameterSharding]): A dictionary mapping
+                parameter names to their new ParameterSharding configurations. Includes
+                only the shards that needs to be moved.
+
+        Example:
+            ```
+            # Original sharding plan might have table sharded across 2 GPUs
+            original_plan = {
+                "table_0': ParameterSharding(
+                    sharding_type="table_wise",
+                    ranks=[0, 1, 2, 3],
+                    sharding_spec=EnumerableShardingSpec(...)
+                )
+            }
+
+            # New sharding plan to shard across 4 GPUs
+            new_plan = {
+                "weight": ParameterSharding(
+                    sharding_type="table_wise",
+                    ranks=[0, 1, 2, 3],
+                    sharding_spec=EnumerableShardingSpec(...)
+                )
+            }
+
+            # Helper function for only selecting the delta between original and new plan
+            changed_sharding_params = output_sharding_plan_delta(new_plan)
+
+            # Reshard the module and redistribute the tensors
+            model.reshard("embedding_module", changed_sharding_params)
+            ```
+
+        Notes:
+            - The sharder for the module must implement a `reshard` method
+            - Resharding involves redistributing tensor data across devices, which can be expensive
+            - After resharding, the optimizer state is maintained for the module
+            - The sharding plan is updated to reflect the new configuration
+        """
+        steps = sharded_module_fqn.split(".")
+        sharded_module = self.module
+        for s in steps:
+            sharded_module = getattr(sharded_module, s)
+
+        assert isinstance(sharded_module, ShardedModule)
+        assert changed_shard_to_params is not None
+        sharder_key = sharded_module.unsharded_module_type
+        sharder = self._sharder_map[sharder_key]
+        assert hasattr(
+            sharder, "reshard"
+        ), "reshard is not implemented for this sharder"
+        sharded_module = sharder.reshard(  # pyre-ignore
+            sharded_module,
+            changed_shard_to_params,
+            self._env,
+            self.device,
+        )
+
+        self._optim: CombinedOptimizer = self._init_optim(self._dmp_wrapped_module)
+        self._plan.plan[sharded_module_fqn] = sharded_module.module_sharding_plan
+        return sharded_module
+
 
 class DMPCollection(DistributedModelParallel):
     """
 
@@ -14,12 +14,17 @@
 import torch
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
 
-from torchrec.distributed.planner.types import Shard, ShardingOption
+from torchrec.distributed.planner.types import (
+    ParameterConstraints,
+    Shard,
+    ShardingOption,
+)
 from torchrec.distributed.types import (
     BoundsCheckMode,
     CacheAlgorithm,
     CacheParams,
     DataType,
+    KeyValueParams,
     ShardingType,
 )
 from torchrec.modules.embedding_configs import EmbeddingBagConfig, EmbeddingConfig
@@ -207,3 +212,90 @@ def test_module_pooled_mch_ec(self) -> None:
             shards=[Shard(size=shard_size, offset=offset) for offset in shard_offsets],
         )
         self.assertEqual(sharding_option.is_pooled, False)
+
+
+class TestParameterConstraintsHash(unittest.TestCase):
+
+    def test_hash_equality(self) -> None:
+        # Create two identical instances
+        pc1 = ParameterConstraints(
+            sharding_types=["type1", "type2"],
+            compute_kernels=["kernel1"],
+            min_partition=4,
+            pooling_factors=[1.0, 2.0],
+            num_poolings=[1.0],
+            batch_sizes=[32],
+            is_weighted=True,
+            cache_params=CacheParams(),
+            enforce_hbm=True,
+            stochastic_rounding=False,
+            bounds_check_mode=BoundsCheckMode(1),
+            feature_names=["feature1", "feature2"],
+            output_dtype=DataType.FP32,
+            device_group="cuda",
+            key_value_params=KeyValueParams(),
+        )
+
+        pc2 = ParameterConstraints(
+            sharding_types=["type1", "type2"],
+            compute_kernels=["kernel1"],
+            min_partition=4,
+            pooling_factors=[1.0, 2.0],
+            num_poolings=[1.0],
+            batch_sizes=[32],
+            is_weighted=True,
+            cache_params=CacheParams(),
+            enforce_hbm=True,
+            stochastic_rounding=False,
+            bounds_check_mode=BoundsCheckMode(1),
+            feature_names=["feature1", "feature2"],
+            output_dtype=DataType.FP32,
+            device_group="cuda",
+            key_value_params=KeyValueParams(),
+        )
+
+        self.assertEqual(
+            hash(pc1), hash(pc2), "Hashes should be equal for identical instances"
+        )
+
+    def test_hash_inequality(self) -> None:
+        # Create two different instances
+        pc1 = ParameterConstraints(
+            sharding_types=["type1"],
+            compute_kernels=["kernel1"],
+            min_partition=4,
+            pooling_factors=[1.0],
+            num_poolings=[1.0],
+            batch_sizes=[32],
+            is_weighted=True,
+            cache_params=CacheParams(),
+            enforce_hbm=True,
+            stochastic_rounding=False,
+            bounds_check_mode=BoundsCheckMode(1),
+            feature_names=["feature1"],
+            output_dtype=DataType.FP32,
+            device_group="cuda",
+            key_value_params=KeyValueParams(),
+        )
+
+        pc2 = ParameterConstraints(
+            sharding_types=["type2"],
+            compute_kernels=["kernel2"],
+            min_partition=8,
+            pooling_factors=[2.0],
+            num_poolings=[2.0],
+            batch_sizes=[64],
+            is_weighted=False,
+            cache_params=CacheParams(),
+            enforce_hbm=False,
+            stochastic_rounding=True,
+            bounds_check_mode=BoundsCheckMode(1),
+            feature_names=["feature2"],
+            output_dtype=DataType.FP16,
+            device_group="cpu",
+            key_value_params=KeyValueParams(),
+        )
+
+        self.assertNotEqual(
+            hash(pc1), hash(pc2), "Hashes should be different for different instances"
+        )
@@ -703,6 +703,27 @@ class ParameterConstraints:
     device_group: Optional[str] = None
     key_value_params: Optional[KeyValueParams] = None
 
+    def __hash__(self) -> int:
+        return hash(
+            (
+                tuple(self.sharding_types) if self.sharding_types else None,
+                tuple(self.compute_kernels) if self.compute_kernels else None,
+                self.min_partition,
+                tuple(self.pooling_factors),
+                tuple(self.num_poolings) if self.num_poolings else None,
+                tuple(self.batch_sizes) if self.batch_sizes else None,
+                self.is_weighted,
+                self.cache_params,
+                self.enforce_hbm,
+                self.stochastic_rounding,
+                self.bounds_check_mode,
+                tuple(self.feature_names) if self.feature_names else None,
+                self.output_dtype,
+                self.device_group,
+                self.key_value_params,
+            )
+        )
+
 
 class PlannerErrorType(Enum):
     """
 
@@ -7,13 +7,15 @@
 
 # pyre-strict
 
+import copy
 from typing import Any, Callable, Dict, List, Tuple
 
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
 from torch.distributed._shard.sharded_tensor import Shard
 from torchrec.distributed.types import (
+    EmbeddingModuleShardingPlan,
     ParameterSharding,
     ShardedModule,
     ShardedTensor,
@@ -364,3 +366,25 @@ def pad_tensor_to_max_dims(
         mode="constant",
         value=0,
     )
+
+
+# Utils
+def output_sharding_plan_delta(
+    old_plan: EmbeddingModuleShardingPlan, new_plan: EmbeddingModuleShardingPlan
+) -> EmbeddingModuleShardingPlan:
+    """
+    Compute and return a new sharding plan that is the delta
+    between new and old embedding module plans. Assumes that the old and new plan
+    have the same number of parameters/tables.
+
+    This is useful for Dynamic Sharding since Resharding API takes in only the
+    ParameterSharding or shards that needs to be moved.
+    """
+    assert len(old_plan) == len(new_plan)
+    return EmbeddingModuleShardingPlan(
+        {
+            k: copy.deepcopy(v)
+            for k, v in new_plan.items()
+            if v.ranks != old_plan[k].ranks
+        }
+    )
@@ -410,6 +410,20 @@ def _get_parameter_sharding(
 ]
 
 
+def get_sharding_constructor_from_type(
+    sharding_type: ShardingType,
+) -> Callable[..., ParameterShardingGenerator]:
+    sharding_type_to_constructor = {
+        ShardingType.TABLE_WISE: table_wise,
+        ShardingType.ROW_WISE: row_wise,
+        ShardingType.COLUMN_WISE: column_wise,
+        ShardingType.TABLE_ROW_WISE: table_row_wise,
+        ShardingType.GRID_SHARD: grid_shard,
+        ShardingType.DATA_PARALLEL: data_parallel,
+    }
+    return sharding_type_to_constructor[sharding_type]
+
+
 def data_parallel() -> ParameterShardingGenerator:
     """
     Returns a generator of ParameterShardingPlan for `ShardingType::DATA_PARALLEL` for construct_module_sharding_plan.