Added the sharding to shard lora_b

vanbasten23 · vanbasten23 · commit f5c163a57f6a · 2025-10-08T18:24:46.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/tpu_commons/models/vllm/sharding.py b/tpu_commons/models/vllm/sharding.py
@@ -128,7 +128,47 @@ def _shard_column_parallel_linear_lora(
 
 def _shard_qkv_parallel_linear_lora(layer: MergedQKVParallelLinearWithLoRA,
                                     mesh: Mesh) -> None:
-    _shard_base_linear_lora(layer, mesh)
+    # mesh=Mesh(axis_sizes=(1, 2), axis_names=('data', 'model'), axis_types=(Auto, Auto))
+    # NOTE: lora_a_stacked[i] has shape [max_loras, 1, num_out, num_in]
+    sharded_lora_a_tpu = torch.nn.ParameterList()
+    sharded_lora_b_tpu = torch.nn.ParameterList()
+    sharded_lora_bias_tpu = torch.nn.ParameterList()
+
+    assert layer.n_slices > 0, "layer.n_slices should be greater than 0"
+    mesh_lora_b_shape = (1, 1) + (mesh.shape['data'], mesh.shape['model'])
+    mesh_lora_b_axis = ('replica_num_lora', 'replica', 'data', 'model')
+    lora_b_mesh = jax.make_mesh(
+        mesh_lora_b_shape, mesh_lora_b_axis,
+        devices=mesh.devices[0])  # mesh.devices=[[device0, ..device_n]]
+    lora_b_partition_spec = P(None, None, 'model', None)
+    lora_b_sharding = NamedSharding(lora_b_mesh, lora_b_partition_spec)
+
+    mesh_lora_bias_shape = (1, 1) + (mesh.shape['model'], )
+    mesh_lora_bias_axis = ('replica_num_lora', 'replica', 'model')
+    lora_bias_mesh = jax.make_mesh(
+        mesh_lora_bias_shape, mesh_lora_bias_axis,
+        devices=mesh.devices[0])  # mesh.devices=[[device0, ..device_n]]
+    lora_bias_partition_spec = P(None, None, 'model')
+    lora_bias_sharding = NamedSharding(lora_bias_mesh,
+                                       lora_bias_partition_spec)
+
+    for i in range(layer.n_slices):
+        sharded_lora_a_tpu.append(
+            _shard_tensor_to_tpu_replicated(layer.lora_a_stacked[i], mesh))
+
+        sharded_lora_b_tpu.append(
+            _convert_to_torchax_and_shard(layer.lora_b_stacked[i],
+                                          lora_b_sharding))
+
+        if layer.lora_bias_stacked is not None:
+            sharded_lora_bias_tpu.append(
+                _convert_to_torchax_and_shard(layer.lora_bias_stacked[i],
+                                              lora_bias_sharding))
+
+    layer.lora_a_stacked = sharded_lora_a_tpu
+    layer.lora_b_stacked = sharded_lora_b_tpu
+    if layer.lora_bias_stacked is not None:
+        layer.lora_bias_stacked = sharded_lora_bias_tpu
 
 
 def _shard_row_parallel_linear_lora(layer: RowParallelLinearWithLoRA,
@@ -152,7 +192,7 @@ def _shard_row_parallel_linear_lora(layer: RowParallelLinearWithLoRA,
 def _shard_module_to_tpu(model: torch.nn.Module, mesh: Mesh) -> None:
     for path, module in model.named_modules():
         for module_type, sharding_func in MODULE_TYPE_TO_SHARDING_FUNC:
-            if isinstance(module, module_type):
+            if type(module) is module_type:
                 logger.debug("shard %s with %s", path, sharding_func)
                 sharding_func(module, mesh)
                 break