added some comment

vanbasten23 · vanbasten23 · commit aa8c235c72db · 2025-10-08T06:24:18.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
@@ -88,7 +88,9 @@ def test_single_lora_spmd():
     #     ensure_model_parallel_initialized(1, 1)
 
     # num_devices = jax.local_device_count()  # why does this line cause hanging.
-    num_devices = 4
+    # To test SPMD multi-chip case, only num_device=2 works for this model Qwen2.5-3B-Instruct.
+    # This is because this model has kv_head=2. https://github.com/vllm-project/tpu_commons/blob/a489e59c5b3a4d5c28e93775d5323970eecd66c9/tpu_commons/layers/jax/attention_interface.py#L275 here we shard the num_kv_heads. Only 2 can divide the num_kv_heads in this case.
+    num_devices = 2
     print(f'xw32 using TP={num_devices}')
     llm = setup_vllm(1, num_devices)
 
diff --git a/tpu_commons/models/jax/attention.py b/tpu_commons/models/jax/attention.py
@@ -21,6 +21,7 @@ def sharded_ragged_paged_attention(
     v_scale: float | None = None,
 ):
     """Shards along KV heads."""
+    # nonspmd(tp=1):q.shape=(16,16,128),k.shape=(16,2,128),kv_cache.shape=(40660,16,2,2,128)
     qkv_spec = P(None, "model", None)
     kv_cache_spec = P(None, None, "model")
     in_specs = (
@@ -86,6 +87,7 @@ def attention(
     md = attention_metadata
 
     # (T, N, H)
+    # nonspmd(tp=1):q.shape=(16,16,128),k.shape=(16,2,128),kv_cache.shape=(40660,16,2,2,128)
     output, kv_cache = sharded_ragged_paged_attention(
         head_dim_original**-0.5, mesh, attention_chunk_size, q_scale, k_scale,
         v_scale)(