added some comment

vanbasten23 · vanbasten23 · commit 014add77e154 · 2025-10-08T03:41:03.000Z
Signed-off-by: Xiongfei Wei &lt;isaacwxf23@gmail.com&gt;
diff --git a/tests/lora/test_lora.py b/tests/lora/test_lora.py
@@ -88,7 +88,7 @@ def test_single_lora_spmd():
     #     ensure_model_parallel_initialized(1, 1)
 
     # num_devices = jax.local_device_count()  # why does this line cause hanging.
-    num_devices = 4
+    num_devices = 8
     print(f'xw32 using TP={num_devices}')
     llm = setup_vllm(1, num_devices)
 
diff --git a/tpu_commons/models/jax/attention.py b/tpu_commons/models/jax/attention.py
@@ -21,6 +21,7 @@ def sharded_ragged_paged_attention(
     v_scale: float | None = None,
 ):
     """Shards along KV heads."""
+    # nonspmd(tp=1):q.shape=(16,16,128),k.shape=(16,2,128),kv_cache.shape=(40660,16,2,2,128)
     qkv_spec = P(None, "model", None)
     kv_cache_spec = P(None, None, "model")
     in_specs = (
@@ -86,6 +87,7 @@ def attention(
     md = attention_metadata
 
     # (T, N, H)
+    # nonspmd(tp=1):q.shape=(16,16,128),k.shape=(16,2,128),kv_cache.shape=(40660,16,2,2,128)
     output, kv_cache = sharded_ragged_paged_attention(
         head_dim_original**-0.5, mesh, attention_chunk_size, q_scale, k_scale,
         v_scale)(