[KV Cache] Override Number of KV cache blocks (#757)

kyuyeunk · web-flow · commit 303ab6fd8851 · 2025-10-03T19:28:30.000-07:00
Signed-off-by: Kyuyeun Kim &lt;kyuyeunk@google.com&gt;
diff --git a/tpu_commons/runner/kv_cache.py b/tpu_commons/runner/kv_cache.py
@@ -1,8 +1,11 @@
-from typing import List
+from typing import Any, List
 
 import jax
 import jax.numpy as jnp
+import numpy as np
+from jax._src import dtypes
 from jax.sharding import Mesh, NamedSharding, PartitionSpec
+from torchax.ops.mappings import t2j_dtype
 
 import tpu_commons.kernels.ragged_paged_attention.v3.kernel as rpa
 from tpu_commons.logger import init_logger
@@ -37,11 +40,11 @@ def create_kv_caches(
     cache_dtype: jnp.dtype = DEFAULT_KV_CACHE_DTYPE,
 ) -> List[jax.Array]:
     """
-    Creates the KV caches, a list of arrays, each array is for one attention layer.
+    Creates a list of KV cache where each array mapps to single attention layer.
 
     The shape of the KV cache per layer is:
-    (num_blocks, block_size, cdiv(num_kv_heads * 2, packing), packing, head_size).
-    packing =  (32 // dtype bits)
+    (num_blocks, block_size, cdiv(num_kv_heads * 2, packing), packing, head_dim)
+    where packing = (32 // dtype bits)
 
     Args:
         num_blocks: The number of blocks in the KV cache.
@@ -50,6 +53,7 @@ def create_kv_caches(
         head_size: The size of each head in the KV cache.
         mesh: The mesh to shard the KV caches across.
         layer_names: The names of the decoder layers in the model.
+        cache_dtype: The datatype of KV cache.
 
     Returns:
         A list of KV caches, one per each decoder layer in the model.
@@ -75,3 +79,41 @@ def _allocate() -> jax.Array:
     for _ in layer_names:
         kv_caches.append(sharded_allocate())
     return kv_caches
+
+
+def get_rpa_page_size_bytes(mesh: Mesh, kv_cache_specs: dict[str, Any]) -> int:
+    """
+    Calculate KV cache page size of RPA kernel.
+
+    Args:
+        mesh: The mesh to shard the KV caches across.
+        kv_cache_specs: Dictionary of KV cache specs.
+
+    Returns:
+        KV cache page size in bytes.
+    """
+
+    # Import it here to avoid circular import.
+    from vllm.v1.kv_cache_interface import AttentionSpec
+
+    page_size_bytes_set = set()
+    for kv_cache_spec in kv_cache_specs.values():
+        assert isinstance(kv_cache_spec, AttentionSpec)
+
+        dtype = t2j_dtype(kv_cache_spec.dtype)
+        bits = dtypes.bit_width(dtype)
+
+        kv_cache_shape = get_kv_cache_shape_with_mesh(
+            mesh=mesh,
+            total_num_pages=1,  # Pass 1 to get shape of a single page.
+            page_size=kv_cache_spec.block_size,
+            actual_num_kv_heads=kv_cache_spec.num_kv_heads,
+            actual_head_dim=kv_cache_spec.head_size,
+            kv_dtype=dtype,
+        )
+        page_size_bytes = (bits * np.prod(kv_cache_shape)) // 8
+        page_size_bytes_set.add(page_size_bytes)
+
+    # Ensure that page size is the same for all kv caches.
+    assert len(page_size_bytes_set) == 1
+    return page_size_bytes_set.pop()
diff --git a/tpu_commons/worker/tpu_worker_jax.py b/tpu_commons/worker/tpu_worker_jax.py
@@ -15,6 +15,7 @@
                                              init_distributed_environment)
 from vllm.lora.request import LoRARequest
 from vllm.tasks import SupportedTask
+from vllm.v1.core.kv_cache_utils import get_num_blocks, get_uniform_page_size
 from vllm.v1.core.sched.output import SchedulerOutput
 from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec
 from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
@@ -27,6 +28,7 @@
 from tpu_commons.distributed.utils import (get_host_ip, get_kv_transfer_port,
                                            get_node_id)
 from tpu_commons.logger import init_logger
+from tpu_commons.runner.kv_cache import get_rpa_page_size_bytes
 from tpu_commons.runner.tpu_jax_runner import TPUModelRunner
 from tpu_commons.worker._temporary_vllm_compat import (
     adapt_kv_cache_config_if_needed, adapt_lora_request_if_needed,
@@ -251,7 +253,29 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
         # responsible for this translation. When vLLM can be modified, this
         # method should be changed to return `dict[str, AbstractKVCacheSpec]`,
         # and the vLLM side should be updated to handle the translation.
-        return self.model_runner.get_kv_cache_spec()
+        kv_cache_specs = self.model_runner.get_kv_cache_spec()
+
+        # TODO(kyuyeunk): Instead of checking page_size_bytes here, introduce
+        # feature that allows overriding page_size_bytes of KVCacheSpec.
+        vllm_page_size_bytes = get_uniform_page_size(kv_cache_specs)
+        rpa_page_size_bytes = get_rpa_page_size_bytes(self.model_runner.mesh,
+                                                      kv_cache_specs)
+
+        if vllm_page_size_bytes != rpa_page_size_bytes:
+            logger.info(
+                f"KV cache page size calculated by vLLM "
+                f"({vllm_page_size_bytes} Bytes) does not match with actual "
+                f"page size used by RPA kernel ({rpa_page_size_bytes} Bytes). "
+                f"Recalculating number of KV blocks using actual page size.")
+
+            available_memory = self.determine_available_memory()
+            num_blocks = get_num_blocks(self.vllm_config, len(kv_cache_specs),
+                                        available_memory, rpa_page_size_bytes)
+
+            cache_config = self.vllm_config.cache_config
+            cache_config.num_gpu_blocks_override = num_blocks
+
+        return kv_cache_specs
 
     def initialize_from_config(
         self,