From 9cf2badde51c3f053475b01f26bc67c884a8d1e5 Mon Sep 17 00:00:00 2001 From: gufengc Date: Sat, 23 May 2026 12:38:24 +0800 Subject: [PATCH 1/6] feat(model): add glm 5.1 --- .gitignore | 1 + src/backend/server/static_config.py | 25 ++++---------------- src/parallax/launch.py | 4 ++-- src/parallax/models/deepseek_v32.py | 7 +++++- src/parallax/server/shard_loader.py | 8 +++++++ src/parallax/utils/utils.py | 30 +++++++++++++++--------- src/scheduling/model_info.py | 4 +++- tests/scheduler_tests/test_model_info.py | 23 ++++++++++++++++++ tests/test_shard_loader.py | 18 ++++++++++---- tests/test_static_config.py | 5 ++++ tests/test_utils.py | 25 ++++++++++++++++++++ 11 files changed, 111 insertions(+), 39 deletions(-) create mode 100644 tests/scheduler_tests/test_model_info.py create mode 100644 tests/test_static_config.py diff --git a/.gitignore b/.gitignore index 0fb4e30b..75904bf2 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ build/ .cache .vscode/ hosts.json +uv.lock diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 48a7cb79..c4263dd9 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -1,8 +1,7 @@ import concurrent.futures -import json import math -from pathlib import Path +from parallax.utils.utils import load_config_only from parallax_utils.logging_config import get_logger from scheduling.model_info import ModelInfo @@ -33,6 +32,7 @@ "zai-org/GLM-4.7": "mlx-community/GLM-4.7-4bit", "zai-org/GLM-4.7-Flash": "mlx-community/GLM-4.7-Flash-4bit", "zai-org/GLM-4.7-Flash-FP8": "mlx-community/GLM-4.7-Flash-8bit", + "zai-org/GLM-5.1": "mlx-community/GLM-5.1", # Minimax M2 Models "MiniMaxAI/MiniMax-M2.7": "mlx-community/MiniMax-M2.7-4bit", "MiniMaxAI/MiniMax-M2.1": "mlx-community/MiniMax-M2.1-4bit", @@ -100,23 +100,7 @@ def get_model_info(model_name, use_hfcache: bool = False): - def _load_config_only(name: str) -> dict: - local_path = Path(name) - if local_path.exists(): - config_path = local_path / "config.json" - with open(config_path, "r") as f: - return json.load(f) - - # Hugging Face only – download just config.json - from huggingface_hub import hf_hub_download # type: ignore - - config_file = hf_hub_download( - repo_id=name, filename="config.json", local_files_only=use_hfcache - ) - with open(config_file, "r") as f: - return json.load(f) - - config = _load_config_only(model_name) + config = load_config_only(model_name, local_files_only=use_hfcache) quant_method = config.get("quant_method", None) quantization_config = config.get("quantization_config", None) @@ -139,7 +123,7 @@ def _load_config_only(name: str) -> dict: mlx_model_name = MODELS.get(model_name, model_name) if mlx_model_name != model_name: - mlx_config = _load_config_only(mlx_model_name) + mlx_config = load_config_only(mlx_model_name, local_files_only=use_hfcache) mlx_quant_dict = mlx_config.get("quantization_config", None) if mlx_quant_dict and "bits" in mlx_quant_dict: mlx_param_bytes_per_element = mlx_quant_dict["bits"] / 8 @@ -161,6 +145,7 @@ def _load_config_only(name: str) -> dict: head_size=config.get("head_dim", 128), qk_nope_head_dim=config.get("qk_nope_head_dim", None), qk_rope_head_dim=config.get("qk_rope_head_dim", None), + v_head_dim=config.get("v_head_dim", None), hidden_dim=config.get("hidden_size", 0), intermediate_dim=config.get("intermediate_size", 0), num_attention_heads=config.get("num_attention_heads", 0), diff --git a/src/parallax/launch.py b/src/parallax/launch.py index b20f6250..41f50fe4 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -27,7 +27,7 @@ from parallax.server.http_server import launch_http_server, stop_http_server from parallax.server.server_args import parse_args from parallax.utils.shared_state import SharedState -from parallax.utils.utils import fetch_model_from_hf, initialize_nccl_port +from parallax.utils.utils import initialize_nccl_port, load_config_only from parallax_utils.ascii_anime import display_parallax_join from parallax_utils.logging_config import get_logger, set_log_level from parallax_utils.version_check import check_latest_release @@ -119,7 +119,7 @@ def _wait_executors_check_layer_change(shared_state: SharedState, executor_subpr display_parallax_join(args.model_path) check_latest_release() - config = fetch_model_from_hf(args.model_path, local_files_only=args.use_hfcache) + config = load_config_only(args.model_path, local_files_only=args.use_hfcache) if args.start_layer is None: args.start_layer = 0 if args.end_layer is None: diff --git a/src/parallax/models/deepseek_v32.py b/src/parallax/models/deepseek_v32.py index 7f2553eb..1861c3c5 100644 --- a/src/parallax/models/deepseek_v32.py +++ b/src/parallax/models/deepseek_v32.py @@ -102,7 +102,12 @@ def __call__( scores = scores * weights scores = scores.sum(axis=1) if mask is not None: - scores = mx.where(mask, scores, -float("inf")) + if mask.ndim == 4: + mask = mask[:, 0, :, :] + if mask.dtype == mx.bool_: + scores = mx.where(mask, scores, -float("inf")) + else: + scores = scores + mask.astype(scores.dtype) return mx.argpartition(scores, kth=-self.index_topk, axis=-1)[..., -self.index_topk :] diff --git a/src/parallax/server/shard_loader.py b/src/parallax/server/shard_loader.py index 515de3e0..75d09004 100644 --- a/src/parallax/server/shard_loader.py +++ b/src/parallax/server/shard_loader.py @@ -29,6 +29,10 @@ "minimax_m2": "mlx_lm.models.minimax", } +ARCHITECTURE_CLASS_ALIASES = { + "GlmMoeDsaForCausalLM": "DeepseekV32ForCausalLM", +} + class MLXModelLoader: """ @@ -93,6 +97,10 @@ def register_block_class(self): except Exception as e: logger.warning(f"Failed to load model from {model_file}: {e}") + for alias, target in ARCHITECTURE_CLASS_ALIASES.items(): + if target in self.block_class_map: + self.block_class_map[alias] = self.block_class_map[target] + def linear_to_lora_layers( self, model: nn.Module, diff --git a/src/parallax/utils/utils.py b/src/parallax/utils/utils.py index 997a6eea..63c13a76 100644 --- a/src/parallax/utils/utils.py +++ b/src/parallax/utils/utils.py @@ -1,7 +1,9 @@ """Utility functions.""" +import json import random import socket +from pathlib import Path from typing import List import mlx.core as mx @@ -9,9 +11,6 @@ import psutil import torch import zmq -from mlx_lm.utils import _download, load_config - -from parallax.utils.selective_download import download_metadata_only def is_cuda_available(): @@ -281,15 +280,24 @@ def combine_padding_and_causal_masks( return causal_mask + padding_mask_float -def fetch_model_from_hf(name: str, local_files_only: bool = False): - """Fetch model from huggingface and returns model config""" - - if local_files_only: - model_path = download_metadata_only(name, local_files_only=local_files_only) +def load_config_only(name: str, local_files_only: bool = False): + """Load only config.json from a local path or Hugging Face repo.""" + local_path = Path(name) + if local_path.exists(): + config_file = local_path / "config.json" else: - model_path = _download(name) - config = load_config(model_path) - return config + from huggingface_hub import hf_hub_download + + config_file = Path( + hf_hub_download( + repo_id=name, + filename="config.json", + local_files_only=local_files_only, + ) + ) + + with open(config_file, "r") as f: + return json.load(f) def is_port_available(port: int): diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py index a19c645e..1eb08915 100644 --- a/src/scheduling/model_info.py +++ b/src/scheduling/model_info.py @@ -44,6 +44,7 @@ class ModelInfo: qk_nope_head_dim: Optional[int] = None qk_rope_head_dim: Optional[int] = None + v_head_dim: Optional[int] = None head_size_k: int = None head_size_v: int = None @@ -55,7 +56,8 @@ def __init__(self, **kwargs): self.head_size_k = self.qk_nope_head_dim + self.qk_rope_head_dim else: self.head_size_k = self.head_size - self.head_size_v = self.head_size + v_head_dim = getattr(self, "v_head_dim", None) + self.head_size_v = v_head_dim if v_head_dim is not None else self.head_size @property def q_dim(self) -> int: diff --git a/tests/scheduler_tests/test_model_info.py b/tests/scheduler_tests/test_model_info.py new file mode 100644 index 00000000..0e62f7cd --- /dev/null +++ b/tests/scheduler_tests/test_model_info.py @@ -0,0 +1,23 @@ +from scheduling.model_info import ModelInfo + + +def test_model_info_uses_distinct_value_head_dim(): + model_info = ModelInfo( + model_name="zai-org/GLM-5.1", + mlx_model_name="mlx-community/GLM-5.1", + head_size=64, + qk_nope_head_dim=192, + qk_rope_head_dim=64, + v_head_dim=256, + hidden_dim=6144, + intermediate_dim=12288, + num_attention_heads=64, + num_kv_heads=64, + vocab_size=154880, + num_layers=78, + cache_bytes_per_element=2, + ) + + assert model_info.head_size_k == 256 + assert model_info.head_size_v == 256 + assert model_info.per_token_per_layer_kv_size == 2 * 64 * (256 + 256) diff --git a/tests/test_shard_loader.py b/tests/test_shard_loader.py index a5bee274..7f901a9e 100644 --- a/tests/test_shard_loader.py +++ b/tests/test_shard_loader.py @@ -7,7 +7,11 @@ import pytest -from parallax.server.shard_loader import MODEL_CLASS_MAP, MLXModelLoader +from parallax.server.shard_loader import ( + ARCHITECTURE_CLASS_ALIASES, + MODEL_CLASS_MAP, + MLXModelLoader, +) @pytest.mark.skipif(sys.platform != "darwin", reason="MLX tests require macOS") @@ -27,11 +31,16 @@ def test_register_block_class_success(self): assert isinstance(loader.block_class_map, dict) # Check that expected architectures are registered - expected_architectures = ["Qwen2ForCausalLM", "Qwen3ForCausalLM"] + expected_architectures = [ + "Qwen2ForCausalLM", + "Qwen3ForCausalLM", + "GlmMoeDsaForCausalLM", + ] for architecture in expected_architectures: assert architecture in loader.block_class_map assert hasattr(loader.block_class_map[architecture], "get_architecture") - assert loader.block_class_map[architecture].get_architecture() == architecture + target_architecture = ARCHITECTURE_CLASS_ALIASES.get(architecture, architecture) + assert loader.block_class_map[architecture].get_architecture() == target_architecture def test_register_block_class_with_missing_get_architecture(self): """Test registration when EntryClass doesn't have get_architecture method.""" @@ -137,7 +146,8 @@ def test_register_block_class_multiple_models(self): # Each registered architecture should have a valid EntryClass for architecture, entry_class in loader.block_class_map.items(): assert hasattr(entry_class, "get_architecture") - assert entry_class.get_architecture() == architecture + target_architecture = ARCHITECTURE_CLASS_ALIASES.get(architecture, architecture) + assert entry_class.get_architecture() == target_architecture def test_register_block_class_initialization(self): """Test that register_block_class is called during initialization.""" diff --git a/tests/test_static_config.py b/tests/test_static_config.py new file mode 100644 index 00000000..a50ce6b8 --- /dev/null +++ b/tests/test_static_config.py @@ -0,0 +1,5 @@ +from backend.server.static_config import MODELS + + +def test_glm_5_1_uses_mlx_community_model(): + assert MODELS["zai-org/GLM-5.1"] == "mlx-community/GLM-5.1" diff --git a/tests/test_utils.py b/tests/test_utils.py index cfa53aa2..c4459a82 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,5 @@ from types import SimpleNamespace +from unittest.mock import patch from parallax.utils import utils @@ -31,3 +32,27 @@ def test_get_current_device_prefers_mlx_when_both_backends_report_available(monk monkeypatch.setattr(utils, "is_metal_available", lambda: True) assert utils.get_current_device() == "mlx" + + +def test_load_config_only_reads_local_config(tmp_path): + model_path = tmp_path / "model" + model_path.mkdir() + (model_path / "config.json").write_text('{"num_hidden_layers": 2}') + + config = utils.load_config_only(str(model_path)) + assert config == {"num_hidden_layers": 2} + + +def test_load_config_only_downloads_config_json(tmp_path): + config_path = tmp_path / "config.json" + config_path.write_text('{"num_hidden_layers": 78}') + + with patch("huggingface_hub.hf_hub_download", return_value=str(config_path)) as download: + config = utils.load_config_only("mlx-community/GLM-5.1", local_files_only=True) + + assert config == {"num_hidden_layers": 78} + download.assert_called_once_with( + repo_id="mlx-community/GLM-5.1", + filename="config.json", + local_files_only=True, + ) From 3e633b673006c60fc119832b10b8d4ca3d4f05dd Mon Sep 17 00:00:00 2001 From: gufengc Date: Sat, 23 May 2026 13:34:52 +0800 Subject: [PATCH 2/6] update --- src/parallax/utils/selective_download.py | 3 --- .../parallax_extensions_tests/test_kv_cache_integration.py | 7 ++++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/parallax/utils/selective_download.py b/src/parallax/utils/selective_download.py index 9822b47c..79af4aba 100644 --- a/src/parallax/utils/selective_download.py +++ b/src/parallax/utils/selective_download.py @@ -121,9 +121,6 @@ def selective_model_download( # Check if file already exists in local cache before downloading weight_file_path = model_path / weight_file if weight_file_path.exists(): - logger.debug( - f"Weight file {weight_file} already exists locally, skipping download" - ) continue logger.debug(f"Downloading {weight_file}") diff --git a/tests/parallax_extensions_tests/test_kv_cache_integration.py b/tests/parallax_extensions_tests/test_kv_cache_integration.py index 944aadca..9bafe613 100644 --- a/tests/parallax_extensions_tests/test_kv_cache_integration.py +++ b/tests/parallax_extensions_tests/test_kv_cache_integration.py @@ -25,11 +25,12 @@ def setUp(self): dtype=self.dtype, block_size=self.block_size, cache_memory_fraction=0.5, + num_gpu_blocks=128, ) - # Ensure we have enough blocks for testing - if self.cache_manager.num_gpu_blocks < 100: - pass + # The test only needs 3 blocks; keep the cache bounded so CI does not + # allocate a large fraction of the hosted runner's Metal memory. + assert self.cache_manager.num_gpu_blocks >= 3 def test_prefill_slot_mapping(self): """ From 6548c16786a04f2ec61cea371266a04599506333 Mon Sep 17 00:00:00 2001 From: gufengc Date: Sat, 23 May 2026 14:17:57 +0800 Subject: [PATCH 3/6] update --- pyproject.toml | 3 ++ src/parallax_extensions/ops.py | 89 ++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index dcbcfb16..0bccc6b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,7 +72,10 @@ benchmark = [ dev = [ "black>=24.3", + "cmake>=3.27", + "ninja>=1.11", "ruff>=0.4", + "setuptools>=68", "pytest>=8.2", "pytest-mock>=3.14", "pytest-cov>=5.0", diff --git a/src/parallax_extensions/ops.py b/src/parallax_extensions/ops.py index e5a23902..9e2fe155 100644 --- a/src/parallax_extensions/ops.py +++ b/src/parallax_extensions/ops.py @@ -1,5 +1,9 @@ import importlib +import os +import shutil +import subprocess import sys +import sysconfig from pathlib import Path from types import ModuleType from typing import Optional @@ -34,8 +38,93 @@ def _build_import_error(original_error: Exception) -> ImportError: return ImportError(msg) +def _build_signature() -> str: + try: + from importlib.metadata import version + + mlx_version = version("mlx") + nanobind_version = version("nanobind") + except Exception: + mlx_version = "unknown" + nanobind_version = "unknown" + + return "|".join( + [ + sys.implementation.cache_tag or "", + sys.version.split()[0], + mx.__file__, + mlx_version, + nanobind_version, + ] + ) + + +def _ensure_build_tools() -> None: + missing = [] + try: + import setuptools # noqa: F401 + except Exception: + missing.append("setuptools") + + if shutil.which("cmake") is None: + missing.append("cmake") + if shutil.which("ninja") is None: + missing.append("ninja") + + if missing: + subprocess.run( + [sys.executable, "-m", "pip", "install", *missing], + check=True, + ) + + +def _rebuild_for_github_actions() -> None: + """Build native kernels against the exact Python/MLX used by GitHub macOS CI.""" + if os.environ.get("GITHUB_ACTIONS") != "true" or sys.platform != "darwin": + return + if os.environ.get("PARALLAX_SKIP_CI_EXTENSION_REBUILD") == "1": + return + + package_dir = Path(__file__).resolve().parent + lib_dir = package_dir / "lib" + ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ".so" + expected_ext = lib_dir / f"_ext{ext_suffix}" + stamp = lib_dir / f".ci-build-{sys.implementation.cache_tag or 'python'}" + signature = _build_signature() + + if expected_ext.exists() and stamp.exists() and stamp.read_text() == signature: + return + + _ensure_build_tools() + + env = os.environ.copy() + env["DEBUG"] = "0" + cmake_args = env.get("CMAKE_ARGS", "") + python_arg = f"-DPython_EXECUTABLE={sys.executable}" + env["CMAKE_ARGS"] = f"{python_arg} {cmake_args}".strip() + + log_path = Path("/tmp/parallax_ext_build.log") + with log_path.open("w") as log: + subprocess.run( + [sys.executable, "setup.py", "build_ext", "-j8", "--inplace"], + cwd=package_dir, + env=env, + stdout=log, + stderr=subprocess.STDOUT, + check=True, + ) + + stamp.write_text(signature) + print(f"Rebuilt parallax_extensions native kernels for CI: {expected_ext}") + + def load_extension_module() -> ModuleType: """Load the compiled extension module for the current Python runtime.""" + try: + _rebuild_for_github_actions() + except Exception as exc: # pragma: no cover - GitHub runner dependent + raise _build_import_error(exc) from exc + try: # Python's import machinery selects the matching ABI-tagged binary # (e.g. _ext.cpython-312-*.so) from parallax_extensions/lib. From a78aed832099a03f33ba66737c7eeffc5a3ffc4c Mon Sep 17 00:00:00 2001 From: gufengc Date: Sat, 23 May 2026 15:03:46 +0800 Subject: [PATCH 4/6] update --- src/parallax/models/deepseek_v32.py | 9 +++-- tests/test_deepseek_v32.py | 52 +++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 5 deletions(-) create mode 100644 tests/test_deepseek_v32.py diff --git a/src/parallax/models/deepseek_v32.py b/src/parallax/models/deepseek_v32.py index 1861c3c5..f643e446 100644 --- a/src/parallax/models/deepseek_v32.py +++ b/src/parallax/models/deepseek_v32.py @@ -142,11 +142,10 @@ def __call__( compressed_kv = self.kv_a_proj_with_mqa(x) compressed_kv, k_pe = mx.split(compressed_kv, [self.kv_lora_rank], axis=-1) k_pe = k_pe.reshape(batch, target_len, 1, self.qk_rope_head_dim).transpose(0, 2, 1, 3) - kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv)) - kv = kv.reshape(batch, target_len, self.num_heads, -1) - - k_nope, values = mx.split(kv, [self.qk_nope_head_dim], axis=-1) - k_nope = k_nope.transpose(0, 2, 1, 3) + kv_latent = self.kv_a_layernorm(compressed_kv) + kv_latent = kv_latent[:, None, :, :] + k_nope = self.embed_q(kv_latent, transpose=False) + values = self.unembed_out(kv_latent).transpose(0, 2, 1, 3) key_cache_global, value_cache_global = cache.get_cache() indexer_cache = cache.get_indexer_cache() diff --git a/tests/test_deepseek_v32.py b/tests/test_deepseek_v32.py new file mode 100644 index 00000000..061c0d01 --- /dev/null +++ b/tests/test_deepseek_v32.py @@ -0,0 +1,52 @@ +import sys + +import mlx.core as mx +import pytest + +from parallax.models.deepseek_v32 import ModelArgs, ParallaxDeepSeekV32Attention +from parallax.server.cache.dsa_cache import DeepSeekSparseCache + +pytestmark = pytest.mark.skipif(sys.platform != "darwin", reason="MLX tests require macOS") + + +def _tiny_args(): + return ModelArgs( + hidden_size=16, + num_attention_heads=2, + num_key_value_heads=2, + q_lora_rank=8, + kv_lora_rank=4, + qk_nope_head_dim=2, + qk_rope_head_dim=2, + v_head_dim=4, + index_head_dim=4, + index_n_heads=2, + index_topk=4, + num_hidden_layers=1, + max_position_embeddings=16, + ) + + +def test_attention_decode_forward_uses_glm_style_kv_cache(): + args = _tiny_args() + attention = ParallaxDeepSeekV32Attention(args) + cache = DeepSeekSparseCache( + num_blocks=1, + block_size=8, + num_kv_heads=args.num_key_value_heads, + head_dim=args.qk_nope_head_dim + args.qk_rope_head_dim, + head_dim_v=args.v_head_dim, + dtype=mx.float32, + index_head_dim=args.index_head_dim, + index_n_heads=args.index_n_heads, + ) + + output = attention( + mx.zeros((1, 1, args.hidden_size), dtype=mx.float32), + cache=cache, + block_tables=mx.array([[0]], dtype=mx.int32), + context_lengths=mx.array([1], dtype=mx.int32), + ) + mx.eval(output) + + assert output.shape == (1, 1, args.hidden_size) From 5305dff819593276abaf849edcccce72e5bf67fa Mon Sep 17 00:00:00 2001 From: gufengc Date: Sat, 23 May 2026 15:05:44 +0800 Subject: [PATCH 5/6] update --- src/backend/server/static_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index c4263dd9..e7af2945 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -33,6 +33,7 @@ "zai-org/GLM-4.7-Flash": "mlx-community/GLM-4.7-Flash-4bit", "zai-org/GLM-4.7-Flash-FP8": "mlx-community/GLM-4.7-Flash-8bit", "zai-org/GLM-5.1": "mlx-community/GLM-5.1", + "zai-org/GLM-5.1-FP8": "mlx-community/GLM-5.1", # Minimax M2 Models "MiniMaxAI/MiniMax-M2.7": "mlx-community/MiniMax-M2.7-4bit", "MiniMaxAI/MiniMax-M2.1": "mlx-community/MiniMax-M2.1-4bit", From 657095d147a2f713717e3f72bf55c3b1c6ddf280 Mon Sep 17 00:00:00 2001 From: gufengc Date: Sat, 23 May 2026 15:09:14 +0800 Subject: [PATCH 6/6] update --- .../parallax_extensions_tests/test_kv_cache_integration.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tests/parallax_extensions_tests/test_kv_cache_integration.py b/tests/parallax_extensions_tests/test_kv_cache_integration.py index 9bafe613..944aadca 100644 --- a/tests/parallax_extensions_tests/test_kv_cache_integration.py +++ b/tests/parallax_extensions_tests/test_kv_cache_integration.py @@ -25,12 +25,11 @@ def setUp(self): dtype=self.dtype, block_size=self.block_size, cache_memory_fraction=0.5, - num_gpu_blocks=128, ) - # The test only needs 3 blocks; keep the cache bounded so CI does not - # allocate a large fraction of the hosted runner's Metal memory. - assert self.cache_manager.num_gpu_blocks >= 3 + # Ensure we have enough blocks for testing + if self.cache_manager.num_gpu_blocks < 100: + pass def test_prefill_slot_mapping(self): """