From 9cf2badde51c3f053475b01f26bc67c884a8d1e5 Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 23 May 2026 12:38:24 +0800
Subject: [PATCH 1/6] feat(model): add glm 5.1

---
 .gitignore                               |  1 +
 src/backend/server/static_config.py      | 25 ++++----------------
 src/parallax/launch.py                   |  4 ++--
 src/parallax/models/deepseek_v32.py      |  7 +++++-
 src/parallax/server/shard_loader.py      |  8 +++++++
 src/parallax/utils/utils.py              | 30 +++++++++++++++---------
 src/scheduling/model_info.py             |  4 +++-
 tests/scheduler_tests/test_model_info.py | 23 ++++++++++++++++++
 tests/test_shard_loader.py               | 18 ++++++++++----
 tests/test_static_config.py              |  5 ++++
 tests/test_utils.py                      | 25 ++++++++++++++++++++
 11 files changed, 111 insertions(+), 39 deletions(-)
 create mode 100644 tests/scheduler_tests/test_model_info.py
 create mode 100644 tests/test_static_config.py

diff --git a/.gitignore b/.gitignore
index 0fb4e30b..75904bf2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ build/
 .cache
 .vscode/
 hosts.json
+uv.lock
diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py
index 48a7cb79..c4263dd9 100644
--- a/src/backend/server/static_config.py
+++ b/src/backend/server/static_config.py
@@ -1,8 +1,7 @@
 import concurrent.futures
-import json
 import math
-from pathlib import Path
 
+from parallax.utils.utils import load_config_only
 from parallax_utils.logging_config import get_logger
 from scheduling.model_info import ModelInfo
 
@@ -33,6 +32,7 @@
     "zai-org/GLM-4.7": "mlx-community/GLM-4.7-4bit",
     "zai-org/GLM-4.7-Flash": "mlx-community/GLM-4.7-Flash-4bit",
     "zai-org/GLM-4.7-Flash-FP8": "mlx-community/GLM-4.7-Flash-8bit",
+    "zai-org/GLM-5.1": "mlx-community/GLM-5.1",
     # Minimax M2 Models
     "MiniMaxAI/MiniMax-M2.7": "mlx-community/MiniMax-M2.7-4bit",
     "MiniMaxAI/MiniMax-M2.1": "mlx-community/MiniMax-M2.1-4bit",
@@ -100,23 +100,7 @@
 
 
 def get_model_info(model_name, use_hfcache: bool = False):
-    def _load_config_only(name: str) -> dict:
-        local_path = Path(name)
-        if local_path.exists():
-            config_path = local_path / "config.json"
-            with open(config_path, "r") as f:
-                return json.load(f)
-
-        # Hugging Face only – download just config.json
-        from huggingface_hub import hf_hub_download  # type: ignore
-
-        config_file = hf_hub_download(
-            repo_id=name, filename="config.json", local_files_only=use_hfcache
-        )
-        with open(config_file, "r") as f:
-            return json.load(f)
-
-    config = _load_config_only(model_name)
+    config = load_config_only(model_name, local_files_only=use_hfcache)
 
     quant_method = config.get("quant_method", None)
     quantization_config = config.get("quantization_config", None)
@@ -139,7 +123,7 @@ def _load_config_only(name: str) -> dict:
     mlx_model_name = MODELS.get(model_name, model_name)
 
     if mlx_model_name != model_name:
-        mlx_config = _load_config_only(mlx_model_name)
+        mlx_config = load_config_only(mlx_model_name, local_files_only=use_hfcache)
         mlx_quant_dict = mlx_config.get("quantization_config", None)
         if mlx_quant_dict and "bits" in mlx_quant_dict:
             mlx_param_bytes_per_element = mlx_quant_dict["bits"] / 8
@@ -161,6 +145,7 @@ def _load_config_only(name: str) -> dict:
         head_size=config.get("head_dim", 128),
         qk_nope_head_dim=config.get("qk_nope_head_dim", None),
         qk_rope_head_dim=config.get("qk_rope_head_dim", None),
+        v_head_dim=config.get("v_head_dim", None),
         hidden_dim=config.get("hidden_size", 0),
         intermediate_dim=config.get("intermediate_size", 0),
         num_attention_heads=config.get("num_attention_heads", 0),
diff --git a/src/parallax/launch.py b/src/parallax/launch.py
index b20f6250..41f50fe4 100644
--- a/src/parallax/launch.py
+++ b/src/parallax/launch.py
@@ -27,7 +27,7 @@
 from parallax.server.http_server import launch_http_server, stop_http_server
 from parallax.server.server_args import parse_args
 from parallax.utils.shared_state import SharedState
-from parallax.utils.utils import fetch_model_from_hf, initialize_nccl_port
+from parallax.utils.utils import initialize_nccl_port, load_config_only
 from parallax_utils.ascii_anime import display_parallax_join
 from parallax_utils.logging_config import get_logger, set_log_level
 from parallax_utils.version_check import check_latest_release
@@ -119,7 +119,7 @@ def _wait_executors_check_layer_change(shared_state: SharedState, executor_subpr
                 display_parallax_join(args.model_path)
             check_latest_release()
 
-            config = fetch_model_from_hf(args.model_path, local_files_only=args.use_hfcache)
+            config = load_config_only(args.model_path, local_files_only=args.use_hfcache)
             if args.start_layer is None:
                 args.start_layer = 0
             if args.end_layer is None:
diff --git a/src/parallax/models/deepseek_v32.py b/src/parallax/models/deepseek_v32.py
index 7f2553eb..1861c3c5 100644
--- a/src/parallax/models/deepseek_v32.py
+++ b/src/parallax/models/deepseek_v32.py
@@ -102,7 +102,12 @@ def __call__(
             scores = scores * weights
             scores = scores.sum(axis=1)
             if mask is not None:
-                scores = mx.where(mask, scores, -float("inf"))
+                if mask.ndim == 4:
+                    mask = mask[:, 0, :, :]
+                if mask.dtype == mx.bool_:
+                    scores = mx.where(mask, scores, -float("inf"))
+                else:
+                    scores = scores + mask.astype(scores.dtype)
             return mx.argpartition(scores, kth=-self.index_topk, axis=-1)[..., -self.index_topk :]
 
 
diff --git a/src/parallax/server/shard_loader.py b/src/parallax/server/shard_loader.py
index 515de3e0..75d09004 100644
--- a/src/parallax/server/shard_loader.py
+++ b/src/parallax/server/shard_loader.py
@@ -29,6 +29,10 @@
     "minimax_m2": "mlx_lm.models.minimax",
 }
 
+ARCHITECTURE_CLASS_ALIASES = {
+    "GlmMoeDsaForCausalLM": "DeepseekV32ForCausalLM",
+}
+
 
 class MLXModelLoader:
     """
@@ -93,6 +97,10 @@ def register_block_class(self):
             except Exception as e:
                 logger.warning(f"Failed to load model from {model_file}: {e}")
 
+        for alias, target in ARCHITECTURE_CLASS_ALIASES.items():
+            if target in self.block_class_map:
+                self.block_class_map[alias] = self.block_class_map[target]
+
     def linear_to_lora_layers(
         self,
         model: nn.Module,
diff --git a/src/parallax/utils/utils.py b/src/parallax/utils/utils.py
index 997a6eea..63c13a76 100644
--- a/src/parallax/utils/utils.py
+++ b/src/parallax/utils/utils.py
@@ -1,7 +1,9 @@
 """Utility functions."""
 
+import json
 import random
 import socket
+from pathlib import Path
 from typing import List
 
 import mlx.core as mx
@@ -9,9 +11,6 @@
 import psutil
 import torch
 import zmq
-from mlx_lm.utils import _download, load_config
-
-from parallax.utils.selective_download import download_metadata_only
 
 
 def is_cuda_available():
@@ -281,15 +280,24 @@ def combine_padding_and_causal_masks(
     return causal_mask + padding_mask_float
 
 
-def fetch_model_from_hf(name: str, local_files_only: bool = False):
-    """Fetch model from huggingface and returns model config"""
-
-    if local_files_only:
-        model_path = download_metadata_only(name, local_files_only=local_files_only)
+def load_config_only(name: str, local_files_only: bool = False):
+    """Load only config.json from a local path or Hugging Face repo."""
+    local_path = Path(name)
+    if local_path.exists():
+        config_file = local_path / "config.json"
     else:
-        model_path = _download(name)
-    config = load_config(model_path)
-    return config
+        from huggingface_hub import hf_hub_download
+
+        config_file = Path(
+            hf_hub_download(
+                repo_id=name,
+                filename="config.json",
+                local_files_only=local_files_only,
+            )
+        )
+
+    with open(config_file, "r") as f:
+        return json.load(f)
 
 
 def is_port_available(port: int):
diff --git a/src/scheduling/model_info.py b/src/scheduling/model_info.py
index a19c645e..1eb08915 100644
--- a/src/scheduling/model_info.py
+++ b/src/scheduling/model_info.py
@@ -44,6 +44,7 @@ class ModelInfo:
 
     qk_nope_head_dim: Optional[int] = None
     qk_rope_head_dim: Optional[int] = None
+    v_head_dim: Optional[int] = None
     head_size_k: int = None
     head_size_v: int = None
 
@@ -55,7 +56,8 @@ def __init__(self, **kwargs):
             self.head_size_k = self.qk_nope_head_dim + self.qk_rope_head_dim
         else:
             self.head_size_k = self.head_size
-        self.head_size_v = self.head_size
+        v_head_dim = getattr(self, "v_head_dim", None)
+        self.head_size_v = v_head_dim if v_head_dim is not None else self.head_size
 
     @property
     def q_dim(self) -> int:
diff --git a/tests/scheduler_tests/test_model_info.py b/tests/scheduler_tests/test_model_info.py
new file mode 100644
index 00000000..0e62f7cd
--- /dev/null
+++ b/tests/scheduler_tests/test_model_info.py
@@ -0,0 +1,23 @@
+from scheduling.model_info import ModelInfo
+
+
+def test_model_info_uses_distinct_value_head_dim():
+    model_info = ModelInfo(
+        model_name="zai-org/GLM-5.1",
+        mlx_model_name="mlx-community/GLM-5.1",
+        head_size=64,
+        qk_nope_head_dim=192,
+        qk_rope_head_dim=64,
+        v_head_dim=256,
+        hidden_dim=6144,
+        intermediate_dim=12288,
+        num_attention_heads=64,
+        num_kv_heads=64,
+        vocab_size=154880,
+        num_layers=78,
+        cache_bytes_per_element=2,
+    )
+
+    assert model_info.head_size_k == 256
+    assert model_info.head_size_v == 256
+    assert model_info.per_token_per_layer_kv_size == 2 * 64 * (256 + 256)
diff --git a/tests/test_shard_loader.py b/tests/test_shard_loader.py
index a5bee274..7f901a9e 100644
--- a/tests/test_shard_loader.py
+++ b/tests/test_shard_loader.py
@@ -7,7 +7,11 @@
 
 import pytest
 
-from parallax.server.shard_loader import MODEL_CLASS_MAP, MLXModelLoader
+from parallax.server.shard_loader import (
+    ARCHITECTURE_CLASS_ALIASES,
+    MODEL_CLASS_MAP,
+    MLXModelLoader,
+)
 
 
 @pytest.mark.skipif(sys.platform != "darwin", reason="MLX tests require macOS")
@@ -27,11 +31,16 @@ def test_register_block_class_success(self):
         assert isinstance(loader.block_class_map, dict)
 
         # Check that expected architectures are registered
-        expected_architectures = ["Qwen2ForCausalLM", "Qwen3ForCausalLM"]
+        expected_architectures = [
+            "Qwen2ForCausalLM",
+            "Qwen3ForCausalLM",
+            "GlmMoeDsaForCausalLM",
+        ]
         for architecture in expected_architectures:
             assert architecture in loader.block_class_map
             assert hasattr(loader.block_class_map[architecture], "get_architecture")
-            assert loader.block_class_map[architecture].get_architecture() == architecture
+            target_architecture = ARCHITECTURE_CLASS_ALIASES.get(architecture, architecture)
+            assert loader.block_class_map[architecture].get_architecture() == target_architecture
 
     def test_register_block_class_with_missing_get_architecture(self):
         """Test registration when EntryClass doesn't have get_architecture method."""
@@ -137,7 +146,8 @@ def test_register_block_class_multiple_models(self):
         # Each registered architecture should have a valid EntryClass
         for architecture, entry_class in loader.block_class_map.items():
             assert hasattr(entry_class, "get_architecture")
-            assert entry_class.get_architecture() == architecture
+            target_architecture = ARCHITECTURE_CLASS_ALIASES.get(architecture, architecture)
+            assert entry_class.get_architecture() == target_architecture
 
     def test_register_block_class_initialization(self):
         """Test that register_block_class is called during initialization."""
diff --git a/tests/test_static_config.py b/tests/test_static_config.py
new file mode 100644
index 00000000..a50ce6b8
--- /dev/null
+++ b/tests/test_static_config.py
@@ -0,0 +1,5 @@
+from backend.server.static_config import MODELS
+
+
+def test_glm_5_1_uses_mlx_community_model():
+    assert MODELS["zai-org/GLM-5.1"] == "mlx-community/GLM-5.1"
diff --git a/tests/test_utils.py b/tests/test_utils.py
index cfa53aa2..c4459a82 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,4 +1,5 @@
 from types import SimpleNamespace
+from unittest.mock import patch
 
 from parallax.utils import utils
 
@@ -31,3 +32,27 @@ def test_get_current_device_prefers_mlx_when_both_backends_report_available(monk
     monkeypatch.setattr(utils, "is_metal_available", lambda: True)
 
     assert utils.get_current_device() == "mlx"
+
+
+def test_load_config_only_reads_local_config(tmp_path):
+    model_path = tmp_path / "model"
+    model_path.mkdir()
+    (model_path / "config.json").write_text('{"num_hidden_layers": 2}')
+
+    config = utils.load_config_only(str(model_path))
+    assert config == {"num_hidden_layers": 2}
+
+
+def test_load_config_only_downloads_config_json(tmp_path):
+    config_path = tmp_path / "config.json"
+    config_path.write_text('{"num_hidden_layers": 78}')
+
+    with patch("huggingface_hub.hf_hub_download", return_value=str(config_path)) as download:
+        config = utils.load_config_only("mlx-community/GLM-5.1", local_files_only=True)
+
+    assert config == {"num_hidden_layers": 78}
+    download.assert_called_once_with(
+        repo_id="mlx-community/GLM-5.1",
+        filename="config.json",
+        local_files_only=True,
+    )

From 3e633b673006c60fc119832b10b8d4ca3d4f05dd Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 23 May 2026 13:34:52 +0800
Subject: [PATCH 2/6] update

---
 src/parallax/utils/selective_download.py                   | 3 ---
 .../parallax_extensions_tests/test_kv_cache_integration.py | 7 ++++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/parallax/utils/selective_download.py b/src/parallax/utils/selective_download.py
index 9822b47c..79af4aba 100644
--- a/src/parallax/utils/selective_download.py
+++ b/src/parallax/utils/selective_download.py
@@ -121,9 +121,6 @@ def selective_model_download(
                     # Check if file already exists in local cache before downloading
                     weight_file_path = model_path / weight_file
                     if weight_file_path.exists():
-                        logger.debug(
-                            f"Weight file {weight_file} already exists locally, skipping download"
-                        )
                         continue
 
                     logger.debug(f"Downloading {weight_file}")
diff --git a/tests/parallax_extensions_tests/test_kv_cache_integration.py b/tests/parallax_extensions_tests/test_kv_cache_integration.py
index 944aadca..9bafe613 100644
--- a/tests/parallax_extensions_tests/test_kv_cache_integration.py
+++ b/tests/parallax_extensions_tests/test_kv_cache_integration.py
@@ -25,11 +25,12 @@ def setUp(self):
             dtype=self.dtype,
             block_size=self.block_size,
             cache_memory_fraction=0.5,
+            num_gpu_blocks=128,
         )
 
-        # Ensure we have enough blocks for testing
-        if self.cache_manager.num_gpu_blocks < 100:
-            pass
+        # The test only needs 3 blocks; keep the cache bounded so CI does not
+        # allocate a large fraction of the hosted runner's Metal memory.
+        assert self.cache_manager.num_gpu_blocks >= 3
 
     def test_prefill_slot_mapping(self):
         """

From 6548c16786a04f2ec61cea371266a04599506333 Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 23 May 2026 14:17:57 +0800
Subject: [PATCH 3/6] update

---
 pyproject.toml                 |  3 ++
 src/parallax_extensions/ops.py | 89 ++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index dcbcfb16..0bccc6b5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,7 +72,10 @@ benchmark = [
 
 dev = [
   "black>=24.3",
+  "cmake>=3.27",
+  "ninja>=1.11",
   "ruff>=0.4",
+  "setuptools>=68",
   "pytest>=8.2",
   "pytest-mock>=3.14",
   "pytest-cov>=5.0",
diff --git a/src/parallax_extensions/ops.py b/src/parallax_extensions/ops.py
index e5a23902..9e2fe155 100644
--- a/src/parallax_extensions/ops.py
+++ b/src/parallax_extensions/ops.py
@@ -1,5 +1,9 @@
 import importlib
+import os
+import shutil
+import subprocess
 import sys
+import sysconfig
 from pathlib import Path
 from types import ModuleType
 from typing import Optional
@@ -34,8 +38,93 @@ def _build_import_error(original_error: Exception) -> ImportError:
     return ImportError(msg)
 
 
+def _build_signature() -> str:
+    try:
+        from importlib.metadata import version
+
+        mlx_version = version("mlx")
+        nanobind_version = version("nanobind")
+    except Exception:
+        mlx_version = "unknown"
+        nanobind_version = "unknown"
+
+    return "|".join(
+        [
+            sys.implementation.cache_tag or "",
+            sys.version.split()[0],
+            mx.__file__,
+            mlx_version,
+            nanobind_version,
+        ]
+    )
+
+
+def _ensure_build_tools() -> None:
+    missing = []
+    try:
+        import setuptools  # noqa: F401
+    except Exception:
+        missing.append("setuptools")
+
+    if shutil.which("cmake") is None:
+        missing.append("cmake")
+    if shutil.which("ninja") is None:
+        missing.append("ninja")
+
+    if missing:
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", *missing],
+            check=True,
+        )
+
+
+def _rebuild_for_github_actions() -> None:
+    """Build native kernels against the exact Python/MLX used by GitHub macOS CI."""
+    if os.environ.get("GITHUB_ACTIONS") != "true" or sys.platform != "darwin":
+        return
+    if os.environ.get("PARALLAX_SKIP_CI_EXTENSION_REBUILD") == "1":
+        return
+
+    package_dir = Path(__file__).resolve().parent
+    lib_dir = package_dir / "lib"
+    ext_suffix = sysconfig.get_config_var("EXT_SUFFIX") or ".so"
+    expected_ext = lib_dir / f"_ext{ext_suffix}"
+    stamp = lib_dir / f".ci-build-{sys.implementation.cache_tag or 'python'}"
+    signature = _build_signature()
+
+    if expected_ext.exists() and stamp.exists() and stamp.read_text() == signature:
+        return
+
+    _ensure_build_tools()
+
+    env = os.environ.copy()
+    env["DEBUG"] = "0"
+    cmake_args = env.get("CMAKE_ARGS", "")
+    python_arg = f"-DPython_EXECUTABLE={sys.executable}"
+    env["CMAKE_ARGS"] = f"{python_arg} {cmake_args}".strip()
+
+    log_path = Path("/tmp/parallax_ext_build.log")
+    with log_path.open("w") as log:
+        subprocess.run(
+            [sys.executable, "setup.py", "build_ext", "-j8", "--inplace"],
+            cwd=package_dir,
+            env=env,
+            stdout=log,
+            stderr=subprocess.STDOUT,
+            check=True,
+        )
+
+    stamp.write_text(signature)
+    print(f"Rebuilt parallax_extensions native kernels for CI: {expected_ext}")
+
+
 def load_extension_module() -> ModuleType:
     """Load the compiled extension module for the current Python runtime."""
+    try:
+        _rebuild_for_github_actions()
+    except Exception as exc:  # pragma: no cover - GitHub runner dependent
+        raise _build_import_error(exc) from exc
+
     try:
         # Python's import machinery selects the matching ABI-tagged binary
         # (e.g. _ext.cpython-312-*.so) from parallax_extensions/lib.

From a78aed832099a03f33ba66737c7eeffc5a3ffc4c Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 23 May 2026 15:03:46 +0800
Subject: [PATCH 4/6] update

---
 src/parallax/models/deepseek_v32.py |  9 +++--
 tests/test_deepseek_v32.py          | 52 +++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_deepseek_v32.py

diff --git a/src/parallax/models/deepseek_v32.py b/src/parallax/models/deepseek_v32.py
index 1861c3c5..f643e446 100644
--- a/src/parallax/models/deepseek_v32.py
+++ b/src/parallax/models/deepseek_v32.py
@@ -142,11 +142,10 @@ def __call__(
         compressed_kv = self.kv_a_proj_with_mqa(x)
         compressed_kv, k_pe = mx.split(compressed_kv, [self.kv_lora_rank], axis=-1)
         k_pe = k_pe.reshape(batch, target_len, 1, self.qk_rope_head_dim).transpose(0, 2, 1, 3)
-        kv = self.kv_b_proj(self.kv_a_layernorm(compressed_kv))
-        kv = kv.reshape(batch, target_len, self.num_heads, -1)
-
-        k_nope, values = mx.split(kv, [self.qk_nope_head_dim], axis=-1)
-        k_nope = k_nope.transpose(0, 2, 1, 3)
+        kv_latent = self.kv_a_layernorm(compressed_kv)
+        kv_latent = kv_latent[:, None, :, :]
+        k_nope = self.embed_q(kv_latent, transpose=False)
+        values = self.unembed_out(kv_latent).transpose(0, 2, 1, 3)
         key_cache_global, value_cache_global = cache.get_cache()
         indexer_cache = cache.get_indexer_cache()
 
diff --git a/tests/test_deepseek_v32.py b/tests/test_deepseek_v32.py
new file mode 100644
index 00000000..061c0d01
--- /dev/null
+++ b/tests/test_deepseek_v32.py
@@ -0,0 +1,52 @@
+import sys
+
+import mlx.core as mx
+import pytest
+
+from parallax.models.deepseek_v32 import ModelArgs, ParallaxDeepSeekV32Attention
+from parallax.server.cache.dsa_cache import DeepSeekSparseCache
+
+pytestmark = pytest.mark.skipif(sys.platform != "darwin", reason="MLX tests require macOS")
+
+
+def _tiny_args():
+    return ModelArgs(
+        hidden_size=16,
+        num_attention_heads=2,
+        num_key_value_heads=2,
+        q_lora_rank=8,
+        kv_lora_rank=4,
+        qk_nope_head_dim=2,
+        qk_rope_head_dim=2,
+        v_head_dim=4,
+        index_head_dim=4,
+        index_n_heads=2,
+        index_topk=4,
+        num_hidden_layers=1,
+        max_position_embeddings=16,
+    )
+
+
+def test_attention_decode_forward_uses_glm_style_kv_cache():
+    args = _tiny_args()
+    attention = ParallaxDeepSeekV32Attention(args)
+    cache = DeepSeekSparseCache(
+        num_blocks=1,
+        block_size=8,
+        num_kv_heads=args.num_key_value_heads,
+        head_dim=args.qk_nope_head_dim + args.qk_rope_head_dim,
+        head_dim_v=args.v_head_dim,
+        dtype=mx.float32,
+        index_head_dim=args.index_head_dim,
+        index_n_heads=args.index_n_heads,
+    )
+
+    output = attention(
+        mx.zeros((1, 1, args.hidden_size), dtype=mx.float32),
+        cache=cache,
+        block_tables=mx.array([[0]], dtype=mx.int32),
+        context_lengths=mx.array([1], dtype=mx.int32),
+    )
+    mx.eval(output)
+
+    assert output.shape == (1, 1, args.hidden_size)

From 5305dff819593276abaf849edcccce72e5bf67fa Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 23 May 2026 15:05:44 +0800
Subject: [PATCH 5/6] update

---
 src/backend/server/static_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py
index c4263dd9..e7af2945 100644
--- a/src/backend/server/static_config.py
+++ b/src/backend/server/static_config.py
@@ -33,6 +33,7 @@
     "zai-org/GLM-4.7-Flash": "mlx-community/GLM-4.7-Flash-4bit",
     "zai-org/GLM-4.7-Flash-FP8": "mlx-community/GLM-4.7-Flash-8bit",
     "zai-org/GLM-5.1": "mlx-community/GLM-5.1",
+    "zai-org/GLM-5.1-FP8": "mlx-community/GLM-5.1",
     # Minimax M2 Models
     "MiniMaxAI/MiniMax-M2.7": "mlx-community/MiniMax-M2.7-4bit",
     "MiniMaxAI/MiniMax-M2.1": "mlx-community/MiniMax-M2.1-4bit",

From 657095d147a2f713717e3f72bf55c3b1c6ddf280 Mon Sep 17 00:00:00 2001
From: gufengc <gufeng@gradient.network>
Date: Sat, 23 May 2026 15:09:14 +0800
Subject: [PATCH 6/6] update

---
 .../parallax_extensions_tests/test_kv_cache_integration.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tests/parallax_extensions_tests/test_kv_cache_integration.py b/tests/parallax_extensions_tests/test_kv_cache_integration.py
index 9bafe613..944aadca 100644
--- a/tests/parallax_extensions_tests/test_kv_cache_integration.py
+++ b/tests/parallax_extensions_tests/test_kv_cache_integration.py
@@ -25,12 +25,11 @@ def setUp(self):
             dtype=self.dtype,
             block_size=self.block_size,
             cache_memory_fraction=0.5,
-            num_gpu_blocks=128,
         )
 
-        # The test only needs 3 blocks; keep the cache bounded so CI does not
-        # allocate a large fraction of the hosted runner's Metal memory.
-        assert self.cache_manager.num_gpu_blocks >= 3
+        # Ensure we have enough blocks for testing
+        if self.cache_manager.num_gpu_blocks < 100:
+            pass
 
     def test_prefill_slot_mapping(self):
         """