[MODEL] Add support for Zamba2 models (vllm-project#13185)

yury-tokpanov · Quentin-Anthony · tlrmchlsmth · web-flow · commit 452e8fd96842 · 2025-03-18T08:56:21.000-07:00
Signed-off-by: Yury Tokpanov &lt;yury@zyphra.com&gt;
Signed-off-by: Quentin Anthony &lt;qganthony@yahoo.com&gt;
Co-authored-by: Quentin Anthony &lt;qganthony@yahoo.com&gt;
Co-authored-by: Tyler Michael Smith &lt;tysmith@redhat.com&gt;
Co-authored-by: Cyrus Leung &lt;cyrus.tl.leung@gmail.com&gt;
diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
@@ -477,6 +477,11 @@ See [this page](#generative-models) for more information on how to use generativ
   * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
   * ✅︎
   * ✅︎
+- * `Zamba2ForCausalLM`
+  * Zamba2
+  * `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc.
+  *
+  *
 :::
 
 :::{note}
diff --git a/tests/models/decoder_only/language/test_hybrid.py b/tests/models/decoder_only/language/test_hybrid.py
@@ -9,7 +9,7 @@
 from ...utils import check_outputs_equal
 
 # This test is for the hybrid models
-MODELS = ["ai21labs/Jamba-tiny-dev"]
+MODELS = ["ai21labs/Jamba-tiny-dev", "Zyphra/Zamba2-1.2B-instruct"]
 # Bamba at Fp32 is too big for the CI (L4 GPU).
 # MODELS = ["ai21labs/Jamba-tiny-dev", "ibm-ai-platform/Bamba-9B"]
 
@@ -27,17 +27,19 @@ def test_models(
 ) -> None:
 
     # numeric error produces different generation
-    if 'Bamba' in model:
+    if "Bamba" in model:
         example_prompts.pop(3)
 
-    with hf_runner(
-            model,
-            dtype=dtype,
-            model_kwargs={
-                "use_mamba_kernels":
-                False,  # mamba kernels are not installed so HF 
-                # don't use them
-            }) as hf_model:
+    model_kwargs = {
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
+        # don't use them
+    }
+    if "Zamba2" in model:
+        # Zamba2 HF implementation automatically checks if mamba kernels are
+        # installed
+        model_kwargs = {}
+
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model, dtype=dtype) as vllm_model:
@@ -112,26 +114,31 @@ def test_mamba_prefill_chunking_with_parallel_sampling(
 def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
                                 model: str, dtype: str,
                                 max_tokens: int) -> None:
-    # numeric error during prefill chucking produces different generation
+    # numeric error during prefill chunking produces different generation
     # compared to w/o prefill chunking for those examples, removed them for now
-    if 'Jamba' in model:
+    if "Jamba" in model:
         example_prompts.pop(7)
         example_prompts.pop(2)
         example_prompts.pop(1)
-    elif 'Bamba' in model:
+    elif "Bamba" in model:
         example_prompts.pop(6)
         example_prompts.pop(3)
         example_prompts.pop(2)
         dtype = "half"  # use a different dtype for Bamba
-
-    with hf_runner(
-            model,
-            dtype=dtype,
-            model_kwargs={
-                "use_mamba_kernels":
-                False,  # mamba kernels are not installed so HF 
-                # don't use them
-            }) as hf_model:
+    elif "Zamba2" in model:
+        example_prompts.pop(7)
+        dtype = "half"
+
+    model_kwargs = {
+        "use_mamba_kernels": False,  # mamba kernels are not installed so HF 
+        # don't use them
+    }
+    if "Zamba2" in model:
+        # Zamba2 HF implementation automatically checks if mamba kernels are
+        # installed
+        model_kwargs = {}
+
+    with hf_runner(model, dtype=dtype, model_kwargs=model_kwargs) as hf_model:
         non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
 
     with vllm_runner(model,
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -195,6 +195,8 @@ def check_available_online(
     "XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
                                          is_available_online=False,
                                          trust_remote_code=True),
+    "Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct",
+                                         min_transformers_version="4.49"),
     # [Encoder-decoder]
     "BartModel": _HfExamplesInfo("facebook/bart-base"),
     "BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
diff --git a/vllm/config.py b/vllm/config.py
@@ -821,6 +821,11 @@ def get_head_size(self) -> int:
                 if qk_rope_head_dim and qk_nope_head_dim:
                     return qk_rope_head_dim + qk_nope_head_dim
 
+        if hasattr(self.hf_text_config,
+                   "model_type") and (self.hf_text_config.model_type
+                                      == "zamba2"):
+            return self.hf_text_config.attention_head_dim
+
         if self.is_attention_free:
             return 0
 
@@ -944,6 +949,15 @@ def get_num_layers_by_block_type(
                                  "cannot determine the num of "
                                  f"{block_type.value} layers")
 
+            if hasattr(self.hf_text_config,
+                       "model_type") and (self.hf_text_config.model_type
+                                          == "zamba2"):
+                if attn_block_type:
+                    return sum(t == "hybrid"
+                               for t in layers_block_type_value[start:end])
+                else:
+                    return self.get_num_layers(parallel_config)
+
             return sum(t == block_type.value
                        for t in layers_block_type_value[start:end])
 
diff --git a/vllm/model_executor/layers/mamba/mamba_mixer2.py b/vllm/model_executor/layers/mamba/mamba_mixer2.py
@@ -245,7 +245,6 @@ def __init__(self,
         assert num_heads % self.tp_size == 0, \
             "Tensor parallel world size must divide num heads."
 
-
         assert (n_groups % self.tp_size) == 0 or n_groups == 1, \
             (
                 "If tensor parallel world size does not divide num_heads, "
diff --git a/vllm/model_executor/models/bamba.py b/vllm/model_executor/models/bamba.py
@@ -38,8 +38,6 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
 
 class BambaMLP(nn.Module):
 
diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py
@@ -36,8 +36,6 @@
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
-KVCache = Tuple[torch.Tensor, torch.Tensor]
-
 
 class JambaMoE(nn.Module):
 
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py
@@ -105,6 +105,7 @@
     "SolarForCausalLM": ("solar", "SolarForCausalLM"),
     "TeleChat2ForCausalLM": ("telechat2", "TeleChat2ForCausalLM"),
     "XverseForCausalLM": ("llama", "LlamaForCausalLM"),
+    "Zamba2ForCausalLM": ("zamba2", "Zamba2ForCausalLM"),
     # [Encoder-decoder]
     "BartModel": ("bart", "BartForConditionalGeneration"),
     "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py

Original file line number	Diff line number	Diff line change
`@@ -245,7 +245,6 @@ def __init__(self,`
`245`	`245`	`assert num_heads % self.tp_size == 0, \`
`246`	`246`	`"Tensor parallel world size must divide num heads."`
`247`	`247`
`248`		`-`
`249`	`248`	`assert (n_groups % self.tp_size) == 0 or n_groups == 1, \`
`250`	`249`	`(`
`251`	`250`	`"If tensor parallel world size does not divide num_heads, "`