Added unit test for non HF models like swiftkv

quic-hemagnih · quic-hemagnih · commit bbaaf6131e25 · 2025-03-31T09:17:37.000Z
Signed-off-by: Hem Agnihotri &lt;quic_hemagnih@quicinc.com&gt;
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
@@ -91,8 +91,8 @@
 
 # Placeholder for all non-transformer models
 from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
-    LlamaSwiftKVConfig,
-    LlamaSwiftKVForCausalLM,
+    QeffLlamaSwiftKVConfig,
+    QeffLlamaSwiftKVForCausalLM,
 )
 
 from .models.codegen.modeling_codegen import (
@@ -280,7 +280,7 @@
 
 # Map of model type to config class, Modelling class and transformer model architecture class
 MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
-    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM, AutoModelForCausalLM],
+    "llama_swiftkv": [QeffLlamaSwiftKVConfig, QeffLlamaSwiftKVForCausalLM, AutoModelForCausalLM],
 }
 
 
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -31,7 +31,7 @@
 )
 
 
-class LlamaSwiftKVConfig(LlamaConfig):
+class QeffLlamaSwiftKVConfig(LlamaConfig):
     """
     Args:
         num_key_value_layers (int, optional):
@@ -59,8 +59,8 @@ def __init__(
         assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
 
 
-class LlamaSwiftKVAttention(nn.Module):
-    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
+class QeffLlamaSwiftKVAttention(nn.Module):
+    def __init__(self, config: QeffLlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.attention_dropout = config.attention_dropout
@@ -139,12 +139,12 @@ def forward(
         return attn_output, past_key_value
 
 
-class LlamaSwiftKVDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
+class QeffLlamaSwiftKVDecoderLayer(nn.Module):
+    def __init__(self, config: QeffLlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-        self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
+        self.self_attn = QeffLlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -179,10 +179,10 @@ def forward(
         return hidden_states, past_key_values
 
 
-class LlamaSwiftKVModel(nn.Module):
-    config_class = LlamaSwiftKVConfig
+class QeffLlamaSwiftKVModel(nn.Module):
+    config_class = QeffLlamaSwiftKVConfig
 
-    def __init__(self, config: LlamaSwiftKVConfig):
+    def __init__(self, config: QeffLlamaSwiftKVConfig):
         super().__init__()
         self.vocab_size = config.vocab_size
         self.config = config
@@ -192,7 +192,7 @@ def __init__(self, config: LlamaSwiftKVConfig):
             [
                 QEffLlamaDecoderLayer(config=config, layer_idx=idx)
                 if idx < config.num_key_value_layers
-                else LlamaSwiftKVDecoderLayer(config=config, layer_idx=idx)
+                else QeffLlamaSwiftKVDecoderLayer(config=config, layer_idx=idx)
                 for idx in range(config.num_hidden_layers)
             ]
         )
@@ -391,13 +391,13 @@ def forward(
         return hidden_states, next_cache
 
 
-class LlamaSwiftKVForCausalLM(PreTrainedModel):  #
-    config_class = LlamaSwiftKVConfig
+class QeffLlamaSwiftKVForCausalLM(PreTrainedModel):  #
+    config_class = QeffLlamaSwiftKVConfig
 
-    def __init__(self, config: LlamaSwiftKVConfig):
+    def __init__(self, config: QeffLlamaSwiftKVConfig):
         super().__init__(config=config)
 
-        self.model = LlamaSwiftKVModel(
+        self.model = QeffLlamaSwiftKVModel(
             config=config,
         )
         self.vocab_size = config.vocab_size
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
@@ -9,7 +9,6 @@
 from typing import Optional
 
 import numpy as np
-
 import pytest
 from transformers import AutoModelForCausalLM
 
@@ -23,9 +22,33 @@
 from QEfficient.utils.run_utils import ApiRunner
 
 test_models = [
-    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",  # SwiftKV model
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "gpt2",
+    "Salesforce/codegen-350M-mono",
+    "microsoft/Phi-3-mini-4k-instruct",
+    "tiiuae/falcon-7b",
+    "Qwen/Qwen2-0.5B",
+    "bigcode/starcoder2-3b",
+    "Felladrin/Minueza-32M-Base",
+    "wtang06/mpt-125m-c4",
+    "hakurei/gpt-j-random-tinier",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "meta-llama/Llama-3.2-1B",
+    "unsloth/gemma-2b",
+    "unsloth/gemma-2-2b",
+    "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",  # AWQ model
+    "TheBloke/Llama-2-7B-GPTQ",  # GPTQ model
+    "ibm-granite/granite-20b-code-base",
+    # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic",  # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
+    "neuralmagic/Llama-3.2-3B-Instruct-FP8",  # float quantized compressed-tensor per tensor both weight and activations
+    "neuralmagic/Qwen2-0.5B-Instruct-FP8",  # fp8 quant method, static, with lm head ignored
+    "ibm-granite/granite-3.1-2b-instruct",
+    "ibm-granite/granite-guardian-3.1-2b",
 ]
 
+swiftkv_test_models = [
+    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct",  # SwiftKV model
+]
 spd_test_models = [
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 ]
@@ -89,15 +112,15 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         Constants.CTX_LEN,
     )
 
-    # pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
+    pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
     is_tlm = False if num_speculative_tokens is None else True
     qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm)
 
     pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
 
-    # assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
-    #     "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
-    # )
+    assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
+        "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
+    )
 
     onnx_model_path = qeff_model.export()
     ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm)
@@ -128,18 +151,18 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     config = model_hf.config
     full_batch_size = 4
     fbs_prompts = Constants.INPUT_STR * 4
-    # api_runner = ApiRunner(
-    #     batch_size,
-    #     tokenizer,
-    #     config,
-    #     fbs_prompts,
-    #     Constants.PROMPT_LEN,
-    #     Constants.CTX_LEN,
-    #     full_batch_size,
-    # )
-
-    # pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf)
-    # pytorch_hf_tokens = np.vstack(pytorch_hf_tokens)
+    api_runner = ApiRunner(
+        batch_size,
+        tokenizer,
+        config,
+        fbs_prompts,
+        Constants.PROMPT_LEN,
+        Constants.CTX_LEN,
+        full_batch_size,
+    )
+
+    pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf)
+    pytorch_hf_tokens = np.vstack(pytorch_hf_tokens)
 
     qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True, is_tlm=is_tlm)
     onnx_model_path = qeff_model.export()
@@ -156,19 +179,112 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         full_batch_size=full_batch_size,
         num_speculative_tokens=num_speculative_tokens,
     )
-    # exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
-    qeff_model.generate(tokenizer, prompts=fbs_prompts)
-
+    exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
 
-"""
     assert all(
         [
             all(pt_token[:24] == cloud_token[:24])
             for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
         ]
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
     assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
-"""
+
+
+def check_non_hf_kv_vs_ort_vs_ai100(
+    model_name: str,
+    prompt_len: int = Constants.PROMPT_LEN,
+    ctx_len: int = Constants.CTX_LEN,
+    n_layer: int = 1,
+    num_speculative_tokens: Optional[int] = None,
+):
+    """
+    Validate the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+        :prompt_len (int): Prompt length for the model to compile.
+        :ctx_len (int): Maximum context length to compile the model.
+        :n_layers (int): Number of layers for the Model.
+    """
+    replace_transformers_quantizers()
+    model_config = {"model_name": model_name}
+    model_config["n_layer"] = n_layer
+
+    model_hf, _ = load_causal_lm_model(model_config)
+
+    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=model_name)
+    config = model_hf.config
+    batch_size = len(Constants.INPUT_STR)
+    api_runner = ApiRunner(
+        batch_size,
+        tokenizer,
+        config,
+        Constants.INPUT_STR,
+        Constants.PROMPT_LEN,
+        Constants.CTX_LEN,
+    )
+
+    is_tlm = False if num_speculative_tokens is None else True
+
+    qeff_model = QEFFAutoModelForCausalLM(model_hf, is_tlm=is_tlm)
+    pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
+
+    onnx_model_path = qeff_model.export()
+    ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=is_tlm)
+
+    assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output."
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    qpc_path = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+
+    exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
+    cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
+    gen_len = ort_tokens.shape[-1]
+
+    assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
+        "Tokens don't match for ONNXRT output and Cloud AI 100 output."
+    )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
+
+    # testing for CB models
+    model_hf, _ = load_causal_lm_model(model_config)
+    config = model_hf.config
+    full_batch_size = 4
+    fbs_prompts = Constants.INPUT_STR * 4
+
+    qeff_model = QEFFAutoModelForCausalLM(model_hf, continuous_batching=True, is_tlm=is_tlm)
+    onnx_model_path = qeff_model.export()
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    qpc_path = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=14,
+        mxfp6=False,
+        aic_enable_depth_first=False,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+
+    exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
+
+    assert all(
+        [
+            all(pt_token[:24] == cloud_token[:24])
+            for pt_token, cloud_token in zip(ort_tokens, exec_info_fbs.generated_ids)
+        ]
+    ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
 
 # FIXME: there should be a CB test here
@@ -211,14 +327,28 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     """
     if model_name == "microsoft/Phi-3-mini-4k-instruct":
         n_layer = 2  # test only 2 layer models
-    elif model_name == "Snowflake/Llama-3.1-SwiftKV-8B-Instruct":
-        n_layer = 32
     else:
         n_layer = 1
 
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", swiftkv_test_models)
+def test_non_hf_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    """
+    Test function to validate the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    if model_name == "Snowflake/Llama-3.1-SwiftKV-8B-Instruct":
+        n_layer = 32
+    else:
+        n_layer = 2
+
+    check_non_hf_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
+
+
 @pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", spd_test_models)

Original file line number	Diff line number	Diff line change
`@@ -91,8 +91,8 @@`
`91`	`91`
`92`	`92`	`# Placeholder for all non-transformer models`
`93`	`93`	`from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (`
`94`		`- LlamaSwiftKVConfig,`
`95`		`- LlamaSwiftKVForCausalLM,`
	`94`	`+ QeffLlamaSwiftKVConfig,`
	`95`	`+ QeffLlamaSwiftKVForCausalLM,`
`96`	`96`	`)`
`97`	`97`
`98`	`98`	`from .models.codegen.modeling_codegen import (`
`@@ -280,7 +280,7 @@`
`280`	`280`
`281`	`281`	`# Map of model type to config class, Modelling class and transformer model architecture class`
`282`	`282`	`MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {`
`283`		`- "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM, AutoModelForCausalLM],`
	`283`	`+ "llama_swiftkv": [QeffLlamaSwiftKVConfig, QeffLlamaSwiftKVForCausalLM, AutoModelForCausalLM],`
`284`	`284`	`}`
`285`	`285`
`286`	`286`