quic · quic-amitraj · Aug 20, 2025 · Aug 28, 2025
@@ -313,6 +313,7 @@ def _compile(
             "mdp_ts_json": mdp_ts_json,
             "num_speculative_tokens": num_speculative_tokens,
         }
+
         compile_hash = hash_dict_params(compile_hash_params)
 
         compile_dir = qpc_path.with_name(qpc_path.name + "-" + compile_hash)

@@ -98,6 +98,14 @@ def forward(
         return self.pooling_fn(output[0], attention_mask)
 
 
+def embedding_forward(
+    self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, **kwargs
+):
+    print("Forward swapped with new one")
+    output = self.old_forward(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+    return output[0]
+
+
 def validate_user_pooling_function(user_function):
     """
     Validate a user-provided pooling function to ensure it meets the required interface.

@@ -161,14 +161,16 @@ class QEFFAutoModel(QEFFTransformersBase):
     _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
-    def __init__(self, model: nn.Module, pooling=None, **kwargs):
+    def __init__(self, model: nn.Module, **kwargs):
         super().__init__(model, **kwargs)
 
         # Make Embedding specific transforms like appending pooling
-        if pooling:
-            self.model, _ = PoolingTransform.apply(self.model, pooling)
+        if kwargs["pooling"]:
+            self.model, _ = PoolingTransform.apply(self.model, kwargs["pooling"])
+        # else:
+        #     self.model, _ = EmbeddingTransform.apply(self.model)
 
-        self.model.base_model.config.use_cache = True
+        # self.model.base_model.config.use_cache = True
 
         self.hash_params["qeff_auto_class"] = self.__class__.__name__
 
@@ -396,7 +398,7 @@ def cloud_ai_100_feature_generate(
             outputs = self.qpc_session.run(inputs)
         except Exception:
             outputs = {
-                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype(
+                "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[0]).astype(
                     np.float32
                 ),
             }

@@ -154,7 +154,12 @@
 
 from QEfficient.base.pytorch_transforms import ExternalModuleMapperTransform, ModuleMappingTransform
 from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
-from QEfficient.transformers.embeddings.embedding_utils import POOLING_MAP, PooledModel, validate_user_pooling_function
+from QEfficient.transformers.embeddings.embedding_utils import (
+    POOLING_MAP,
+    PooledModel,
+    embedding_forward,
+    validate_user_pooling_function,
+)
 from QEfficient.transformers.models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -634,3 +639,14 @@ def apply(cls, model: nn.Module, pooling: Union[str, Callable]) -> Tuple[nn.Modu
         model = PooledModel(model, pooling_method)
         warnings.warn("Pooling is applied to the model.")
         return model, transformed
+
+
+class EmbeddingTransform:
+    @classmethod
+    def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) -> Tuple[nn.Module, bool]:
+        transformed = False
+        model.old_forward = model.forward
+        model.forward = MethodType(embedding_forward, model)
+        transformed = True
+
+        return model, transformed
@@ -17,7 +17,7 @@
 ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
 ONNX_EXPORT_EXAMPLE_FBS = 4
 ONNX_EXPORT_EXAMPLE_NLK = 2  # Number of Logits to Keep
-ONNX_EXPORT_OPSET = 13
+ONNX_EXPORT_OPSET = 18
 ONNX_EXPORT_MAX_NUM_IMAGES = 1
 ONNX_EXPORT_MAX_IMAGE_TILES = 4
 ONNX_EXPORT_IMAGE_WIDTH = 560
@@ -41,6 +41,7 @@
     "pretrained_model_name_or_path",
     "attn_implementation",
     "_attn_implementation",
+    "pooling",
 ]
 
 # Minimum value for causal mask
@@ -83,7 +84,7 @@ def get_models_dir():
 ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS = 512
 ONNX_EXPORT_EXAMPLE_TOP_PS = 0.80
 ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99
-ONNX_EXPORT_OPSET = 13
+ONNX_EXPORT_OPSET = 18
 
 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]
 

@@ -24,12 +24,12 @@ def max_pooling(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor)
 sentences = "This is an example sentence"
 
 # Load model from HuggingFace Hub
-tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+tokenizer = AutoTokenizer.from_pretrained("intfloat/e5-mistral-7b-instruct")
 
 
 # You can specify the pooling strategy either as a string (e.g., "max") or by passing a custom pooling function.
 # If no pooling is specified, the model will return its default output (typically token embeddings).
-qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling=max_pooling)
+qeff_model = AutoModel.from_pretrained("intfloat/e5-mistral-7b-instruct", num_hidden_layers=1, pooling=max_pooling)
 # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2", pooling="max")
 # qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
 

@@ -0,0 +1,64 @@
+from transformers import AutoModelForCausalLM
+
+from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient.utils.run_utils import ApiRunner
+from QEfficient.utils._utils import create_json, load_hf_tokenizer
+from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
+
+from QEfficient.utils import hf_download
+
+def load_causal_lm_model(model_config):
+    """
+    Function to load model from huggingface and transform to KV model
+    --------
+
+    :model_config: Dict
+
+    :return model_hf, params
+    """
+    model_path = hf_download(
+        repo_id=model_config["model_name"],
+        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
+    )
+    model_hf = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        use_cache=True,
+        num_hidden_layers=model_config["n_layer"],
+        attn_implementation="eager",
+        low_cpu_mem_usage=False,
+    )  # Run models for single layers only
+    # params = sum(p.numel() for p in model_hf.parameters())
+    params=""
+    # model_hf.eval()
+    return model_hf, params
+
+
+def check_llama_onnx():
+    MODEL_ID = "meta-llama/Llama-3.2-1B"
+    # replace_transformers_quantizers()
+    tokenizer = load_hf_tokenizer(pretrained_model_name_or_path=MODEL_ID)
+    # config = AutoConfig.from_pretrained(MODEL_ID)
+    # config.num_hidden_layers = 1
+    model_config = {"model_name": MODEL_ID}
+    model_config["n_layer"] = 2
+
+    model_hf, _ = load_causal_lm_model(model_config)
+    config = model_hf.config
+    api_runner = ApiRunner(
+        1,
+        tokenizer,
+        model_hf.config,
+        "Where is the Thomas J. Watson Research Center located?",
+        32, #prompt_len
+        64, #ctx_len
+
+    )
+    pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf)
+    qeff_model = QEFFAutoModelForCausalLM(model_hf)
+    pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model)
+    onnx_model_path = qeff_model.export()
+    ort_tokens = api_runner.run_kv_model_on_ort(onnx_model_path, is_tlm=False)
+
+if __name__ == "__main__":
+    # run()
+    check_llama_onnx()
@@ -22,9 +22,16 @@
 embed_test_models = [
     {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"},
     {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"},
+    {"model_name": "intfloat/e5-mistral-7b-instruct", "pooling":"cls"},
+    {"model_name": "sentence-transformers/multi-qa-mpnet-base-cos-v1"},
+    {"model_name": "intfloat/e5-mistral-7b-instruct", "pooling": "cls"},
+    {"model_name": "nomic-ai/nomic-embed-text-v1.5", "pooling": "cls"},
+    {"model_name": "NovaSearch/stella_en_1.5B_v5", "pooling": "cls"},
+    {"model_name": "ibm-granite/granite-embedding-30m-english", "pooling": "cls"},
+    {"model_name": "BAAI/bge-reranker-v2-m3", "pooling": "cls"},
+    {"model_name": "ibm-granite/granite-embedding-107m-multilingual", "pooling": "cls"},
 ]
 
-
 def check_embed_pytorch_vs_ort_vs_ai100(
     model_name: str,
     seq_len: int = Constants.CTX_LEN,
@@ -119,7 +126,7 @@ def test_embed_model_pytorch_vs_onnx_vs_ai100_pooling(model):
 
 
 @pytest.mark.on_qaic
-@pytest.mark.parametrize("model", embed_test_models[:1])
+@pytest.mark.parametrize("model", embed_test_models)
 def test_embed_model_pytorch_vs_onnx_vs_ai100_multiple_seq_len(model):
     """
     Test function to validate output of the Pytorch, ONNX and AI 100 runtime model output with multiple seq_len.