From 89771e8f8ccaa82746f2665ad0da8b04fc955eb5 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Mon, 3 Mar 2025 13:31:49 +0530
Subject: [PATCH 01/20] Removed onnx_defer_loading flag. (#295)

Removing onnx_defer_loading flag which was originally removed in
_[Removed onnx_defer_loading from Immutable Convertor Args. PR: 230]_
but got added back later in _[Mllama(single + dual) + InternVL(single) +
Llava (single) PR: 267]_ maybe becausing of rebasing.

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/utils/constants.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index a5cc6fda1..6c2bba0c6 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -136,7 +136,6 @@ class QnnConstants:
         "--float_bitwidth ",
         "--preserve_io_datatype",
         "--onnx_skip_simplification",
-        "--onnx_defer_loading",
     ]
 
     IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [

From b3736a4999c1283d0075b801cdacad3e51943a0e Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Mon, 3 Mar 2025 20:29:36 +0530
Subject: [PATCH 02/20] Code for SDK configs Inclusion (#203)

This will create a config JSON file, which contains all the details
about compilation and SDK versions.

Currently, this code is added in the code block of
QEFFAutoModelForCausalLM.compile.

The config would look like below:

```
{
    "huggingface_config": {
        "vocab_size": 50257,
        "n_positions": 1024,
        "n_embd": 768,
        "n_layer": 12,
        "n_head": 12,
        "n_inner": null,
        "activation_function": "gelu_new",
        "resid_pdrop": 0.1,
        "embd_pdrop": 0.1,
        "attn_pdrop": 0.1,
        "layer_norm_epsilon": 1e-05,
        "initializer_range": 0.02,
        "summary_type": "cls_index",
        "summary_use_proj": true,
        "summary_activation": null,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": true,
        "scale_attn_weights": true,
        "use_cache": true,
        "scale_attn_by_inverse_layer_idx": false,
        "reorder_and_upcast_attn": false,
        "bos_token_id": 50256,
        "eos_token_id": 50256,
        "return_dict": true,
        "output_hidden_states": false,
        "output_attentions": false,
        "torchscript": false,
        "torch_dtype": null,
        "use_bfloat16": false,
        "tf_legacy_loss": false,
        "pruned_heads": {},
        "tie_word_embeddings": true,
        "chunk_size_feed_forward": 0,
        "is_encoder_decoder": false,
        "is_decoder": false,
        "cross_attention_hidden_size": null,
        "add_cross_attention": false,
        "tie_encoder_decoder": false,
        "max_length": 20,
        "min_length": 0,
        "do_sample": false,
        "early_stopping": false,
        "num_beams": 1,
        "num_beam_groups": 1,
        "diversity_penalty": 0.0,
        "temperature": 1.0,
        "top_k": 50,
        "top_p": 1.0,
        "typical_p": 1.0,
        "repetition_penalty": 1.0,
        "length_penalty": 1.0,
        "no_repeat_ngram_size": 0,
        "encoder_no_repeat_ngram_size": 0,
        "bad_words_ids": null,
        "num_return_sequences": 1,
        "output_scores": false,
        "return_dict_in_generate": false,
        "forced_bos_token_id": null,
        "forced_eos_token_id": null,
        "remove_invalid_values": false,
        "exponential_decay_length_penalty": null,
        "suppress_tokens": null,
        "begin_suppress_tokens": null,
        "architectures": [
            "GPT2LMHeadModel"
        ],
        "finetuning_task": null,
        "id2label": {
            "0": "LABEL_0",
            "1": "LABEL_1"
        },
        "label2id": {
            "LABEL_0": 0,
            "LABEL_1": 1
        },
        "tokenizer_class": null,
        "prefix": null,
        "pad_token_id": null,
        "sep_token_id": null,
        "decoder_start_token_id": null,
        "task_specific_params": {
            "text-generation": {
                "do_sample": true,
                "max_length": 50
            }
        },
        "problem_type": null,
        "_name_or_path": "gpt2",
        "_commit_hash": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
        "_attn_implementation_internal": "eager",
        "transformers_version": null,
        "model_type": "gpt2",
        "n_ctx": 1024
    },
    "qpc_config": {
        "QEff_config": {
            "pytorch_transforms": [
                "AwqToMatmulNbitsTransform",
                "GPTQToMatmulNbitsTransform",
                "CustomOpsTransform",
                "KVCacheTransform"
            ],
            "onnx_transforms": [
                "FP16ClipTransform",
                "SplitTensorsTransform"
            ],
            "onnx_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/GPT2LMHeadModel.onnx"
        },
        "aic_compiler_config": {
            "apps_sdk_version": "1.20.0",
            "compile_dir": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47",
            "specializtions_file_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/specializations.json",
            "prefill_seq_len": 32,
            "ctx_len": 128,
            "batch_size": 1,
            "full_batch_size": null,
            "num_devices": 1,
            "num_cores": 16,
            "mxfp6_matmul": false,
            "mxint8_kv_cache": false,
            "num_speculative_tokens": null
        },
        "qnn_config": {
            "enable_qnn": true,
            "qnn_config_path": "QEfficient/compile/qnn_config.json",
            "product": "QAIRT",
            "os": {
                "Ubuntu": 22.04,
                "Windows": 11
            },
            "sdk_flavor": [
                "aic"
            ],
            "version": "2.31.0",
            "build_id": "250109072054_3882",
            "qnn_backend_api_version": "2.18.0",
            "tensorflow": "2.10.1",
            "tflite": "2.3.0",
            "torch": "1.13.1",
            "onnx": "1.16.1",
            "onnxruntime": "1.17.1",
            "onnxsimplifier": "0.4.36",
            "android-ndk": "r26c",
            "platform": "AIC.1.20.0.14"
        }
    }
}
```

Note: The code structure may change.

---------

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/base/modeling_qeff.py              |   6 +-
 QEfficient/peft/auto.py                       |   4 +
 QEfficient/peft/lora/auto.py                  |   4 +
 .../transformers/models/modeling_auto.py      |  24 ++++
 QEfficient/utils/__init__.py                  |   1 +
 QEfficient/utils/_utils.py                    | 114 +++++++++++++++++-
 QEfficient/utils/constants.py                 |   2 +
 tests/peft/lora/test_lora_model.py            |   4 +
 tests/peft/test_peft_model.py                 |   2 +
 tests/qnn_tests/test_causal_lm_models_qnn.py  |   8 +-
 tests/text_generation/test_text_generation.py |   3 +
 .../models/test_causal_lm_models.py           |   7 +-
 .../models/test_embedding_models.py           |   2 +
 .../models/test_prefix_caching.py             |   2 +
 .../models/test_speech_seq2seq_models.py      |   1 +
 tests/transformers/spd/test_spd_inference.py  |   3 +
 tests/transformers/test_causal_lm.py          |   2 +
 tests/transformers/test_speech_seq2seq.py     |   2 +
 18 files changed, 185 insertions(+), 6 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index ec74c57f3..f2b3714fa 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -23,7 +23,7 @@
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import constants
+from QEfficient.utils import constants, dump_qconfig
 from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
@@ -211,6 +211,7 @@ def _export(
         self.onnx_path = onnx_path
         return onnx_path
 
+    @dump_qconfig
     def _compile(
         self,
         onnx_path: Optional[str] = None,
@@ -336,8 +337,10 @@ def _compile(
             )
 
         self.qpc_path = qpc_path
+
         return qpc_path
 
+    @dump_qconfig
     def _qnn_compile(
         self,
         onnx_path: Optional[str] = None,
@@ -435,4 +438,5 @@ def _qnn_compile(
         )
 
         self.qpc_path = qpc_path
+
         return qpc_path
diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
index 377caa3e7..deb64fae1 100644
--- a/QEfficient/peft/auto.py
+++ b/QEfficient/peft/auto.py
@@ -107,6 +107,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.get_base_model().config.__dict__
+
     def load_adapter(self, model_id: str, adapter_name: str):
         """Loads a new adapter from huggingface hub or local path
 
diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py
index c13979968..7f2a5cd84 100644
--- a/QEfficient/peft/lora/auto.py
+++ b/QEfficient/peft/lora/auto.py
@@ -90,6 +90,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.model.config.__dict__
+
     def download_adapter(
         self,
         adapter_model_id: str,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 54b7828c8..5852740b4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -229,6 +229,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -447,6 +451,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.model.vision_model.config.__dict__
+
 
 class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
     _pytorch_transforms = [
@@ -506,6 +514,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.language_model.config.__dict__
+
 
 class _QEffAutoModelForImageTextToTextDualQPC:
     _hf_auto_class = AutoModelForImageTextToText
@@ -1128,6 +1140,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
 
 class QEFFAutoModelForImageTextToText:
     """
@@ -1320,6 +1336,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -1630,6 +1650,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 2506b9233..a7f17e6bc 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -11,6 +11,7 @@
 )
 from QEfficient.utils._utils import (  # noqa: F401
     check_and_assign_cache_dir,
+    dump_qconfig,
     get_num_layers_from_config,
     get_onnx_dir_name,
     get_padding_shape_from_config,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 8344a053d..ea9044e2c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,16 +8,18 @@
 import json
 import os
 import subprocess
+import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import requests
 import torch
+import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
 from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
 
 
@@ -442,3 +444,113 @@ class IOInfo:
 
     def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
+
+
+def dump_qconfig(func):
+    def wrapper(self, *args, **kwargs):
+        result = func(self, *args, **kwargs)
+        create_and_dump_qconfigs(
+            self.qpc_path,
+            self.onnx_path,
+            self.get_model_config,
+            [cls.__name__ for cls in self._pytorch_transforms],
+            [cls.__name__ for cls in self._onnx_transforms],
+            kwargs.get("specializations"),
+            kwargs.get("mdp_ts_num_devices", 1),
+            kwargs.get("num_speculative_tokens"),
+            **{
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"]
+            },
+        )
+        return result
+
+    return wrapper
+
+
+def create_and_dump_qconfigs(
+    qpc_path,
+    onnx_path,
+    huggingface_config,
+    pytorch_transforms,
+    onnx_transforms,
+    specializations,
+    mdp_ts_num_devices,
+    num_speculative_tokens,
+    **compiler_options,
+):
+    """
+    This Method creates a JSON file which contains all the configs for a model.
+    Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and
+    many other compilation options.
+    """
+    qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
+    enable_qnn = True if "qnn_config" in compiler_options else None
+
+    qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json")
+    onnx_path = str(onnx_path)
+    specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json"))
+    compile_dir = str(os.path.dirname(qpc_path))
+    qnn_config_path = (
+        (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None
+    )
+
+    # Extract QAIC SDK Apps Version from SDK XML file
+    tree = ET.parse(Constants.SDK_APPS_XML)
+    root = tree.getroot()
+    qaic_version = root.find(".//base_version").text
+
+    # Extract QNN SDK details from YAML file if the environment variable is set
+    qnn_sdk_details = None
+    qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
+    if qnn_sdk_path:
+        qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
+        with open(qnn_sdk_yaml_path, "r") as file:
+            qnn_sdk_details = yaml.safe_load(file)
+
+    # Ensure all objects in the configs dictionary are JSON serializable
+    def make_serializable(obj):
+        if isinstance(obj, (int, float, str, bool, type(None))):
+            return obj
+        elif isinstance(obj, (list, tuple)):
+            return [make_serializable(item) for item in obj]
+        elif isinstance(obj, dict):
+            return {key: make_serializable(value) for key, value in obj.items()}
+        elif hasattr(obj, "__dict__"):
+            return make_serializable(vars(obj))
+        return str(obj)
+
+    qconfigs = {
+        "huggingface_config": make_serializable(huggingface_config),
+        "qpc_config": {
+            "QEff_config": {
+                "pytorch_transforms": make_serializable(pytorch_transforms),
+                "onnx_transforms": make_serializable(onnx_transforms),
+                "onnx_path": onnx_path,
+            },
+        },
+    }
+
+    aic_compiler_config = {
+        "apps_sdk_version": qaic_version,
+        "compile_dir": compile_dir,
+        "specializations_file_path": specializations_file_path,
+        "specializations": make_serializable(specializations),
+        "mdp_ts_num_devices": mdp_ts_num_devices,
+        "num_speculative_tokens": num_speculative_tokens,
+        **compiler_options,
+    }
+    qnn_config = {
+        "enable_qnn": enable_qnn,
+        "qnn_config_path": qnn_config_path,
+    }
+    # Put AIC or qnn details.
+    if enable_qnn:
+        qconfigs["qpc_config"]["qnn_config"] = qnn_config
+        if qnn_sdk_details:
+            qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details)
+    else:
+        qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
+
+    create_json(qconfig_file_path, qconfigs)
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 6c2bba0c6..3852adcda 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -75,12 +75,14 @@ class Constants:
     MAX_QPC_LIMIT = 30
     MAX_RETRIES = 5  # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download
     NUM_SPECULATIVE_TOKENS = 2
+    SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml"  # This xml file is parsed to find out the SDK version.
 
 
 @dataclass
 class QnnConstants:
     # QNN PATH to be read from environment variable.
     QNN_SDK_PATH_ENV_VAR_NAME = "QNN_SDK_ROOT"
+    QNN_SDK_YAML = "sdk.yaml"
 
     # QNN Compilation tools
     QAIRT_CONVERTER = "{}/bin/{}/qairt-converter"
diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py
index 4726fb8c5..69a6282fb 100644
--- a/tests/peft/lora/test_lora_model.py
+++ b/tests/peft/lora/test_lora_model.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import os
 from pathlib import Path
 from time import perf_counter
 
@@ -225,6 +227,7 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate(
     # test compile
     qeff_model.compile(prefill_seq_len=32, ctx_len=64)
     assert Path(qeff_model.qpc_path).is_dir()
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
     # test generate
     prompts = ["hello!", "hi", "hello, my name is", "hey"]
@@ -249,6 +252,7 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap
     # test compile
     qeff_model.compile(prefill_seq_len=32, ctx_len=64, full_batch_size=2)
     assert Path(qeff_model.qpc_path).is_dir()
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
     # test generate
     prompts = ["hello!", "hi", "hello, my name is", "hey"]
diff --git a/tests/peft/test_peft_model.py b/tests/peft/test_peft_model.py
index 6a9a957b2..c4e331a9d 100644
--- a/tests/peft/test_peft_model.py
+++ b/tests/peft/test_peft_model.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from time import perf_counter
 
 import numpy as np
@@ -187,3 +188,4 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con
     end = perf_counter()
     compile_time_1 = end - start
     assert compile_time_1 < 0.01 * compile_time_0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py
index fe906fe7e..65acab157 100644
--- a/tests/qnn_tests/test_causal_lm_models_qnn.py
+++ b/tests/qnn_tests/test_causal_lm_models_qnn.py
@@ -5,6 +5,8 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import numpy as np
 import pytest
 from transformers import AutoModelForCausalLM
@@ -98,7 +100,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -106,6 +108,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         aic_enable_depth_first=False,
         enable_qnn=True,
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
     cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
     gen_len = ort_tokens.shape[-1]
@@ -136,7 +139,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -145,6 +148,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         full_batch_size=full_batch_size,
         enable_qnn=True,
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
 
     assert all(
diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py
index a1e4265ee..f7d3cd6cb 100644
--- a/tests/text_generation/test_text_generation.py
+++ b/tests/text_generation/test_text_generation.py
@@ -5,6 +5,8 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import pytest
 from transformers import AutoModelForCausalLM
 
@@ -101,3 +103,4 @@ def test_generate_text_stream(
     assert cloud_ai_100_output == stream_tokens, (
         f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index a3a855cee..418386780 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from typing import Optional
 
 import numpy as np
@@ -127,7 +128,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -141,6 +142,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
         "Tokens don't match for ONNXRT output and Cloud AI 100 output."
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
     # testing for CB models
     model_hf, _ = load_causal_lm_model(model_config)
@@ -165,7 +167,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -182,6 +184,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
             for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
         ]
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
 
 # FIXME: there should be a CB test here
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 1c2d5196c..e681f5093 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import numpy as np
 import onnxruntime as ort
@@ -77,6 +78,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py
index 8ef24403c..c787a3c96 100644
--- a/tests/transformers/models/test_prefix_caching.py
+++ b/tests/transformers/models/test_prefix_caching.py
@@ -31,6 +31,7 @@ def test_simple_prefix_caching(model_name):
         num_cores=14,
     )
     prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
@@ -61,6 +62,7 @@ def test_simple_prefix_caching_qnn(model_name):
         qnn_config=qnn_config_json_path,
     )
     prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
     os.remove(qnn_config_json_path)
 
 
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
index af83c9354..99f715863 100644
--- a/tests/transformers/models/test_speech_seq2seq_models.py
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -360,6 +360,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     assert (pytorch_kv_tokens == cloud_ai_100_tokens).all(), (
         "Tokens don't match for pytorch output and Cloud AI 100 output."
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index a9f197ec3..205f00a00 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from time import perf_counter
 from typing import List, Optional
 
@@ -331,3 +332,5 @@ def test_spec_decode_inference(
     ]  # Because we always run for single input and single batch size
     all_matching = np.array_equal(cloud_ai_100_tokens, generated_ids)
     assert all_matching, "Tokens don't match for SpD output and vanilla DLM output."
+    assert os.path.isfile(os.path.join(os.path.dirname(target_model_qpc_path), "qconfig.json"))
+    assert os.path.isfile(os.path.join(os.path.dirname(draft_model_qpc_path), "qconfig.json"))
diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py
index 1ceb5a7e0..64376db62 100644
--- a/tests/transformers/test_causal_lm.py
+++ b/tests/transformers/test_causal_lm.py
@@ -6,6 +6,7 @@
 # ----------------------------------------------------------------------------
 
 import copy
+import os
 from time import perf_counter
 
 import onnx
@@ -170,3 +171,4 @@ def test_causal_lm_compile(config, cb, tmp_cache):
     end = perf_counter()
     compile_time = end - start
     assert compile_time < 2.0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
diff --git a/tests/transformers/test_speech_seq2seq.py b/tests/transformers/test_speech_seq2seq.py
index a41896010..15d6152e3 100644
--- a/tests/transformers/test_speech_seq2seq.py
+++ b/tests/transformers/test_speech_seq2seq.py
@@ -6,6 +6,7 @@
 # ----------------------------------------------------------------------------
 
 import copy
+import os
 from time import perf_counter
 
 import onnx
@@ -142,3 +143,4 @@ def test_causal_lm_compile(config, tmp_cache):
     end = perf_counter()
     compile_time = end - start
     assert compile_time < 2.0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))

From 687d44fad2866eeadc64b78158bb98421f5535e6 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 6 Mar 2025 11:56:27 +0530
Subject: [PATCH 03/20] Docs string added for the Image class and granite
 models are added in validation page (#303)

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 62 ++++++++++++++++++-
 docs/source/quick_start.md                    |  6 +-
 docs/source/validate.md                       | 10 +--
 3 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 5852740b4..07aff78ff 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1147,9 +1147,69 @@ def get_model_config(self) -> dict:
 
 class QEFFAutoModelForImageTextToText:
     """
-    A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach
+    The QEFFAutoModelForImageTextToText class is used to work with multimodal language models from the HuggingFace hub.
+    While you can initialize the class directly, it's best to use the ``from_pretrained`` method for this purpose. This class supports both single and dual QPC approaches.
     Attributes:
         _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models.
+
+    ``Mandatory`` Args:
+        :pretrained_model_name_or_path (str): Model card name from HuggingFace or local path to model directory.
+
+    ``Optional`` Args:
+        :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True.
+
+    .. code-block:: python
+        import requests
+        from PIL import Image
+        from transformers import AutoProcessor, TextStreamer
+
+        from QEfficient import QEFFAutoModelForImageTextToText
+
+        # Add HuggingFace Token to access the model
+        HF_TOKEN = ""
+        model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+        query = "Describe this image."
+        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+
+        ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc
+        processor = AutoProcessor.from_pretrained(model_name, token=token)
+        model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, attn_implementation="eager", kv_offload=False)
+
+        ## STEP - 2 Export & Compile the Model
+        model.compile(
+            prefill_seq_len=32,
+            ctx_len=512,
+            img_size=560,
+            num_cores=16,
+            num_devices=1,
+            mxfp6_matmul=False,
+        )
+
+        ## STEP - 3 Load and process the inputs for Inference
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": query},
+                ],
+            }
+        ]
+        input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
+        inputs = processor(
+            text=input_text,
+            images=image,
+            return_tensors="pt",
+            add_special_tokens=False,
+            padding="max_length",
+            max_length=prefill_seq_len,
+        )
+
+        ## STEP - 4 Run Inference on the compiled model
+        streamer = TextStreamer(processor.tokenizer)
+        model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+
     """
 
     _hf_auto_class = AutoModelForImageTextToText
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 88093e134..2ccb013e9 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -239,7 +239,7 @@ Use the qualcomm_efficient_converter API to export the KV transformed Model to O
 
 generated_qpc_path = qeff_model.compile(
     num_cores=14,
-    mxfp6=True,
+    mxfp6_matmul=True,
 )
 ```
 
@@ -250,8 +250,8 @@ Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/s
 ```Python
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
-
-qeff_model.generate(prompts=["My name is"])
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+qeff_model.generate(prompts=["My name is"],tokenizer=tokenizer)
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.
 
diff --git a/docs/source/validate.md b/docs/source/validate.md
index 49acd268d..acd4c11da 100644
--- a/docs/source/validate.md
+++ b/docs/source/validate.md
@@ -41,13 +41,15 @@
 
 | Architecture | Model Family | Representative Models          |
 |--------------|--------------|---------------------------------|
-| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2)          |
+| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) |
 | **LlamaModel** | Llama-based  | [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
-| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
-| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
 | **MPNetForMaskedLM** | MPNet | [sentence-transformers/multi-qa-mpnet-base-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1) |
-| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
 | **MistralModel** | Mistral | [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
+| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
+| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
+| **RobertaModel**     | RoBERTa |  [ibm-granite/granite-embedding-30m-english](https://huggingface.co/ibm-granite/granite-embedding-30m-english)<br> [ibm-granite/granite-embedding-125m-english](https://huggingface.co/ibm-granite/granite-embedding-125m-english) |
+| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
+| **XLMRobertaModel**    | XLM-RoBERTa  |[ibm-granite/granite-embedding-107m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-107m-multilingual)<br> [ibm-granite/granite-embedding-278m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual)  |
 
 ## Multimodal Language Models
 

From 260bacb50b8db791f96bba7b3b22e2d2430ffaa7 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 6 Mar 2025 15:36:34 +0530
Subject: [PATCH 04/20] [Bug-Fix :] QEFFAutoModelForCausalLM __repr__() Method
 Fixed (#307)

This is just small fixes done for printing the
`QEFFAutoModelForCausalLM`'s instance by changing the `__repr__(self)`
method.

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 07aff78ff..a87c39fb4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1325,7 +1325,7 @@ def model_name(self) -> str:
         return mname
 
     def __repr__(self) -> str:
-        return self.__class__.__name__ + "\n" + self.model.__repr__
+        return self.__class__.__name__ + "\n" + self.model.__repr__()
 
     @classmethod
     @with_replaced_quantizers

From 691cca4c3eed306ff838c7397b229cfabaae1f6c Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 27 Feb 2025 06:22:43 +0000
Subject: [PATCH 05/20] Enabled VLMs via CLI

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/base/common.py                     | 22 +++--
 QEfficient/cloud/infer.py                     | 86 ++++++++++++++++---
 .../transformers/models/modeling_auto.py      | 18 +++-
 3 files changed, 105 insertions(+), 21 deletions(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index d94e02894..bcf5b1575 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -12,13 +12,21 @@
 QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
 """
 
+import importlib
+from collections import OrderedDict
 from typing import Any
 
+import transformers.models.auto.modeling_auto as mapping
 from transformers import AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+
+MODEL_CLASS_MAPPING = OrderedDict(
+    [
+        (tuple(mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()), "QEFFAutoModelForCausalLM"),
+        (tuple(mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()), "QEFFAutoModelForImageTextToText"),
+    ]
+)
 
 
 class QEFFCommonLoader:
@@ -42,9 +50,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         architecture = config.architectures[0] if config.architectures else None
 
-        if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-            model_class = QEFFAutoModelForCausalLM
-        else:
+        model_class = None
+        for key_tuple, class_name in MODEL_CLASS_MAPPING.items():
+            if architecture in key_tuple:
+                module = importlib.import_module("QEfficient.transformers.models.modeling_auto")
+                model_class = getattr(module, class_name)
+                break
+        if model_class is None:
             raise NotImplementedError(
                 f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
             )
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 28eaa4d52..c93dde55f 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -10,6 +10,11 @@
 import sys
 from typing import List, Optional
 
+import requests
+from PIL import Image
+from transformers import AutoConfig, AutoProcessor, TextStreamer
+from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
@@ -36,6 +41,7 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    img_size: Optional[int] = None,
     **kwargs,
 ) -> None:
     """
@@ -65,6 +71,9 @@ def main(
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
         :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
         :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+        :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+                -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
+                -qpc_crc=True -> -qpc-crc
 
     .. code-block:: bash
 
@@ -72,11 +81,6 @@ def main(
 
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
-    tokenizer = load_hf_tokenizer(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-    )
 
     if "--mxfp6" in sys.argv:
         if args.mxfp6:
@@ -85,6 +89,9 @@ def main(
         if args.mxint8:
             logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
+    image_path = kwargs.pop("image_path", None)
+    image_url = kwargs.pop("image_url", None)
+
     qeff_model = QEFFCommonLoader.from_pretrained(
         pretrained_model_name_or_path=model_name,
         cache_dir=cache_dir,
@@ -110,20 +117,70 @@ def main(
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
+        img_size=img_size,
         **kwargs,
     )
 
+    tokenizer = load_hf_tokenizer(
+        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+    )
+
     #########
     # Execute
     #########
-    _ = qeff_model.generate(
-        tokenizer,
-        prompts=prompt,
-        device_id=device_group,
-        prompt=prompt,
-        prompts_txt_file_path=prompts_txt_file_path,
-        generation_len=generation_len,
-    )
+    config = AutoConfig.from_pretrained(model_name)
+    architecture = config.architectures[0] if config.architectures else None
+
+    if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+        processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
+
+        raw_image = None
+        if image_url is not None:
+            raw_image = Image.open(requests.get(image_url, stream=True).raw)
+        elif image_path is not None:
+            raw_image = Image.open(image_path)
+        else:
+            raise FileNotFoundError(
+                'Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"'
+            )
+
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": prompt[0]},  # Currently accepting only 1 prompt
+                ],
+            },
+        ]
+
+        # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
+        input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+
+        split_inputs = processor(
+            text=input_text,
+            images=raw_image,
+            return_tensors="pt",
+            add_special_tokens=False,
+        )
+        streamer = TextStreamer(processor.tokenizer)
+        _ = qeff_model.generate(
+            inputs=split_inputs,
+            streamer=streamer,
+            device_ids=device_group,
+            generation_len=generation_len,
+        )
+    else:
+        _ = qeff_model.generate(
+            tokenizer,
+            prompts=prompt,
+            device_id=device_group,
+            prompt=prompt,
+            prompts_txt_file_path=prompts_txt_file_path,
+            generation_len=generation_len,
+        )
 
 
 if __name__ == "__main__":
@@ -226,10 +283,11 @@ def main(
              Sample Config: QEfficient/compile/qnn_config.json",
     )
     parser.add_argument(
-        "qnn_config",
+        "--qnn_config",
         nargs="?",
         type=str,
     )
+    parser.add_argument("--img-size", "--img_size", default=None, type=int, required=False, help="Size of Image")
 
     args, compiler_options = parser.parse_known_args()
     compiler_options_dict = {}
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index a87c39fb4..b127c50a0 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -615,6 +615,8 @@ def compile(
             )
 
         output_names = self.model.get_output_names(kv_offload=True)
+        vision_onnx_path = compiler_options.get("vision_onnx_path", None)
+        lang_onnx_path = compiler_options.get("lang_onnx_path", None)
 
         specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
@@ -826,7 +828,7 @@ def kv_offload_generate(
         total_time = decode_end - prefill_start
         total_perf = num_token / total_time
 
-        return CloudAI100ExecInfoNew(
+        exec_info = CloudAI100ExecInfoNew(
             batch_size=batch_size,
             generated_ids=generated_ids,
             perf_metrics=PerfMetrics(
@@ -834,6 +836,9 @@ def kv_offload_generate(
             ),
         )
 
+        print(exec_info)
+        return exec_info
+
 
 class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin):
     _hf_auto_class = AutoModelForImageTextToText
@@ -1116,7 +1121,7 @@ def cloud_ai_100_generate(
         total_time = decode_end - prefill_start
         total_perf = num_token / total_time
 
-        return CloudAI100ExecInfoNew(
+        exec_info = CloudAI100ExecInfoNew(
             batch_size=batch_size,
             generated_ids=generated_ids,
             perf_metrics=PerfMetrics(
@@ -1124,6 +1129,9 @@ def cloud_ai_100_generate(
             ),
         )
 
+        print(exec_info)
+        return exec_info
+
     @property
     def model_hash(self) -> str:
         mhash = hashlib.sha256()
@@ -1239,6 +1247,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona
         if kwargs.get("low_cpu_mem_usage", None):
             logger.warning("Updating low_cpu_mem_usage=False")
 
+        if kwargs.pop("continuous_batching", None):
+            NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
+
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         return cls(model, kv_offload=kv_offload, **kwargs)
@@ -1560,6 +1571,9 @@ def compile(
             decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
             specializations.append(decode_specialization)
 
+        if compiler_options.pop("img_size", None):
+            logger.warning("img_size is not a valid argument for Text-to-Text Model.")
+
         if enable_qnn:
             if compiler_options:
                 logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")

From ea8555dc857aa05ff9554afcb44c5440e13c4c8a Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Fri, 28 Feb 2025 12:18:16 +0000
Subject: [PATCH 06/20] Addressing comments

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/base/common.py                     | 25 ++++++++-----------
 QEfficient/cloud/infer.py                     | 18 ++++++-------
 .../transformers/models/modeling_auto.py      |  6 +----
 3 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index bcf5b1575..228d1184e 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -13,7 +13,6 @@
 """
 
 import importlib
-from collections import OrderedDict
 from typing import Any
 
 import transformers.models.auto.modeling_auto as mapping
@@ -21,12 +20,12 @@
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 
-MODEL_CLASS_MAPPING = OrderedDict(
-    [
-        (tuple(mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()), "QEFFAutoModelForCausalLM"),
-        (tuple(mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()), "QEFFAutoModelForImageTextToText"),
-    ]
-)
+MODEL_CLASS_MAPPING = {}
+for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+    MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForCausalLM"
+
+for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+    MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForImageTextToText"
 
 
 class QEFFCommonLoader:
@@ -50,13 +49,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         architecture = config.architectures[0] if config.architectures else None
 
-        model_class = None
-        for key_tuple, class_name in MODEL_CLASS_MAPPING.items():
-            if architecture in key_tuple:
-                module = importlib.import_module("QEfficient.transformers.models.modeling_auto")
-                model_class = getattr(module, class_name)
-                break
-        if model_class is None:
+        class_name = MODEL_CLASS_MAPPING.get(architecture)
+        if class_name:
+            module = importlib.import_module("QEfficient.transformers.models.modeling_auto")
+            model_class = getattr(module, class_name)
+        else:
             raise NotImplementedError(
                 f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
             )
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index c93dde55f..0ac38cb6a 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -12,7 +12,7 @@
 
 import requests
 from PIL import Image
-from transformers import AutoConfig, AutoProcessor, TextStreamer
+from transformers import AutoProcessor, TextStreamer
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
@@ -121,16 +121,10 @@ def main(
         **kwargs,
     )
 
-    tokenizer = load_hf_tokenizer(
-        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-        cache_dir=cache_dir,
-        hf_token=hf_token,
-    )
-
     #########
     # Execute
     #########
-    config = AutoConfig.from_pretrained(model_name)
+    config = qeff_model.model.config
     architecture = config.architectures[0] if config.architectures else None
 
     if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
@@ -166,13 +160,19 @@ def main(
             add_special_tokens=False,
         )
         streamer = TextStreamer(processor.tokenizer)
-        _ = qeff_model.generate(
+        output = qeff_model.generate(
             inputs=split_inputs,
             streamer=streamer,
             device_ids=device_group,
             generation_len=generation_len,
         )
+        print(output)
     else:
+        tokenizer = load_hf_tokenizer(
+            pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+            cache_dir=cache_dir,
+            hf_token=hf_token,
+        )
         _ = qeff_model.generate(
             tokenizer,
             prompts=prompt,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index b127c50a0..b668ceff5 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -835,8 +835,6 @@ def kv_offload_generate(
                 prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
             ),
         )
-
-        print(exec_info)
         return exec_info
 
 
@@ -1128,8 +1126,6 @@ def cloud_ai_100_generate(
                 prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
             ),
         )
-
-        print(exec_info)
         return exec_info
 
     @property
@@ -1572,7 +1568,7 @@ def compile(
             specializations.append(decode_specialization)
 
         if compiler_options.pop("img_size", None):
-            logger.warning("img_size is not a valid argument for Text-to-Text Model.")
+            logger.warning(f"Skipping img_size as it is not a valid argument for {self.model.config.architectures[0]}.")
 
         if enable_qnn:
             if compiler_options:

From 5ea6f1c6e5dc2e6a03001763c94dc4dc9334d8ab Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 6 Mar 2025 05:55:28 +0000
Subject: [PATCH 07/20] Removed importlib

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/base/common.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index 228d1184e..ee27d7565 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -12,7 +12,6 @@
 QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
 """
 
-import importlib
 from typing import Any
 
 import transformers.models.auto.modeling_auto as mapping
@@ -51,7 +50,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
 
         class_name = MODEL_CLASS_MAPPING.get(architecture)
         if class_name:
-            module = importlib.import_module("QEfficient.transformers.models.modeling_auto")
+            module = __import__("QEfficient.transformers.models.modeling_auto")
             model_class = getattr(module, class_name)
         else:
             raise NotImplementedError(

From 561142bf6b1ba4866d221c9b8e301a10cff973bb Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Mon, 10 Mar 2025 07:25:47 +0000
Subject: [PATCH 08/20] Addressing comments

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/base/common.py                     | 11 +---
 QEfficient/cloud/infer.py                     | 57 ++++++++-----------
 QEfficient/transformers/modeling_utils.py     | 11 ++++
 .../transformers/models/modeling_auto.py      |  5 --
 QEfficient/utils/_utils.py                    |  2 +-
 QEfficient/utils/constants.py                 |  9 +++
 6 files changed, 47 insertions(+), 48 deletions(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index ee27d7565..ad699dcfa 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -14,18 +14,11 @@
 
 from typing import Any
 
-import transformers.models.auto.modeling_auto as mapping
+from QEfficient.transformers.modeling_utils import model_class_mapping
 from transformers import AutoConfig
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 
-MODEL_CLASS_MAPPING = {}
-for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
-    MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForCausalLM"
-
-for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
-    MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForImageTextToText"
-
 
 class QEFFCommonLoader:
     """
@@ -48,7 +41,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         architecture = config.architectures[0] if config.architectures else None
 
-        class_name = MODEL_CLASS_MAPPING.get(architecture)
+        class_name = model_class_mapping.get(architecture)
         if class_name:
             module = __import__("QEfficient.transformers.models.modeling_auto")
             model_class = getattr(module, class_name)
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 0ac38cb6a..88a968c8c 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -16,7 +16,7 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, constants
 from QEfficient.utils.logging_utils import logger
 
 
@@ -41,7 +41,6 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
-    img_size: Optional[int] = None,
     **kwargs,
 ) -> None:
     """
@@ -89,9 +88,6 @@ def main(
         if args.mxint8:
             logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
-    image_path = kwargs.pop("image_path", None)
-    image_url = kwargs.pop("image_url", None)
-
     qeff_model = QEFFCommonLoader.from_pretrained(
         pretrained_model_name_or_path=model_name,
         cache_dir=cache_dir,
@@ -100,6 +96,16 @@ def main(
         local_model_dir=local_model_dir,
     )
 
+    image_path = kwargs.pop("image_path", None)
+    image_url = kwargs.pop("image_url", None)
+
+    config = qeff_model.model.config
+    architecture = config.architectures[0] if config.architectures else None
+    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
+        img_size = kwargs.pop("img_size", None)
+        if img_size or image_path or image_url:
+            logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
+
     #########
     # Compile
     #########
@@ -117,38 +123,21 @@ def main(
         allow_mxint8_mdp_io=allow_mxint8_mdp_io,
         enable_qnn=enable_qnn,
         qnn_config=qnn_config,
-        img_size=img_size,
         **kwargs,
     )
 
     #########
     # Execute
     #########
-    config = qeff_model.model.config
-    architecture = config.architectures[0] if config.architectures else None
-
     if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
         processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
 
-        raw_image = None
-        if image_url is not None:
-            raw_image = Image.open(requests.get(image_url, stream=True).raw)
-        elif image_path is not None:
-            raw_image = Image.open(image_path)
-        else:
-            raise FileNotFoundError(
-                'Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"'
-            )
+        if not (image_url or image_path):
+            raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
+        raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
 
-        conversation = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image"},
-                    {"type": "text", "text": prompt[0]},  # Currently accepting only 1 prompt
-                ],
-            },
-        ]
+        conversation = constants.Constants.conversation
+        conversation[0]["content"][1].update({"text": prompt[0]})  # Currently accepting only 1 prompt
 
         # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
         input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
@@ -277,19 +266,21 @@ def main(
         "--enable_qnn",
         "--enable-qnn",
         action="store_true",
+        nargs="?",
+        const=True,
+        type=str,
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
              Sample Config: QEfficient/compile/qnn_config.json",
     )
-    parser.add_argument(
-        "--qnn_config",
-        nargs="?",
-        type=str,
-    )
-    parser.add_argument("--img-size", "--img_size", default=None, type=int, required=False, help="Size of Image")
 
     args, compiler_options = parser.parse_known_args()
+
+    if isinstance(args.enable_qnn, str):
+        args.qnn_config = args.enable_qnn
+        args.enable_qnn = True
+
     compiler_options_dict = {}
     for i in range(0, len(compiler_options)):
         if compiler_options[i].startswith("--"):
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index ccad5e020..aad76cb37 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -8,6 +8,8 @@
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
+import transformers.models.auto.modeling_auto as mapping
+
 import torch
 import torch.nn as nn
 from transformers.models.codegen.modeling_codegen import (
@@ -272,6 +274,15 @@
 }
 
 
+model_class_mapping = {
+    **{architecture: "QEFFAutoModelForCausalLM" for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()},
+    **{
+        architecture: "QEFFAutoModelForImageTextToText"
+        for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()
+    },
+}
+
+
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
     num_vision_tokens: int,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index b668ceff5..4411eab95 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -615,8 +615,6 @@ def compile(
             )
 
         output_names = self.model.get_output_names(kv_offload=True)
-        vision_onnx_path = compiler_options.get("vision_onnx_path", None)
-        lang_onnx_path = compiler_options.get("lang_onnx_path", None)
 
         specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
@@ -1567,9 +1565,6 @@ def compile(
             decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ...
             specializations.append(decode_specialization)
 
-        if compiler_options.pop("img_size", None):
-            logger.warning(f"Skipping img_size as it is not a valid argument for {self.model.config.architectures[0]}.")
-
         if enable_qnn:
             if compiler_options:
                 logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only")
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index ea9044e2c..d4e1ac4bf 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -504,7 +504,7 @@ def create_and_dump_qconfigs(
     # Extract QNN SDK details from YAML file if the environment variable is set
     qnn_sdk_details = None
     qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
-    if qnn_sdk_path:
+    if enable_qnn and qnn_sdk_path:
         qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
         with open(qnn_sdk_yaml_path, "r") as file:
             qnn_sdk_details = yaml.safe_load(file)
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 3852adcda..8cf100c93 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -76,6 +76,15 @@ class Constants:
     MAX_RETRIES = 5  # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download
     NUM_SPECULATIVE_TOKENS = 2
     SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml"  # This xml file is parsed to find out the SDK version.
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text"},
+            ],
+        }
+    ]
 
 
 @dataclass

From d9dc7d28773fc670ede62643e394ac06d72b27ea Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Mon, 10 Mar 2025 07:31:37 +0000
Subject: [PATCH 09/20] Addressing comments

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/base/common.py                 | 2 +-
 QEfficient/cloud/infer.py                 | 2 +-
 QEfficient/transformers/modeling_utils.py | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index ad699dcfa..9abbb3e94 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -14,10 +14,10 @@
 
 from typing import Any
 
-from QEfficient.transformers.modeling_utils import model_class_mapping
 from transformers import AutoConfig
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
+from QEfficient.transformers.modeling_utils import model_class_mapping
 
 
 class QEFFCommonLoader:
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 88a968c8c..59b61bd15 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -16,7 +16,7 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, constants
+from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index aad76cb37..1770feeea 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -8,10 +8,9 @@
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
-import transformers.models.auto.modeling_auto as mapping
-
 import torch
 import torch.nn as nn
+import transformers.models.auto.modeling_auto as mapping
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,

From ca55d42246f8b36d584a5a09bd87d3bf7c78f30d Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Wed, 2 Apr 2025 07:52:10 +0000
Subject: [PATCH 10/20] Addressed Comments

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 70 ++++++++++++++++++++++++---------------
 1 file changed, 44 insertions(+), 26 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 59b61bd15..09585b066 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -12,13 +12,48 @@
 
 import requests
 from PIL import Image
-from transformers import AutoProcessor, TextStreamer
+from transformers import AutoProcessor, TextStreamer, PreTrainedModel
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
+def execute_vlm_model(
+    qeff_model: PreTrainedModel,
+    model_name: str,
+    image_url: str,
+    image_path: str,
+    prompt: Optional[str] = None, #type: ignore
+    device_group: Optional[List[int]] = None,
+    generation_len: Optional[int] = None,
+):
+    if not (image_url or image_path):
+            raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
+    raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
+
+    processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
+
+    conversation = constants.Constants.conversation
+    conversation[0]["content"][1].update({"text": prompt[0]})  # Currently accepting only 1 prompt
+
+    # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
+    input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+
+    split_inputs = processor(
+        text=input_text,
+        images=raw_image,
+        return_tensors="pt",
+        add_special_tokens=False,
+    )
+    streamer = TextStreamer(processor.tokenizer)
+    output = qeff_model.generate(
+        inputs=split_inputs,
+        streamer=streamer,
+        device_ids=device_group,
+        generation_len=generation_len,
+    )
+    return output
 
 def main(
     model_name: str,
@@ -130,32 +165,16 @@ def main(
     # Execute
     #########
     if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
-        processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
-
-        if not (image_url or image_path):
-            raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
-        raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
-
-        conversation = constants.Constants.conversation
-        conversation[0]["content"][1].update({"text": prompt[0]})  # Currently accepting only 1 prompt
-
-        # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
-        input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-
-        split_inputs = processor(
-            text=input_text,
-            images=raw_image,
-            return_tensors="pt",
-            add_special_tokens=False,
-        )
-        streamer = TextStreamer(processor.tokenizer)
-        output = qeff_model.generate(
-            inputs=split_inputs,
-            streamer=streamer,
-            device_ids=device_group,
+        exec_info = execute_vlm_model(
+            qeff_model=qeff_model,
+            model_name=model_name,
+            prompt=prompt,
+            image_url=image_url,
+            image_path=image_path,
+            device_group=device_group,
             generation_len=generation_len,
         )
-        print(output)
+        print(exec_info)
     else:
         tokenizer = load_hf_tokenizer(
             pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
@@ -265,7 +284,6 @@ def main(
     parser.add_argument(
         "--enable_qnn",
         "--enable-qnn",
-        action="store_true",
         nargs="?",
         const=True,
         type=str,

From 7a4d18e8b11f1f84fae9cb7da8e29c556e1c3b4e Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Wed, 2 Apr 2025 08:02:46 +0000
Subject: [PATCH 11/20] Ruff check and format

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 09585b066..11916dd8d 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -12,24 +12,25 @@
 
 import requests
 from PIL import Image
-from transformers import AutoProcessor, TextStreamer, PreTrainedModel
+from transformers import AutoProcessor, PreTrainedModel, TextStreamer
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
+
 def execute_vlm_model(
     qeff_model: PreTrainedModel,
     model_name: str,
     image_url: str,
     image_path: str,
-    prompt: Optional[str] = None, #type: ignore
+    prompt: Optional[str] = None,  # type: ignore
     device_group: Optional[List[int]] = None,
     generation_len: Optional[int] = None,
 ):
     if not (image_url or image_path):
-            raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
+        raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
     raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
 
     processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
@@ -55,6 +56,7 @@ def execute_vlm_model(
     )
     return output
 
+
 def main(
     model_name: str,
     num_cores: int,

From adaee62bdb474898ef8d2990755d1ab5a7c470d4 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Mon, 14 Apr 2025 13:38:09 +0000
Subject: [PATCH 12/20] Adderssing comments

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/base/common.py                 |  4 ++--
 QEfficient/cloud/infer.py                 | 21 ++++++++++++++-------
 QEfficient/transformers/modeling_utils.py |  2 +-
 QEfficient/utils/constants.py             |  9 ---------
 4 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index 9abbb3e94..7836d5988 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -17,7 +17,7 @@
 from transformers import AutoConfig
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
-from QEfficient.transformers.modeling_utils import model_class_mapping
+from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING
 
 
 class QEFFCommonLoader:
@@ -41,7 +41,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
         architecture = config.architectures[0] if config.architectures else None
 
-        class_name = model_class_mapping.get(architecture)
+        class_name = MODEL_CLASS_MAPPING.get(architecture)
         if class_name:
             module = __import__("QEfficient.transformers.models.modeling_auto")
             model_class = getattr(module, class_name)
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 11916dd8d..0ff8cb3b9 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -16,7 +16,7 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -35,8 +35,16 @@ def execute_vlm_model(
 
     processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
 
-    conversation = constants.Constants.conversation
-    conversation[0]["content"][1].update({"text": prompt[0]})  # Currently accepting only 1 prompt
+    # Added for QEff version 1.20 supported VLM models (mllama and llava)
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": prompt[0]},
+            ],
+        }
+    ]
 
     # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
     input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
@@ -138,9 +146,8 @@ def main(
 
     config = qeff_model.model.config
     architecture = config.architectures[0] if config.architectures else None
-    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values():
-        img_size = kwargs.pop("img_size", None)
-        if img_size or image_path or image_url:
+
+    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and (kwargs.pop("img_size", None) or image_path or image_url):
             logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
 
     #########
@@ -304,7 +311,7 @@ def main(
     compiler_options_dict = {}
     for i in range(0, len(compiler_options)):
         if compiler_options[i].startswith("--"):
-            key = compiler_options[i].lstrip("-")
+            key = compiler_options[i].lstrip("-").replace("-", "_")
             value = (
                 compiler_options[i + 1]
                 if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 1770feeea..454fcf51d 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -273,7 +273,7 @@
 }
 
 
-model_class_mapping = {
+MODEL_CLASS_MAPPING = {
     **{architecture: "QEFFAutoModelForCausalLM" for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()},
     **{
         architecture: "QEFFAutoModelForImageTextToText"
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 0f44104e9..c2663594f 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -86,15 +86,6 @@ class Constants:
     MAX_RETRIES = 5  # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download
     NUM_SPECULATIVE_TOKENS = 2
     SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml"  # This xml file is parsed to find out the SDK version.
-    conversation = [
-        {
-            "role": "user",
-            "content": [
-                {"type": "image"},
-                {"type": "text"},
-            ],
-        }
-    ]
 
 
 @dataclass

From f7c84b7fbed0b730dbb9426adccc64b451761d11 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Mon, 14 Apr 2025 13:46:28 +0000
Subject: [PATCH 13/20] Ruff format

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 0ff8cb3b9..8b36910f5 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -147,8 +147,10 @@ def main(
     config = qeff_model.model.config
     architecture = config.architectures[0] if config.architectures else None
 
-    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and (kwargs.pop("img_size", None) or image_path or image_url):
-            logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
+    if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and (
+        kwargs.pop("img_size", None) or image_path or image_url
+    ):
+        logger.warning(f"Skipping image arguments as they are not valid for {architecture}")
 
     #########
     # Compile

From 893e322af04ee5a3e9bacfa33161f755349af54e Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 17 Apr 2025 08:56:50 +0000
Subject: [PATCH 14/20] Added VLM CLI test and addressed comments

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py                     | 15 ++++++++
 .../transformers/models/modeling_auto.py      |  6 ++--
 tests/cloud/test_infer_vlm.py                 | 34 +++++++++++++++++++
 3 files changed, 51 insertions(+), 4 deletions(-)
 create mode 100644 tests/cloud/test_infer_vlm.py

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 8b36910f5..f99070f94 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -29,6 +29,21 @@ def execute_vlm_model(
     device_group: Optional[List[int]] = None,
     generation_len: Optional[int] = None,
 ):
+    """
+    This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+    ``Mandatory`` Args:
+        :qeff_model (PreTrainedModel): QEfficient model object.
+        :model_name (str): Hugging Face Model Card name, Example: ``llava-hf/llava-1.5-7b-hf``
+        :num_cores (int): Number of cores to compile model on.
+        :image_url (str): Image URL to be used for inference. ``Defaults to None.``
+        :image_path (str): Image path to be used for inference. ``Defaults to None.``
+    ``Optional`` Args:
+        :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
+        :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.``
+        :generation_len (int): Number of tokens to be generated. ``Defaults to None.``
+    Returns:
+        :dict: Output from the ``AI_100`` runtime.
+    """
     if not (image_url or image_path):
         raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
     raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index ee486811a..3295af67c 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -828,14 +828,13 @@ def kv_offload_generate(
         total_time = decode_end - prefill_start
         total_perf = num_token / total_time
 
-        exec_info = CloudAI100ExecInfoNew(
+        return CloudAI100ExecInfoNew(
             batch_size=batch_size,
             generated_ids=generated_ids,
             perf_metrics=PerfMetrics(
                 prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
             ),
         )
-        return exec_info
 
 
 class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin):
@@ -1119,14 +1118,13 @@ def cloud_ai_100_generate(
         total_time = decode_end - prefill_start
         total_perf = num_token / total_time
 
-        exec_info = CloudAI100ExecInfoNew(
+        return CloudAI100ExecInfoNew(
             batch_size=batch_size,
             generated_ids=generated_ids,
             perf_metrics=PerfMetrics(
                 prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
             ),
         )
-        return exec_info
 
     @property
     def model_hash(self) -> str:
diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py
new file mode 100644
index 000000000..435f0fde8
--- /dev/null
+++ b/tests/cloud/test_infer_vlm.py
@@ -0,0 +1,34 @@
+import pytest
+
+from QEfficient.cloud.infer import main as infer
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+@pytest.mark.multimodal
+@pytest.mark.usefixtures("clean_up_after_test")
+def test_vlm_cli(setup, mocker):
+    ms = setup
+    # Taking some values from setup fixture and assigning other's based on model's requirement.
+    # For example, mxint8 is not required for VLM models, so assigning False.
+    infer(
+        model_name="llava-hf/llava-1.5-7b-hf",
+        num_cores=ms.num_cores,
+        prompt="Describe the image.",
+        prompts_txt_file_path=None,
+        aic_enable_depth_first=ms.aic_enable_depth_first,
+        mos=ms.mos,
+        batch_size=1,
+        full_batch_size=None,
+        prompt_len=1024,
+        ctx_len=2048,
+        generation_len=ms.generation_len,
+        mxfp6=False,
+        mxint8=False,
+        local_model_dir=None,
+        cache_dir=None,
+        hf_token=ms.hf_token,
+        enable_qnn=False,
+        qnn_config=None,
+        image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg",
+    )

From 9cde30bca0682e3efe84efe379d8bedcf27ef6b2 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 17 Apr 2025 09:17:35 +0000
Subject: [PATCH 15/20] Added Copyrights

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 tests/cloud/test_infer_vlm.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py
index 435f0fde8..b5e71dc11 100644
--- a/tests/cloud/test_infer_vlm.py
+++ b/tests/cloud/test_infer_vlm.py
@@ -1,3 +1,10 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
 import pytest
 
 from QEfficient.cloud.infer import main as infer

From 3a1e5b9f58b1e71f8b4d4d5d3cf505917c2ca873 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Thu, 17 Apr 2025 09:26:17 +0000
Subject: [PATCH 16/20] Added Copyrights

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 tests/cloud/test_infer_vlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py
index b5e71dc11..d06e09946 100644
--- a/tests/cloud/test_infer_vlm.py
+++ b/tests/cloud/test_infer_vlm.py
@@ -29,7 +29,7 @@ def test_vlm_cli(setup, mocker):
         full_batch_size=None,
         prompt_len=1024,
         ctx_len=2048,
-        generation_len=ms.generation_len,
+        generation_len=20,
         mxfp6=False,
         mxint8=False,
         local_model_dir=None,

From d4af07a49aa57915e0b281c286bf64eb509a4dd1 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Mon, 21 Apr 2025 09:36:49 +0000
Subject: [PATCH 17/20] Ruff format

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 scripts/Jenkinsfile                          |  2 +-
 tests/transformers/spd/test_pld_inference.py |  6 +++---
 tests/transformers/spd/test_spd_inference.py | 12 ++++++------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index f3f2fc2d8..53fcab1c7 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -69,7 +69,7 @@ pipeline {
        }
        stage('CLI Tests') {
                    steps {
-                       timeout(time: 15, unit: 'MINUTES') {
+                       timeout(time: 150, unit: 'MINUTES') {
                            sh '''
                            sudo docker exec ${BUILD_TAG} bash -c "
                     	   source /qnn_sdk/bin/envsetup.sh &&
diff --git a/tests/transformers/spd/test_pld_inference.py b/tests/transformers/spd/test_pld_inference.py
index 88d86a9be..e5d472734 100644
--- a/tests/transformers/spd/test_pld_inference.py
+++ b/tests/transformers/spd/test_pld_inference.py
@@ -145,9 +145,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
     """
     num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
     input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
-    assert (
-        input_len_padded <= ctx_len
-    ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
     return input_len_padded
 
 
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index 39dbd95cb..b78afdc38 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -75,9 +75,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
     """
     num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
     input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
-    assert (
-        input_len_padded <= ctx_len
-    ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
     return input_len_padded
 
 
@@ -320,9 +320,9 @@ def test_spec_decode_inference(
     for prompt, generation in zip(prompts, batch_decode):
         print(f"{prompt=} {generation=}")
     # validation check
-    assert mean_num_accepted_tokens == float(
-        num_speculative_tokens + 1
-    ), f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}"
+    assert mean_num_accepted_tokens == float(num_speculative_tokens + 1), (
+        f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}"
+    )
     del target_model_session
     del draft_model_session
     generated_ids = np.asarray(generated_ids[0]).flatten()

From 9441120ad2c0a0ef7de6a76d6c736820941ce7e6 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Tue, 22 Apr 2025 09:02:23 +0000
Subject: [PATCH 18/20] Addressed Comments

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py | 1 -
 scripts/Jenkinsfile       | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index f99070f94..13e6e5c73 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -34,7 +34,6 @@ def execute_vlm_model(
     ``Mandatory`` Args:
         :qeff_model (PreTrainedModel): QEfficient model object.
         :model_name (str): Hugging Face Model Card name, Example: ``llava-hf/llava-1.5-7b-hf``
-        :num_cores (int): Number of cores to compile model on.
         :image_url (str): Image URL to be used for inference. ``Defaults to None.``
         :image_path (str): Image path to be used for inference. ``Defaults to None.``
     ``Optional`` Args:
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 53fcab1c7..24113f9c8 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -69,7 +69,7 @@ pipeline {
        }
        stage('CLI Tests') {
                    steps {
-                       timeout(time: 150, unit: 'MINUTES') {
+                       timeout(time: 60, unit: 'MINUTES') {
                            sh '''
                            sudo docker exec ${BUILD_TAG} bash -c "
                     	   source /qnn_sdk/bin/envsetup.sh &&

From e138828c7c7ba04ee60a1b738cd8a027b58807c7 Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Tue, 22 Apr 2025 10:16:33 +0000
Subject: [PATCH 19/20] Updated load_hf_processor

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py    | 20 +++++++++++++++++---
 QEfficient/utils/__init__.py |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 13e6e5c73..7e1f321d7 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -12,14 +12,15 @@
 
 import requests
 from PIL import Image
-from transformers import AutoProcessor, PreTrainedModel, TextStreamer
+from transformers import PreTrainedModel, TextStreamer
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_hf_processor
 from QEfficient.utils.logging_utils import logger
 
 
+# TODO: Remove after adding support for VLM's compile and execute
 def execute_vlm_model(
     qeff_model: PreTrainedModel,
     model_name: str,
@@ -27,6 +28,9 @@ def execute_vlm_model(
     image_path: str,
     prompt: Optional[str] = None,  # type: ignore
     device_group: Optional[List[int]] = None,
+    local_model_dir: Optional[str] = None,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
     generation_len: Optional[int] = None,
 ):
     """
@@ -39,6 +43,9 @@ def execute_vlm_model(
     ``Optional`` Args:
         :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
         :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.``
+        :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
+        :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
+        :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
         :generation_len (int): Number of tokens to be generated. ``Defaults to None.``
     Returns:
         :dict: Output from the ``AI_100`` runtime.
@@ -47,7 +54,11 @@ def execute_vlm_model(
         raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"')
     raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path)
 
-    processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
+    processor = load_hf_processor(
+        pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+    )
 
     # Added for QEff version 1.20 supported VLM models (mllama and llava)
     conversation = [
@@ -197,6 +208,9 @@ def main(
             image_url=image_url,
             image_path=image_path,
             device_group=device_group,
+            local_model_dir=local_model_dir,
+            cache_dir=cache_dir,
+            hf_token=hf_token,
             generation_len=generation_len,
         )
         print(exec_info)
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index a7f17e6bc..5bc2d5efa 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -18,6 +18,7 @@
     get_qpc_dir_path,
     hf_download,
     load_hf_tokenizer,
+    load_hf_processor,
     login_and_download_hf_lm,
     onnx_exists,
     padding_check_and_fix,

From 28fd36196acfe22e92d64c668d4d5cb8ff80491f Mon Sep 17 00:00:00 2001
From: Asmita Goswami <quic_asmigosw@quicinc.com>
Date: Tue, 22 Apr 2025 10:18:08 +0000
Subject: [PATCH 20/20] Ruff check fix

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
---
 QEfficient/cloud/infer.py    | 2 +-
 QEfficient/utils/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 7e1f321d7..68be72fa8 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -16,7 +16,7 @@
 from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
 
 from QEfficient.base.common import QEFFCommonLoader
-from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_hf_processor
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 5bc2d5efa..f6aa3296d 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -17,8 +17,8 @@
     get_padding_shape_from_config,
     get_qpc_dir_path,
     hf_download,
-    load_hf_tokenizer,
     load_hf_processor,
+    load_hf_tokenizer,
     login_and_download_hf_lm,
     onnx_exists,
     padding_check_and_fix,