From 89771e8f8ccaa82746f2665ad0da8b04fc955eb5 Mon Sep 17 00:00:00 2001 From: shubhagr-quic Date: Mon, 3 Mar 2025 13:31:49 +0530 Subject: [PATCH 01/20] Removed onnx_defer_loading flag. (#295) Removing onnx_defer_loading flag which was originally removed in _[Removed onnx_defer_loading from Immutable Convertor Args. PR: 230]_ but got added back later in _[Mllama(single + dual) + InternVL(single) + Llava (single) PR: 267]_ maybe becausing of rebasing. Signed-off-by: Shubham Agrawal Signed-off-by: Asmita Goswami --- QEfficient/utils/constants.py | 1 - 1 file changed, 1 deletion(-) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index a5cc6fda1..6c2bba0c6 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -136,7 +136,6 @@ class QnnConstants: "--float_bitwidth ", "--preserve_io_datatype", "--onnx_skip_simplification", - "--onnx_defer_loading", ] IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [ From b3736a4999c1283d0075b801cdacad3e51943a0e Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Mon, 3 Mar 2025 20:29:36 +0530 Subject: [PATCH 02/20] Code for SDK configs Inclusion (#203) This will create a config JSON file, which contains all the details about compilation and SDK versions. Currently, this code is added in the code block of QEFFAutoModelForCausalLM.compile. The config would look like below: ``` { "huggingface_config": { "vocab_size": 50257, "n_positions": 1024, "n_embd": 768, "n_layer": 12, "n_head": 12, "n_inner": null, "activation_function": "gelu_new", "resid_pdrop": 0.1, "embd_pdrop": 0.1, "attn_pdrop": 0.1, "layer_norm_epsilon": 1e-05, "initializer_range": 0.02, "summary_type": "cls_index", "summary_use_proj": true, "summary_activation": null, "summary_first_dropout": 0.1, "summary_proj_to_labels": true, "scale_attn_weights": true, "use_cache": true, "scale_attn_by_inverse_layer_idx": false, "reorder_and_upcast_attn": false, "bos_token_id": 50256, "eos_token_id": 50256, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": null, "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "chunk_size_feed_forward": 0, "is_encoder_decoder": false, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 20, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "suppress_tokens": null, "begin_suppress_tokens": null, "architectures": [ "GPT2LMHeadModel" ], "finetuning_task": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "tokenizer_class": null, "prefix": null, "pad_token_id": null, "sep_token_id": null, "decoder_start_token_id": null, "task_specific_params": { "text-generation": { "do_sample": true, "max_length": 50 } }, "problem_type": null, "_name_or_path": "gpt2", "_commit_hash": "607a30d783dfa663caf39e06633721c8d4cfcd7e", "_attn_implementation_internal": "eager", "transformers_version": null, "model_type": "gpt2", "n_ctx": 1024 }, "qpc_config": { "QEff_config": { "pytorch_transforms": [ "AwqToMatmulNbitsTransform", "GPTQToMatmulNbitsTransform", "CustomOpsTransform", "KVCacheTransform" ], "onnx_transforms": [ "FP16ClipTransform", "SplitTensorsTransform" ], "onnx_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/GPT2LMHeadModel.onnx" }, "aic_compiler_config": { "apps_sdk_version": "1.20.0", "compile_dir": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47", "specializtions_file_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/specializations.json", "prefill_seq_len": 32, "ctx_len": 128, "batch_size": 1, "full_batch_size": null, "num_devices": 1, "num_cores": 16, "mxfp6_matmul": false, "mxint8_kv_cache": false, "num_speculative_tokens": null }, "qnn_config": { "enable_qnn": true, "qnn_config_path": "QEfficient/compile/qnn_config.json", "product": "QAIRT", "os": { "Ubuntu": 22.04, "Windows": 11 }, "sdk_flavor": [ "aic" ], "version": "2.31.0", "build_id": "250109072054_3882", "qnn_backend_api_version": "2.18.0", "tensorflow": "2.10.1", "tflite": "2.3.0", "torch": "1.13.1", "onnx": "1.16.1", "onnxruntime": "1.17.1", "onnxsimplifier": "0.4.36", "android-ndk": "r26c", "platform": "AIC.1.20.0.14" } } } ``` Note: The code structure may change. --------- Signed-off-by: Abukhoyer Shaik Signed-off-by: Asmita Goswami --- QEfficient/base/modeling_qeff.py | 6 +- QEfficient/peft/auto.py | 4 + QEfficient/peft/lora/auto.py | 4 + .../transformers/models/modeling_auto.py | 24 ++++ QEfficient/utils/__init__.py | 1 + QEfficient/utils/_utils.py | 114 +++++++++++++++++- QEfficient/utils/constants.py | 2 + tests/peft/lora/test_lora_model.py | 4 + tests/peft/test_peft_model.py | 2 + tests/qnn_tests/test_causal_lm_models_qnn.py | 8 +- tests/text_generation/test_text_generation.py | 3 + .../models/test_causal_lm_models.py | 7 +- .../models/test_embedding_models.py | 2 + .../models/test_prefix_caching.py | 2 + .../models/test_speech_seq2seq_models.py | 1 + tests/transformers/spd/test_spd_inference.py | 3 + tests/transformers/test_causal_lm.py | 2 + tests/transformers/test_speech_seq2seq.py | 2 + 18 files changed, 185 insertions(+), 6 deletions(-) diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index ec74c57f3..f2b3714fa 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -23,7 +23,7 @@ from QEfficient.base.pytorch_transforms import PytorchTransform from QEfficient.compile.qnn_compiler import compile as qnn_compile from QEfficient.generation.cloud_infer import QAICInferenceSession -from QEfficient.utils import constants +from QEfficient.utils import constants, dump_qconfig from QEfficient.utils._utils import load_json from QEfficient.utils.cache import QEFF_HOME, to_hashable @@ -211,6 +211,7 @@ def _export( self.onnx_path = onnx_path return onnx_path + @dump_qconfig def _compile( self, onnx_path: Optional[str] = None, @@ -336,8 +337,10 @@ def _compile( ) self.qpc_path = qpc_path + return qpc_path + @dump_qconfig def _qnn_compile( self, onnx_path: Optional[str] = None, @@ -435,4 +438,5 @@ def _qnn_compile( ) self.qpc_path = qpc_path + return qpc_path diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py index 377caa3e7..deb64fae1 100644 --- a/QEfficient/peft/auto.py +++ b/QEfficient/peft/auto.py @@ -107,6 +107,10 @@ def model_hash(self) -> str: mhash = mhash.hexdigest()[:16] return mhash + @property + def get_model_config(self) -> dict: + return self.model.get_base_model().config.__dict__ + def load_adapter(self, model_id: str, adapter_name: str): """Loads a new adapter from huggingface hub or local path diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py index c13979968..7f2a5cd84 100644 --- a/QEfficient/peft/lora/auto.py +++ b/QEfficient/peft/lora/auto.py @@ -90,6 +90,10 @@ def model_hash(self) -> str: mhash = mhash.hexdigest()[:16] return mhash + @property + def get_model_config(self) -> dict: + return self.model.model.config.__dict__ + def download_adapter( self, adapter_model_id: str, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 54b7828c8..5852740b4 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -229,6 +229,10 @@ def model_hash(self) -> str: mhash = mhash.hexdigest()[:16] return mhash + @property + def get_model_config(self) -> dict: + return self.model.config.__dict__ + def export(self, export_dir: Optional[str] = None) -> str: """ Exports the model to ``ONNX`` format using ``torch.onnx.export``. @@ -447,6 +451,10 @@ def model_name(self) -> str: mname = mname[4:] return mname + @property + def get_model_config(self) -> dict: + return self.model.model.vision_model.config.__dict__ + class QEffCausalLMForTextImageToTextModel(QEFFBaseModel): _pytorch_transforms = [ @@ -506,6 +514,10 @@ def model_name(self) -> str: mname = mname[4:] return mname + @property + def get_model_config(self) -> dict: + return self.model.language_model.config.__dict__ + class _QEffAutoModelForImageTextToTextDualQPC: _hf_auto_class = AutoModelForImageTextToText @@ -1128,6 +1140,10 @@ def model_name(self) -> str: mname = mname[4:] return mname + @property + def get_model_config(self) -> dict: + return self.model.config.__dict__ + class QEFFAutoModelForImageTextToText: """ @@ -1320,6 +1336,10 @@ def model_hash(self) -> str: mhash = mhash.hexdigest()[:16] return mhash + @property + def get_model_config(self) -> dict: + return self.model.config.__dict__ + def export(self, export_dir: Optional[str] = None) -> str: """ Exports the model to ``ONNX`` format using ``torch.onnx.export``. @@ -1630,6 +1650,10 @@ def model_hash(self) -> str: mhash = mhash.hexdigest()[:16] return mhash + @property + def get_model_config(self) -> dict: + return self.model.config.__dict__ + def export(self, export_dir: Optional[str] = None) -> str: """ Exports the model to ``ONNX`` format using ``torch.onnx.export``. diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 2506b9233..a7f17e6bc 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -11,6 +11,7 @@ ) from QEfficient.utils._utils import ( # noqa: F401 check_and_assign_cache_dir, + dump_qconfig, get_num_layers_from_config, get_onnx_dir_name, get_padding_shape_from_config, diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index 8344a053d..ea9044e2c 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -8,16 +8,18 @@ import json import os import subprocess +import xml.etree.ElementTree as ET from dataclasses import dataclass from typing import Any, Dict, List, Optional, Tuple, Union import requests import torch +import yaml from huggingface_hub import login, snapshot_download from requests.exceptions import HTTPError from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast -from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants +from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants from QEfficient.utils.logging_utils import logger @@ -442,3 +444,113 @@ class IOInfo: def __repr__(self): return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}" + + +def dump_qconfig(func): + def wrapper(self, *args, **kwargs): + result = func(self, *args, **kwargs) + create_and_dump_qconfigs( + self.qpc_path, + self.onnx_path, + self.get_model_config, + [cls.__name__ for cls in self._pytorch_transforms], + [cls.__name__ for cls in self._onnx_transforms], + kwargs.get("specializations"), + kwargs.get("mdp_ts_num_devices", 1), + kwargs.get("num_speculative_tokens"), + **{ + k: v + for k, v in kwargs.items() + if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"] + }, + ) + return result + + return wrapper + + +def create_and_dump_qconfigs( + qpc_path, + onnx_path, + huggingface_config, + pytorch_transforms, + onnx_transforms, + specializations, + mdp_ts_num_devices, + num_speculative_tokens, + **compiler_options, +): + """ + This Method creates a JSON file which contains all the configs for a model. + Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and + many other compilation options. + """ + qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None + enable_qnn = True if "qnn_config" in compiler_options else None + + qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json") + onnx_path = str(onnx_path) + specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json")) + compile_dir = str(os.path.dirname(qpc_path)) + qnn_config_path = ( + (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None + ) + + # Extract QAIC SDK Apps Version from SDK XML file + tree = ET.parse(Constants.SDK_APPS_XML) + root = tree.getroot() + qaic_version = root.find(".//base_version").text + + # Extract QNN SDK details from YAML file if the environment variable is set + qnn_sdk_details = None + qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME) + if qnn_sdk_path: + qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML) + with open(qnn_sdk_yaml_path, "r") as file: + qnn_sdk_details = yaml.safe_load(file) + + # Ensure all objects in the configs dictionary are JSON serializable + def make_serializable(obj): + if isinstance(obj, (int, float, str, bool, type(None))): + return obj + elif isinstance(obj, (list, tuple)): + return [make_serializable(item) for item in obj] + elif isinstance(obj, dict): + return {key: make_serializable(value) for key, value in obj.items()} + elif hasattr(obj, "__dict__"): + return make_serializable(vars(obj)) + return str(obj) + + qconfigs = { + "huggingface_config": make_serializable(huggingface_config), + "qpc_config": { + "QEff_config": { + "pytorch_transforms": make_serializable(pytorch_transforms), + "onnx_transforms": make_serializable(onnx_transforms), + "onnx_path": onnx_path, + }, + }, + } + + aic_compiler_config = { + "apps_sdk_version": qaic_version, + "compile_dir": compile_dir, + "specializations_file_path": specializations_file_path, + "specializations": make_serializable(specializations), + "mdp_ts_num_devices": mdp_ts_num_devices, + "num_speculative_tokens": num_speculative_tokens, + **compiler_options, + } + qnn_config = { + "enable_qnn": enable_qnn, + "qnn_config_path": qnn_config_path, + } + # Put AIC or qnn details. + if enable_qnn: + qconfigs["qpc_config"]["qnn_config"] = qnn_config + if qnn_sdk_details: + qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details) + else: + qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config + + create_json(qconfig_file_path, qconfigs) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 6c2bba0c6..3852adcda 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -75,12 +75,14 @@ class Constants: MAX_QPC_LIMIT = 30 MAX_RETRIES = 5 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download NUM_SPECULATIVE_TOKENS = 2 + SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml" # This xml file is parsed to find out the SDK version. @dataclass class QnnConstants: # QNN PATH to be read from environment variable. QNN_SDK_PATH_ENV_VAR_NAME = "QNN_SDK_ROOT" + QNN_SDK_YAML = "sdk.yaml" # QNN Compilation tools QAIRT_CONVERTER = "{}/bin/{}/qairt-converter" diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py index 4726fb8c5..69a6282fb 100644 --- a/tests/peft/lora/test_lora_model.py +++ b/tests/peft/lora/test_lora_model.py @@ -4,6 +4,8 @@ # SPDX-License-Identifier: BSD-3-Clause # # ----------------------------------------------------------------------------- + +import os from pathlib import Path from time import perf_counter @@ -225,6 +227,7 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate( # test compile qeff_model.compile(prefill_seq_len=32, ctx_len=64) assert Path(qeff_model.qpc_path).is_dir() + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) # test generate prompts = ["hello!", "hi", "hello, my name is", "hey"] @@ -249,6 +252,7 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap # test compile qeff_model.compile(prefill_seq_len=32, ctx_len=64, full_batch_size=2) assert Path(qeff_model.qpc_path).is_dir() + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) # test generate prompts = ["hello!", "hi", "hello, my name is", "hey"] diff --git a/tests/peft/test_peft_model.py b/tests/peft/test_peft_model.py index 6a9a957b2..c4e331a9d 100644 --- a/tests/peft/test_peft_model.py +++ b/tests/peft/test_peft_model.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import os from time import perf_counter import numpy as np @@ -187,3 +188,4 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con end = perf_counter() compile_time_1 = end - start assert compile_time_1 < 0.01 * compile_time_0 + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py index fe906fe7e..65acab157 100644 --- a/tests/qnn_tests/test_causal_lm_models_qnn.py +++ b/tests/qnn_tests/test_causal_lm_models_qnn.py @@ -5,6 +5,8 @@ # # ----------------------------------------------------------------------------- +import os + import numpy as np import pytest from transformers import AutoModelForCausalLM @@ -98,7 +100,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") - _ = qeff_model.compile( + qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, num_cores=14, @@ -106,6 +108,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( aic_enable_depth_first=False, enable_qnn=True, ) + assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR) cloud_ai_100_tokens = exec_info.generated_ids[0] # Because we always run for single input and single batch size gen_len = ort_tokens.shape[-1] @@ -136,7 +139,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") - _ = qeff_model.compile( + qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, num_cores=14, @@ -145,6 +148,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( full_batch_size=full_batch_size, enable_qnn=True, ) + assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) assert all( diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py index a1e4265ee..f7d3cd6cb 100644 --- a/tests/text_generation/test_text_generation.py +++ b/tests/text_generation/test_text_generation.py @@ -5,6 +5,8 @@ # # ----------------------------------------------------------------------------- +import os + import pytest from transformers import AutoModelForCausalLM @@ -101,3 +103,4 @@ def test_generate_text_stream( assert cloud_ai_100_output == stream_tokens, ( f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}" ) + assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index a3a855cee..418386780 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import os from typing import Optional import numpy as np @@ -127,7 +128,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") - _ = qeff_model.compile( + qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, num_cores=14, @@ -141,6 +142,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), ( "Tokens don't match for ONNXRT output and Cloud AI 100 output." ) + assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) # testing for CB models model_hf, _ = load_causal_lm_model(model_config) @@ -165,7 +167,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( if not get_available_device_id(): pytest.skip("No available devices to run model on Cloud AI 100") - _ = qeff_model.compile( + qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, num_cores=14, @@ -182,6 +184,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids) ] ), "Tokens don't match for HF PyTorch model output and Cloud AI 100 output." + assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json")) # FIXME: there should be a CB test here diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 1c2d5196c..e681f5093 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import os import numpy as np import onnxruntime as ort @@ -77,6 +78,7 @@ def check_embed_pytorch_vs_ort_vs_ai100( mad = np.mean(np.abs(ai100_output - onnx_outputs[0])) print("Mad for onnx and AI 100 output is ", mad) assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}" + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) @pytest.mark.on_qaic diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py index 8ef24403c..c787a3c96 100644 --- a/tests/transformers/models/test_prefix_caching.py +++ b/tests/transformers/models/test_prefix_caching.py @@ -31,6 +31,7 @@ def test_simple_prefix_caching(model_name): num_cores=14, ) prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path) + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) @pytest.mark.on_qaic @@ -61,6 +62,7 @@ def test_simple_prefix_caching_qnn(model_name): qnn_config=qnn_config_json_path, ) prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path) + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) os.remove(qnn_config_json_path) diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index af83c9354..99f715863 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -360,6 +360,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100( assert (pytorch_kv_tokens == cloud_ai_100_tokens).all(), ( "Tokens don't match for pytorch output and Cloud AI 100 output." ) + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) @pytest.mark.on_qaic diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py index a9f197ec3..205f00a00 100644 --- a/tests/transformers/spd/test_spd_inference.py +++ b/tests/transformers/spd/test_spd_inference.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import os from time import perf_counter from typing import List, Optional @@ -331,3 +332,5 @@ def test_spec_decode_inference( ] # Because we always run for single input and single batch size all_matching = np.array_equal(cloud_ai_100_tokens, generated_ids) assert all_matching, "Tokens don't match for SpD output and vanilla DLM output." + assert os.path.isfile(os.path.join(os.path.dirname(target_model_qpc_path), "qconfig.json")) + assert os.path.isfile(os.path.join(os.path.dirname(draft_model_qpc_path), "qconfig.json")) diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py index 1ceb5a7e0..64376db62 100644 --- a/tests/transformers/test_causal_lm.py +++ b/tests/transformers/test_causal_lm.py @@ -6,6 +6,7 @@ # ---------------------------------------------------------------------------- import copy +import os from time import perf_counter import onnx @@ -170,3 +171,4 @@ def test_causal_lm_compile(config, cb, tmp_cache): end = perf_counter() compile_time = end - start assert compile_time < 2.0 + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) diff --git a/tests/transformers/test_speech_seq2seq.py b/tests/transformers/test_speech_seq2seq.py index a41896010..15d6152e3 100644 --- a/tests/transformers/test_speech_seq2seq.py +++ b/tests/transformers/test_speech_seq2seq.py @@ -6,6 +6,7 @@ # ---------------------------------------------------------------------------- import copy +import os from time import perf_counter import onnx @@ -142,3 +143,4 @@ def test_causal_lm_compile(config, tmp_cache): end = perf_counter() compile_time = end - start assert compile_time < 2.0 + assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json")) From 687d44fad2866eeadc64b78158bb98421f5535e6 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 6 Mar 2025 11:56:27 +0530 Subject: [PATCH 03/20] Docs string added for the Image class and granite models are added in validation page (#303) Signed-off-by: Abukhoyer Shaik Signed-off-by: Asmita Goswami --- .../transformers/models/modeling_auto.py | 62 ++++++++++++++++++- docs/source/quick_start.md | 6 +- docs/source/validate.md | 10 +-- 3 files changed, 70 insertions(+), 8 deletions(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 5852740b4..07aff78ff 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1147,9 +1147,69 @@ def get_model_config(self) -> dict: class QEFFAutoModelForImageTextToText: """ - A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach + The QEFFAutoModelForImageTextToText class is used to work with multimodal language models from the HuggingFace hub. + While you can initialize the class directly, it's best to use the ``from_pretrained`` method for this purpose. This class supports both single and dual QPC approaches. Attributes: _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models. + + ``Mandatory`` Args: + :pretrained_model_name_or_path (str): Model card name from HuggingFace or local path to model directory. + + ``Optional`` Args: + :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True. + + .. code-block:: python + import requests + from PIL import Image + from transformers import AutoProcessor, TextStreamer + + from QEfficient import QEFFAutoModelForImageTextToText + + # Add HuggingFace Token to access the model + HF_TOKEN = "" + model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" + query = "Describe this image." + image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg" + + ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc + processor = AutoProcessor.from_pretrained(model_name, token=token) + model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, attn_implementation="eager", kv_offload=False) + + ## STEP - 2 Export & Compile the Model + model.compile( + prefill_seq_len=32, + ctx_len=512, + img_size=560, + num_cores=16, + num_devices=1, + mxfp6_matmul=False, + ) + + ## STEP - 3 Load and process the inputs for Inference + image = Image.open(requests.get(image_url, stream=True).raw) + messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": query}, + ], + } + ] + input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)] + inputs = processor( + text=input_text, + images=image, + return_tensors="pt", + add_special_tokens=False, + padding="max_length", + max_length=prefill_seq_len, + ) + + ## STEP - 4 Run Inference on the compiled model + streamer = TextStreamer(processor.tokenizer) + model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len) + """ _hf_auto_class = AutoModelForImageTextToText diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 88093e134..2ccb013e9 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -239,7 +239,7 @@ Use the qualcomm_efficient_converter API to export the KV transformed Model to O generated_qpc_path = qeff_model.compile( num_cores=14, - mxfp6=True, + mxfp6_matmul=True, ) ``` @@ -250,8 +250,8 @@ Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/s ```Python # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach - -qeff_model.generate(prompts=["My name is"]) +tokenizer = AutoTokenizer.from_pretrained(model_name) +qeff_model.generate(prompts=["My name is"],tokenizer=tokenizer) ``` End to End demo examples for various models are available in **notebooks** directory. Please check them out. diff --git a/docs/source/validate.md b/docs/source/validate.md index 49acd268d..acd4c11da 100644 --- a/docs/source/validate.md +++ b/docs/source/validate.md @@ -41,13 +41,15 @@ | Architecture | Model Family | Representative Models | |--------------|--------------|---------------------------------| -| **BertModel** | BERT-based | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)
[BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)
[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)
[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) | +| **BertModel** | BERT-based | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)
[BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)
[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5)
[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) | | **LlamaModel** | Llama-based | [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | -| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) | -| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) | | **MPNetForMaskedLM** | MPNet | [sentence-transformers/multi-qa-mpnet-base-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1) | -| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) | | **MistralModel** | Mistral | [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | +| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) | +| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) | +| **RobertaModel** | RoBERTa | [ibm-granite/granite-embedding-30m-english](https://huggingface.co/ibm-granite/granite-embedding-30m-english)
[ibm-granite/granite-embedding-125m-english](https://huggingface.co/ibm-granite/granite-embedding-125m-english) | +| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) | +| **XLMRobertaModel** | XLM-RoBERTa |[ibm-granite/granite-embedding-107m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-107m-multilingual)
[ibm-granite/granite-embedding-278m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual) | ## Multimodal Language Models From 260bacb50b8db791f96bba7b3b22e2d2430ffaa7 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 6 Mar 2025 15:36:34 +0530 Subject: [PATCH 04/20] [Bug-Fix :] QEFFAutoModelForCausalLM __repr__() Method Fixed (#307) This is just small fixes done for printing the `QEFFAutoModelForCausalLM`'s instance by changing the `__repr__(self)` method. Signed-off-by: Abukhoyer Shaik Signed-off-by: Asmita Goswami --- QEfficient/transformers/models/modeling_auto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index 07aff78ff..a87c39fb4 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -1325,7 +1325,7 @@ def model_name(self) -> str: return mname def __repr__(self) -> str: - return self.__class__.__name__ + "\n" + self.model.__repr__ + return self.__class__.__name__ + "\n" + self.model.__repr__() @classmethod @with_replaced_quantizers From 691cca4c3eed306ff838c7397b229cfabaae1f6c Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 27 Feb 2025 06:22:43 +0000 Subject: [PATCH 05/20] Enabled VLMs via CLI Signed-off-by: Asmita Goswami --- QEfficient/base/common.py | 22 +++-- QEfficient/cloud/infer.py | 86 ++++++++++++++++--- .../transformers/models/modeling_auto.py | 18 +++- 3 files changed, 105 insertions(+), 21 deletions(-) diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index d94e02894..bcf5b1575 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -12,13 +12,21 @@ QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. """ +import importlib +from collections import OrderedDict from typing import Any +import transformers.models.auto.modeling_auto as mapping from transformers import AutoConfig -from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from QEfficient.base.modeling_qeff import QEFFBaseModel -from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM + +MODEL_CLASS_MAPPING = OrderedDict( + [ + (tuple(mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()), "QEFFAutoModelForCausalLM"), + (tuple(mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()), "QEFFAutoModelForImageTextToText"), + ] +) class QEFFCommonLoader: @@ -42,9 +50,13 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> config = AutoConfig.from_pretrained(pretrained_model_name_or_path) architecture = config.architectures[0] if config.architectures else None - if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - model_class = QEFFAutoModelForCausalLM - else: + model_class = None + for key_tuple, class_name in MODEL_CLASS_MAPPING.items(): + if architecture in key_tuple: + module = importlib.import_module("QEfficient.transformers.models.modeling_auto") + model_class = getattr(module, class_name) + break + if model_class is None: raise NotImplementedError( f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!" ) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 28eaa4d52..c93dde55f 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -10,6 +10,11 @@ import sys from typing import List, Optional +import requests +from PIL import Image +from transformers import AutoConfig, AutoProcessor, TextStreamer +from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES + from QEfficient.base.common import QEFFCommonLoader from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -36,6 +41,7 @@ def main( allow_mxint8_mdp_io: bool = False, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + img_size: Optional[int] = None, **kwargs, ) -> None: """ @@ -65,6 +71,9 @@ def main( :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` + :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below: + -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1 + -qpc_crc=True -> -qpc-crc .. code-block:: bash @@ -72,11 +81,6 @@ def main( """ cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir) - tokenizer = load_hf_tokenizer( - pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), - cache_dir=cache_dir, - hf_token=hf_token, - ) if "--mxfp6" in sys.argv: if args.mxfp6: @@ -85,6 +89,9 @@ def main( if args.mxint8: logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") + image_path = kwargs.pop("image_path", None) + image_url = kwargs.pop("image_url", None) + qeff_model = QEFFCommonLoader.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=cache_dir, @@ -110,20 +117,70 @@ def main( allow_mxint8_mdp_io=allow_mxint8_mdp_io, enable_qnn=enable_qnn, qnn_config=qnn_config, + img_size=img_size, **kwargs, ) + tokenizer = load_hf_tokenizer( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + ) + ######### # Execute ######### - _ = qeff_model.generate( - tokenizer, - prompts=prompt, - device_id=device_group, - prompt=prompt, - prompts_txt_file_path=prompts_txt_file_path, - generation_len=generation_len, - ) + config = AutoConfig.from_pretrained(model_name) + architecture = config.architectures[0] if config.architectures else None + + if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): + processor = AutoProcessor.from_pretrained(model_name, use_fast=False) + + raw_image = None + if image_url is not None: + raw_image = Image.open(requests.get(image_url, stream=True).raw) + elif image_path is not None: + raw_image = Image.open(image_path) + else: + raise FileNotFoundError( + 'Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"' + ) + + conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": prompt[0]}, # Currently accepting only 1 prompt + ], + }, + ] + + # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. + input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + + split_inputs = processor( + text=input_text, + images=raw_image, + return_tensors="pt", + add_special_tokens=False, + ) + streamer = TextStreamer(processor.tokenizer) + _ = qeff_model.generate( + inputs=split_inputs, + streamer=streamer, + device_ids=device_group, + generation_len=generation_len, + ) + else: + _ = qeff_model.generate( + tokenizer, + prompts=prompt, + device_id=device_group, + prompt=prompt, + prompts_txt_file_path=prompts_txt_file_path, + generation_len=generation_len, + ) if __name__ == "__main__": @@ -226,10 +283,11 @@ def main( Sample Config: QEfficient/compile/qnn_config.json", ) parser.add_argument( - "qnn_config", + "--qnn_config", nargs="?", type=str, ) + parser.add_argument("--img-size", "--img_size", default=None, type=int, required=False, help="Size of Image") args, compiler_options = parser.parse_known_args() compiler_options_dict = {} diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index a87c39fb4..b127c50a0 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -615,6 +615,8 @@ def compile( ) output_names = self.model.get_output_names(kv_offload=True) + vision_onnx_path = compiler_options.get("vision_onnx_path", None) + lang_onnx_path = compiler_options.get("lang_onnx_path", None) specializations, compiler_options = self.model.get_specializations( batch_size=batch_size, @@ -826,7 +828,7 @@ def kv_offload_generate( total_time = decode_end - prefill_start total_perf = num_token / total_time - return CloudAI100ExecInfoNew( + exec_info = CloudAI100ExecInfoNew( batch_size=batch_size, generated_ids=generated_ids, perf_metrics=PerfMetrics( @@ -834,6 +836,9 @@ def kv_offload_generate( ), ) + print(exec_info) + return exec_info + class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin): _hf_auto_class = AutoModelForImageTextToText @@ -1116,7 +1121,7 @@ def cloud_ai_100_generate( total_time = decode_end - prefill_start total_perf = num_token / total_time - return CloudAI100ExecInfoNew( + exec_info = CloudAI100ExecInfoNew( batch_size=batch_size, generated_ids=generated_ids, perf_metrics=PerfMetrics( @@ -1124,6 +1129,9 @@ def cloud_ai_100_generate( ), ) + print(exec_info) + return exec_info + @property def model_hash(self) -> str: mhash = hashlib.sha256() @@ -1239,6 +1247,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optiona if kwargs.get("low_cpu_mem_usage", None): logger.warning("Updating low_cpu_mem_usage=False") + if kwargs.pop("continuous_batching", None): + NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.") + kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False}) model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs) return cls(model, kv_offload=kv_offload, **kwargs) @@ -1560,6 +1571,9 @@ def compile( decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ... specializations.append(decode_specialization) + if compiler_options.pop("img_size", None): + logger.warning("img_size is not a valid argument for Text-to-Text Model.") + if enable_qnn: if compiler_options: logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only") From ea8555dc857aa05ff9554afcb44c5440e13c4c8a Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Fri, 28 Feb 2025 12:18:16 +0000 Subject: [PATCH 06/20] Addressing comments Signed-off-by: Asmita Goswami --- QEfficient/base/common.py | 25 ++++++++----------- QEfficient/cloud/infer.py | 18 ++++++------- .../transformers/models/modeling_auto.py | 6 +---- 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index bcf5b1575..228d1184e 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -13,7 +13,6 @@ """ import importlib -from collections import OrderedDict from typing import Any import transformers.models.auto.modeling_auto as mapping @@ -21,12 +20,12 @@ from QEfficient.base.modeling_qeff import QEFFBaseModel -MODEL_CLASS_MAPPING = OrderedDict( - [ - (tuple(mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()), "QEFFAutoModelForCausalLM"), - (tuple(mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values()), "QEFFAutoModelForImageTextToText"), - ] -) +MODEL_CLASS_MAPPING = {} +for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForCausalLM" + +for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): + MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForImageTextToText" class QEFFCommonLoader: @@ -50,13 +49,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> config = AutoConfig.from_pretrained(pretrained_model_name_or_path) architecture = config.architectures[0] if config.architectures else None - model_class = None - for key_tuple, class_name in MODEL_CLASS_MAPPING.items(): - if architecture in key_tuple: - module = importlib.import_module("QEfficient.transformers.models.modeling_auto") - model_class = getattr(module, class_name) - break - if model_class is None: + class_name = MODEL_CLASS_MAPPING.get(architecture) + if class_name: + module = importlib.import_module("QEfficient.transformers.models.modeling_auto") + model_class = getattr(module, class_name) + else: raise NotImplementedError( f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!" ) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index c93dde55f..0ac38cb6a 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -12,7 +12,7 @@ import requests from PIL import Image -from transformers import AutoConfig, AutoProcessor, TextStreamer +from transformers import AutoProcessor, TextStreamer from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader @@ -121,16 +121,10 @@ def main( **kwargs, ) - tokenizer = load_hf_tokenizer( - pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), - cache_dir=cache_dir, - hf_token=hf_token, - ) - ######### # Execute ######### - config = AutoConfig.from_pretrained(model_name) + config = qeff_model.model.config architecture = config.architectures[0] if config.architectures else None if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): @@ -166,13 +160,19 @@ def main( add_special_tokens=False, ) streamer = TextStreamer(processor.tokenizer) - _ = qeff_model.generate( + output = qeff_model.generate( inputs=split_inputs, streamer=streamer, device_ids=device_group, generation_len=generation_len, ) + print(output) else: + tokenizer = load_hf_tokenizer( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + ) _ = qeff_model.generate( tokenizer, prompts=prompt, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index b127c50a0..b668ceff5 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -835,8 +835,6 @@ def kv_offload_generate( prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time ), ) - - print(exec_info) return exec_info @@ -1128,8 +1126,6 @@ def cloud_ai_100_generate( prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time ), ) - - print(exec_info) return exec_info @property @@ -1572,7 +1568,7 @@ def compile( specializations.append(decode_specialization) if compiler_options.pop("img_size", None): - logger.warning("img_size is not a valid argument for Text-to-Text Model.") + logger.warning(f"Skipping img_size as it is not a valid argument for {self.model.config.architectures[0]}.") if enable_qnn: if compiler_options: From 5ea6f1c6e5dc2e6a03001763c94dc4dc9334d8ab Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 6 Mar 2025 05:55:28 +0000 Subject: [PATCH 07/20] Removed importlib Signed-off-by: Asmita Goswami --- QEfficient/base/common.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index 228d1184e..ee27d7565 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -12,7 +12,6 @@ QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model. """ -import importlib from typing import Any import transformers.models.auto.modeling_auto as mapping @@ -51,7 +50,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> class_name = MODEL_CLASS_MAPPING.get(architecture) if class_name: - module = importlib.import_module("QEfficient.transformers.models.modeling_auto") + module = __import__("QEfficient.transformers.models.modeling_auto") model_class = getattr(module, class_name) else: raise NotImplementedError( From 561142bf6b1ba4866d221c9b8e301a10cff973bb Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 10 Mar 2025 07:25:47 +0000 Subject: [PATCH 08/20] Addressing comments Signed-off-by: Asmita Goswami --- QEfficient/base/common.py | 11 +--- QEfficient/cloud/infer.py | 57 ++++++++----------- QEfficient/transformers/modeling_utils.py | 11 ++++ .../transformers/models/modeling_auto.py | 5 -- QEfficient/utils/_utils.py | 2 +- QEfficient/utils/constants.py | 9 +++ 6 files changed, 47 insertions(+), 48 deletions(-) diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index ee27d7565..ad699dcfa 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -14,18 +14,11 @@ from typing import Any -import transformers.models.auto.modeling_auto as mapping +from QEfficient.transformers.modeling_utils import model_class_mapping from transformers import AutoConfig from QEfficient.base.modeling_qeff import QEFFBaseModel -MODEL_CLASS_MAPPING = {} -for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): - MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForCausalLM" - -for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): - MODEL_CLASS_MAPPING[architecture] = "QEFFAutoModelForImageTextToText" - class QEFFCommonLoader: """ @@ -48,7 +41,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> config = AutoConfig.from_pretrained(pretrained_model_name_or_path) architecture = config.architectures[0] if config.architectures else None - class_name = MODEL_CLASS_MAPPING.get(architecture) + class_name = model_class_mapping.get(architecture) if class_name: module = __import__("QEfficient.transformers.models.modeling_auto") model_class = getattr(module, class_name) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 0ac38cb6a..88a968c8c 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -16,7 +16,7 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader -from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, constants from QEfficient.utils.logging_utils import logger @@ -41,7 +41,6 @@ def main( allow_mxint8_mdp_io: bool = False, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, - img_size: Optional[int] = None, **kwargs, ) -> None: """ @@ -89,9 +88,6 @@ def main( if args.mxint8: logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.") - image_path = kwargs.pop("image_path", None) - image_url = kwargs.pop("image_url", None) - qeff_model = QEFFCommonLoader.from_pretrained( pretrained_model_name_or_path=model_name, cache_dir=cache_dir, @@ -100,6 +96,16 @@ def main( local_model_dir=local_model_dir, ) + image_path = kwargs.pop("image_path", None) + image_url = kwargs.pop("image_url", None) + + config = qeff_model.model.config + architecture = config.architectures[0] if config.architectures else None + if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): + img_size = kwargs.pop("img_size", None) + if img_size or image_path or image_url: + logger.warning(f"Skipping image arguments as they are not valid for {architecture}") + ######### # Compile ######### @@ -117,38 +123,21 @@ def main( allow_mxint8_mdp_io=allow_mxint8_mdp_io, enable_qnn=enable_qnn, qnn_config=qnn_config, - img_size=img_size, **kwargs, ) ######### # Execute ######### - config = qeff_model.model.config - architecture = config.architectures[0] if config.architectures else None - if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): processor = AutoProcessor.from_pretrained(model_name, use_fast=False) - raw_image = None - if image_url is not None: - raw_image = Image.open(requests.get(image_url, stream=True).raw) - elif image_path is not None: - raw_image = Image.open(image_path) - else: - raise FileNotFoundError( - 'Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"' - ) + if not (image_url or image_path): + raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') + raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) - conversation = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text", "text": prompt[0]}, # Currently accepting only 1 prompt - ], - }, - ] + conversation = constants.Constants.conversation + conversation[0]["content"][1].update({"text": prompt[0]}) # Currently accepting only 1 prompt # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) @@ -277,19 +266,21 @@ def main( "--enable_qnn", "--enable-qnn", action="store_true", + nargs="?", + const=True, + type=str, default=False, help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\ If not provided, the default configuration will be used.\ Sample Config: QEfficient/compile/qnn_config.json", ) - parser.add_argument( - "--qnn_config", - nargs="?", - type=str, - ) - parser.add_argument("--img-size", "--img_size", default=None, type=int, required=False, help="Size of Image") args, compiler_options = parser.parse_known_args() + + if isinstance(args.enable_qnn, str): + args.qnn_config = args.enable_qnn + args.enable_qnn = True + compiler_options_dict = {} for i in range(0, len(compiler_options)): if compiler_options[i].startswith("--"): diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index ccad5e020..aad76cb37 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -8,6 +8,8 @@ from collections import namedtuple from typing import Dict, Optional, Tuple, Type +import transformers.models.auto.modeling_auto as mapping + import torch import torch.nn as nn from transformers.models.codegen.modeling_codegen import ( @@ -272,6 +274,15 @@ } +model_class_mapping = { + **{architecture: "QEFFAutoModelForCausalLM" for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()}, + **{ + architecture: "QEFFAutoModelForImageTextToText" + for architecture in mapping.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() + }, +} + + def _prepare_cross_attention_mask( cross_attention_mask: torch.Tensor, num_vision_tokens: int, diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index b668ceff5..4411eab95 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -615,8 +615,6 @@ def compile( ) output_names = self.model.get_output_names(kv_offload=True) - vision_onnx_path = compiler_options.get("vision_onnx_path", None) - lang_onnx_path = compiler_options.get("lang_onnx_path", None) specializations, compiler_options = self.model.get_specializations( batch_size=batch_size, @@ -1567,9 +1565,6 @@ def compile( decode_specialization.update({"num_logits_to_keep": num_speculative_tokens + 1}) if self.is_tlm else ... specializations.append(decode_specialization) - if compiler_options.pop("img_size", None): - logger.warning(f"Skipping img_size as it is not a valid argument for {self.model.config.architectures[0]}.") - if enable_qnn: if compiler_options: logger.warning("Extra arguments to QNN compilation are supported via qnn_config.json only") diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py index ea9044e2c..d4e1ac4bf 100644 --- a/QEfficient/utils/_utils.py +++ b/QEfficient/utils/_utils.py @@ -504,7 +504,7 @@ def create_and_dump_qconfigs( # Extract QNN SDK details from YAML file if the environment variable is set qnn_sdk_details = None qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME) - if qnn_sdk_path: + if enable_qnn and qnn_sdk_path: qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML) with open(qnn_sdk_yaml_path, "r") as file: qnn_sdk_details = yaml.safe_load(file) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 3852adcda..8cf100c93 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -76,6 +76,15 @@ class Constants: MAX_RETRIES = 5 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download NUM_SPECULATIVE_TOKENS = 2 SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml" # This xml file is parsed to find out the SDK version. + conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text"}, + ], + } + ] @dataclass From d9dc7d28773fc670ede62643e394ac06d72b27ea Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 10 Mar 2025 07:31:37 +0000 Subject: [PATCH 09/20] Addressing comments Signed-off-by: Asmita Goswami --- QEfficient/base/common.py | 2 +- QEfficient/cloud/infer.py | 2 +- QEfficient/transformers/modeling_utils.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index ad699dcfa..9abbb3e94 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -14,10 +14,10 @@ from typing import Any -from QEfficient.transformers.modeling_utils import model_class_mapping from transformers import AutoConfig from QEfficient.base.modeling_qeff import QEFFBaseModel +from QEfficient.transformers.modeling_utils import model_class_mapping class QEFFCommonLoader: diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 88a968c8c..59b61bd15 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -16,7 +16,7 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader -from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, constants +from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer from QEfficient.utils.logging_utils import logger diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index aad76cb37..1770feeea 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -8,10 +8,9 @@ from collections import namedtuple from typing import Dict, Optional, Tuple, Type -import transformers.models.auto.modeling_auto as mapping - import torch import torch.nn as nn +import transformers.models.auto.modeling_auto as mapping from transformers.models.codegen.modeling_codegen import ( CodeGenAttention, CodeGenBlock, From ca55d42246f8b36d584a5a09bd87d3bf7c78f30d Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 2 Apr 2025 07:52:10 +0000 Subject: [PATCH 10/20] Addressed Comments Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 70 ++++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 26 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 59b61bd15..09585b066 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -12,13 +12,48 @@ import requests from PIL import Image -from transformers import AutoProcessor, TextStreamer +from transformers import AutoProcessor, TextStreamer, PreTrainedModel from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer from QEfficient.utils.logging_utils import logger +def execute_vlm_model( + qeff_model: PreTrainedModel, + model_name: str, + image_url: str, + image_path: str, + prompt: Optional[str] = None, #type: ignore + device_group: Optional[List[int]] = None, + generation_len: Optional[int] = None, +): + if not (image_url or image_path): + raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') + raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) + + processor = AutoProcessor.from_pretrained(model_name, use_fast=False) + + conversation = constants.Constants.conversation + conversation[0]["content"][1].update({"text": prompt[0]}) # Currently accepting only 1 prompt + + # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. + input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) + + split_inputs = processor( + text=input_text, + images=raw_image, + return_tensors="pt", + add_special_tokens=False, + ) + streamer = TextStreamer(processor.tokenizer) + output = qeff_model.generate( + inputs=split_inputs, + streamer=streamer, + device_ids=device_group, + generation_len=generation_len, + ) + return output def main( model_name: str, @@ -130,32 +165,16 @@ def main( # Execute ######### if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): - processor = AutoProcessor.from_pretrained(model_name, use_fast=False) - - if not (image_url or image_path): - raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') - raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) - - conversation = constants.Constants.conversation - conversation[0]["content"][1].update({"text": prompt[0]}) # Currently accepting only 1 prompt - - # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. - input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) - - split_inputs = processor( - text=input_text, - images=raw_image, - return_tensors="pt", - add_special_tokens=False, - ) - streamer = TextStreamer(processor.tokenizer) - output = qeff_model.generate( - inputs=split_inputs, - streamer=streamer, - device_ids=device_group, + exec_info = execute_vlm_model( + qeff_model=qeff_model, + model_name=model_name, + prompt=prompt, + image_url=image_url, + image_path=image_path, + device_group=device_group, generation_len=generation_len, ) - print(output) + print(exec_info) else: tokenizer = load_hf_tokenizer( pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), @@ -265,7 +284,6 @@ def main( parser.add_argument( "--enable_qnn", "--enable-qnn", - action="store_true", nargs="?", const=True, type=str, From 7a4d18e8b11f1f84fae9cb7da8e29c556e1c3b4e Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Wed, 2 Apr 2025 08:02:46 +0000 Subject: [PATCH 11/20] Ruff check and format Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 09585b066..11916dd8d 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -12,24 +12,25 @@ import requests from PIL import Image -from transformers import AutoProcessor, TextStreamer, PreTrainedModel +from transformers import AutoProcessor, PreTrainedModel, TextStreamer from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer from QEfficient.utils.logging_utils import logger + def execute_vlm_model( qeff_model: PreTrainedModel, model_name: str, image_url: str, image_path: str, - prompt: Optional[str] = None, #type: ignore + prompt: Optional[str] = None, # type: ignore device_group: Optional[List[int]] = None, generation_len: Optional[int] = None, ): if not (image_url or image_path): - raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') + raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) processor = AutoProcessor.from_pretrained(model_name, use_fast=False) @@ -55,6 +56,7 @@ def execute_vlm_model( ) return output + def main( model_name: str, num_cores: int, From adaee62bdb474898ef8d2990755d1ab5a7c470d4 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 14 Apr 2025 13:38:09 +0000 Subject: [PATCH 12/20] Adderssing comments Signed-off-by: Asmita Goswami --- QEfficient/base/common.py | 4 ++-- QEfficient/cloud/infer.py | 21 ++++++++++++++------- QEfficient/transformers/modeling_utils.py | 2 +- QEfficient/utils/constants.py | 9 --------- 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py index 9abbb3e94..7836d5988 100644 --- a/QEfficient/base/common.py +++ b/QEfficient/base/common.py @@ -17,7 +17,7 @@ from transformers import AutoConfig from QEfficient.base.modeling_qeff import QEFFBaseModel -from QEfficient.transformers.modeling_utils import model_class_mapping +from QEfficient.transformers.modeling_utils import MODEL_CLASS_MAPPING class QEFFCommonLoader: @@ -41,7 +41,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) -> config = AutoConfig.from_pretrained(pretrained_model_name_or_path) architecture = config.architectures[0] if config.architectures else None - class_name = model_class_mapping.get(architecture) + class_name = MODEL_CLASS_MAPPING.get(architecture) if class_name: module = __import__("QEfficient.transformers.models.modeling_auto") model_class = getattr(module, class_name) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 11916dd8d..0ff8cb3b9 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -16,7 +16,7 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader -from QEfficient.utils import check_and_assign_cache_dir, constants, load_hf_tokenizer +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer from QEfficient.utils.logging_utils import logger @@ -35,8 +35,16 @@ def execute_vlm_model( processor = AutoProcessor.from_pretrained(model_name, use_fast=False) - conversation = constants.Constants.conversation - conversation[0]["content"][1].update({"text": prompt[0]}) # Currently accepting only 1 prompt + # Added for QEff version 1.20 supported VLM models (mllama and llava) + conversation = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"type": "text", "text": prompt[0]}, + ], + } + ] # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids. input_text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) @@ -138,9 +146,8 @@ def main( config = qeff_model.model.config architecture = config.architectures[0] if config.architectures else None - if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values(): - img_size = kwargs.pop("img_size", None) - if img_size or image_path or image_url: + + if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and (kwargs.pop("img_size", None) or image_path or image_url): logger.warning(f"Skipping image arguments as they are not valid for {architecture}") ######### @@ -304,7 +311,7 @@ def main( compiler_options_dict = {} for i in range(0, len(compiler_options)): if compiler_options[i].startswith("--"): - key = compiler_options[i].lstrip("-") + key = compiler_options[i].lstrip("-").replace("-", "_") value = ( compiler_options[i + 1] if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-") diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py index 1770feeea..454fcf51d 100644 --- a/QEfficient/transformers/modeling_utils.py +++ b/QEfficient/transformers/modeling_utils.py @@ -273,7 +273,7 @@ } -model_class_mapping = { +MODEL_CLASS_MAPPING = { **{architecture: "QEFFAutoModelForCausalLM" for architecture in mapping.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values()}, **{ architecture: "QEFFAutoModelForImageTextToText" diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 0f44104e9..c2663594f 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -86,15 +86,6 @@ class Constants: MAX_RETRIES = 5 # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download NUM_SPECULATIVE_TOKENS = 2 SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml" # This xml file is parsed to find out the SDK version. - conversation = [ - { - "role": "user", - "content": [ - {"type": "image"}, - {"type": "text"}, - ], - } - ] @dataclass From f7c84b7fbed0b730dbb9426adccc64b451761d11 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 14 Apr 2025 13:46:28 +0000 Subject: [PATCH 13/20] Ruff format Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 0ff8cb3b9..8b36910f5 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -147,8 +147,10 @@ def main( config = qeff_model.model.config architecture = config.architectures[0] if config.architectures else None - if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and (kwargs.pop("img_size", None) or image_path or image_url): - logger.warning(f"Skipping image arguments as they are not valid for {architecture}") + if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES.values() and ( + kwargs.pop("img_size", None) or image_path or image_url + ): + logger.warning(f"Skipping image arguments as they are not valid for {architecture}") ######### # Compile From 893e322af04ee5a3e9bacfa33161f755349af54e Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 17 Apr 2025 08:56:50 +0000 Subject: [PATCH 14/20] Added VLM CLI test and addressed comments Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 15 ++++++++ .../transformers/models/modeling_auto.py | 6 ++-- tests/cloud/test_infer_vlm.py | 34 +++++++++++++++++++ 3 files changed, 51 insertions(+), 4 deletions(-) create mode 100644 tests/cloud/test_infer_vlm.py diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 8b36910f5..f99070f94 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -29,6 +29,21 @@ def execute_vlm_model( device_group: Optional[List[int]] = None, generation_len: Optional[int] = None, ): + """ + This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards. + ``Mandatory`` Args: + :qeff_model (PreTrainedModel): QEfficient model object. + :model_name (str): Hugging Face Model Card name, Example: ``llava-hf/llava-1.5-7b-hf`` + :num_cores (int): Number of cores to compile model on. + :image_url (str): Image URL to be used for inference. ``Defaults to None.`` + :image_path (str): Image path to be used for inference. ``Defaults to None.`` + ``Optional`` Args: + :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` + :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.`` + :generation_len (int): Number of tokens to be generated. ``Defaults to None.`` + Returns: + :dict: Output from the ``AI_100`` runtime. + """ if not (image_url or image_path): raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index ee486811a..3295af67c 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -828,14 +828,13 @@ def kv_offload_generate( total_time = decode_end - prefill_start total_perf = num_token / total_time - exec_info = CloudAI100ExecInfoNew( + return CloudAI100ExecInfoNew( batch_size=batch_size, generated_ids=generated_ids, perf_metrics=PerfMetrics( prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time ), ) - return exec_info class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin): @@ -1119,14 +1118,13 @@ def cloud_ai_100_generate( total_time = decode_end - prefill_start total_perf = num_token / total_time - exec_info = CloudAI100ExecInfoNew( + return CloudAI100ExecInfoNew( batch_size=batch_size, generated_ids=generated_ids, perf_metrics=PerfMetrics( prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time ), ) - return exec_info @property def model_hash(self) -> str: diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py new file mode 100644 index 000000000..435f0fde8 --- /dev/null +++ b/tests/cloud/test_infer_vlm.py @@ -0,0 +1,34 @@ +import pytest + +from QEfficient.cloud.infer import main as infer + + +@pytest.mark.on_qaic +@pytest.mark.cli +@pytest.mark.multimodal +@pytest.mark.usefixtures("clean_up_after_test") +def test_vlm_cli(setup, mocker): + ms = setup + # Taking some values from setup fixture and assigning other's based on model's requirement. + # For example, mxint8 is not required for VLM models, so assigning False. + infer( + model_name="llava-hf/llava-1.5-7b-hf", + num_cores=ms.num_cores, + prompt="Describe the image.", + prompts_txt_file_path=None, + aic_enable_depth_first=ms.aic_enable_depth_first, + mos=ms.mos, + batch_size=1, + full_batch_size=None, + prompt_len=1024, + ctx_len=2048, + generation_len=ms.generation_len, + mxfp6=False, + mxint8=False, + local_model_dir=None, + cache_dir=None, + hf_token=ms.hf_token, + enable_qnn=False, + qnn_config=None, + image_url="https://i.etsystatic.com/8155076/r/il/0825c2/1594869823/il_fullxfull.1594869823_5x0w.jpg", + ) From 9cde30bca0682e3efe84efe379d8bedcf27ef6b2 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 17 Apr 2025 09:17:35 +0000 Subject: [PATCH 15/20] Added Copyrights Signed-off-by: Asmita Goswami --- tests/cloud/test_infer_vlm.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py index 435f0fde8..b5e71dc11 100644 --- a/tests/cloud/test_infer_vlm.py +++ b/tests/cloud/test_infer_vlm.py @@ -1,3 +1,10 @@ +# ----------------------------------------------------------------------------- +# +# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# ----------------------------------------------------------------------------- + import pytest from QEfficient.cloud.infer import main as infer From 3a1e5b9f58b1e71f8b4d4d5d3cf505917c2ca873 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Thu, 17 Apr 2025 09:26:17 +0000 Subject: [PATCH 16/20] Added Copyrights Signed-off-by: Asmita Goswami --- tests/cloud/test_infer_vlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/cloud/test_infer_vlm.py b/tests/cloud/test_infer_vlm.py index b5e71dc11..d06e09946 100644 --- a/tests/cloud/test_infer_vlm.py +++ b/tests/cloud/test_infer_vlm.py @@ -29,7 +29,7 @@ def test_vlm_cli(setup, mocker): full_batch_size=None, prompt_len=1024, ctx_len=2048, - generation_len=ms.generation_len, + generation_len=20, mxfp6=False, mxint8=False, local_model_dir=None, From d4af07a49aa57915e0b281c286bf64eb509a4dd1 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Mon, 21 Apr 2025 09:36:49 +0000 Subject: [PATCH 17/20] Ruff format Signed-off-by: Asmita Goswami --- scripts/Jenkinsfile | 2 +- tests/transformers/spd/test_pld_inference.py | 6 +++--- tests/transformers/spd/test_spd_inference.py | 12 ++++++------ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index f3f2fc2d8..53fcab1c7 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -69,7 +69,7 @@ pipeline { } stage('CLI Tests') { steps { - timeout(time: 15, unit: 'MINUTES') { + timeout(time: 150, unit: 'MINUTES') { sh ''' sudo docker exec ${BUILD_TAG} bash -c " source /qnn_sdk/bin/envsetup.sh && diff --git a/tests/transformers/spd/test_pld_inference.py b/tests/transformers/spd/test_pld_inference.py index 88d86a9be..e5d472734 100644 --- a/tests/transformers/spd/test_pld_inference.py +++ b/tests/transformers/spd/test_pld_inference.py @@ -145,9 +145,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int): """ num_chunks = -(input_len // -prefill_seq_len) # ceil divide without float input_len_padded = num_chunks * prefill_seq_len # Convert input_len to a multiple of prefill_seq_len - assert ( - input_len_padded <= ctx_len - ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len" + assert input_len_padded <= ctx_len, ( + "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len" + ) return input_len_padded diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py index 39dbd95cb..b78afdc38 100644 --- a/tests/transformers/spd/test_spd_inference.py +++ b/tests/transformers/spd/test_spd_inference.py @@ -75,9 +75,9 @@ def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int): """ num_chunks = -(input_len // -prefill_seq_len) # ceil divide without float input_len_padded = num_chunks * prefill_seq_len # Convert input_len to a multiple of prefill_seq_len - assert ( - input_len_padded <= ctx_len - ), "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len" + assert input_len_padded <= ctx_len, ( + "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len" + ) return input_len_padded @@ -320,9 +320,9 @@ def test_spec_decode_inference( for prompt, generation in zip(prompts, batch_decode): print(f"{prompt=} {generation=}") # validation check - assert mean_num_accepted_tokens == float( - num_speculative_tokens + 1 - ), f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}" + assert mean_num_accepted_tokens == float(num_speculative_tokens + 1), ( + f"mean number of accepted tokens is {mean_num_accepted_tokens} but should be {num_speculative_tokens + 1}" + ) del target_model_session del draft_model_session generated_ids = np.asarray(generated_ids[0]).flatten() From 9441120ad2c0a0ef7de6a76d6c736820941ce7e6 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 22 Apr 2025 09:02:23 +0000 Subject: [PATCH 18/20] Addressed Comments Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 1 - scripts/Jenkinsfile | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index f99070f94..13e6e5c73 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -34,7 +34,6 @@ def execute_vlm_model( ``Mandatory`` Args: :qeff_model (PreTrainedModel): QEfficient model object. :model_name (str): Hugging Face Model Card name, Example: ``llava-hf/llava-1.5-7b-hf`` - :num_cores (int): Number of cores to compile model on. :image_url (str): Image URL to be used for inference. ``Defaults to None.`` :image_path (str): Image path to be used for inference. ``Defaults to None.`` ``Optional`` Args: diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile index 53fcab1c7..24113f9c8 100644 --- a/scripts/Jenkinsfile +++ b/scripts/Jenkinsfile @@ -69,7 +69,7 @@ pipeline { } stage('CLI Tests') { steps { - timeout(time: 150, unit: 'MINUTES') { + timeout(time: 60, unit: 'MINUTES') { sh ''' sudo docker exec ${BUILD_TAG} bash -c " source /qnn_sdk/bin/envsetup.sh && From e138828c7c7ba04ee60a1b738cd8a027b58807c7 Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 22 Apr 2025 10:16:33 +0000 Subject: [PATCH 19/20] Updated load_hf_processor Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 20 +++++++++++++++++--- QEfficient/utils/__init__.py | 1 + 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 13e6e5c73..7e1f321d7 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -12,14 +12,15 @@ import requests from PIL import Image -from transformers import AutoProcessor, PreTrainedModel, TextStreamer +from transformers import PreTrainedModel, TextStreamer from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader -from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer +from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_hf_processor from QEfficient.utils.logging_utils import logger +# TODO: Remove after adding support for VLM's compile and execute def execute_vlm_model( qeff_model: PreTrainedModel, model_name: str, @@ -27,6 +28,9 @@ def execute_vlm_model( image_path: str, prompt: Optional[str] = None, # type: ignore device_group: Optional[List[int]] = None, + local_model_dir: Optional[str] = None, + cache_dir: Optional[str] = None, + hf_token: Optional[str] = None, generation_len: Optional[int] = None, ): """ @@ -39,6 +43,9 @@ def execute_vlm_model( ``Optional`` Args: :prompt (str): Sample prompt for the model text generation. ``Defaults to None.`` :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.`` + :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.`` + :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.`` + :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.`` :generation_len (int): Number of tokens to be generated. ``Defaults to None.`` Returns: :dict: Output from the ``AI_100`` runtime. @@ -47,7 +54,11 @@ def execute_vlm_model( raise ValueError('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"') raw_image = Image.open(requests.get(image_url, stream=True).raw) if image_url else Image.open(image_path) - processor = AutoProcessor.from_pretrained(model_name, use_fast=False) + processor = load_hf_processor( + pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name), + cache_dir=cache_dir, + hf_token=hf_token, + ) # Added for QEff version 1.20 supported VLM models (mllama and llava) conversation = [ @@ -197,6 +208,9 @@ def main( image_url=image_url, image_path=image_path, device_group=device_group, + local_model_dir=local_model_dir, + cache_dir=cache_dir, + hf_token=hf_token, generation_len=generation_len, ) print(exec_info) diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index a7f17e6bc..5bc2d5efa 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -18,6 +18,7 @@ get_qpc_dir_path, hf_download, load_hf_tokenizer, + load_hf_processor, login_and_download_hf_lm, onnx_exists, padding_check_and_fix, From 28fd36196acfe22e92d64c668d4d5cb8ff80491f Mon Sep 17 00:00:00 2001 From: Asmita Goswami Date: Tue, 22 Apr 2025 10:18:08 +0000 Subject: [PATCH 20/20] Ruff check fix Signed-off-by: Asmita Goswami --- QEfficient/cloud/infer.py | 2 +- QEfficient/utils/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 7e1f321d7..68be72fa8 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -16,7 +16,7 @@ from transformers.models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES from QEfficient.base.common import QEFFCommonLoader -from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer, load_hf_processor +from QEfficient.utils import check_and_assign_cache_dir, load_hf_processor, load_hf_tokenizer from QEfficient.utils.logging_utils import logger diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py index 5bc2d5efa..f6aa3296d 100755 --- a/QEfficient/utils/__init__.py +++ b/QEfficient/utils/__init__.py @@ -17,8 +17,8 @@ get_padding_shape_from_config, get_qpc_dir_path, hf_download, - load_hf_tokenizer, load_hf_processor, + load_hf_tokenizer, login_and_download_hf_lm, onnx_exists, padding_check_and_fix,