1010import sys
1111from typing import List , Optional
1212
13+ import requests
14+ from PIL import Image
15+ from transformers import PreTrainedModel , TextStreamer
16+ from transformers .models .auto .modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
17+
1318from QEfficient .base .common import QEFFCommonLoader
14- from QEfficient .utils import check_and_assign_cache_dir , load_hf_tokenizer
19+ from QEfficient .utils import check_and_assign_cache_dir , load_hf_processor , load_hf_tokenizer
1520from QEfficient .utils .logging_utils import logger
1621
1722
23+ # TODO: Remove after adding support for VLM's compile and execute
24+ def execute_vlm_model (
25+ qeff_model : PreTrainedModel ,
26+ model_name : str ,
27+ image_url : str ,
28+ image_path : str ,
29+ prompt : Optional [str ] = None , # type: ignore
30+ device_group : Optional [List [int ]] = None ,
31+ local_model_dir : Optional [str ] = None ,
32+ cache_dir : Optional [str ] = None ,
33+ hf_token : Optional [str ] = None ,
34+ generation_len : Optional [int ] = None ,
35+ ):
36+ """
37+ This method generates output by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
38+ ``Mandatory`` Args:
39+ :qeff_model (PreTrainedModel): QEfficient model object.
40+ :model_name (str): Hugging Face Model Card name, Example: ``llava-hf/llava-1.5-7b-hf``
41+ :image_url (str): Image URL to be used for inference. ``Defaults to None.``
42+ :image_path (str): Image path to be used for inference. ``Defaults to None.``
43+ ``Optional`` Args:
44+ :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
45+ :device_group (List[int]): Device Ids to be used for compilation. If ``len(device_group) > 1``, multiple Card setup is enabled. ``Defaults to None.``
46+ :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
47+ :cache_dir (str): Cache dir where downloaded HuggingFace files are stored. ``Defaults to None.``
48+ :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
49+ :generation_len (int): Number of tokens to be generated. ``Defaults to None.``
50+ Returns:
51+ :dict: Output from the ``AI_100`` runtime.
52+ """
53+ if not (image_url or image_path ):
54+ raise ValueError ('Neither Image URL nor Image Path is found, either provide "image_url" or "image_path"' )
55+ raw_image = Image .open (requests .get (image_url , stream = True ).raw ) if image_url else Image .open (image_path )
56+
57+ processor = load_hf_processor (
58+ pretrained_model_name_or_path = (local_model_dir if local_model_dir else model_name ),
59+ cache_dir = cache_dir ,
60+ hf_token = hf_token ,
61+ )
62+
63+ # Added for QEff version 1.20 supported VLM models (mllama and llava)
64+ conversation = [
65+ {
66+ "role" : "user" ,
67+ "content" : [
68+ {"type" : "image" },
69+ {"type" : "text" , "text" : prompt [0 ]},
70+ ],
71+ }
72+ ]
73+
74+ # Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token ids.
75+ input_text = processor .apply_chat_template (conversation , add_generation_prompt = True , tokenize = False )
76+
77+ split_inputs = processor (
78+ text = input_text ,
79+ images = raw_image ,
80+ return_tensors = "pt" ,
81+ add_special_tokens = False ,
82+ )
83+ streamer = TextStreamer (processor .tokenizer )
84+ output = qeff_model .generate (
85+ inputs = split_inputs ,
86+ streamer = streamer ,
87+ device_ids = device_group ,
88+ generation_len = generation_len ,
89+ )
90+ return output
91+
92+
1893def main (
1994 model_name : str ,
2095 num_cores : int ,
@@ -65,18 +140,16 @@ def main(
65140 :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
66141 :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
67142 :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
143+ :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
144+ -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
145+ -qpc_crc=True -> -qpc-crc
68146
69147 .. code-block:: bash
70148
71149 python -m QEfficient.cloud.infer OPTIONS
72150
73151 """
74152 cache_dir = check_and_assign_cache_dir (local_model_dir , cache_dir )
75- tokenizer = load_hf_tokenizer (
76- pretrained_model_name_or_path = (local_model_dir if local_model_dir else model_name ),
77- cache_dir = cache_dir ,
78- hf_token = hf_token ,
79- )
80153
81154 if "--mxfp6" in sys .argv :
82155 if args .mxfp6 :
@@ -93,6 +166,17 @@ def main(
93166 local_model_dir = local_model_dir ,
94167 )
95168
169+ image_path = kwargs .pop ("image_path" , None )
170+ image_url = kwargs .pop ("image_url" , None )
171+
172+ config = qeff_model .model .config
173+ architecture = config .architectures [0 ] if config .architectures else None
174+
175+ if architecture not in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES .values () and (
176+ kwargs .pop ("img_size" , None ) or image_path or image_url
177+ ):
178+ logger .warning (f"Skipping image arguments as they are not valid for { architecture } " )
179+
96180 #########
97181 # Compile
98182 #########
@@ -116,14 +200,34 @@ def main(
116200 #########
117201 # Execute
118202 #########
119- _ = qeff_model .generate (
120- tokenizer ,
121- prompts = prompt ,
122- device_id = device_group ,
123- prompt = prompt ,
124- prompts_txt_file_path = prompts_txt_file_path ,
125- generation_len = generation_len ,
126- )
203+ if architecture in MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES .values ():
204+ exec_info = execute_vlm_model (
205+ qeff_model = qeff_model ,
206+ model_name = model_name ,
207+ prompt = prompt ,
208+ image_url = image_url ,
209+ image_path = image_path ,
210+ device_group = device_group ,
211+ local_model_dir = local_model_dir ,
212+ cache_dir = cache_dir ,
213+ hf_token = hf_token ,
214+ generation_len = generation_len ,
215+ )
216+ print (exec_info )
217+ else :
218+ tokenizer = load_hf_tokenizer (
219+ pretrained_model_name_or_path = (local_model_dir if local_model_dir else model_name ),
220+ cache_dir = cache_dir ,
221+ hf_token = hf_token ,
222+ )
223+ _ = qeff_model .generate (
224+ tokenizer ,
225+ prompts = prompt ,
226+ device_id = device_group ,
227+ prompt = prompt ,
228+ prompts_txt_file_path = prompts_txt_file_path ,
229+ generation_len = generation_len ,
230+ )
127231
128232
129233if __name__ == "__main__" :
@@ -219,23 +323,25 @@ def main(
219323 parser .add_argument (
220324 "--enable_qnn" ,
221325 "--enable-qnn" ,
222- action = "store_true" ,
326+ nargs = "?" ,
327+ const = True ,
328+ type = str ,
223329 default = False ,
224330 help = "Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
225331 If not provided, the default configuration will be used.\
226332 Sample Config: QEfficient/compile/qnn_config.json" ,
227333 )
228- parser .add_argument (
229- "qnn_config" ,
230- nargs = "?" ,
231- type = str ,
232- )
233334
234335 args , compiler_options = parser .parse_known_args ()
336+
337+ if isinstance (args .enable_qnn , str ):
338+ args .qnn_config = args .enable_qnn
339+ args .enable_qnn = True
340+
235341 compiler_options_dict = {}
236342 for i in range (0 , len (compiler_options )):
237343 if compiler_options [i ].startswith ("--" ):
238- key = compiler_options [i ].lstrip ("-" )
344+ key = compiler_options [i ].lstrip ("-" ). replace ( "-" , "_" )
239345 value = (
240346 compiler_options [i + 1 ]
241347 if i + 1 < len (compiler_options ) and not compiler_options [i + 1 ].startswith ("-" )
0 commit comments