[docs]defload_adapter(self,model_id:str,adapter_name:str):"""Loads a new adapter from huggingface hub or local path
diff --git a/_modules/QEfficient/peft/lora/auto.html b/_modules/QEfficient/peft/lora/auto.html
index 7c4e600f5..994aa253c 100644
--- a/_modules/QEfficient/peft/lora/auto.html
+++ b/_modules/QEfficient/peft/lora/auto.html
@@ -202,6 +202,10 @@
Source code for QEfficient.transformers.models.modeling_auto
raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")self.model=modelself.config=model.config
- ifself.model_nameinself.UNSUPPORTED_MODELS:
- raiseNotImplementedError(f"kv_offload is not yet supported for {self.model.__class__.__name__}")self.vision_model=QEffVisionEncoderForTextImageToTextModel(model)self.lang_model=QEffCausalLMForTextImageToTextModel(model)
@@ -677,6 +691,7 @@
Source code for QEfficient.transformers.models.modeling_auto
Source code for QEfficient.transformers.models.modeling_auto
):
self.export()
- ifmxfp6_matmulandself.model_nameinMODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6:
- logger.warning(
- "Due to accuracy issues of vision model fixing it's precision to fp16, while language model will be compiled for mxfp6"
- )
-
self.vision_model._compile(compile_dir,compile_only=True,specializations=specializations["vision"],convert_to_fp16=True,
- mxfp6_matmul=False,
+ mxfp6_matmul=mxfp6_matmul,mdp_ts_num_devices=num_devices,aic_num_cores=num_cores,custom_io=custom_io_vision,
@@ -759,12 +769,12 @@
Source code for QEfficient.transformers.models.modeling_auto
Source code for QEfficient.transformers.models.modeling_auto
if output_name.endswith("_RetainedState"):custom_io[output_name]=kv_cache_dtype
- ifself.model_nameinMODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6andmxfp6_matmul:
- logger.warning(
- f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True"
- )
-
self._compile(onnx_path,compile_dir,
@@ -1249,26 +1254,81 @@
Source code for QEfficient.transformers.models.modeling_auto
[docs]classQEFFAutoModelForImageTextToText:"""
- A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach
+ The QEFFAutoModelForImageTextToText class is used to work with multimodal language models from the HuggingFace hub.
+ While you can initialize the class directly, it's best to use the ``from_pretrained`` method for this purpose. This class supports both single and dual QPC approaches. Attributes: _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models.
+
+ ``Mandatory`` Args:
+ :pretrained_model_name_or_path (str): Model card name from HuggingFace or local path to model directory.
+
+ ``Optional`` Args:
+ :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True.
+
+ .. code-block:: python
+ import requests
+ from PIL import Image
+ from transformers import AutoProcessor, TextStreamer
+
+ from QEfficient import QEFFAutoModelForImageTextToText
+
+ # Add HuggingFace Token to access the model
+ HF_TOKEN = ""
+ model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+ query = "Describe this image."
+ image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+
+ ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc
+ processor = AutoProcessor.from_pretrained(model_name, token=token)
+ model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, attn_implementation="eager", kv_offload=False)
+
+ ## STEP - 2 Export & Compile the Model
+ model.compile(
+ prefill_seq_len=32,
+ ctx_len=512,
+ img_size=560,
+ num_cores=16,
+ num_devices=1,
+ mxfp6_matmul=False,
+ )
+
+ ## STEP - 3 Load and process the inputs for Inference
+ image = Image.open(requests.get(image_url, stream=True).raw)
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": query},
+ ],
+ }
+ ]
+ input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
+ inputs = processor(
+ text=input_text,
+ images=image,
+ return_tensors="pt",
+ add_special_tokens=False,
+ padding="max_length",
+ max_length=prefill_seq_len,
+ )
+
+ ## STEP - 4 Run Inference on the compiled model
+ streamer = TextStreamer(processor.tokenizer)
+ model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+
"""_hf_auto_class=AutoModelForImageTextToText
- def__new__(self,model:nn.Module,kv_offload:Optional[bool]=None,**kwargs):
- ifmodel.config.architectures[0]inMODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6andnotkv_offload:
- # For models with mxfp6 accuracy issue, we will use kv_offload=True by default
- ifkv_offloadisNone:
- kv_offload=True
- else:
- logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
- elifkv_offloadisNone:
- kv_offload=False
-
+ def__new__(self,model:nn.Module,kv_offload:Optional[bool]=True,**kwargs):ifkv_offload:return_QEffAutoModelForImageTextToTextDualQPC(model,**kwargs)else:
@@ -1379,7 +1439,7 @@
Source code for QEfficient.transformers.models.modeling_auto
Source code for QEfficient.transformers.models.modeling_auto
``Optional`` Args:
:onnx_path (str, optional): Path to pre-exported onnx model. :compile_dir (str, optional): Path for saving the qpc generated.
- :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
+ :encoder_ctx_len (int, optional): The maximum length of context for encoder, based on the AutoProcessor output. ``Defaults to checking config, if None in config then 1500``
+ :ctx_len (int, optional): The maximum length of context to keep for decoding. ``Defaults to 150``. :batch_size (int, optional): Batch size. ``Defaults to 1``. :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1. :num_cores (int): Number of cores used to compile the model. :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
- :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+
+ Other args are not yet implemented for AutoModelForSpeechSeq2Seq Returns: :str: Path of the compiled ``qpc`` package. """
- specializations=self.model.get_specializations(batch_size,encoder_ctx_len,decoder_ctx_len,feature_len)
+ specializations,compiler_options=self.model.get_specializations(
+ batch_size,
+ encoder_ctx_len,
+ ctx_len,
+ **compiler_options,
+ )
- self._compile(
+ iffull_batch_size:
+ logger.warning("Continuous batching is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+ ifkv_cache_batch_size:
+ logger.warning("Prefix caching is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+ ifmxint8_kv_cache:
+ logger.warning("mxint8 cache is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+ ifnum_speculative_tokens:
+ logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+ ifenable_qnnorqnn_config:
+ logger.warning("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+ returnself._compile(onnx_path,compile_dir,compile_only=True,
@@ -1827,7 +1923,6 @@
Source code for QEfficient.transformers.models.modeling_auto
Source code for QEfficient.transformers.models.modeling_auto
``Mandatory`` Args:
:processor: autoprocessor to process inputs and decode logits
- :inputs (np.ndarray): inputs to run the execution.
+ :inputs (torch.Tensor): inputs to run the execution. :generation_len (int): length upto which to generate
- :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor) :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model Returns: :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
@@ -1849,9 +1943,20 @@
Source code for QEfficient.transformers.models.modeling_auto
Install the packages required for building documentation:
+
pipinstall-rdocs/requirements.txt
+
+
+
And then, change directory to docs folder to build the docs.
+
cddocs/
+# To build docs specific to branch
+sphinx-build-Mhtml.build
+# [Optional] To build docs for all the supporting branches
+sphinx-multiversion.build
+