Docs string added for the Image class and granite models are added in validation page (#303)

abukhoy · web-flow · commit 01dffb6c1893 · 2025-03-06T11:56:27.000+05:30
Signed-off-by: Abukhoyer Shaik &lt;quic_abukhoye@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -1147,9 +1147,69 @@ def get_model_config(self) -> dict:
 
 class QEFFAutoModelForImageTextToText:
     """
-    A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach
+    The QEFFAutoModelForImageTextToText class is used to work with multimodal language models from the HuggingFace hub.
+    While you can initialize the class directly, it's best to use the ``from_pretrained`` method for this purpose. This class supports both single and dual QPC approaches.
     Attributes:
         _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models.
+
+    ``Mandatory`` Args:
+        :pretrained_model_name_or_path (str): Model card name from HuggingFace or local path to model directory.
+
+    ``Optional`` Args:
+        :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True.
+
+    .. code-block:: python
+        import requests
+        from PIL import Image
+        from transformers import AutoProcessor, TextStreamer
+
+        from QEfficient import QEFFAutoModelForImageTextToText
+
+        # Add HuggingFace Token to access the model
+        HF_TOKEN = ""
+        model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+        query = "Describe this image."
+        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+
+        ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc
+        processor = AutoProcessor.from_pretrained(model_name, token=token)
+        model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, attn_implementation="eager", kv_offload=False)
+
+        ## STEP - 2 Export & Compile the Model
+        model.compile(
+            prefill_seq_len=32,
+            ctx_len=512,
+            img_size=560,
+            num_cores=16,
+            num_devices=1,
+            mxfp6_matmul=False,
+        )
+
+        ## STEP - 3 Load and process the inputs for Inference
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": query},
+                ],
+            }
+        ]
+        input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
+        inputs = processor(
+            text=input_text,
+            images=image,
+            return_tensors="pt",
+            add_special_tokens=False,
+            padding="max_length",
+            max_length=prefill_seq_len,
+        )
+
+        ## STEP - 4 Run Inference on the compiled model
+        streamer = TextStreamer(processor.tokenizer)
+        model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+
     """
 
     _hf_auto_class = AutoModelForImageTextToText
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
@@ -239,7 +239,7 @@ Use the qualcomm_efficient_converter API to export the KV transformed Model to O
 
 generated_qpc_path = qeff_model.compile(
     num_cores=14,
-    mxfp6=True,
+    mxfp6_matmul=True,
 )
 ```
 
@@ -250,8 +250,8 @@ Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/s
 ```Python
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
-
-qeff_model.generate(prompts=["My name is"])
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+qeff_model.generate(prompts=["My name is"],tokenizer=tokenizer)
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.
 
diff --git a/docs/source/validate.md b/docs/source/validate.md
@@ -41,13 +41,15 @@
 
 | Architecture | Model Family | Representative Models          |
 |--------------|--------------|---------------------------------|
-| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2)          |
+| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) |
 | **LlamaModel** | Llama-based  | [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
-| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
-| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
 | **MPNetForMaskedLM** | MPNet | [sentence-transformers/multi-qa-mpnet-base-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1) |
-| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
 | **MistralModel** | Mistral | [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
+| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
+| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
+| **RobertaModel**     | RoBERTa |  [ibm-granite/granite-embedding-30m-english](https://huggingface.co/ibm-granite/granite-embedding-30m-english)<br> [ibm-granite/granite-embedding-125m-english](https://huggingface.co/ibm-granite/granite-embedding-125m-english) |
+| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
+| **XLMRobertaModel**    | XLM-RoBERTa  |[ibm-granite/granite-embedding-107m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-107m-multilingual)<br> [ibm-granite/granite-embedding-278m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual)  |
 
 ## Multimodal Language Models