From 909189dedcf4c5641048f0a1cd60aeb107afcf99 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 00:39:03 +0530
Subject: [PATCH 001/138] added initial version of SwiftKV for AI 100

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py        |  29 ++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 411 ++++++++++++++++++
 exps/run_swiftkv.py                           |  28 ++
 3 files changed, 468 insertions(+)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
 create mode 100644 exps/run_swiftkv.py

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index a5c375c6e..fe56b197c 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -36,6 +36,35 @@ class QEffDynamicCache(DynamicCache):
 
     """
 
+    def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        else:
+            position_ids = cache_kwargs.get("position_ids")
+            self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
+            self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
+
+    def read_only(self, layer_idx, cache_kwargs):
+        position_ids = cache_kwargs.get("position_ids")
+        ctx_len = position_ids.shape[-1]
+        ctx_indices = torch.arange(ctx_len)[None, None, ...]
+        gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
+        invalid_mask = ctx_indices > gather_limit
+
+        if torch.onnx.is_in_onnx_export():
+            invalid_idx_value = torch.iinfo(torch.int32).max
+        else:
+            invalid_idx_value = 0
+
+        ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
+        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
+        k_out = CtxGatherFunc.apply(k_out, ctx_indices)
+        v_out = CtxGatherFunc.apply(v_out, ctx_indices)
+        v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)
+        return k_out, v_out
+
     def update(
         self,
         key_states: torch.Tensor,
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
new file mode 100644
index 000000000..a33c83d3a
--- /dev/null
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -0,0 +1,411 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+import logging
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaRMSNorm, repeat_kv
+
+from QEfficient.transformers.cache_utils import QEffDynamicCache
+from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.transformers.models.llama.modeling_llama import (
+    QEffLlamaDecoderLayer,
+    QEffLlamaRotaryEmbedding,
+    qeff_apply_rotary_pos_emb,
+)
+
+logger = logging.get_logger(__name__)
+
+
+class LlamaSwiftKVAttention(LlamaAttention):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__(config, layer_idx)
+        self.hidden_size = config.hidden_size
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj_swiftkv = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj_swiftkv = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj_swiftkv = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.rotary_emb = QEffLlamaRotaryEmbedding(config=self.config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask=None,
+    ) -> torch.Tensor:
+        bsz, q_len, _ = hidden_states.size()
+        query, _ = self.q_proj_swiftkv(hidden_states)
+
+        # Reshape the query, key, and value tensors.
+        query_states = query.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = position_ids.shape[-1]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        key_states, value_states = past_key_value.read_only(self.layer_idx, position_ids=position_ids)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, _ = qeff_apply_rotary_pos_emb(query_states, torch.empty_like(key_states), cos, sin, position_ids)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights)
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, past_key_value
+
+
+class LlamaSwiftKVDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, position_ids: torch.Tensor, past_key_values, causal_mask
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+            past_key_value=past_key_values,
+            attention_mask=causal_mask,
+        )
+
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, past_key_values
+
+
+class LlamaSwiftKVModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size, config.hidden_size, None
+        )  # TODO: Not sure if padding_idx shoudl eb NONE
+        self.layers = torch.nn.ModuleList(
+            [
+                QEffLlamaDecoderLayer(config=config, layer_idx=idx)
+                if idx < config.num_key_value_layers
+                else LlamaSwiftKVDecoderLayer(config=config, layer_idx=idx)
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm_swiftkv = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def _run_swiftkv_layers(
+        self, hidden_states: torch.Tensor, position_ids: torch.Tensor, past_key_values, causal_mask
+    ) -> torch.Tensor:
+        for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
+            layer = self.layers[layer_idx]
+
+            hidden_states, past_key_values = layer(hidden_states, position_ids, past_key_values, causal_mask)
+
+        return hidden_states, past_key_values
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        self.config._attn_implementation = "eager"
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+            else:
+                causal_mask = _create_causal_mask(position_ids=position_ids, target_length=target_length)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        position_ids: torch.Tensor,
+        past_key_values: List[torch.Tensor],
+    ):
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        use_cache = True
+
+        if use_cache and not isinstance(past_key_values, Cache):
+            if past_key_values is None:
+                past_key_values = QEffDynamicCache()
+            else:
+                past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(None, inputs_embeds, cache_position, past_key_values, False)
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        next_decoder_cache = None
+
+        for layer_idx in range(self.config.num_key_value_layers):
+            layer = self.layers[layer_idx]
+            hidden_states, next_decoder_cache = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=True,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+        swiftkv_hidden_states = self.norm_swiftkv(hidden_states)
+
+        ####################################
+        ## THE MAGIC OF SWIFT KV BEGINS HERE
+        ####################################
+        for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
+            self_attn = self.layers[layer_idx].self_attn
+            key_states = self_attn.k_proj_swiftkv(swiftkv_hidden_states)
+            value_states = self_attn.v_proj_swiftkv(swiftkv_hidden_states)
+            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+            kv_seq_len = key_states.shape[-2]
+            if past_key_values is not None:
+                if self.layer_idx is None:
+                    raise ValueError(
+                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                        "with a layer index."
+                    )
+                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
+
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            _, key_states = qeff_apply_rotary_pos_emb(
+                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
+            )
+            cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
+            past_key_values.write_only(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        hidden_states, next_decoder_cache = self._run_swiftkv_layers(
+            hidden_states, position_ids, past_key_values, causal_mask
+        )
+        ####################################
+        ## THE MAGIC OF SWIFT KV ENDS HERE
+        ####################################
+
+        next_cache = next_decoder_cache.to_legacy_cache()
+        return hidden_states, next_cache
+
+
+class LlamaSwiftKVForCausalLM(nn.Module):
+    """
+    # packed_modules_mapping = {
+    #     "kv_proj_swiftkv": ["k_proj_swiftkv", "v_proj_swiftkv"],
+    #     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    #     "gate_up_proj": ["gate_proj", "up_proj"],
+    # }
+
+    # # BitandBytes specific attributes
+    # default_bitsandbytes_target_modules = [
+    #     ".gate_proj.",
+    #     ".down_proj.",
+    #     ".up_proj.",
+    #     ".q_proj.",
+    #     ".k_proj.",
+    #     ".v_proj.",
+    #     ".o_proj.",
+    #     ".k_proj_swiftkv.",
+    #     ".v_proj_swiftkv.",
+    # ]
+
+    # # in TP, these weights are partitioned along the column dimension (dim=-1)
+    # column_parallel_weights_modules = [
+    #     ".q_proj_swiftkv.",
+    #     ".down_proj.",
+    #     ".o_proj.",
+    # ]
+    # bitsandbytes_stacked_params_mapping = {
+    #     # shard_name, weight_name, index
+    #     "k_proj_swiftkv": ("kv_proj_swiftkv", 1),
+    #     "v_proj_swiftkv": ("kv_proj_swiftkv", 2),
+    #     "q_proj": ("qkv_proj", 0),
+    #     "k_proj": ("qkv_proj", 1),
+    #     "v_proj": ("qkv_proj", 2),
+    #     "gate_proj": ("gate_up_proj", 0),
+    #     "up_proj": ("gate_up_proj", 1),
+    # }
+    """
+
+    def __init__(self, *, config):
+        super().__init__()
+
+        self.model = LlamaSwiftKVModel(
+            config=config,
+        )
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Optional[Union[List[torch.FloatTensor]]] = None,
+    ):
+        hidden_states, output_past_key_values = self.model(input_ids, position_ids, past_key_values)
+        logit_index = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        hidden_states = hidden_states[torch.arange(position_ids.shape[0]).view(-1, 1), logit_index]
+        logits = self.lm_head(hidden_states)
+        return logits, output_past_key_values
diff --git a/exps/run_swiftkv.py b/exps/run_swiftkv.py
new file mode 100644
index 000000000..cf180f609
--- /dev/null
+++ b/exps/run_swiftkv.py
@@ -0,0 +1,28 @@
+import json
+import os
+
+from safetensors import safe_open
+
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+
+WEIGHTS = "/local/mnt/workspace/open-source/myown/efficient-transformers/cache_dir/swiftkv_model_weights"
+
+
+def load_safetensors(path):
+    state_dict = {}
+    f = safe_open(path, framework="pt", device="cpu")
+    for key in f.keys():
+        tensor = f.get_tensor(key)
+        state_dict[key] = tensor
+    return state_dict
+
+
+config = json.load(open(os.path.join(WEIGHTS, "config.json"), "r"))
+
+config.num_hidden_layers = 1
+
+model = LlamaSwiftKVForCausalLM(config=config)
+state_dict_0 = load_safetensors(os.path.join(WEIGHTS, "model-00001-of-00009.safetensors"))
+
+for k in model.state_dict().keys() - state_dict_0.keys():
+    del state_dict_0[k]

From ef47eb9abde88dbe5303680a7f36529c4ead822a Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:36:22 +0530
Subject: [PATCH 002/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py    | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index a33c83d3a..5b5fcd77f 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
-import logging
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -30,7 +29,7 @@
 from torch import nn
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaRMSNorm, repeat_kv
+from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -40,12 +39,10 @@
     qeff_apply_rotary_pos_emb,
 )
 
-logger = logging.get_logger(__name__)
 
-
-class LlamaSwiftKVAttention(LlamaAttention):
+class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config, layer_idx) -> None:
-        super().__init__(config, layer_idx)
+        super().__init__()
         self.hidden_size = config.hidden_size
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -56,7 +53,7 @@ def __init__(self, config, layer_idx) -> None:
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
-
+        self.layer_idx = layer_idx
         self.q_proj_swiftkv = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
         self.k_proj_swiftkv = nn.Linear(
             self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias

From f43b345109f06b10cee6bd3e2b6f4e9912649a45 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:39:46 +0530
Subject: [PATCH 003/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 5b5fcd77f..2022d2c9b 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -63,7 +63,7 @@ def __init__(self, config, layer_idx) -> None:
         )
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
 
-        self.rotary_emb = QEffLlamaRotaryEmbedding(config=self.config)
+        self.rotary_emb = QEffLlamaRotaryEmbedding(config=config)
 
     def forward(
         self,

From 5fbc10b2e8a7a05932d1c434e1176a402a67d306 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:46:12 +0530
Subject: [PATCH 004/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 2022d2c9b..4f22e82e0 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -394,6 +394,7 @@ def __init__(self, *, config):
         )
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.config = config
 
     def forward(
         self,

From a6a3727b24c400d73a69f37fc2499fa0769c3514 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:07:57 +0530
Subject: [PATCH 005/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4f22e82e0..24b88746a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -286,7 +286,9 @@ def forward(
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
-        causal_mask = self._update_causal_mask(None, inputs_embeds, cache_position, past_key_values, False)
+        causal_mask = self._update_causal_mask(
+            None, inputs_embeds, cache_position, position_ids, past_key_values, False
+        )
         hidden_states = inputs_embeds
 
         # create position embeddings to be shared across the decoder layers

From 5c094e2607203eb774092ce8209eef300f7d8bda Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:16:52 +0530
Subject: [PATCH 006/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 24b88746a..8eaef4521 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -292,7 +292,7 @@ def forward(
         hidden_states = inputs_embeds
 
         # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
         next_decoder_cache = None
 
         for layer_idx in range(self.config.num_key_value_layers):
@@ -305,7 +305,7 @@ def forward(
                 output_attentions=False,
                 use_cache=True,
                 cache_position=cache_position,
-                position_embeddings=position_embeddings,
+                position_embeddings=None,
             )
 
         bsz, q_len, _ = hidden_states.size()

From 52598734770721d1da3ce0e63ec23e88d564034c Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:23:24 +0530
Subject: [PATCH 007/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 8eaef4521..19887c77e 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -123,6 +123,8 @@ class LlamaSwiftKVDecoderLayer(nn.Module):
     def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.num_key_value_heads = config.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)
@@ -318,8 +320,10 @@ def forward(
             self_attn = self.layers[layer_idx].self_attn
             key_states = self_attn.k_proj_swiftkv(swiftkv_hidden_states)
             value_states = self_attn.v_proj_swiftkv(swiftkv_hidden_states)
-            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            key_states = key_states.view(bsz, q_len, self_attn.num_key_value_heads, self_attn.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, q_len, self_attn.num_key_value_heads, self_attn.head_dim).transpose(
+                1, 2
+            )
 
             kv_seq_len = key_states.shape[-2]
             if past_key_values is not None:
@@ -331,12 +335,12 @@ def forward(
                     )
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
 
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
             _, key_states = qeff_apply_rotary_pos_emb(
                 torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
             )
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
-            past_key_values.write_only(key_states, value_states, self.layer_idx, cache_kwargs)
+            past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask

From 39034c87018340ac10ec704a1bb89053019bbe2a Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 12:14:45 +0530
Subject: [PATCH 008/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 19887c77e..20a91ef45 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -124,7 +124,7 @@ def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)

From cd017147785180033d791ce364e7cade581d5700 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 12:33:39 +0530
Subject: [PATCH 009/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 20a91ef45..b4160a312 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -124,7 +124,6 @@ def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)

From a9539bff9476c889ebbc884de08105497f1c304b Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:05:36 +0530
Subject: [PATCH 010/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index b4160a312..4d8bfb754 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -326,13 +326,13 @@ def forward(
 
             kv_seq_len = key_states.shape[-2]
             if past_key_values is not None:
-                if self.layer_idx is None:
+                if self_attn.layer_idx is None:
                     raise ValueError(
-                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        f"The cache structure has changed since version v4.36. If you are using {self_attn.__class__.__name__} "
                         "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                         "with a layer index."
                     )
-                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
+                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
             _, key_states = qeff_apply_rotary_pos_emb(

From 4bafed03bf5ac4e691dacc8ad7121f5b015e7e55 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:08:51 +0530
Subject: [PATCH 011/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4d8bfb754..4015a6c95 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -335,9 +335,7 @@ def forward(
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
-            _, key_states = qeff_apply_rotary_pos_emb(
-                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
-            )
+            _, key_states = qeff_apply_rotary_pos_emb(torch.empty_like(key_states), key_states, cos, sin, position_ids)
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 

From c015d636edd83fa2d6d94b5b38df049fa3cf2b44 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:18:16 +0530
Subject: [PATCH 012/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4015a6c95..8ba2ad78e 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -73,7 +73,7 @@ def forward(
         attention_mask=None,
     ) -> torch.Tensor:
         bsz, q_len, _ = hidden_states.size()
-        query, _ = self.q_proj_swiftkv(hidden_states)
+        query = self.q_proj_swiftkv(hidden_states)
 
         # Reshape the query, key, and value tensors.
         query_states = query.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)

From b35cdd497216c2892be2f58ef4f59a3fd1cf41ad Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 19 Dec 2024 16:21:20 +0530
Subject: [PATCH 013/138] all bugfixes in

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 8ba2ad78e..d93d7cb44 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -90,7 +90,11 @@ def forward(
 
         key_states, value_states = past_key_value.read_only(self.layer_idx, position_ids=position_ids)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, _ = qeff_apply_rotary_pos_emb(query_states, torch.empty_like(key_states), cos, sin, position_ids)
+        position_idx = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        position_ids = position_ids[:, position_idx[0]]
+        query_states, _ = qeff_apply_rotary_pos_emb(
+            query_states, torch.empty_like(query_states), cos, sin, position_ids
+        )
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -160,9 +164,7 @@ def __init__(self, config):
         self.vocab_size = config.vocab_size
         self.config = config
 
-        self.embed_tokens = nn.Embedding(
-            self.vocab_size, config.hidden_size, None
-        )  # TODO: Not sure if padding_idx shoudl eb NONE
+        self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, None)
         self.layers = torch.nn.ModuleList(
             [
                 QEffLlamaDecoderLayer(config=config, layer_idx=idx)
@@ -179,9 +181,9 @@ def _run_swiftkv_layers(
     ) -> torch.Tensor:
         for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
             layer = self.layers[layer_idx]
-
             hidden_states, past_key_values = layer(hidden_states, position_ids, past_key_values, causal_mask)
 
+        hidden_states = self.norm(hidden_states)
         return hidden_states, past_key_values
 
     def _update_causal_mask(
@@ -339,15 +341,21 @@ def forward(
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 
+        last_pos_id = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        orig_hidden_states = hidden_states
+        hidden_states = orig_hidden_states[:, last_pos_id[0], :]
+        causal_mask = causal_mask[:, :, last_pos_id[0], :]
+
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask
         )
+        orig_hidden_states[:, last_pos_id[0], :] = hidden_states
         ####################################
         ## THE MAGIC OF SWIFT KV ENDS HERE
         ####################################
 
         next_cache = next_decoder_cache.to_legacy_cache()
-        return hidden_states, next_cache
+        return orig_hidden_states, next_cache
 
 
 class LlamaSwiftKVForCausalLM(nn.Module):

From c5914a51d0c6a574532f55e1e414d62292dfd396 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 19 Dec 2024 16:21:56 +0530
Subject: [PATCH 014/138] added init file

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/llama_swiftkv/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/__init__.py

diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 23df77759b91dfa2249701d73f068945f8f081d6 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 9 Jan 2025 16:38:13 +0530
Subject: [PATCH 015/138] all changes except BQA are in with this

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index fe56b197c..2a07d9f10 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -47,8 +47,9 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
             self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
     def read_only(self, layer_idx, cache_kwargs):
+        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         position_ids = cache_kwargs.get("position_ids")
-        ctx_len = position_ids.shape[-1]
+        ctx_len = k_out.shape[2]
         ctx_indices = torch.arange(ctx_len)[None, None, ...]
         gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
         invalid_mask = ctx_indices > gather_limit
@@ -59,7 +60,7 @@ def read_only(self, layer_idx, cache_kwargs):
             invalid_idx_value = 0
 
         ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
-        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
+
         k_out = CtxGatherFunc.apply(k_out, ctx_indices)
         v_out = CtxGatherFunc.apply(v_out, ctx_indices)
         v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)

From f7bad4b8d9d85351537817762295faea4c800954 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 5 Feb 2025 09:20:06 +0530
Subject: [PATCH 016/138] more updates

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/__init__.py          |  6 ++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 68 +++----------------
 2 files changed, 14 insertions(+), 60 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
index e69de29bb..d259e435a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/__init__.py
+++ b/QEfficient/transformers/models/llama_swiftkv/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index d93d7cb44..365f0b6d2 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,25 +1,13 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# -----------------------------------------------------------------------------
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# -----------------------------------------------------------------------------
+# This file is adapted from vllm implementation by snowflake here: https://github.com/Snowflake-Labs/vllm/blob/swiftkv/vllm/model_executor/models/llama_swiftkv.py
+# The Modules are updated as required by Cloud AI 100 HW requirements.
+
+
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 import math
@@ -294,8 +282,6 @@ def forward(
         )
         hidden_states = inputs_embeds
 
-        # create position embeddings to be shared across the decoder layers
-        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
         next_decoder_cache = None
 
         for layer_idx in range(self.config.num_key_value_layers):
@@ -359,44 +345,6 @@ def forward(
 
 
 class LlamaSwiftKVForCausalLM(nn.Module):
-    """
-    # packed_modules_mapping = {
-    #     "kv_proj_swiftkv": ["k_proj_swiftkv", "v_proj_swiftkv"],
-    #     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    #     "gate_up_proj": ["gate_proj", "up_proj"],
-    # }
-
-    # # BitandBytes specific attributes
-    # default_bitsandbytes_target_modules = [
-    #     ".gate_proj.",
-    #     ".down_proj.",
-    #     ".up_proj.",
-    #     ".q_proj.",
-    #     ".k_proj.",
-    #     ".v_proj.",
-    #     ".o_proj.",
-    #     ".k_proj_swiftkv.",
-    #     ".v_proj_swiftkv.",
-    # ]
-
-    # # in TP, these weights are partitioned along the column dimension (dim=-1)
-    # column_parallel_weights_modules = [
-    #     ".q_proj_swiftkv.",
-    #     ".down_proj.",
-    #     ".o_proj.",
-    # ]
-    # bitsandbytes_stacked_params_mapping = {
-    #     # shard_name, weight_name, index
-    #     "k_proj_swiftkv": ("kv_proj_swiftkv", 1),
-    #     "v_proj_swiftkv": ("kv_proj_swiftkv", 2),
-    #     "q_proj": ("qkv_proj", 0),
-    #     "k_proj": ("qkv_proj", 1),
-    #     "v_proj": ("qkv_proj", 2),
-    #     "gate_proj": ("gate_up_proj", 0),
-    #     "up_proj": ("gate_up_proj", 1),
-    # }
-    """
-
     def __init__(self, *, config):
         super().__init__()
 

From 2a37e62be793a2f49be3a2e55bba95489d40fd3c Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 06:17:43 +0000
Subject: [PATCH 017/138] Enabling the SwiftKV model in the QEFF Infra

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py     | 19 ++++++++
 .../llama_swiftkv/config_llama_swiftkv.py     | 45 +++++++++++++++++++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 17 ++++---
 .../transformers/models/modeling_auto.py      |  6 +++
 QEfficient/utils/_utils.py                    |  2 +-
 5 files changed, 82 insertions(+), 7 deletions(-)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index ccad5e020..aec82e8cd 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -153,6 +153,9 @@
     QEffWhisperPositionalEmbedding,
 )
 
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
@@ -362,3 +365,19 @@ def _create_causal_mask(
         attention_mask = attention_mask.unsqueeze(1)
 
     return attention_mask
+
+
+# Define a SwiftKV Model card name to Model type dictionary
+# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
+SwiftKVModelCardNameToSwiftKVModelTypeDict: Dict[Type[str], Type[str]] = {
+    # LlamaSwiftKV Model
+    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct": "llama_swiftkv"
+}
+
+# Define a SwiftKV Model type to ConfigClass and ModelArchitecture class dictionary
+# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
+SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
+    # LlamaSwiftKV Model
+    "llama_swiftkv" : [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+}
+
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
new file mode 100644
index 000000000..fa97388de
--- /dev/null
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -0,0 +1,45 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# The Modules are updated as required by Cloud AI 100 HW requirements.
+
+
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+
+
+from typing import Optional
+from transformers import LlamaConfig
+
+
+class LlamaSwiftKVConfig(LlamaConfig):
+    """
+    Args:
+        num_key_value_layers (int, optional):
+            The number of layers, from the first layer, that have keys and
+            values. If None, all layers have keys and values.
+        last_key_value_heads (int, optional):
+            The number of heads in the last layer that have keys and values.
+            If None, the number of heads in the last key-value layer is equal
+            to the number of heads in all the other key-value layers.
+    """
+
+    model_type = "llama_swiftkv"
+
+    def __init__(
+        self,
+        swiftkv: bool = False,
+        num_key_value_layers: Optional[int] = None,
+        key_value_group_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.swiftkv = swiftkv
+        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
+        self.key_value_group_size = key_value_group_size or 1
+        assert (
+            self.num_hidden_layers - self.num_key_value_layers
+        ) % self.key_value_group_size == 0
\ No newline at end of file
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 365f0b6d2..e2bd5a08a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -18,6 +18,7 @@
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
+from transformers.modeling_utils import PreTrainedModel
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -26,10 +27,10 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
 class LlamaSwiftKVAttention(nn.Module):
-    def __init__(self, config, layer_idx) -> None:
+    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.attention_dropout = config.attention_dropout
@@ -112,7 +113,7 @@ def forward(
 
 
 class LlamaSwiftKVDecoderLayer(nn.Module):
-    def __init__(self, config, layer_idx) -> None:
+    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
@@ -147,7 +148,9 @@ def forward(
 
 
 class LlamaSwiftKVModel(nn.Module):
-    def __init__(self, config):
+    config_class = LlamaSwiftKVConfig
+
+    def __init__(self, config: LlamaSwiftKVConfig):
         super().__init__()
         self.vocab_size = config.vocab_size
         self.config = config
@@ -344,8 +347,10 @@ def forward(
         return orig_hidden_states, next_cache
 
 
-class LlamaSwiftKVForCausalLM(nn.Module):
-    def __init__(self, *, config):
+class LlamaSwiftKVForCausalLM(PreTrainedModel):
+    config_class = LlamaSwiftKVConfig
+
+    def __init__(self, *, config: LlamaSwiftKVConfig):
         super().__init__()
 
         self.model = LlamaSwiftKVModel(
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index b8b5981cd..c543da036 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,6 +7,7 @@
 
 import hashlib
 import warnings
+
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -51,6 +52,7 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
+from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"]
 
@@ -78,6 +80,10 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
+
+        # Load the SwiftKV model if supported
+        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
+
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 8344a053d..5b205ffbc 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -19,7 +19,7 @@
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
-
+from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
 
 class DownloadRetryLimitExceeded(Exception):
     """

From b280225e170feb0f8ef40b62ff4762cce39e98a7 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 27 Feb 2025 15:16:14 +0530
Subject: [PATCH 018/138] rebased

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py     |  3 +-
 .../llama_swiftkv/config_llama_swiftkv.py     |  6 +-
 .../llama_swiftkv/modeling_llama_swiftkv.py   |  1 +
 .../transformers/models/modeling_auto.py      |  1 -
 QEfficient/utils/_utils.py                    | 78 ++++++++++++++++++-
 5 files changed, 78 insertions(+), 11 deletions(-)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index aec82e8cd..42244e288 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -378,6 +378,5 @@ def _create_causal_mask(
 # While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
 SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
     # LlamaSwiftKV Model
-    "llama_swiftkv" : [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
 }
-
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
index fa97388de..77eeb61a3 100644
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -9,8 +9,6 @@
 
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
-
-
 from typing import Optional
 from transformers import LlamaConfig
 
@@ -40,6 +38,4 @@ def __init__(
         self.swiftkv = swiftkv
         self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
         self.key_value_group_size = key_value_group_size or 1
-        assert (
-            self.num_hidden_layers - self.num_key_value_layers
-        ) % self.key_value_group_size == 0
\ No newline at end of file
+        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index e2bd5a08a..4d6888bc7 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -29,6 +29,7 @@
 )
 from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
+
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index c543da036..feda125ef 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -80,7 +80,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         # Load the SwiftKV model if supported
         QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 5b205ffbc..e9b58d209 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,6 +8,8 @@
 import json
 import os
 import subprocess
+import sys
+import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -15,11 +17,21 @@
 import torch
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
+
+from QEfficient.transformers.modeling_utils import (
+    SwiftKVModelCardNameToSwiftKVModelTypeDict,
+    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
+)
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
-from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
+
 
 class DownloadRetryLimitExceeded(Exception):
     """
@@ -442,3 +454,63 @@ class IOInfo:
 
     def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
+
+
+def convert_str_to_class(className):
+    """
+    Convert the string to class name
+    ---------
+    :className: `str`- Class name string.
+    Return:
+        Class Name
+    """
+    return getattr(sys.modules[__name__], className)
+
+
+def register_swiftKV_model(model_type, SwiftkvConfigCls, SwiftKVModelCls):
+    """
+    Register the SwiftKV Models
+    ---------------------------------------
+    : model_type: str: name of the swiftKVModel for example llama_swiftkv
+    : SwiftkVConfigCls: SwiftKV Config class for example LlamaSwiftKVConfig
+    : SwiftKVModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
+    """
+
+    # Register the SwiftKV Config class using AutoConfig
+    AutoConfig.register(model_type, SwiftkvConfigCls)
+
+    # Construct the AutoModel class name using SwiftKVModel Class name, this code is written to make things generic
+    swiftKvModelName = SwiftKVModelCls.__name__
+    start_index = swiftKvModelName.find("SwiftKVFor")
+
+    # Calculate the index after "SwiftKVFor"
+    substring_start = start_index + len("SwiftKVFor")
+
+    # Get the substring after "SwiftKVFor"
+    swiftKVModel = swiftKvModelName[substring_start:]
+
+    AutoModelName = "AutoModelFor" + swiftKVModel
+
+    # Convert the string to class name
+    AutoModelClassName = convert_str_to_class(AutoModelName)
+
+    # Register the SwiftKVModel Class and config class using AutoModelClass
+    AutoModelClassName.register(SwiftkvConfigCls, SwiftKVModelCls)
+
+
+def QEFFLoadSwiftKVModels(pretrained_model_name_or_path):
+    """
+    Load the SwiftKV Models
+    ---------------------------------------
+    : pretrained_model_name_or_path: str: name of the swiftKVModel for example Snowflake/Llama-3.1-SwiftKV-8B-Instruct
+    """
+    try:
+        modelType = SwiftKVModelCardNameToSwiftKVModelTypeDict[pretrained_model_name_or_path]
+
+        SwiftKVConfigCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][0]
+        SwiftKVModelArchCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][1]
+
+        register_swiftKV_model(modelType, SwiftKVConfigCls, SwiftKVModelArchCls)
+
+    except KeyError:
+        warnings.warn("Requested SwiftKVModel is currently not supported... stay tuned for future releases", Warning)

From 9f5bca6e7eb38231bb8ef35c62ec727434cc0e56 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 18:45:54 +0000
Subject: [PATCH 019/138] moving registration of non transformer models during
 initialization of QEfficient

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        | 18 ++++-
 QEfficient/transformers/modeling_utils.py     | 76 +++++++++++++++----
 .../models/llama_swiftkv/__init__.py          |  2 +-
 .../llama_swiftkv/config_llama_swiftkv.py     | 41 ----------
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 30 +++++++-
 .../transformers/models/modeling_auto.py      |  3 -
 QEfficient/utils/_utils.py                    | 66 ----------------
 7 files changed, 108 insertions(+), 128 deletions(-)
 delete mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 4deb929c4..e5f9cec78 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -1,12 +1,28 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
 
 from QEfficient.utils.logging_utils import logger
+from transformers import AutoConfig
+from QEfficient.transformers.modeling_utils import (
+    get_model_class_type_from_model_type,
+    get_auto_model_class,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+)
 
+# loop over all the models which are not present in transformers and register them
+for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
+    # Register the config class based on model type
+    AutoConfig.register(key, value[0])
+
+    model_class_type = get_model_class_type_from_model_type(key)
+    AutoModelClassName = get_auto_model_class(model_class_type, value[1])
+
+    # Register the non transformer library Class and config class using AutoModelClass
+    AutoModelClassName.register(value[0], value[1])
 
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 42244e288..9619cb816 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -7,6 +7,7 @@
 
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
+import sys
 
 import torch
 import torch.nn as nn
@@ -86,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 from .models.codegen.modeling_codegen import (
@@ -153,8 +155,11 @@
     QEffWhisperPositionalEmbedding,
 )
 
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+# Placeholder for all non-transformer models
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
+    LlamaSwiftKVForCausalLM,
+    LlamaSwiftKVConfig
+)
 
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
@@ -274,6 +279,19 @@
     WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
 }
 
+# Map of model type to config class and Model architecture class
+# While onboarding new models make sure to add the new model card names to this dictionary.
+MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
+    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+}
+
+# list of sub-strings representing the model type, this is typically taken from llama-swiftkv
+LIST_OF_MODEL_TYPES = {"swiftkv"}
+
+# list of sub-strings used for representing the model Architecture class name, for example LlamaSwiftKVForCausalLM
+MODEL_TYPE_TO_MODEL_CLASS_TYPE = {
+    "swiftkv": "SwiftKVFor"
+}
 
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
@@ -366,17 +384,47 @@ def _create_causal_mask(
 
     return attention_mask
 
+def convert_str_to_class(className):
+    """
+    Convert the string to class name
+    ---------
+    :className: `str`- Class name string.
+    Return:
+        Class Name
+    """
+    return getattr(sys.modules[__name__], className)
 
-# Define a SwiftKV Model card name to Model type dictionary
-# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
-SwiftKVModelCardNameToSwiftKVModelTypeDict: Dict[Type[str], Type[str]] = {
-    # LlamaSwiftKV Model
-    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct": "llama_swiftkv"
-}
 
-# Define a SwiftKV Model type to ConfigClass and ModelArchitecture class dictionary
-# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
-SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
-    # LlamaSwiftKV Model
-    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
-}
+def get_auto_model_class(model_type, NonTransformerModelCls):
+    """
+    Register the Non Transformer Models like swiftkv
+    ---------------------------------------
+    : model_type: str: name of the Non Transformer model for example llama_swiftkv
+    : NonTransformerModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
+    """
+
+    # Construct the AutoModel class name using NonTransformerModel class e.g. SwiftKVModel Class name, this code is written to make things generic
+    nonTransformerModelClsName = NonTransformerModelCls.__name__
+    start_index = nonTransformerModelClsName.find(model_type)
+
+    # Calculate the index after model_type example "SwiftKVFor"
+    substring_start = start_index + len(model_type)
+
+    # Get the substring after model_type example "SwiftKVFor"
+    nonTransformerModel = nonTransformerModelClsName[substring_start:]
+
+    autoModelName = "AutoModelFor" + nonTransformerModel
+
+    # Convert the string to class name
+    autoModelClassName = convert_str_to_class(autoModelName)
+
+    return autoModelClassName
+
+def get_model_class_type_from_model_type(model_type):
+    for substring in LIST_OF_MODEL_TYPES:
+        if (substring in model_type):
+            model_class_type = substring
+            break
+
+    model_class_name = MODEL_TYPE_TO_MODEL_CLASS_TYPE[model_class_type]
+    return model_class_name
\ No newline at end of file
diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
index d259e435a..72ba36c8a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/__init__.py
+++ b/QEfficient/transformers/models/llama_swiftkv/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
deleted file mode 100644
index 77eeb61a3..000000000
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-# The Modules are updated as required by Cloud AI 100 HW requirements.
-
-
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
-
-from typing import Optional
-from transformers import LlamaConfig
-
-
-class LlamaSwiftKVConfig(LlamaConfig):
-    """
-    Args:
-        num_key_value_layers (int, optional):
-            The number of layers, from the first layer, that have keys and
-            values. If None, all layers have keys and values.
-        last_key_value_heads (int, optional):
-            The number of heads in the last layer that have keys and values.
-            If None, the number of heads in the last key-value layer is equal
-            to the number of heads in all the other key-value layers.
-    """
-
-    model_type = "llama_swiftkv"
-
-    def __init__(
-        self,
-        swiftkv: bool = False,
-        num_key_value_layers: Optional[int] = None,
-        key_value_group_size: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.swiftkv = swiftkv
-        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
-        self.key_value_group_size = key_value_group_size or 1
-        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4d6888bc7..7d5c45a7d 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
@@ -19,6 +19,7 @@
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
+from transformers import LlamaConfig
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -27,8 +28,33 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
+class LlamaSwiftKVConfig(LlamaConfig):
+    """
+    Args:
+        num_key_value_layers (int, optional):
+            The number of layers, from the first layer, that have keys and
+            values. If None, all layers have keys and values.
+        last_key_value_heads (int, optional):
+            The number of heads in the last layer that have keys and values.
+            If None, the number of heads in the last key-value layer is equal
+            to the number of heads in all the other key-value layers.
+    """
+
+    model_type = "llama_swiftkv"
+
+    def __init__(
+        self,
+        swiftkv: bool = False,
+        num_key_value_layers: Optional[int] = None,
+        key_value_group_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.swiftkv = swiftkv
+        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
+        self.key_value_group_size = key_value_group_size or 1
+        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
 
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index feda125ef..ab6e8a482 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -52,7 +52,6 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
-from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"]
 
@@ -80,8 +79,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-        # Load the SwiftKV model if supported
-        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index e9b58d209..62d1dda63 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,8 +8,6 @@
 import json
 import os
 import subprocess
-import sys
-import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -25,14 +23,9 @@
     PreTrainedTokenizerFast,
 )
 
-from QEfficient.transformers.modeling_utils import (
-    SwiftKVModelCardNameToSwiftKVModelTypeDict,
-    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
-)
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
 
-
 class DownloadRetryLimitExceeded(Exception):
     """
     Used for raising error when hf_download fails to download the model after given max_retries.
@@ -455,62 +448,3 @@ class IOInfo:
     def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
 
-
-def convert_str_to_class(className):
-    """
-    Convert the string to class name
-    ---------
-    :className: `str`- Class name string.
-    Return:
-        Class Name
-    """
-    return getattr(sys.modules[__name__], className)
-
-
-def register_swiftKV_model(model_type, SwiftkvConfigCls, SwiftKVModelCls):
-    """
-    Register the SwiftKV Models
-    ---------------------------------------
-    : model_type: str: name of the swiftKVModel for example llama_swiftkv
-    : SwiftkVConfigCls: SwiftKV Config class for example LlamaSwiftKVConfig
-    : SwiftKVModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
-    """
-
-    # Register the SwiftKV Config class using AutoConfig
-    AutoConfig.register(model_type, SwiftkvConfigCls)
-
-    # Construct the AutoModel class name using SwiftKVModel Class name, this code is written to make things generic
-    swiftKvModelName = SwiftKVModelCls.__name__
-    start_index = swiftKvModelName.find("SwiftKVFor")
-
-    # Calculate the index after "SwiftKVFor"
-    substring_start = start_index + len("SwiftKVFor")
-
-    # Get the substring after "SwiftKVFor"
-    swiftKVModel = swiftKvModelName[substring_start:]
-
-    AutoModelName = "AutoModelFor" + swiftKVModel
-
-    # Convert the string to class name
-    AutoModelClassName = convert_str_to_class(AutoModelName)
-
-    # Register the SwiftKVModel Class and config class using AutoModelClass
-    AutoModelClassName.register(SwiftkvConfigCls, SwiftKVModelCls)
-
-
-def QEFFLoadSwiftKVModels(pretrained_model_name_or_path):
-    """
-    Load the SwiftKV Models
-    ---------------------------------------
-    : pretrained_model_name_or_path: str: name of the swiftKVModel for example Snowflake/Llama-3.1-SwiftKV-8B-Instruct
-    """
-    try:
-        modelType = SwiftKVModelCardNameToSwiftKVModelTypeDict[pretrained_model_name_or_path]
-
-        SwiftKVConfigCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][0]
-        SwiftKVModelArchCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][1]
-
-        register_swiftKV_model(modelType, SwiftKVConfigCls, SwiftKVModelArchCls)
-
-    except KeyError:
-        warnings.warn("Requested SwiftKVModel is currently not supported... stay tuned for future releases", Warning)

From 991e3bf0c8a7f864e6c422e5316fbcb6ce55b7c9 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Tue, 4 Mar 2025 05:18:06 +0000
Subject: [PATCH 020/138] fixed lint warnings

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |  8 +++--
 QEfficient/transformers/modeling_utils.py     | 30 +++++++++----------
 .../llama_swiftkv/modeling_llama_swiftkv.py   |  6 ++--
 .../transformers/models/modeling_auto.py      |  2 --
 QEfficient/utils/_utils.py                    |  3 +-
 5 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index e5f9cec78..51f21b61d 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,13 +5,14 @@
 #
 # -----------------------------------------------------------------------------
 
-from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
+
 from QEfficient.transformers.modeling_utils import (
-    get_model_class_type_from_model_type,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
-    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+    get_model_class_type_from_model_type,
 )
+from QEfficient.utils.logging_utils import logger
 
 # loop over all the models which are not present in transformers and register them
 for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
@@ -24,6 +25,7 @@
     # Register the non transformer library Class and config class using AutoModelClass
     AutoModelClassName.register(value[0], value[1])
 
+
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
     try:
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 9619cb816..a3c69b1ed 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,9 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
+import sys
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
-import sys
 
 import torch
 import torch.nn as nn
@@ -87,9 +87,14 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
+# Placeholder for all non-transformer models
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
+    LlamaSwiftKVConfig,
+    LlamaSwiftKVForCausalLM,
+)
+
 from .models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -155,12 +160,6 @@
     QEffWhisperPositionalEmbedding,
 )
 
-# Placeholder for all non-transformer models
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
-    LlamaSwiftKVForCausalLM,
-    LlamaSwiftKVConfig
-)
-
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
@@ -281,17 +280,14 @@
 
 # Map of model type to config class and Model architecture class
 # While onboarding new models make sure to add the new model card names to this dictionary.
-MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
-    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
-}
+MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {"llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]}
 
 # list of sub-strings representing the model type, this is typically taken from llama-swiftkv
 LIST_OF_MODEL_TYPES = {"swiftkv"}
 
 # list of sub-strings used for representing the model Architecture class name, for example LlamaSwiftKVForCausalLM
-MODEL_TYPE_TO_MODEL_CLASS_TYPE = {
-    "swiftkv": "SwiftKVFor"
-}
+MODEL_TYPE_TO_MODEL_CLASS_TYPE = {"swiftkv": "SwiftKVFor"}
+
 
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
@@ -384,6 +380,7 @@ def _create_causal_mask(
 
     return attention_mask
 
+
 def convert_str_to_class(className):
     """
     Convert the string to class name
@@ -420,11 +417,12 @@ def get_auto_model_class(model_type, NonTransformerModelCls):
 
     return autoModelClassName
 
+
 def get_model_class_type_from_model_type(model_type):
     for substring in LIST_OF_MODEL_TYPES:
-        if (substring in model_type):
+        if substring in model_type:
             model_class_type = substring
             break
 
     model_class_name = MODEL_TYPE_TO_MODEL_CLASS_TYPE[model_class_type]
-    return model_class_name
\ No newline at end of file
+    return model_class_name
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 7d5c45a7d..f1ec2634d 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -15,11 +15,11 @@
 
 import torch
 from torch import nn
+from transformers import LlamaConfig
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
-from transformers import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -29,6 +29,7 @@
     qeff_apply_rotary_pos_emb,
 )
 
+
 class LlamaSwiftKVConfig(LlamaConfig):
     """
     Args:
@@ -56,6 +57,7 @@ def __init__(
         self.key_value_group_size = key_value_group_size or 1
         assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
 
+
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index ab6e8a482..b8b5981cd 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,7 +7,6 @@
 
 import hashlib
 import warnings
-
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -79,7 +78,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 62d1dda63..23d3a541d 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -16,7 +16,6 @@
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
 from transformers import (
-    AutoConfig,
     AutoProcessor,
     AutoTokenizer,
     PreTrainedTokenizer,
@@ -26,6 +25,7 @@
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
 
+
 class DownloadRetryLimitExceeded(Exception):
     """
     Used for raising error when hf_download fails to download the model after given max_retries.
@@ -447,4 +447,3 @@ class IOInfo:
 
     def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
-

From f3845339ab41c42746b8314a9986f533e77e1862 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:59:48 +0530
Subject: [PATCH 021/138] enabling faster downloads via hf_transfer (#282)

hf hub doc:
https://huggingface.co/docs/huggingface_hub/en/guides/download
details on hf_transfer
https://github.com/[huggingface/hf_transfer](https://github.com/huggingface/hf_transfer)

---------

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py | 8 ++++++++
 pyproject.toml         | 1 +
 2 files changed, 9 insertions(+)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 51f21b61d..91856d2e9 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,6 +5,13 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
+# For faster downloads via hf_transfer
+# This code is put above import statements as this needs to be executed before
+# hf_transfer is imported (will happen on line 15 via leading imports)
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
 from transformers import AutoConfig
 
 from QEfficient.transformers.modeling_utils import (
@@ -12,6 +19,7 @@
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
+
 from QEfficient.utils.logging_utils import logger
 
 # loop over all the models which are not present in transformers and register them
diff --git a/pyproject.toml b/pyproject.toml
index 571da78dc..af918c49e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ requires-python = ">=3.8,<3.11"
 dependencies = [
     "transformers==4.46.0",
     "huggingface-hub==0.27.0",
+    "hf_transfer==0.1.9",
     "peft==0.13.2",
     "datasets==2.20.0",
     "fsspec==2023.6.0",

From aa798360907a8ee2fc467a6e4ebd5dbe666430f4 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 28 Feb 2025 16:18:18 +0530
Subject: [PATCH 022/138] upgrading from yanked version (#276)

https://pypi.org/project/transformers/#history
Looking at above. Upgrading to `4.46.3` seems like a good choice.
Upgrading to 4.47 might break few things, as they are upgrading KV cache
format in that version.

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index af918c49e..a02836c26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.11"
 dependencies = [
-    "transformers==4.46.0",
+    "transformers==4.46.3",
     "huggingface-hub==0.27.0",
     "hf_transfer==0.1.9",
     "peft==0.13.2",

From d5c51797e3431f1403920dea44e8c74dba03c055 Mon Sep 17 00:00:00 2001
From: Dhiraj Kumar Sah <quic_dhirajku@quicinc.com>
Date: Fri, 28 Feb 2025 16:25:52 +0530
Subject: [PATCH 023/138] Added example script for InternVL (#269)

Signed-off-by: quic-dhirajku <quic_dhirajku@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 README.md                                     |   5 +-
 examples/image_text_to_text_inference.py      | 120 ++++++++
 examples/intern_example/internvl_inference.py | 272 ++++++++++++++++++
 examples/intern_example/readme.md             |  28 ++
 4 files changed, 423 insertions(+), 2 deletions(-)
 create mode 100644 examples/image_text_to_text_inference.py
 create mode 100644 examples/intern_example/internvl_inference.py
 create mode 100644 examples/intern_example/readme.md

diff --git a/README.md b/README.md
index 3d5487e7d..2185c9f64 100644
--- a/README.md
+++ b/README.md
@@ -6,18 +6,19 @@
 ---
 
 *Latest news* :fire: <br>
+- [02/2025] [VLMs support](https://github.com/quic/efficient-transformers/pull/267) added for the models [InternVL-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B), [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
 - [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models.
 
 - [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
 - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models.
 - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM.
 - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)
-- [09/2024] [AWQ](https://arxiv.org/abs/2306.00978)/[GPTQ](https://arxiv.org/abs/2210.17323) 4-bit quantized models are supported <br>
-- [09/2024] Now we support [PEFT](https://huggingface.co/docs/peft/index) models
 
 <details>
 <summary>More</summary>
 
+- [09/2024] [AWQ](https://arxiv.org/abs/2306.00978)/[GPTQ](https://arxiv.org/abs/2210.17323) 4-bit quantized models are supported <br>
+- [09/2024] Now we support [PEFT](https://huggingface.co/docs/peft/index) models
 - [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
 - [01/2025] Added support for [Ibm-Granite-Guardian] (https://huggingface.co/ibm-granite/granite-guardian-3.1-8b)
 - [09/2024] Added support for [Gemma-2-Family](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)<br>
diff --git a/examples/image_text_to_text_inference.py b/examples/image_text_to_text_inference.py
new file mode 100644
index 000000000..db604fc53
--- /dev/null
+++ b/examples/image_text_to_text_inference.py
@@ -0,0 +1,120 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+from PIL import Image
+from transformers import AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+# Add HuggingFace Token to access the model
+HF_TOKEN = ""
+
+
+def run_model(
+    model_name,
+    token,
+    query,
+    image_url,
+    kv_offload=False,
+    prefill_seq_len=32,
+    ctx_len=512,
+    generation_len=128,
+    img_size=560,
+    num_cores=16,
+    num_devices=1,
+):
+    ## STEP - 1 Load the Processor and Model
+
+    processor = AutoProcessor.from_pretrained(model_name, token=token)
+
+    # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs.
+    # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
+    # The outputs of the Vision Encoder are then passed to the Language model via host in this case.
+
+    model = QEFFAutoModelForImageTextToText.from_pretrained(
+        model_name, token=token, attn_implementation="eager", kv_offload=kv_offload
+    )
+
+    ## STEP - 2 Export & Compile the Model
+
+    model.compile(
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        img_size=img_size,
+        num_cores=num_cores,
+        num_devices=num_devices,
+        mxfp6_matmul=False,
+    )
+
+    ## STEP - 3 Load and process the inputs for Inference
+
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": query},
+            ],
+        }
+    ]
+    input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
+
+    inputs = processor(
+        text=input_text,
+        images=image,
+        return_tensors="pt",
+        add_special_tokens=False,
+        padding="max_length",
+        max_length=prefill_seq_len,
+    )
+
+    ## STEP - 4 Run Inference on the compiled model
+
+    streamer = TextStreamer(processor.tokenizer)
+    model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+
+
+if __name__ == "__main__":
+    # Model name and Input parameters
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    query = "Describe this image."
+    image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+
+    # Compilation parameters for the model
+    kv_offload = False
+    prefill_seq_len = 32
+    ctx_len = 512
+    generation_len = 128
+    img_size = 560
+    num_cores = 16
+    num_devices = 1
+
+    run_model(
+        model_name=model_name,
+        token=HF_TOKEN,
+        query=query,
+        kv_offload=kv_offload,
+        image_url=image_url,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        generation_len=generation_len,
+        img_size=img_size,
+        num_cores=num_cores,
+        num_devices=num_devices,
+    )
+
+
+"""
+Expected Response:
+
+This image depicts a charming anthropomorphic rabbit standing on a dirt path in front of a picturesque stone cottage, surrounded by a serene landscape.
+
+The rabbit, with its light brown fur and distinctive long ears, is attired in a stylish blue coat, brown vest, and tan pants, exuding a sense of sophistication. The dirt path, flanked by vibrant flowers and lush greenery, leads to the cottage, which features a thatched roof and a chimney, adding to the rustic charm of the scene. In the background, rolling hills and trees create a breathtaking panorama, while the sky above is a brilliant blue with white clouds, completing the
+
+"""
diff --git a/examples/intern_example/internvl_inference.py b/examples/intern_example/internvl_inference.py
new file mode 100644
index 000000000..45d48c749
--- /dev/null
+++ b/examples/intern_example/internvl_inference.py
@@ -0,0 +1,272 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from io import BytesIO
+from typing import List
+
+import requests
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoTokenizer, TextStreamer
+
+from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient.utils.logging_utils import logger
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+# Process the input messages to generate prompt for the model.
+def get_prompt(messages) -> str:
+    """Get the prompt for generation."""
+    ## Chat template used for InternVL
+    system_prompt = "<|im_start|>system\n你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。"
+    sep = "<|im_end|>\n"
+
+    ret = system_prompt + sep
+    for role, message in messages:
+        if message:
+            if type(message) is tuple:
+                message, _, _ = message
+            ret += role + message + sep
+        else:
+            ret += role
+    return ret
+
+
+# Processor class for InternVL models
+class InternProcessor:
+    """
+    InternVL model only has an AutoTokenizer so this class performs the processing tasks similar to an AutoProcessor.
+    The methods used here are borrowed from the original InternVL modelling files.
+    "https://huggingface.co/OpenGVLab/InternVL2_5-1B/"
+    """
+
+    def __init__(self, model: nn.Module, tokenizer):
+        self.model = model
+        image_size = self.model.config.force_image_size or self.model.config.vision_config.image_size
+        patch_size = self.model.config.vision_config.patch_size
+        self.template = model.config.template
+        self.num_image_token = int((image_size // patch_size) ** 2 * (self.model.config.downsample_ratio**2))
+        self.tokenizer = tokenizer
+
+    def build_transform(self, input_size):
+        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+        transform = T.Compose(
+            [
+                T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+                T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=MEAN, std=STD),
+            ]
+        )
+        return transform
+
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+
+    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        # calculate the existing image aspect ratio
+        target_ratios = set(
+            (i, j)
+            for n in range(min_num, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num and i * j >= min_num
+        )
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        # find the closest aspect ratio to the target
+        target_aspect_ratio = self.find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size
+        )
+        # calculate the target width and height
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size,
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+
+    def load_image(self, image, input_size=448, max_num=12):
+        transform = self.build_transform(input_size=input_size)
+        images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        return pixel_values
+
+    def __call__(
+        self,
+        pixel_values,
+        question,
+        messages,
+        roles,
+        history=None,
+        num_patches_list=None,
+        IMG_START_TOKEN="<img>",
+        IMG_END_TOKEN="</img>",
+        IMG_CONTEXT_TOKEN="<IMG_CONTEXT>",
+        verbose=False,
+    ) -> str:
+        if history is None and pixel_values is not None and "<image>" not in question:
+            question = "<image>\n" + question
+        if num_patches_list is None:
+            num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
+        assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
+        img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.model.img_context_token_id = img_context_token_id
+
+        messages.append([roles[0], question])
+        messages.append([roles[1], None])
+        query = get_prompt(messages)
+        if verbose and pixel_values is not None:
+            image_bs = pixel_values.shape[0]
+            logger.info(f"dynamic ViT batch size: {image_bs}")
+
+        for num_patches in num_patches_list:
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
+            query = query.replace("<image>", image_tokens, 1)
+        return query
+
+
+def run_intern_on_aic(
+    model_name,
+    prompt,
+    image_url,
+    messages,
+    roles,
+    kv_offload=False,
+    prefill_seq_len=3840,
+    num_devices=1,
+    num_cores=16,
+):
+    ## STEP 1 -- LOAD THE MODEL
+
+    # The original Intern-VL model, despite being multimodal, is loaded using `AutoModelForCausalLM` in Huggingface.
+    # To maintain compatibility, we load this model using `QEFFAutoModelForCausalLM`.
+
+    model = QEFFAutoModelForCausalLM.from_pretrained(model_name, kv_offload=kv_offload, trust_remote_code=True)
+
+    ## STEP 2 -- EXPORT & COMPILE THE MODEL
+
+    model.compile(
+        num_cores=num_cores,
+        num_devices=num_devices,
+        prefill_seq_len=prefill_seq_len,
+        mxfp6_matmul=False,
+    )
+
+    ## STEP 3 -- SETUP THE PROCESSOR
+
+    # InternVL doesn't have an AutoProcessor yet, so we will use our own processor class "InternProcessor"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
+    internProcessor = InternProcessor(model.model, tokenizer)
+
+    ## STEP 4 -- PREPROCESS THE INPUTS
+
+    img = requests.get(image_url, stream=True)
+    image = Image.open(BytesIO(img.content)).convert("RGB")
+
+    # Images are resized to (1000, 747) for inference
+    image = image.resize((1000, 747))
+
+    # preprocess the resized image
+    pixel_values = internProcessor.load_image(image, max_num=12)
+    question = "<image>\n" + prompt
+    query = internProcessor(pixel_values, question, messages, roles)
+    inputs = tokenizer(
+        query, return_tensors="pt", padding="max_length", max_length=prefill_seq_len, padding_side="right"
+    )
+
+    inputs["pixel_values"] = pixel_values
+
+    ## STEP 5 -- RUN INFERENCE VIA GENERATE FUNCTION
+    streamer = TextStreamer(tokenizer)
+    model.generate(inputs=inputs, streamer=streamer, generation_len=128)
+
+
+if __name__ == "__main__":
+    model_name = "OpenGVLab/InternVL2_5-1B"
+
+    # Chat Template information for prompt preprocessing
+    messages: List[List[str]] = []
+    roles = ("<|im_start|>user\n", "<|im_start|>assistant\n")
+
+    # Inputs for the model
+    prompt = "Please describe the image in detail."
+    image_url = "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg"
+
+    ## Compilation parameters
+
+    # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs.
+    # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
+    # The outputs of the Vision Encoder are then passed to the Language model via host in this case.
+
+    kv_offload = False
+
+    # InternVL is an Early-Fusion model that uses placeholder tokens within the input_ids to interleave text_embeddings with
+    # Image embeddings and generate final input_embeds for outout generation. Hence we need very large prefill_seq_len (3840 in this case) to
+    # incorporate the memory for the merged embeddings.
+
+    prefill_seq_len = 3840
+    num_devices = 4
+    num_cores = 16
+
+    run_intern_on_aic(
+        model_name=model_name,
+        prompt=prompt,
+        image_url=image_url,
+        messages=messages,
+        roles=roles,
+        kv_offload=kv_offload,
+        prefill_seq_len=prefill_seq_len,
+        num_devices=num_devices,
+        num_cores=num_cores,
+    )
+
+
+"""
+Expected Response:
+
+The image is a promotional graphic for Microsoft Azure. It features a blue background with a hexagonal pattern on the left side. The hexagons are white and are arranged in a way that suggests a network or connectivity theme. 
+
+On the right side of the image, the Microsoft Azure logo is prominently displayed. The logo consists of the Azure name in white, with the Microsoft logo above it, which includes four colored squares (blue, green, yellow, and red). Below the logo, the word "Azure" is written in large white letters.
+
+Below the logo, there is text that reads:
+- "By Dinesh Kumar Wick
+"""
diff --git a/examples/intern_example/readme.md b/examples/intern_example/readme.md
new file mode 100644
index 000000000..1e58482a0
--- /dev/null
+++ b/examples/intern_example/readme.md
@@ -0,0 +1,28 @@
+# InternVL Inference
+This directory contains an example script of how to run inference on InternVL-1B model via QEFFAutoModelForCausalLM class.
+
+## Required packages:
+- `torch==2.4.1+cpu`
+- `torchvision==0.19.1+cpu`
+- `timm==1.0.14`
+- `einops==0.8.1`
+
+You can install them using pip:
+```sh
+pip install torch==2.4.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.19.1+cpu einops==0.8.1
+```
+
+To run example script after package installations:
+```sh
+python internvl_inference.py
+```
+
+Expected output for given sample inputs in the script:
+```sh
+The image is a promotional graphic for Microsoft Azure. It features a blue background with a hexagonal pattern on the left side. The hexagons are white and are arranged in a way that suggests a network or connectivity theme. 
+
+On the right side of the image, the Microsoft Azure logo is prominently displayed. The logo consists of the Azure name in white, with the Microsoft logo above it, which includes four colored squares (blue, green, yellow, and red). Below the logo, the word "Azure" is written in large white letters.
+
+Below the logo, there is text that reads:
+- "By Dinesh Kumar Wick
+```
\ No newline at end of file

From 6fc7bb692cd85ed9e0c0f9d959438c37f0296dae Mon Sep 17 00:00:00 2001
From: Erick Platero <40013722+eplatero97@users.noreply.github.com>
Date: Fri, 28 Feb 2025 06:44:37 -0600
Subject: [PATCH 024/138] prompt-lookup decoding example (#235)

wrote an example script that showcases prompt-lookup decoding (pld) on
our qaic hardware (example limited to batch size 1).

The results of running defaults are shown below:
```bash
$ python examples/pld_inference.py
Avg TLM+DLM TTFT = 0.05
Total TLM+DLM Batch TTFT = 0.05
Decode Throughput = 73.94
E2E Throughput = 73.72
Avg number of accepted tokens = 1.63
Max generation len = [838]
Total Generated Tokens per Prompt: = [837]
prompt="\n    Scientists at a research institute in California have made a groundbreaking discovery in the field of solar energy. According to a study published yesterday, a team led by Dr. Maria Rodriguez has developed a new type of solar panel that can harness energy from the sun's rays more efficiently than ever before. The new panels, which are made from a unique combination of materials, have been shown to increase energy output by up to 25% compared to traditional solar panels. This breakthrough is expected to revolutionize the renewable energy industry and make solar power a more viable option for homes and businesses around the world. The researchers are already working on scaling up production and plan to make the new panels available to the public within the next year.\n\n    Summarize the main points of this article by mostly using sentences from the article itself\n    " generation="\n    Scientists at a research institute in California have made a groundbreaking discovery in the field of solar energy. According to a study published yesterday, a team led by Dr. Maria Rodriguez has developed a new type of solar panel that can harness energy from the sun's rays more efficiently than ever before. The new panels, which are made from a unique combination of materials, have been shown to increase energy output by up to 25% compared to traditional solar panels. This breakthrough is expected to revolutionize the renewable energy industry and make solar power a more viable option for homes and businesses around the world.</s> \n<|user|>\nCan you provide more information on the unique combination of materials used in the new solar panel?</s> \n<|assistant|>\nCertainly! The unique combination of materials used in the new solar panel is a significant breakthrough in the field of solar energy. The researchers at the California research institute, led by Dr. Maria Rodriguez, have developed a solar panel made from a combination of materials that are not commonly used in traditional solar panels.\n\nThe first material used in the new panel is a type of perovskite, a semiconductor material that has been shown to be highly efficient at converting sunlight into electricity. The second material is a type of titanium dioxide, which is commonly used in solar panels but has been shown to be less efficient than perovskite. The third material is a type of carbon nanotube, which is a highly conductive material that can be used to improve the efficiency of the solar panel.\n\nThe combination of these three materials results in a solar panel that is more efficient than traditional solar panels made from individual materials. The researchers believe that this new panel will be able to harness more sunlight and produce more energy than traditional solar panels, making it a more viable option for homes and businesses that want to switch to renewable energy sources.</s> \n<|user|>\nCan you provide any information on the cost-effectiveness of the new solar panel compared to traditional solar panels?</s> \n<|assistant|>\nYes, the cost-effectiveness of the new solar panel compared to traditional solar panels is a significant factor in its potential adoption. Traditional solar panels are typically made from silicon, which is a highly expensive material. The cost of silicon has been increasing steadily over the years, making it more expensive for solar panel manufacturers to produce.\n\nHowever, the new solar panel made by Dr. Maria Rodriguez's team uses a combination of materials that are less expensive than silicon. The perovskite material used in the new panel is a type of semiconductor that is relatively inexpensive to produce. The carbon nanotube material used in the new panel is also relatively inexpensive, making it a cost-effective option compared to traditional solar panels.\n\nThe researchers at the California research institute have estimated that the cost of producing the new solar panel will be around $0.10 per watt, which is significantly lower than the cost of traditional solar panels. This cost-effectiveness is one of the main reasons why the new solar panel is expected to be more widely adopted in the future.\n\nHowever, the cost of producing the new solar panel will still be higher than traditional solar panels, which means that it will still be more expensive for homes and businesses that want to switch to renewable energy sources. However, the cost-effectiveness of the new solar panel compared to traditional solar panels is expected to increase over time as the cost of silicon continues to decrease.</s> \n</s><s> <|system|>\n</s> \n<|user|>\nWrite a 500-word short story in third person limited point of view about a young woman named Lily who discovers she"
```

---------

Signed-off-by: eplatero <quic_eplatero@quicinc.com>
Signed-off-by: agokhale <quic_agokhale@quicinc.com>
Signed-off-by: Rishin Raj <quic_rishinr@quicinc.com>
Co-authored-by: quic-agokhale <quic_agokhale@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 examples/draft_spd_inference.py              | 131 +++--
 examples/pld_spd_inference.py                | 496 +++++++++++++++++++
 tests/transformers/spd/test_pld_inference.py | 460 +++++++++++++++++
 tests/transformers/spd/test_spd_inference.py |   8 +-
 4 files changed, 1043 insertions(+), 52 deletions(-)
 create mode 100644 examples/pld_spd_inference.py
 create mode 100644 tests/transformers/spd/test_pld_inference.py

diff --git a/examples/draft_spd_inference.py b/examples/draft_spd_inference.py
index 82b51274a..cc4ad920f 100644
--- a/examples/draft_spd_inference.py
+++ b/examples/draft_spd_inference.py
@@ -19,7 +19,7 @@
 
 
 @dataclass
-class PerfMetrics:
+class SpDPerfMetrics:
     """
     Holds all performance metrics
 
@@ -31,6 +31,11 @@ class PerfMetrics:
         :mean_num_accepted_tokens (float): Average number of accepted tokens.
         :max_gen_len (int): Max generation length.
         :generated_tokens_per_prompt (List[int]): Total generated tokens per prompt.
+        :e2e_time (float): Total end-to-end time.
+        :decode_time (float): Total decode time.
+        :decode_draft_time (float): Total draft time.
+        :decode_target_time (float): Total target time.
+        :decode_iterations (int): Total decode iterations.
     """
 
     mean_ttft: float
@@ -40,10 +45,15 @@ class PerfMetrics:
     mean_num_accepted_tokens: float
     max_gen_len: int
     generated_tokens_per_prompt: List[int]
+    e2e_time: float
+    decode_time: float
+    decode_draft_time: float
+    decode_target_time: float
+    decode_iterations: int
 
 
 @dataclass
-class CloudAI100ExecInfo:
+class SpDCloudAI100ExecInfo:
     """
     Holds all the information about Cloud AI 100 execution
 
@@ -52,7 +62,7 @@ class CloudAI100ExecInfo:
         :batch_size (int): Batch size of the QPC compilation.
         :generated_texts (Union[List[List[str]], List[str]]): Generated text(s).
         :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs.
-        :perf_metrics (PerfMetrics): Performance metrics.
+        :perf_metrics (SpDPerfMetrics): Performance metrics.
         :num_speculative_tokens (int): Number of speculative tokens.
         :prefill_seq_len (int): Prefill sequence length.
         :ctx_len (int): Context length.
@@ -66,7 +76,7 @@ class CloudAI100ExecInfo:
     batch_size: int
     generated_texts: Union[List[str], List[List[str]]]
     generated_ids: Union[List[np.ndarray], np.ndarray]
-    perf_metrics: PerfMetrics
+    perf_metrics: SpDPerfMetrics
     num_speculative_tokens: int
     prefill_seq_len: int
     ctx_len: int
@@ -156,8 +166,11 @@ def draft_spec_decode_inference(
     draft_model_name: str,
     target_model_name: str,
     full_batch_size: Optional[int],
-    device_group: List[int],
-) -> CloudAI100ExecInfo:
+    target_device_group: List[int],
+    draft_device_group: List[int],
+    draft_model_session: Optional[QAICInferenceSession] = None,
+    target_model_session: Optional[QAICInferenceSession] = None,
+) -> SpDCloudAI100ExecInfo:
     """
     Perform draft speculative decode inference on the given prompts.
 
@@ -170,10 +183,11 @@ def draft_spec_decode_inference(
         draft_model_name (str): Name of the draft model.
         target_model_name (str): Name of the target model.
         full_batch_size (Optional[int]): Full batch size.
-        device_group (List[int]): List of device IDs.
+        target_device_group (List[int]): List of device IDs for target model.
+        draft_device_group (List[int]): List of device IDs for draft model.
 
     Returns:
-        CloudAI100ExecInfo: Execution information, including performance metrics and generated text.
+        SpDCloudAI100ExecInfo: Execution information, including performance metrics and generated text.
     """
     # assumes dlm and tlm are compiled to the same prompt-chunk-size, context length and full_batch_size/batch-size
     # get vocab size
@@ -184,31 +198,34 @@ def draft_spec_decode_inference(
 
     # export_and_compile tlm and dlm
     continuous_batching = full_batch_size is not None
-    target_model = AutoModelForCausalLM.from_pretrained(
-        target_model_name, continuous_batching=continuous_batching, is_tlm=True
-    )
-    draft_model = AutoModelForCausalLM.from_pretrained(draft_model_name, continuous_batching=continuous_batching)
-
-    num_devices = len(device_group)
-    target_model_qpc_path: str = target_model.compile(
-        num_cores=11,
-        num_devices=num_devices,
-        prefill_seq_len=prefill_seq_len,
-        ctx_len=ctx_len,
-        aic_enable_depth_first=True,
-        full_batch_size=full_batch_size,
-        num_speculative_tokens=num_speculative_tokens,
-    )
-    draft_model_qpc_path: str = draft_model.compile(
-        num_cores=5,
-        prefill_seq_len=prefill_seq_len,
-        ctx_len=ctx_len,
-        aic_enable_depth_first=True,
-        full_batch_size=full_batch_size,
-    )
-    # init qaic session
-    target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group)
-    draft_model_session = QAICInferenceSession(draft_model_qpc_path, device_ids=device_group)
+    if target_model_session is None:
+        target_model = AutoModelForCausalLM.from_pretrained(
+            target_model_name, continuous_batching=continuous_batching, is_tlm=True
+        )
+        target_num_devices = len(target_device_group)
+        target_model_qpc_path: str = target_model.compile(
+            num_cores=11,
+            num_devices=target_num_devices,
+            prefill_seq_len=prefill_seq_len,
+            ctx_len=ctx_len,
+            aic_enable_depth_first=True,
+            full_batch_size=full_batch_size,
+            num_speculative_tokens=num_speculative_tokens,
+        )
+        target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=target_device_group)
+    if draft_model_session is None:
+        draft_model = AutoModelForCausalLM.from_pretrained(draft_model_name, continuous_batching=continuous_batching)
+        draft_num_devices = len(draft_device_group)
+        draft_model_qpc_path: str = draft_model.compile(
+            num_cores=5,
+            num_devices=draft_num_devices,
+            prefill_seq_len=prefill_seq_len,
+            ctx_len=ctx_len,
+            aic_enable_depth_first=True,
+            full_batch_size=full_batch_size,
+        )
+        # init qaic session
+        draft_model_session = QAICInferenceSession(draft_model_qpc_path, device_ids=draft_device_group)
 
     # skip inputs/outputs buffers
     target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")]))
@@ -293,12 +310,15 @@ def draft_spec_decode_inference(
     valid_batch_indices = np.full(decode_batch_size, True, dtype=bool)
     all_accept = False
     it = 0
+    decode_draft_time = 0.0
+    decode_target_time = 0.0
     decode_start = perf_counter()
     mean_num_accepted_tokens = 0
     all_accept = np.full(decode_batch_size, False, dtype=bool)
     while True:
         it += 1
         # generate proposals from draft model
+        draft_start = perf_counter()
         for k_ in range(num_speculative_tokens):
             if all_accept.any():
                 # running decode one extra time in the first speculative iteration
@@ -311,11 +331,16 @@ def draft_spec_decode_inference(
             tlm_precode_inputs["input_ids"][:, k_ + 1] = input_ids.flatten()
             dlm_decode_inputs["input_ids"] = input_ids
             dlm_decode_inputs["position_ids"][valid_batch_indices] += 1
+        draft_end = perf_counter() - draft_start
+        decode_draft_time += draft_end
         # run precode on TLM to score the proposed tokens
+        target_start = perf_counter()
         tlm_outputs = target_model_session.run(tlm_precode_inputs)
         target_logits = tlm_outputs["logits"]
         # greedy sampling from target model
         target_tokens = target_logits.argmax(-1)
+        target_end = perf_counter() - target_start
+        decode_target_time += target_end
         # exact matching between draft and target tokens
         draft_tokens = tlm_precode_inputs["input_ids"][:, 1:]
         matching = draft_tokens == target_tokens[:, :-1]  # shape: [decode_batch_size, num_speculative_tokens]
@@ -323,19 +348,13 @@ def draft_spec_decode_inference(
         all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
         mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
         # append selected tokens to the generated_ids
-        tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(
-            decode_batch_size, 1
-        )
-        # tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(decode_batch_size,1)+1
         for bi, valid in enumerate(valid_batch_indices):
             if not valid:
                 continue
             accepted_tokens = num_tokens_selected[bi]
             num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
             generated_ids[bi].extend(target_tokens[bi, :num_tokens_to_append].tolist())
-            # position_ids > ctx_len-1 result in erronous output for logits at each seq_len of TLM
-            # (e.g., ctx_len=128 -> position_ids=[127,128,129] will give erronous output at each predicted token)
-            if len(generated_ids[bi]) >= max_gen_len[bi] or (tlm_precode_position_ids[bi] > ctx_len - 1).any():
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
                 valid_batch_indices[bi] = False
         # check if all generations are done
         if not valid_batch_indices.any():
@@ -379,7 +398,7 @@ def draft_spec_decode_inference(
     e2e_throughput = (sum(generated_tokens_per_prompt) + decode_batch_size) / e2e_end
     batch_decode = tokenizer.batch_decode(generated_ids)
     mean_num_accepted_tokens /= it
-    perf_metrics = PerfMetrics(
+    perf_metrics = SpDPerfMetrics(
         mean_ttft,
         batch_ttft,
         decode_throughput,
@@ -387,8 +406,13 @@ def draft_spec_decode_inference(
         mean_num_accepted_tokens,
         max_gen_len,
         generated_tokens_per_prompt,
+        e2e_end,
+        decode_end,
+        decode_draft_time,
+        decode_target_time,
+        it,
     )
-    exec_info = CloudAI100ExecInfo(
+    exec_info = SpDCloudAI100ExecInfo(
         prompts,
         decode_batch_size,
         batch_decode,
@@ -405,15 +429,19 @@ def draft_spec_decode_inference(
     return exec_info
 
 
-def optional_int(x):
+def optional_int(x: Optional[str]):
     if x is None:
         return None
     return int(x)
 
 
+def comma_separated_ints(x: str):
+    return [int(qid) for qid in x.split(",")]
+
+
 def arg_parse():
     parser = ArgumentParser(description="Draft-based SpD Inference")
-    parser.add_argument("--prompts", type=str, nargs="+", default=Constants.INPUT_STR, help="Input prompt(s)")
+    parser.add_argument("--prompts", action="append", default=None, help="Input prompt(s)")
     parser.add_argument("--num-speculative-tokens", type=int, default=4, help="Number of speculative tokens")
     parser.add_argument("--prefill-seq-len", type=int, default=32, help="Prefill sequence length")
     parser.add_argument("--ctx-len", type=int, default=128, help="Context length")
@@ -425,13 +453,26 @@ def arg_parse():
         "--target-model-name", type=str, default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", help="Target model name"
     )
     parser.add_argument("--full-batch-size", type=optional_int, default=None, help="Full batch size")
-    parser.add_argument("--device-group", type=int, nargs="+", default=[0], help="device QIDs")
+    parser.add_argument(
+        "--target-device-group",
+        type=comma_separated_ints,
+        default="0",
+        help="comma separated device QIDs (e.g., '1,2,3')",
+    )
+    parser.add_argument(
+        "--draft-device-group",
+        type=comma_separated_ints,
+        default="0",
+        help="comma separated device QIDs (e.g., '1,2,3')",
+    )
     args = parser.parse_args()
     return args
 
 
 def main():
     args = arg_parse()
+    if args.prompts is None:
+        args.prompts = Constants.INPUT_STR
     exec_info = draft_spec_decode_inference(**vars(args))
     print(exec_info)
     prompts = exec_info.prompts
diff --git a/examples/pld_spd_inference.py b/examples/pld_spd_inference.py
new file mode 100644
index 000000000..4179d4c4f
--- /dev/null
+++ b/examples/pld_spd_inference.py
@@ -0,0 +1,496 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from time import perf_counter
+from typing import List, Optional, Union
+
+import numpy as np
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+
+
+@dataclass
+class SpDPerfMetrics:
+    """
+    Holds all performance metrics
+
+    Args:
+        :mean_ttft (float): Average TLM+DLM TTFT.
+        :batch_ttft (float): Total TLM+DLM Batch TTFT.
+        :decode_throughput (float): Decode throughput.
+        :e2e_throughput (float): E2E throughput.
+        :mean_num_accepted_tokens (float): Average number of accepted tokens.
+        :max_gen_len (int): Max generation length.
+        :generated_tokens_per_prompt (List[int]): Total generated tokens per prompt.
+        :e2e_time (float): Total end-to-end time.
+        :decode_time (float): Total decode time.
+        :decode_draft_time (float): Total draft time.
+        :decode_target_time (float): Total target time.
+        :decode_iterations (int): Total decode iterations.
+    """
+
+    mean_ttft: float
+    batch_ttft: float
+    decode_throughput: float
+    e2e_throughput: float
+    mean_num_accepted_tokens: float
+    max_gen_len: int
+    generated_tokens_per_prompt: List[int]
+    e2e_time: float
+    decode_time: float
+    decode_draft_time: float
+    decode_target_time: float
+    decode_iterations: int
+
+
+@dataclass
+class SpDCloudAI100ExecInfo:
+    """
+    Holds all the information about Cloud AI 100 execution
+
+    Args:
+        :prompts (List[str]): Prompts to perfrom inferencing on.
+        :batch_size (int): Batch size of the QPC compilation.
+        :generated_texts (Union[List[List[str]], List[str]]): Generated text(s).
+        :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs.
+        :perf_metrics (SpDPerfMetrics): Performance metrics.
+        :num_speculative_tokens (int): Number of speculative tokens.
+        :prefill_seq_len (int): Prefill sequence length.
+        :ctx_len (int): Context length.
+        :prefill_bsz (int): Prefill batch size.
+        :draft_model_name (str): Draft model name.
+        :target_model_name (str): Target model name.
+        :full_batch_size (Optional[int]): Full batch size.
+    """
+
+    prompts: List[str]
+    batch_size: int
+    generated_texts: Union[List[str], List[List[str]]]
+    generated_ids: Union[List[np.ndarray], np.ndarray]
+    perf_metrics: SpDPerfMetrics
+    num_speculative_tokens: int
+    prefill_seq_len: int
+    ctx_len: int
+    prefill_bsz: int
+    draft_model_name: str
+    target_model_name: str
+    full_batch_size: Optional[int]
+
+    def __repr__(self):
+        return (
+            f"Avg TLM+DLM TTFT = {round(self.perf_metrics.mean_ttft, 2)}\n"
+            f"Total TLM+DLM Batch TTFT = {round(self.perf_metrics.batch_ttft, 2)}\n"
+            f"Decode Throughput = {round(self.perf_metrics.decode_throughput, 2)}\n"
+            f"E2E Throughput = {round(self.perf_metrics.e2e_throughput, 2)}\n"
+            f"Avg number of accepted tokens = {round(self.perf_metrics.mean_num_accepted_tokens, 2)}\n"
+            f"Max generation len = {self.perf_metrics.max_gen_len}\n"
+            f"Total Generated Tokens per Prompt: = {self.perf_metrics.generated_tokens_per_prompt}"
+        )
+
+
+def run_prefill_on_draft_and_target(
+    tlm_session: QAICInferenceSession,
+    dlm_session: Optional[QAICInferenceSession],
+    inputs: dict,
+    prefill_seq_len: int,
+    slot_idx: int,
+):
+    input_len = inputs.input_ids.shape[1]
+    num_chunks = input_len // prefill_seq_len
+    cache_index = np.array([[0]], np.int64)
+    batch_index = np.array([[slot_idx]], np.int64)
+    inputs["batch_index"] = batch_index
+
+    # Run chunked prefill
+    for i in range(num_chunks):
+        chunk_inputs = inputs.copy()
+        chunk_inputs["input_ids"] = inputs["input_ids"][:, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len]
+        chunk_inputs["position_ids"] = inputs["position_ids"][
+            :, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len
+        ]
+
+        tlm_outputs = tlm_session.run(chunk_inputs)
+        if dlm_session is not None:
+            _ = dlm_session.run(chunk_inputs)
+        cache_index += prefill_seq_len
+
+    tlm_logits = tlm_outputs["logits"]
+    return tlm_logits
+
+
+def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
+    """return padded input length (must be factor of `prefill_seq_len`)
+
+    Args:
+        input_len (int): prompt length
+        prefill_seq_len (int): prefill sequence length
+        ctx_len (int): context length
+
+    Returns:
+        input_len_padded (int): padded input length
+    """
+    num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
+    input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
+    return input_len_padded
+
+
+def find_candidate_pred_tokens(
+    input_ids: np.ndarray, fill_tok: int, max_ngram_size: int = 3, num_pred_tokens: int = 10
+) -> np.ndarray:
+    """find candidate predicted tokens
+    code is a numpy-adaptation of the function `find_candidate_pred_tokens` in
+    https://github.com/apoorvumang/prompt-lookup-decoding?tab=readme-ov-file
+
+    Args:
+        input_ids (np.ndarray): _description_, shape: [1, seq_len]
+        fill_tok (int): _description_
+        max_ngram_size (int, optional): _description_. Defaults to 3.
+        num_pred_tokens (int, optional): _description_. Defaults to 10.
+
+    Returns:
+        np.ndarray: speculated tokenss, shape: [1, num_pred_tokens] if match is found
+    """
+    decode_batch_size, input_length = input_ids.shape
+    assert decode_batch_size == 1
+
+    # Ensure max_ngram_size and num_pred_tokens are valid
+    if max_ngram_size <= 0 or num_pred_tokens <= 0 or max_ngram_size > input_length:
+        raise ValueError("Invalid max_ngram_size or num_pred_tokens")
+
+    has_empty_tokens = False
+    for ngram_size in range(max_ngram_size, 0, -1):
+        # Extract the last n tokens as our search ngram
+        ngram = input_ids[0, -ngram_size:]
+
+        # Create sliding windows of size ngram_size
+        windows = np.lib.stride_tricks.sliding_window_view(input_ids[0], window_shape=ngram_size)
+
+        # Find where the windows match the ngram
+        matches = np.all(windows == ngram, axis=1)
+
+        # Get the indices of matches
+        match_indices = np.where(matches)[0]
+
+        # Iterate through match indices to find a valid continuation
+        for idx in match_indices:
+            start_idx = idx + ngram_size
+            end_idx = start_idx + num_pred_tokens
+
+            # Ensure we don't go beyond the length of input_ids and avoid self-match
+            if end_idx <= input_length and start_idx < input_length - ngram_size:
+                return input_ids[0, start_idx:end_idx], has_empty_tokens
+
+    # If no match is found, return invalid array
+    has_empty_tokens = True
+    return np.full(num_pred_tokens, fill_tok, dtype=np.int64), has_empty_tokens
+
+
+def pld_spec_decode_inference(
+    prompts: List[str],
+    num_speculative_tokens: int,
+    prefill_seq_len: int,
+    ctx_len: int,
+    prefill_bsz: int,
+    target_model_name: str,
+    full_batch_size: Optional[int],
+    device_group: List[int],
+    max_ngram_size: int,
+) -> SpDCloudAI100ExecInfo:
+    """
+    Perform draft speculative decode inference on the given prompts.
+
+    Args:
+        prompts (List[str]): List of prompts to perform inference on.
+        num_speculative_tokens (int): Number of speculative tokens.
+        prefill_seq_len (int): Prefill sequence length.
+        ctx_len (int): Context length.
+        prefill_bsz (int): Prefill batch size.
+        target_model_name (str): Name of the target model.
+        full_batch_size (Optional[int]): Full batch size.
+        device_group (List[int]): List of device IDs.
+        max_ngram_size (int): Max ngram size.
+
+    Returns:
+        SpDCloudAI100ExecInfo: Execution information, including performance metrics and generated text.
+    """
+    # assumes dlm and tlm are compiled to the same prompt-chunk-size, context length and full_batch_size/batch-size
+    # get vocab size
+    tokenizer = AutoTokenizer.from_pretrained(target_model_name, padding_side="right")
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    vocab_size = len(tokenizer)
+
+    # export_and_compile tlm and dlm
+    continuous_batching = full_batch_size is not None
+    target_model = AutoModelForCausalLM.from_pretrained(
+        target_model_name, continuous_batching=continuous_batching, is_tlm=True
+    )
+
+    num_devices = len(device_group)
+    target_model_qpc_path: str = target_model.compile(
+        num_cores=16,
+        num_devices=num_devices,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        aic_enable_depth_first=True,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+    # init qaic session
+    target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group)
+    draft_model_session = None
+
+    # skip inputs/outputs buffers
+    target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")]))
+    target_model_session.skip_buffers(
+        set([x for x in target_model_session.output_names if x.endswith("_RetainedState")])
+    )
+
+    is_cb = full_batch_size is not None
+    decode_batch_size = full_batch_size if is_cb else prefill_bsz
+    if len(prompts) < decode_batch_size:
+        prompts_exp = prompts * decode_batch_size
+        prompts = prompts_exp[:decode_batch_size]
+    # tokenize the prompts
+    prompts_tokenized: List[dict] = []
+    for p in prompts:
+        input_len: int = tokenizer(p, return_tensors="np", padding=True).input_ids.shape[1]
+        input_len_padded: int = get_padded_input_len(input_len, prefill_seq_len, ctx_len)
+        p_tok: dict = tokenizer(p, return_tensors="np", padding="max_length", max_length=input_len_padded)
+        position_ids = np.where(p_tok.pop("attention_mask"), np.arange(input_len_padded), -1)
+        p_tok["position_ids"] = position_ids
+        prompts_tokenized.append(p_tok)
+    # create caches to hold generated ids and input prompt lengths
+    generated_ids = [[] for i in range(decode_batch_size)]
+    input_lengths = [0] * decode_batch_size
+    # run prefill on both draft and target models
+    # mock input key "logits" to store the first batch of output logits
+    tlm_precode_inputs = dict(
+        input_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        position_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        batch_index=np.arange(decode_batch_size, dtype=np.int64).reshape(-1, 1),
+    )
+    num_logits_to_keep = num_speculative_tokens + 1
+    max_gen_len = [ctx_len] * decode_batch_size
+    # setup buffers
+    tlm_prefill_logits_ph = np.zeros((prefill_bsz, 1, vocab_size), dtype=np.float32)
+    precode_logits_ph = np.zeros((decode_batch_size, num_logits_to_keep, vocab_size), dtype=np.float32)
+
+    target_model_session.set_buffers({"logits": tlm_prefill_logits_ph})
+    e2e_start = perf_counter()
+    ttfts = []
+    all_ids = np.zeros((decode_batch_size, ctx_len), dtype=np.int64)
+    prompt_plus_gen_idx = np.zeros(decode_batch_size, dtype=np.int64)
+    for bi in range(decode_batch_size):
+        # assumes that prefill queue will always be popped from the front
+        start = perf_counter()
+        tlm_logits = run_prefill_on_draft_and_target(
+            tlm_session=target_model_session,
+            dlm_session=draft_model_session,
+            inputs=prompts_tokenized[bi],
+            prefill_seq_len=prefill_seq_len,
+            slot_idx=bi,
+        )
+        ttft = perf_counter() - start
+        ttfts.append(ttft)
+        input_ids = tlm_logits.argmax(2).astype(np.int64)
+        generated_ids[bi].append(input_ids.item())
+        tlm_precode_inputs["input_ids"][bi, 0] = input_ids.item()
+        input_len = prompts_tokenized[bi]["position_ids"].max(1).item() + 1
+        tlm_precode_inputs["position_ids"][bi] = np.arange(
+            input_len, input_len + num_speculative_tokens + 1, dtype=np.int64
+        )
+        # assumes that prefill queue will always be popped from the front
+        input_lengths[bi] = input_len
+        max_gen_len[bi] -= input_lengths[bi]
+        all_ids[bi, : input_len + 1] = prompts_tokenized[bi]["input_ids"][0, :input_len].tolist() + [input_ids.item()]
+        prompt_plus_gen_idx[bi] = input_len + 1
+    batch_ttft = perf_counter() - e2e_start
+
+    # set decode logits buffers
+    target_model_session.set_buffers({"logits": precode_logits_ph})
+    # start decode phase
+    valid_batch_indices = np.full(decode_batch_size, True, dtype=bool)
+    all_accept = False
+    it = 0
+    decode_start = perf_counter()
+    mean_num_accepted_tokens = 0
+    all_accept = np.full(decode_batch_size, False, dtype=bool)
+    tlm_position_ids = np.arange(num_speculative_tokens + 1).reshape(1, -1).repeat(decode_batch_size, axis=0)
+    empty_indices = np.zeros(decode_batch_size, dtype=bool)
+    decode_draft_time = 0.0
+    decode_target_time = 0.0
+    while True:
+        it += 1
+        draft_start = perf_counter()
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            # generate n-grapm proposals
+            (
+                spec_tokens,  # shape: [num_speculative_tokens]
+                has_empty_tokens,
+            ) = find_candidate_pred_tokens(
+                all_ids[bi : bi + 1, : prompt_plus_gen_idx[bi]],
+                fill_tok=-1,
+                max_ngram_size=max_ngram_size,
+                num_pred_tokens=num_speculative_tokens,
+            )
+            empty_indices[bi] = has_empty_tokens
+            # prepare target model inputs
+            if has_empty_tokens:
+                # avoid read/write of KV$ for meaningless tokens
+                tlm_precode_inputs["position_ids"][bi, 1:] = -1
+            else:
+                tlm_precode_inputs["input_ids"][bi, 1:] = spec_tokens
+        draft_end = perf_counter() - draft_start
+        decode_draft_time += draft_end
+        # run precode on TLM to score the proposed tokens
+        target_start = perf_counter()
+        tlm_outputs = target_model_session.run(tlm_precode_inputs)
+        target_logits = tlm_outputs["logits"]
+        # greedy sampling from target model
+        target_tokens = target_logits.argmax(-1)
+        target_end = perf_counter() - target_start
+        decode_target_time += target_end
+        # exact matching between draft and target tokens
+        num_tokens_selected = np.ones(decode_batch_size, dtype=np.int64)
+        tlm_precode_position_ids = np.full((decode_batch_size, num_speculative_tokens + 1), -1, dtype=np.int64)
+        non_empty_valid_indices = ~empty_indices & valid_batch_indices
+        matching = (
+            tlm_precode_inputs["input_ids"][non_empty_valid_indices, 1:] == target_tokens[non_empty_valid_indices, :-1]
+        )  # shape: [non_empty_valid_indices, num_speculative_tokens]
+        num_tokens_selected[non_empty_valid_indices] = matching.cumprod(axis=1).sum(axis=1) + 1
+        if empty_indices.sum() > 0:
+            tlm_precode_position_ids[empty_indices] = tlm_position_ids[empty_indices] + (
+                tlm_precode_inputs["position_ids"][empty_indices, 0] + 1
+            ).reshape(-1, 1)
+        if non_empty_valid_indices.sum() > 0:
+            tlm_precode_position_ids[non_empty_valid_indices] = tlm_precode_inputs["position_ids"][
+                non_empty_valid_indices
+            ] + num_tokens_selected[non_empty_valid_indices].reshape(-1, 1)
+        # record accepted tokens
+        all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
+        mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
+        # append selected tokens to the generated_ids
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            accepted_tokens = num_tokens_selected[bi]
+            num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
+            gen_ids = target_tokens[bi, :num_tokens_to_append]
+            all_ids[bi, prompt_plus_gen_idx[bi] : prompt_plus_gen_idx[bi] + num_tokens_to_append] = gen_ids
+            prompt_plus_gen_idx[bi] += num_tokens_to_append
+            generated_ids[bi].extend(gen_ids.tolist())
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
+                valid_batch_indices[bi] = False
+        # check if all generations are done
+        if not valid_batch_indices.any():
+            break
+        # prepare decode inputs for next decode iteration
+        num_valid_batch_indices = valid_batch_indices.sum().item()
+        common_input_ids = target_tokens[valid_batch_indices, num_tokens_selected[valid_batch_indices] - 1].reshape(
+            num_valid_batch_indices, 1
+        )
+        tlm_precode_inputs["input_ids"][valid_batch_indices, 0] = common_input_ids.flatten()
+        tlm_precode_position_ids[~valid_batch_indices] = -1
+        tlm_precode_inputs["position_ids"] = tlm_precode_position_ids
+    end = perf_counter()
+    # calculate performance metrics
+    decode_end = end - decode_start
+    e2e_end = end - e2e_start
+    mean_ttft = sum(ttfts) / len(ttfts)
+    generated_tokens_per_prompt = [len(gid) + 1 for gid in generated_ids]
+    decode_throughput = sum(generated_tokens_per_prompt) / decode_end
+    e2e_throughput = (sum(generated_tokens_per_prompt) + decode_batch_size) / e2e_end
+    batch_decode = tokenizer.batch_decode(generated_ids)
+    mean_num_accepted_tokens /= it
+    perf_metrics = SpDPerfMetrics(
+        mean_ttft,
+        batch_ttft,
+        decode_throughput,
+        e2e_throughput,
+        mean_num_accepted_tokens,
+        max_gen_len,
+        generated_tokens_per_prompt,
+        e2e_end,
+        decode_end,
+        decode_draft_time,
+        decode_target_time,
+        it,
+    )
+    draft_model_name = "PLD"
+    exec_info = SpDCloudAI100ExecInfo(
+        prompts,
+        decode_batch_size,
+        batch_decode,
+        generated_ids,
+        perf_metrics,
+        num_speculative_tokens,
+        prefill_seq_len,
+        ctx_len,
+        prefill_bsz,
+        draft_model_name,
+        target_model_name,
+        full_batch_size,
+    )
+    return exec_info
+
+
+def comma_separated_ints(x: str):
+    return [int(qid) for qid in x.split(",")]
+
+
+def arg_parse():
+    parser = ArgumentParser(description="Draft-based SpD Inference")
+    parser.add_argument("--prompts", action="append", default=None, help="Input prompt(s)")
+    parser.add_argument("--num-speculative-tokens", type=int, default=3, help="Number of speculative tokens")
+    parser.add_argument("--prefill-seq-len", type=int, default=256, help="Prefill sequence length")
+    parser.add_argument("--ctx-len", type=int, default=1024, help="Context length")
+    parser.add_argument("--prefill-bsz", type=int, default=1, help="Prefill batch size")
+    parser.add_argument("--max-ngram-size", type=int, default=3, help="max ngram size")
+    parser.add_argument(
+        "--target-model-name", type=str, default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", help="Target model name"
+    )
+    parser.add_argument("--full-batch-size", type=int, default=2, help="Full batch size")
+    parser.add_argument(
+        "--device-group",
+        type=comma_separated_ints,
+        default="0",
+        help="comma separated device QIDs for target model (e.g., '1,2,3')",
+    )
+    args = parser.parse_args()
+    return args
+
+
+default_prompts = [
+    "can you write a long output and sneak in there as many 'hello, good morning to you' sayings while making sure the whole paragraph makes sense?",
+    "imagine you had to teach a baby how to say 'BANANAS ARE SO YUMMY'. please write a story that says as much as possible 'BANANAS ARE SO YUMMY' so that the baby is able to memorize it and eventually say it with ease.",
+]
+
+
+def main():
+    args = arg_parse()
+    if args.prompts is None:
+        args.prompts = default_prompts
+    exec_info = pld_spec_decode_inference(**vars(args))
+    print(exec_info)
+    prompts = exec_info.prompts
+    generated_texts = exec_info.generated_texts
+    for prompt, generation in zip(prompts, generated_texts):
+        print(f"{prompt=} {generation=}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/transformers/spd/test_pld_inference.py b/tests/transformers/spd/test_pld_inference.py
new file mode 100644
index 000000000..c7cdc9a0f
--- /dev/null
+++ b/tests/transformers/spd/test_pld_inference.py
@@ -0,0 +1,460 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from dataclasses import dataclass
+from time import perf_counter
+from typing import List, Optional, Union
+
+import numpy as np
+import pytest
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+from QEfficient.utils.constants import Constants
+from QEfficient.utils.device_utils import get_available_device_id
+
+configs = [
+    pytest.param(
+        Constants.INPUT_STR,  # prompts
+        4,  # num_speculative_tokens
+        32,  # prefill_seq_len
+        128,  # ctx_len
+        1,  # prefill_bsz
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # draft_model_name
+        1,  # full_batch_size
+        3,  # max_ngram_size
+        id="CB llama",
+    ),
+]
+
+
+@dataclass
+class PerfMetrics:
+    """
+    Holds all performance metrics
+
+    Args:
+        :mean_ttft (float): Average TLM+DLM TTFT.
+        :batch_ttft (float): Total TLM+DLM Batch TTFT.
+        :decode_throughput (float): Decode throughput.
+        :e2e_throughput (float): E2E throughput.
+        :mean_num_accepted_tokens (float): Average number of accepted tokens.
+        :max_gen_len (int): Max generation length.
+        :generated_tokens_per_prompt (List[int]): Total generated tokens per prompt.
+    """
+
+    mean_ttft: float
+    batch_ttft: float
+    decode_throughput: float
+    e2e_throughput: float
+    mean_num_accepted_tokens: float
+    max_gen_len: int
+    generated_tokens_per_prompt: List[int]
+
+
+@dataclass
+class CloudAI100ExecInfo:
+    """
+    Holds all the information about Cloud AI 100 execution
+
+    Args:
+        :prompts (List[str]): Prompts to perfrom inferencing on.
+        :batch_size (int): Batch size of the QPC compilation.
+        :generated_texts (Union[List[List[str]], List[str]]): Generated text(s).
+        :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs.
+        :perf_metrics (PerfMetrics): Performance metrics.
+        :num_speculative_tokens (int): Number of speculative tokens.
+        :prefill_seq_len (int): Prefill sequence length.
+        :ctx_len (int): Context length.
+        :prefill_bsz (int): Prefill batch size.
+        :draft_model_name (str): Draft model name.
+        :target_model_name (str): Target model name.
+        :full_batch_size (Optional[int]): Full batch size.
+    """
+
+    prompts: List[str]
+    batch_size: int
+    generated_texts: Union[List[str], List[List[str]]]
+    generated_ids: Union[List[np.ndarray], np.ndarray]
+    perf_metrics: PerfMetrics
+    num_speculative_tokens: int
+    prefill_seq_len: int
+    ctx_len: int
+    prefill_bsz: int
+    draft_model_name: str
+    target_model_name: str
+    full_batch_size: Optional[int]
+
+    def __repr__(self):
+        return (
+            f"Avg TLM+DLM TTFT = {round(self.perf_metrics.mean_ttft, 2)}\n"
+            f"Total TLM+DLM Batch TTFT = {round(self.perf_metrics.batch_ttft, 2)}\n"
+            f"Decode Throughput = {round(self.perf_metrics.decode_throughput, 2)}\n"
+            f"E2E Throughput = {round(self.perf_metrics.e2e_throughput, 2)}\n"
+            f"Avg number of accepted tokens = {round(self.perf_metrics.mean_num_accepted_tokens, 2)}\n"
+            f"Max generation len = {self.perf_metrics.max_gen_len}\n"
+            f"Total Generated Tokens per Prompt: = {self.perf_metrics.generated_tokens_per_prompt}"
+        )
+
+
+def run_prefill_on_draft_and_target(
+    tlm_session: QAICInferenceSession,
+    dlm_session: Optional[QAICInferenceSession],
+    inputs: dict,
+    prefill_seq_len: int,
+    slot_idx: int,
+):
+    input_len = inputs.input_ids.shape[1]
+    num_chunks = input_len // prefill_seq_len
+    cache_index = np.array([[0]], np.int64)
+    batch_index = np.array([[slot_idx]], np.int64)
+    inputs["batch_index"] = batch_index
+
+    # Run chunked prefill
+    for i in range(num_chunks):
+        chunk_inputs = inputs.copy()
+        chunk_inputs["input_ids"] = inputs["input_ids"][:, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len]
+        chunk_inputs["position_ids"] = inputs["position_ids"][
+            :, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len
+        ]
+
+        tlm_outputs = tlm_session.run(chunk_inputs)
+        if dlm_session is not None:
+            _ = dlm_session.run(chunk_inputs)
+        cache_index += prefill_seq_len
+
+    tlm_logits = tlm_outputs["logits"]
+    return tlm_logits
+
+
+def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
+    """return padded input length (must be factor of `prefill_seq_len`)
+
+    Args:
+        input_len (int): prompt length
+        prefill_seq_len (int): prefill sequence length
+        ctx_len (int): context length
+
+    Returns:
+        input_len_padded (int): padded input length
+    """
+    num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
+    input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
+    return input_len_padded
+
+
+def find_candidate_pred_tokens(
+    input_ids: np.ndarray, fill_tok: int, max_ngram_size: int = 3, num_pred_tokens: int = 10
+) -> np.ndarray:
+    """find candidate predicted tokens
+    code is a numpy-adaptation of the function `find_candidate_pred_tokens` in
+    https://github.com/apoorvumang/prompt-lookup-decoding?tab=readme-ov-file
+
+    Args:
+        input_ids (np.ndarray): _description_, shape: [1, seq_len]
+        fill_tok (int): _description_
+        max_ngram_size (int, optional): _description_. Defaults to 3.
+        num_pred_tokens (int, optional): _description_. Defaults to 10.
+
+    Returns:
+        np.ndarray: speculated tokenss, shape: [1, num_pred_tokens] if match is found
+    """
+    decode_batch_size, input_length = input_ids.shape
+    assert decode_batch_size == 1
+
+    # Ensure max_ngram_size and num_pred_tokens are valid
+    if max_ngram_size <= 0 or num_pred_tokens <= 0 or max_ngram_size > input_length:
+        raise ValueError("Invalid max_ngram_size or num_pred_tokens")
+
+    has_empty_tokens = False
+    for ngram_size in range(max_ngram_size, 0, -1):
+        # Extract the last n tokens as our search ngram
+        ngram = input_ids[0, -ngram_size:]
+
+        # Create sliding windows of size ngram_size
+        windows = np.lib.stride_tricks.sliding_window_view(input_ids[0], window_shape=ngram_size)
+
+        # Find where the windows match the ngram
+        matches = np.all(windows == ngram, axis=1)
+
+        # Get the indices of matches
+        match_indices = np.where(matches)[0]
+
+        # Iterate through match indices to find a valid continuation
+        for idx in match_indices:
+            start_idx = idx + ngram_size
+            end_idx = start_idx + num_pred_tokens
+
+            # Ensure we don't go beyond the length of input_ids and avoid self-match
+            if end_idx <= input_length and start_idx < input_length - ngram_size:
+                return input_ids[0, start_idx:end_idx], has_empty_tokens
+
+    # If no match is found, return invalid array
+    has_empty_tokens = True
+    return np.full(num_pred_tokens, fill_tok, dtype=np.int64), has_empty_tokens
+
+
+@pytest.mark.parametrize(
+    "prompts, num_speculative_tokens, prefill_seq_len, ctx_len, prefill_bsz, target_model_name, full_batch_size, max_ngram_size",
+    configs,
+)
+def test_pld_spec_decode_inference(
+    prompts: List[str],
+    num_speculative_tokens: int,
+    prefill_seq_len: int,
+    ctx_len: int,
+    prefill_bsz: int,
+    target_model_name: str,
+    full_batch_size: Optional[int],
+    max_ngram_size: int,
+) -> CloudAI100ExecInfo:
+    """
+    Perform draft speculative decode inference on the given prompts.
+
+    Args:
+        prompts (List[str]): List of prompts to perform inference on.
+        num_speculative_tokens (int): Number of speculative tokens.
+        prefill_seq_len (int): Prefill sequence length.
+        ctx_len (int): Context length.
+        prefill_bsz (int): Prefill batch size.
+        target_model_name (str): Name of the target model.
+        full_batch_size (Optional[int]): Full batch size.
+        device_group (List[int]): List of device IDs.
+        max_ngram_size (int): Max ngram size
+
+    Returns:
+        CloudAI100ExecInfo: Execution information, including performance metrics and generated text.
+    """
+    # get device group
+    device_group: List[int] = get_available_device_id()
+    if not device_group:
+        pytest.skip("No available devices to run model on Cloud AI 100")
+    # assumes dlm and tlm are compiled to the same prompt-chunk-size, context length and full_batch_size/batch-size
+    # get vocab size
+    tokenizer = AutoTokenizer.from_pretrained(target_model_name, padding_side="right")
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    vocab_size = len(tokenizer)
+
+    # export_and_compile tlm and dlm
+    continuous_batching = full_batch_size is not None
+    target_model = AutoModelForCausalLM.from_pretrained(
+        target_model_name, continuous_batching=continuous_batching, is_tlm=True
+    )
+
+    num_devices = len(device_group)
+    target_model_qpc_path: str = target_model.compile(
+        num_cores=16,
+        num_devices=num_devices,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        aic_enable_depth_first=True,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+    # init qaic session
+    target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group)
+    draft_model_session = None
+
+    # skip inputs/outputs buffers
+    target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")]))
+    target_model_session.skip_buffers(
+        set([x for x in target_model_session.output_names if x.endswith("_RetainedState")])
+    )
+
+    is_cb = full_batch_size is not None
+    decode_batch_size = full_batch_size if is_cb else prefill_bsz
+    if len(prompts) < decode_batch_size:
+        prompts_exp = prompts * decode_batch_size
+        prompts = prompts_exp[:decode_batch_size]
+    # tokenize the prompts
+    prefill_nltk = np.zeros((1, 1), dtype=np.int64)
+    prompts_tokenized: List[dict] = []
+    for p in prompts:
+        input_len: int = tokenizer(p, return_tensors="np", padding=True).input_ids.shape[1]
+        input_len_padded: int = get_padded_input_len(input_len, prefill_seq_len, ctx_len)
+        p_tok: dict = tokenizer(p, return_tensors="np", padding="max_length", max_length=input_len_padded)
+        position_ids = np.where(p_tok.pop("attention_mask"), np.arange(input_len_padded), -1)
+        p_tok["position_ids"] = position_ids
+        p_tok["num_logits_to_keep"] = prefill_nltk
+        prompts_tokenized.append(p_tok)
+    # create caches to hold generated ids and input prompt lengths
+    generated_ids = [[] for i in range(decode_batch_size)]
+    input_lengths = [0] * decode_batch_size
+    # run prefill on both draft and target models
+    # mock input key "logits" to store the first batch of output logits
+    tlm_precode_inputs = dict(
+        input_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        position_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        batch_index=np.arange(decode_batch_size, dtype=np.int64).reshape(-1, 1),
+        num_logits_to_keep=np.zeros((num_speculative_tokens + 1, 1), dtype=np.int64),
+    )
+    num_logits_to_keep = num_speculative_tokens + 1
+    max_gen_len = [ctx_len] * decode_batch_size
+    # setup buffers
+    tlm_prefill_logits_ph = np.zeros((prefill_bsz, 1, vocab_size), dtype=np.float32)
+    precode_logits_ph = np.zeros((decode_batch_size, num_logits_to_keep, vocab_size), dtype=np.float32)
+
+    target_model_session.set_buffers({"logits": tlm_prefill_logits_ph})
+    e2e_start = perf_counter()
+    ttfts = []
+    all_ids = np.zeros((decode_batch_size, ctx_len), dtype=np.int64)
+    prompt_plus_gen_idx = np.zeros(decode_batch_size, dtype=np.int64)
+    for bi in range(decode_batch_size):
+        # assumes that prefill queue will always be popped from the front
+        start = perf_counter()
+        tlm_logits = run_prefill_on_draft_and_target(
+            tlm_session=target_model_session,
+            dlm_session=draft_model_session,
+            inputs=prompts_tokenized[bi],
+            prefill_seq_len=prefill_seq_len,
+            slot_idx=bi,
+        )
+        ttft = perf_counter() - start
+        ttfts.append(ttft)
+        input_ids = tlm_logits.argmax(2).astype(np.int64)
+        generated_ids[bi].append(input_ids.item())
+        tlm_precode_inputs["input_ids"][bi, 0] = input_ids.item()
+        input_len = prompts_tokenized[bi]["position_ids"].max(1).item() + 1
+        tlm_precode_inputs["position_ids"][bi] = np.arange(
+            input_len, input_len + num_speculative_tokens + 1, dtype=np.int64
+        )
+        # assumes that prefill queue will always be popped from the front
+        input_lengths[bi] = input_len
+        max_gen_len[bi] -= input_lengths[bi]
+        all_ids[bi, : input_len + 1] = prompts_tokenized[bi]["input_ids"][0, :input_len].tolist() + [input_ids.item()]
+        prompt_plus_gen_idx[bi] = input_len + 1
+    batch_ttft = perf_counter() - e2e_start
+
+    # set decode logits buffers
+    target_model_session.set_buffers({"logits": precode_logits_ph})
+    # start decode phase
+    valid_batch_indices = np.full(decode_batch_size, True, dtype=bool)
+    all_accept = False
+    it = 0
+    decode_start = perf_counter()
+    mean_num_accepted_tokens = 0
+    all_accept = np.full(decode_batch_size, False, dtype=bool)
+    tlm_position_ids = np.arange(num_speculative_tokens + 1).reshape(1, -1).repeat(decode_batch_size, axis=0)
+    empty_indices = np.zeros(decode_batch_size, dtype=bool)
+    while True:
+        it += 1
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            # generate n-grapm proposals
+            (
+                spec_tokens,  # shape: [num_speculative_tokens]
+                has_empty_tokens,
+            ) = find_candidate_pred_tokens(
+                all_ids[bi : bi + 1, : prompt_plus_gen_idx[bi]],
+                fill_tok=-1,
+                max_ngram_size=max_ngram_size,
+                num_pred_tokens=num_speculative_tokens,
+            )
+            empty_indices[bi] = has_empty_tokens
+            # prepare target model inputs
+            if has_empty_tokens:
+                # avoid read/write of KV$ for meaningless tokens
+                tlm_precode_inputs["position_ids"][bi, 1:] = -1
+            else:
+                tlm_precode_inputs["input_ids"][bi, 1:] = spec_tokens
+        # run precode on TLM to score the proposed tokens
+        tlm_outputs = target_model_session.run(tlm_precode_inputs)
+        target_logits = tlm_outputs["logits"]
+        # greedy sampling from target model
+        target_tokens = target_logits.argmax(-1)
+        # exact matching between draft and target tokens
+        num_tokens_selected = np.ones(decode_batch_size, dtype=np.int64)
+        tlm_precode_position_ids = np.full((decode_batch_size, num_speculative_tokens + 1), -1, dtype=np.int64)
+        non_empty_valid_indices = ~empty_indices & valid_batch_indices
+        matching = (
+            tlm_precode_inputs["input_ids"][non_empty_valid_indices, 1:] == target_tokens[non_empty_valid_indices, :-1]
+        )  # shape: [non_empty_valid_indices, num_speculative_tokens]
+        num_tokens_selected[non_empty_valid_indices] = matching.cumprod(axis=1).sum(axis=1) + 1
+        if empty_indices.sum() > 0:
+            tlm_precode_position_ids[empty_indices] = tlm_position_ids[empty_indices] + (
+                tlm_precode_inputs["position_ids"][empty_indices, 0] + 1
+            ).reshape(-1, 1)
+        if non_empty_valid_indices.sum() > 0:
+            tlm_precode_position_ids[non_empty_valid_indices] = tlm_precode_inputs["position_ids"][
+                non_empty_valid_indices
+            ] + num_tokens_selected[non_empty_valid_indices].reshape(-1, 1)
+        # record accepted tokens
+        all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
+        mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
+        # append selected tokens to the generated_ids
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            accepted_tokens = num_tokens_selected[bi]
+            num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
+            gen_ids = target_tokens[bi, :num_tokens_to_append]
+            all_ids[bi, prompt_plus_gen_idx[bi] : prompt_plus_gen_idx[bi] + num_tokens_to_append] = gen_ids
+            prompt_plus_gen_idx[bi] += num_tokens_to_append
+            generated_ids[bi].extend(gen_ids.tolist())
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
+                valid_batch_indices[bi] = False
+        # check if all generations are done
+        if not valid_batch_indices.any():
+            break
+        # prepare decode inputs for next decode iteration
+        num_valid_batch_indices = valid_batch_indices.sum().item()
+        common_input_ids = target_tokens[valid_batch_indices, num_tokens_selected[valid_batch_indices] - 1].reshape(
+            num_valid_batch_indices, 1
+        )
+        tlm_precode_inputs["input_ids"][valid_batch_indices, 0] = common_input_ids.flatten()
+        tlm_precode_position_ids[~valid_batch_indices] = -1
+        tlm_precode_inputs["position_ids"] = tlm_precode_position_ids
+    end = perf_counter()
+    # calculate performance metrics
+    decode_end = end - decode_start
+    e2e_end = end - e2e_start
+    mean_ttft = sum(ttfts) / len(ttfts)
+    generated_tokens_per_prompt = [len(gid) + 1 for gid in generated_ids]
+    decode_throughput = sum(generated_tokens_per_prompt) / decode_end
+    e2e_throughput = (sum(generated_tokens_per_prompt) + decode_batch_size) / e2e_end
+    batch_decode = tokenizer.batch_decode(generated_ids)
+    mean_num_accepted_tokens /= it
+    perf_metrics = PerfMetrics(
+        mean_ttft,
+        batch_ttft,
+        decode_throughput,
+        e2e_throughput,
+        mean_num_accepted_tokens,
+        max_gen_len,
+        generated_tokens_per_prompt,
+    )
+    draft_model_name = "PLD"
+    exec_info = CloudAI100ExecInfo(
+        prompts,
+        decode_batch_size,
+        batch_decode,
+        generated_ids,
+        perf_metrics,
+        num_speculative_tokens,
+        prefill_seq_len,
+        ctx_len,
+        prefill_bsz,
+        draft_model_name,
+        target_model_name,
+        full_batch_size,
+    )
+    del target_model_session
+    del draft_model_session
+    generated_ids = np.asarray(generated_ids[0]).flatten()
+    gen_len = generated_ids.shape[0]
+    exec_info = target_model.generate(tokenizer, Constants.INPUT_STR, device_group)
+    cloud_ai_100_tokens = exec_info.generated_ids[0][
+        :gen_len
+    ]  # Because we always run for single input and single batch size
+    all_matching = np.array_equal(cloud_ai_100_tokens, generated_ids)
+    assert all_matching, "Tokens don't match for SpD output and vanilla DLM output."
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index 9c6c7a2de..a9f197ec3 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -259,19 +259,13 @@ def test_spec_decode_inference(
         all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
         mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
         # append selected tokens to the generated_ids
-        tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(
-            decode_batch_size, 1
-        )
-        # tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(decode_batch_size,1)+1
         for bi, valid in enumerate(valid_batch_indices):
             if not valid:
                 continue
             accepted_tokens = num_tokens_selected[bi]
             num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
             generated_ids[bi].extend(target_tokens[bi, :num_tokens_to_append].tolist())
-            # position_ids > ctx_len-1 result in erronous output for logits at each seq_len of TLM
-            # (e.g., ctx_len=128 -> position_ids=[127,128,129] will give erronous output at each predicted token)
-            if len(generated_ids[bi]) >= max_gen_len[bi] or (tlm_precode_position_ids[bi] > ctx_len - 1).any():
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
                 valid_batch_indices[bi] = False
         # check if all generations are done
         if not valid_batch_indices.any():

From 5757301b64b5cd030208b44074594663a2bf427a Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Fri, 28 Feb 2025 19:26:57 +0530
Subject: [PATCH 025/138] New format of Documentation (#240)

New format of Documentation for inference and finetuning.

---------

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Co-authored-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 docs/index.md                            |  17 +---
 docs/source/cli_api.md                   |  12 ++-
 docs/source/introduction.md              |   5 +
 docs/source/ll_api.md                    |  38 -------
 docs/source/{hl_api.md => python_api.md} |  91 +++++++++++++++--
 docs/source/quick_start.md               |  25 +++++
 docs/source/validate.md                  | 124 +++++++++++++----------
 7 files changed, 193 insertions(+), 119 deletions(-)
 delete mode 100644 docs/source/ll_api.md
 rename docs/source/{hl_api.md => python_api.md} (51%)

diff --git a/docs/index.md b/docs/index.md
index 630493854..6b731e936 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -36,27 +36,14 @@ source/upgrade
 ```
 
 ```{toctree}
-:caption: 'Quick start'
+:caption: 'Inference on Cloud AI 100'
 :maxdepth: 4
 
 source/quick_start
-```
-
-```{toctree}
-:caption: 'Command Line Interface Use (CLI)'
-:maxdepth: 2
 source/cli_api
+source/python_api
 ```
 
- 
-```{toctree}
-:caption: 'Python API'
-:maxdepth: 2
-
-source/hl_api
-source/ll_api
-
-```
 
 ```{toctree}
 :caption: 'QAIC Finetune'
diff --git a/docs/source/cli_api.md b/docs/source/cli_api.md
index 603f0141c..a6ec86554 100644
--- a/docs/source/cli_api.md
+++ b/docs/source/cli_api.md
@@ -1,30 +1,32 @@
 
+# Command Line Interface Use (CLI)
+
 ```{NOTE}
 Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g.  ``'--device_group [0]'``
 ```
 
 (infer_api)=
-# `QEfficient.cloud.infer`
+## `QEfficient.cloud.infer`
 ```{eval-rst}
 .. automodule:: QEfficient.cloud.infer.main
 ``` 
-# `QEfficient.cloud.execute`
+## `QEfficient.cloud.execute`
 ```{eval-rst}
 .. automodule:: QEfficient.cloud.execute.main
 ```
-# `QEfficient.cloud.compile`
+## `QEfficient.cloud.compile`
 ```{eval-rst}
    .. automodule:: QEfficient.compile.compile_helper.compile
    .. code-block:: bash
     
         python -m QEfficient.cloud.compile OPTIONS
 ```
-# `QEfficient.cloud.export`
+## `QEfficient.cloud.export`
 ```{eval-rst}
    .. automodule:: QEfficient.cloud.export.main
    
 ```
-# `QEfficient.cloud.finetune`
+## `QEfficient.cloud.finetune`
 ```{eval-rst}
    .. automodule:: QEfficient.cloud.finetune.main
    
diff --git a/docs/source/introduction.md b/docs/source/introduction.md
index 772de4efc..d842b40c4 100644
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@@ -23,6 +23,9 @@ For other models, there is comprehensive documentation to inspire upon the chang
 ***Latest news*** : <br>
 
 - [coming soon] Support for more popular [models](models_coming_soon)<br>
+- [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models.
+
+- [01/2025] Added support for [Ibm-Granite](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
 - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models.
 - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM.
 - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)
@@ -31,6 +34,8 @@ For other models, there is comprehensive documentation to inspire upon the chang
 <details>
 <summary>More</summary>
 
+- [01/2025] Added support for [Ibm-Granite](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
+- [01/2025] Added support for [Ibm-Granite-Guardian](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b)
 - [09/2024] Added support for [Gemma-2-Family](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)<br>
 - [09/2024] Added support for [CodeGemma-Family](https://huggingface.co/collections/google/codegemma-release-66152ac7b683e2667abdee11)
 - [09/2024] Added support for [Gemma-Family](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b)
diff --git a/docs/source/ll_api.md b/docs/source/ll_api.md
deleted file mode 100644
index 8cdb974bc..000000000
--- a/docs/source/ll_api.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Low Level API
-
-## `convert_to_cloud_kvstyle`
-```{eval-rst}
-.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
-   :members:
-   :show-inheritance:
-   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle
-```
-## `convert_to_cloud_bertstyle`
-```{eval-rst}
-.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
-   :members:
-   :show-inheritance:
-   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle
-```
-
-## `utils`
-
-```{eval-rst}
-.. automodule:: QEfficient.utils.device_utils
-   :members:
-   :show-inheritance:
-```
-
-```{eval-rst}
-.. automodule:: QEfficient.utils.generate_inputs
-   :members:
-   :undoc-members:
-   :show-inheritance:
-```
-
-```{eval-rst}
-.. automodule:: QEfficient.utils.run_utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-```
\ No newline at end of file
diff --git a/docs/source/hl_api.md b/docs/source/python_api.md
similarity index 51%
rename from docs/source/hl_api.md
rename to docs/source/python_api.md
index d5f2e10f7..668861373 100644
--- a/docs/source/hl_api.md
+++ b/docs/source/python_api.md
@@ -1,34 +1,64 @@
+# Python API
+
 **This page give you an overview about the all the APIs that you might need to integrate the `QEfficient` into your python applications.**
 
-# High Level API
+## High Level API
+
+### `QEFFAutoModelForCausalLM`
 
-## `QEFFAutoModelForCausalLM`
 ```{eval-rst}
 .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForCausalLM
    :member-order: bysource
    :members:
 ``` 
-## `QEFFAutoModel`
+
+(QEFFAutoModel)=
+### `QEFFAutoModel`
+
 ```{eval-rst}
 .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel
    :member-order: bysource
    :members:
 ``` 
-## `QEffAutoPeftModelForCausalLM`
+
+(QEffAutoPeftModelForCausalLM)=
+### `QEffAutoPeftModelForCausalLM`
+
 ```{eval-rst}
 .. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM
    :member-order: bysource
    :members:
 ```
 
-## `QEffAutoLoraModelForCausalLM`
+(QEffAutoLoraModelForCausalLM)=
+### `QEffAutoLoraModelForCausalLM`
+
 ```{eval-rst}
 .. autoclass:: QEfficient.peft.lora.auto.QEffAutoLoraModelForCausalLM
    :member-order: bysource
    :members:
 ```
 
-## `export`
+(QEFFAutoModelForImageTextToText)=
+### `QEFFAutoModelForImageTextToText`
+
+```{eval-rst}
+.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForImageTextToText
+   :member-order: bysource
+   :members:
+```
+
+(QEFFAutoModelForSpeechSeq2Seq)=
+### `QEFFAutoModelForSpeechSeq2Seq`
+
+```{eval-rst}
+.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForSpeechSeq2Seq
+   :member-order: bysource
+   :members:
+```
+
+### `export`
+
 ```{eval-rst}
 .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
    :members:
@@ -37,7 +67,9 @@
 .. deprecated::
    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.export instead
 ```
-## `compile`
+
+### `compile`
+
 ```{eval-rst}
 .. automodule:: QEfficient.compile.compile_helper
    :members:
@@ -50,10 +82,53 @@
 .. deprecated::
    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.compile instead
 ```
-## `Execute`
+
+### `Execute`
+
 ```{eval-rst}
 .. automodule:: QEfficient.generation.text_generation_inference
    :members:
    :show-inheritance:
    :exclude-members:  latency_stats_bertstyle,cloud_ai_100_exec_kv_helper
 ```
+## Low Level API
+
+### `convert_to_cloud_kvstyle`
+
+```{eval-rst}
+.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
+   :members:
+   :show-inheritance:
+   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle
+```
+
+### `convert_to_cloud_bertstyle`
+
+```{eval-rst}
+.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
+   :members:
+   :show-inheritance:
+   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle
+```
+
+### `utils`
+
+```{eval-rst}
+.. automodule:: QEfficient.utils.device_utils
+   :members:
+   :show-inheritance:
+```
+
+```{eval-rst}
+.. automodule:: QEfficient.utils.generate_inputs
+   :members:
+   :undoc-members:
+   :show-inheritance:
+```
+
+```{eval-rst}
+.. automodule:: QEfficient.utils.run_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+```
\ No newline at end of file
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 55e0746ef..88093e134 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -1,3 +1,4 @@
+# Quick Start
 
 QEfficient Library was designed with one goal:
 
@@ -8,6 +9,30 @@ To achieve this, we have 2 levels of APIs, with different levels of abstraction.
 
 2. Python high level APIs offer more granular control, ideal for when customization is necessary.
 
+## Supported Features
+
+| Feature | Impact |
+| --- | --- |
+| Context Length Specializations (upcoming) | Increases the maximum context length that models can handle, allowing for better performance on tasks requiring long sequences of text. |
+| Swift KV (upcoming) | Reduces computational overhead during inference by optimizing key-value pair processing, leading to improved throughput. |
+| Block Attention (in progress) | Reduces inference latency and computational cost by dividing context into blocks and reusing key-value states, particularly useful in RAG. |
+| [Vision Language Model](QEFFAutoModelForImageTextToText) | Provides support for the AutoModelForImageTextToText class from the transformers library, enabling advanced vision-language tasks. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/image_text_to_text_inference.py) for more **details**. |
+| [Speech Sequence to Sequence Model](QEFFAutoModelForSpeechSeq2Seq) | Provides support for the QEFFAutoModelForSpeechSeq2Seq Facilitates speech-to-text sequence models. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/speech_to_text/run_whisper_speech_to_text.py) for more **details**. |
+| Support for FP8 Execution | Enables execution with FP8 precision, significantly improving performance and reducing memory usage for computational tasks. |
+| Prefill caching  | Enhances inference speed by caching key-value pairs for shared prefixes, reducing redundant computations and improving efficiency. |
+|Prompt-Lookup Decoding | Speeds up text generation by using overlapping parts of the input prompt and the generated text, making the process faster without losing quality. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/pld_spd_inference.py) for more **details**.|
+| [PEFT LoRA support](QEffAutoPeftModelForCausalLM) | Enables parameter-efficient fine-tuning using low-rank adaptation techniques, reducing the computational and memory requirements for fine-tuning large models. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/peft_models.py) for more **details**. |
+| [QNN support](#qnn-compilation) | Enables compilation using QNN SDK, making Qeff adaptable for various backends in the future. |
+| [Embedding model support](QEFFAutoModel) | Facilitates the generation of vector embeddings for retrieval tasks. |
+| [Speculative Decoding](#draft-based-speculative-decoding) | Accelerates text generation by using a draft model to generate preliminary predictions, which are then verified by the target model, reducing latency and improving efficiency. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/draft_spd_inference.py) for more **details**. |
+| [Finite lorax](QEffAutoLoraModelForCausalLM) | Users can activate multiple LoRA adapters and compile them with the base model. At runtime, they can specify which prompt should use which adapter, enabling mixed adapter usage within the same batch. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/lora_models.py) for more **details**. |
+| Python and CPP Inferencing API support | Provides flexibility while running inference with Qeff and enabling integration with various applications and improving accessibility for developers. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/cpp_execution/text_inference_using_cpp.py) for more **details**.|
+| [Continuous batching](#continuous-batching) | Optimizes throughput and latency by dynamically batching requests, ensuring efficient use of computational resources. |
+| AWQ and GPTQ support | Supports advanced quantization techniques, improving model efficiency and performance on AI 100. |
+| Support serving successive requests in same session | An API that yields tokens as they are generated, facilitating seamless integration with various applications and enhancing accessibility for developers. |
+| Perplexity calculation | A script for computing the perplexity of a model, allowing for the evaluation of model performance and comparison across different models and datasets. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/scripts/perplexity_computation/calculate_perplexity.py) for more **details**. |
+| KV Heads Replication Script| A sample script for replicating key-value (KV) heads for the Llama-3-8B-Instruct model, running inference with the original model, replicating KV heads, validating changes, and exporting the modified model to ONNX format. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/scripts/replicate_kv_head/replicate_kv_heads.py) for more **details**.|
+
 ## Transformed models and QPC storage
 
 By default, the library exported models and Qaic Program Container (QPC) files, which are compiled and inference-ready model binaries generated by the compiler, are stored in `~/.cache/qeff_cache`. You can customize this storage path using the following environment variables:
diff --git a/docs/source/validate.md b/docs/source/validate.md
index b3327596d..49acd268d 100644
--- a/docs/source/validate.md
+++ b/docs/source/validate.md
@@ -1,59 +1,77 @@
 (validated_models)=
 # Validated Models
-``Note- All validated models support Continuous Batching functionality.``
-| Model Name | Model Support |
-| --- | --- |
-| [CodeGemma-2b](https://huggingface.co/google/codegemma-2b) |✔️ |
-| [CodeGemma-7b](https://huggingface.co/google/codegemma-7b) |✔️ |
-| [CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) |✔️ |
-| [CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) |✔️ |
-| [CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) |✔️ |
-| [Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1) |✔️ |
-| [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|✔️ |
-| [DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)|✔️ |
-| [Falcon-40b](https://huggingface.co/tiiuae/falcon-40b) |✔️ |
-| [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) |✔️ |
-| [GPT2](https://huggingface.co/openai-community/gpt2) |✔️ |
-| [Gemma-2b](https://huggingface.co/google/gemma-2b) |✔️ |
-| [Gemma-7b](https://huggingface.co/google/gemma-7b) |✔️ |
-| [Gemma-2-2b](https://huggingface.co/google/gemma-2-2b) |✔️ |
-| [Gemma-2-9b](https://huggingface.co/google/gemma-2-9b) |✔️ |
-| [Gemma-2-27b](https://huggingface.co/google/gemma-2-27b) |✔️ |
-| [Granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✔️ |
-| [Granite-guardian-3.1-8b](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b) |✔️ |
-| [Granite-20b-code-base](https://huggingface.co/ibm-granite/granite-20b-code-base-8k) | ✔️ |
-| [Granite-20b-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k) | ✔️ |
-| [Jais-adapted-7b](https://huggingface.co/inceptionai/jais-adapted-7b) |✔️ |
-| [Jais-adapted-13b-chat](https://huggingface.co/inceptionai/jais-adapted-13b-chat) |✔️ |
-| [Jais-adapted-70b](https://huggingface.co/inceptionai/jais-adapted-70b) |✔️ |
-| [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) |✔️ |
-| [Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) |✔️ |
-| [Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) |✔️ |
-| [Llama-3-8b](https://huggingface.co/meta-llama/Meta-Llama-3-8B) |✔️ |
-| [Llama-3-70b](https://huggingface.co/meta-llama/Meta-Llama-3-70B) |✔️ |
-| [Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) |✔️ |
-| [Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B) |✔️ |
-| [Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) |✔️ |
-| [Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) |✔️ |
-| [Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) |✔️ |
-| [MPT-7b](https://huggingface.co/mosaicml/mpt-7b) |✔️ |
-| [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) |✔️ |
-| [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) |✔️ |
-| [Phi3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |✔️ |
-| [Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |✔️ |
-| [Starcoder1-15B](https://huggingface.co/bigcode/starcoder) |✔️ |
-| [Starcoder2-15B](https://huggingface.co/bigcode/starcoder2-15b) |✔️ |
-| [Vicuna-v0](https://huggingface.co/lmsys/vicuna-13b-delta-v0) |✔️ |
-| [Vicuna-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) |✔️ |
-| [Vicuna-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) |✔️ |
+
+## Text-only Language Models
+
+### Text Generation Task
+**QEff Auto Class:** `QEFFAutoModelForCausalLM`
+
+| Architecture            | Model Family       | Representative Models                                                                 | CB Support |
+|-------------------------|--------------------|--------------------------------------------------------------------------------------|------------|
+| **FalconForCausalLM**   | Falcon             | [tiiuae/falcon-40b]((https://huggingface.co/tiiuae/falcon-40b))                                                                    | ✔️          |
+| **GemmaForCausalLM**    | CodeGemma          | [google/codegemma-2b](https://huggingface.co/google/codegemma-2b)<br>[google/codegemma-7b](https://huggingface.co/google/codegemma-7b)                                           | ✔️          |
+|                         | Gemma              | [google/gemma-2b](https://huggingface.co/google/gemma-2b)<br>[google/gemma-7b](https://huggingface.co/google/gemma-7b)<br>[google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b)<br>[google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b)<br>[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)        | ✔️          |
+| **GPTBigCodeForCausalLM** | Starcoder1.5      | [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)                                                                   | ✔️          |
+|                         | Starcoder2         | [bigcode/starcoder2-15b](https://huggingface.co/bigcode/starcoder2-15b)                                                              | ✔️          |
+| **GPTJForCausalLM**     | GPT-J              | [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)                                                                 | ✔️          |
+| **GPT2LMHeadModel**     | GPT-2              | [openai-community/gpt2](https://huggingface.co/openai-community/gpt2)                                                               | ✔️          |
+| **GraniteForCausalLM**  | Granite 3.1        | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)<br>[ibm-granite/granite-guardian-3.1-8b](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b)          | ✔️          |
+|                         | Granite 20B        | [ibm-granite/granite-20b-code-base-8k](https://huggingface.co/ibm-granite/granite-20b-code-base-8k)<br>[ibm-granite/granite-20b-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k)    | ✔️          |
+| **InternVLChatModel**   | Intern-VL          | [OpenGVLab/InternVL2_5-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B)                                                            |            |
+| **LlamaForCausalLM**    | CodeLlama          | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)<br>[codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf)<br>[codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | ✔️          |
+|                         | DeepSeek-R1-Distill-Llama | [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)                                      | ✔️          |
+|                         | InceptionAI-Adapted | [inceptionai/jais-adapted-7b](https://huggingface.co/inceptionai/jais-adapted-7b)<br>[inceptionai/jais-adapted-13b-chat](https://huggingface.co/inceptionai/jais-adapted-13b-chat)<br>[inceptionai/jais-adapted-70b](https://huggingface.co/inceptionai/jais-adapted-70b) | ✔️          |
+|                         | Llama 3.3          | [meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)                                                   | ✔️          |
+|                         | Llama 3.2          | [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)<br>[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)                                  | ✔️          |
+|                         | Llama 3.1          | [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)<br>[meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B)                                 | ✔️          |
+|                         | Llama 3            | [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)<br>[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)                           | ✔️          |
+|                         | Llama 2            | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)<br>[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)<br>[meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | ✔️          |
+|                         | Vicuna             | [lmsys/vicuna-13b-delta-v0](https://huggingface.co/lmsys/vicuna-13b-delta-v0)<br>[lmsys/vicuna-13b-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3)<br>[lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5)         | ✔️          |
+| **MistralForCausalLM**  | Mistral            | [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)                                                  | ✔️          |
+| **MixtralForCausalLM**  | Codestral<br>Mixtral | [mistralai/Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1)<br>[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)                        | ✔️          |
+| **MPTForCausalLM**      | MPT                | [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b)                                                                     | ✔️          |
+| **Phi3ForCausalLM**     | Phi-3, Phi-3.5     | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)                                                    | ✔️          |
+| **QwenForCausalLM**     | DeepSeek-R1-Distill-Qwen | [DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)                                                   | ✔️          |
+|                         | Qwen2, Qwen2.5     | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct)                                                            | ✔️          |
+
+## Embedding Models
+
+### Text Embedding Task
+**QEff Auto Class:** `QEFFAutoModel`
+
+| Architecture | Model Family | Representative Models          |
+|--------------|--------------|---------------------------------|
+| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2)          |
+| **LlamaModel** | Llama-based  | [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
+| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
+| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
+| **MPNetForMaskedLM** | MPNet | [sentence-transformers/multi-qa-mpnet-base-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1) |
+| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
+| **MistralModel** | Mistral | [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
+
+## Multimodal Language Models
+
+### Vision-Language Models (Text + Image Generation)
+**QEff Auto Class:** `QEFFAutoModelImageTextToText`
+
+| Architecture                | Model Family | Representative Models                  |
+|-----------------------------|--------------|----------------------------------------|
+| **LlavaForConditionalGeneration** | LLaVA-1.5   | [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)               |
+| **MllamaForConditionalGeneration** | Llama 3.2   | [meta-llama/Llama-3.2-11B-Vision Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)<br>[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision) |
+
+### Audio Models
+(Automatic Speech Recognition) - Transcription Task
+**QEff Auto Class:** `QEFFAutoModelForSpeechSeq2Seq`
+
+| Architecture | Model Family | Representative Models                                                                 |
+|--------------|--------------|----------------------------------------------------------------------------------------|
+| **Whisper**  | Whisper      | [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny)<br>[openai/whisper-base](https://huggingface.co/openai/whisper-base)<br>[openai/whisper-small](https://huggingface.co/openai/whisper-small)<br>[openai/whisper-medium](https://huggingface.co/openai/whisper-medium)<br>[openai/whisper-large](https://huggingface.co/openai/whisper-large)<br>[openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) |
 
 (models_coming_soon)=
 # Models Coming Soon
 
-* [Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)
-* [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
-* [Chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
-* [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)
-* [Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B)
-* [Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)
-* [Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)
\ No newline at end of file
+| Architecture            | Model Family | Representative Models                      |
+|-------------------------|--------------|--------------------------------------------|
+| **BaichuanForCausalLM** | Baichuan2    | [baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)             |
+| **CohereForCausalLM**   | Command-R    | [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)             |
+| **DbrxForCausalLM**     | DBRX         | [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)                       |
\ No newline at end of file

From 47577f85b64589161e6fcb6a0659991202e1ddb8 Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Fri, 28 Feb 2025 21:57:37 +0530
Subject: [PATCH 026/138] Removed warning and override of mxfp6 for internal
 use (#277)

compilation fix and enabled mxfp6 for vision encoder

---------

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 25 ++-----------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index b8b5981cd..8bca3b94a 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -52,8 +52,6 @@
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
 
-MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"]
-
 
 class QEFFTransformersBase(QEFFBaseModel):
     """
@@ -627,17 +625,12 @@ def compile(
         ):
             self.export()
 
-        if mxfp6_matmul and self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6:
-            logger.warning(
-                "Due to accuracy issues of vision model fixing it's precision to fp16, while language model will be compiled for mxfp6"
-            )
-
         self.vision_model._compile(
             compile_dir,
             compile_only=True,
             specializations=specializations["vision"],
             convert_to_fp16=True,
-            mxfp6_matmul=False,
+            mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             custom_io=custom_io_vision,
@@ -946,11 +939,6 @@ def compile(
             if output_name.endswith("_RetainedState"):
                 custom_io[output_name] = kv_cache_dtype
 
-        if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and mxfp6_matmul:
-            logger.warning(
-                f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True"
-            )
-
         self._compile(
             onnx_path,
             compile_dir,
@@ -1147,16 +1135,7 @@ class QEFFAutoModelForImageTextToText:
 
     _hf_auto_class = AutoModelForImageTextToText
 
-    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs):
-        if model.config.architectures[0] in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and not kv_offload:
-            # For models with mxfp6 accuracy issue, we will use kv_offload=True by default
-            if kv_offload is None:
-                kv_offload = True
-            else:
-                logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
-        elif kv_offload is None:
-            kv_offload = False
-
+    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs):
         if kv_offload:
             return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs)
         else:

From 87d87812804fa5b22b529f3322a3bf84178ccf4a Mon Sep 17 00:00:00 2001
From: mohiso22 <quic_mohisoni@quicinc.com>
Date: Fri, 28 Feb 2025 22:03:50 +0530
Subject: [PATCH 027/138] Added support of 2qpcs for internvl and llava (#279)

Signed-off-by: Mohit Soni <quic_mohisoni@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/internvl/modeling_internvl.py      | 147 +++++++++++++++---
 .../models/llava/modeling_llava.py            | 147 +++++++++++++++---
 .../models/mllama/modeling_mllama.py          |   3 +
 .../transformers/models/modeling_auto.py      |  17 +-
 .../transformers/models/pytorch_transforms.py |   2 +
 5 files changed, 261 insertions(+), 55 deletions(-)

diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
index 318993dde..c39e7b65d 100644
--- a/QEfficient/transformers/models/internvl/modeling_internvl.py
+++ b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -14,9 +14,57 @@
 from QEfficient.utils.logging_utils import logger
 
 
+class QEffInternEncoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, pixel_values):
+        vit_embeds = self.model.extract_feature(pixel_values)
+        return vit_embeds
+
+
+class QEffInternDecoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.config = self.model.language_model.config
+
+    def forward(self, input_ids, vit_embeds, position_ids, past_key_values):
+        # TODO: Check if Hardcoding this is okay, i.e. check if this value is common for all intern models
+        IMG_CONTEXT_TOKEN = 151667
+
+        input_embeds = self.model.language_model.get_input_embeddings()(input_ids)
+        B, N, C = input_embeds.shape
+        image_input_embeds = input_embeds.reshape(B * N, C)
+        image_input_ids = input_ids.reshape(B * N)
+        selected = image_input_ids == IMG_CONTEXT_TOKEN
+        indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
+        indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
+        image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
+        image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds)
+        inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds)
+        outputs = self.model.language_model(
+            inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True
+        )
+        return outputs.logits, vit_embeds, outputs.past_key_values
+
+
 class QEffInternVLModel(nn.Module):
+    def get_qeff_vision_encoder(self):
+        return QEffInternEncoderWrapper(self)
+
+    def get_qeff_language_decoder(self):
+        return QEffInternDecoderWrapper(self)
+
     def get_specializations(
-        self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
+        self,
+        batch_size: int,
+        prefill_seq_len: int,
+        ctx_len: int,
+        img_size: int,
+        kv_offload: bool = False,
+        **compiler_options,
     ):
         # TODO: check if this should be named num_patches or something else
         num_patches = compiler_options.pop("num_patches", None)
@@ -33,8 +81,18 @@ def get_specializations(
         elif img_size is None:
             img_size = 448
             logger.warning("Setting img_size to be 448, as it was neither passed nor found in vision_config")
-
-        specializations = [
+        if img_size != 448 and kv_offload:
+            raise NotImplementedError("Image Size other than 448 is not supported for Intern models yet.")
+        vision = [
+            {
+                "batch_size": batch_size,
+                "num_patches": num_patches,
+                "img_size": img_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+            }
+        ]
+        lang = [
             {
                 "batch_size": batch_size,
                 "seq_len": prefill_seq_len,
@@ -50,46 +108,75 @@ def get_specializations(
                 "img_size": img_size,
             },
         ]
-        return specializations, compiler_options
 
-    def get_onnx_dynamic_axes(
-        self,
-    ):
+        specializations = {}
+
+        if kv_offload:
+            specializations["vision"] = vision
+            specializations["lang"] = lang
+            return specializations, compiler_options
+        else:
+            return lang, compiler_options
+
+    def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         # Define dynamic axes
-        dynamic_axes = {}
-        dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
-        dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
-        dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
+        vision_dynamic_axes = {}
+        lang_dynamic_axes = {}
+        lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
+        lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
+        vision_dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
 
         pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes
+                lang_dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes
 
+        dynamic_axes = {}
+        if kv_offload:
+            dynamic_axes["vision"] = vision_dynamic_axes
+            dynamic_axes["lang"] = lang_dynamic_axes
+        else:
+            dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes}
         return dynamic_axes
 
-    def get_output_names(
-        self,
-    ):
-        output_names = ["logits", "pixel_values_RetainedState"]
+    def get_output_names(self, kv_offload: bool = False):
+        vision_output_names = ["vit_embeds"]
+        lang_output_names = ["logits"]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                output_names.append(f"past_{kv}.{i}_RetainedState")
+                lang_output_names.append(f"past_{kv}.{i}_RetainedState")
+
+        output_names = {}
+        if kv_offload:
+            lang_output_names.insert(1, "vit_embeds_RetainedState")
+            output_names["vision"] = vision_output_names
+            output_names["lang"] = lang_output_names
+        else:
+            lang_output_names.insert(1, "pixel_values_RetainedState")
+            return lang_output_names
         return output_names
 
     def get_dummy_inputs(self, kv_offload: bool = False):
-        if kv_offload:
-            raise ValueError("kv_offload method not supported for InternVL yet!")
         num_patches = 13
         C = 3
         if vis_cfg := getattr(self.config, "vision_config", None):
             img_size = getattr(vis_cfg, "image_size", 448)
         else:
             img_size = 448
+        if img_size != 448 and kv_offload:
+            raise NotImplementedError("Image Size other than 448 is not supported for Intern models yet.")
+
+        # Taken from the modeling files of OpenGVLab/InternVL2_5-1B
+        feature_size = int((((self.config.vision_config.hidden_size**0.5) * self.config.downsample_ratio) ** 2))
 
         # Define shapes
         inputs_shapes = {}
         inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
+        inputs_shapes["vit_embeds"] = (
+            num_patches,
+            feature_size,
+            self.language_model.config.hidden_size,
+        )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
             constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
@@ -97,14 +184,16 @@ def get_dummy_inputs(self, kv_offload: bool = False):
         inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size)
 
         # Define inputs
-        inputs = {}
-        inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
-        inputs["position_ids"] = (
+        vision_inputs = {}
+        lang_inputs = {}
+        vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
+        lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
+        lang_inputs["vit_embeds"] = torch.zeros((inputs_shapes["vit_embeds"]), dtype=torch.float32)
+        lang_inputs["position_ids"] = (
             torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
             .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
-        inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
 
         # Add data for KV
         kv_cache_shape = get_padding_shape_from_config(
@@ -113,10 +202,18 @@ def get_dummy_inputs(self, kv_offload: bool = False):
             seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
-        inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
+        lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
+                lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
+
+        inputs = {}
+        if kv_offload:
+            inputs["vision"] = vision_inputs
+            inputs["lang"] = lang_inputs
+        else:
+            lang_inputs.pop("vit_embeds")
+            inputs = {**vision_inputs, **lang_inputs}
 
         return inputs
 
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
index 82c934670..93d6f4c3b 100644
--- a/QEfficient/transformers/models/llava/modeling_llava.py
+++ b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import torch
+import torch.nn as nn
 import torch.utils.checkpoint
 from transformers.models.llava.modeling_llava import (
     LlavaForConditionalGeneration,
@@ -20,7 +21,57 @@
 CTX_LEN = 1024
 
 
+class QEFFLlavaEncoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, pixel_values):
+        # Image features
+        image_outputs = self.model.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_outputs.hidden_states[self.model.config.vision_feature_layer]
+        vision_feature_select_strategy = self.model.config.vision_feature_select_strategy
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.model.config.vision_feature_select_strategy}")
+        image_features = self.model.multi_modal_projector(selected_image_feature)
+
+        return image_features
+
+
+class QEFFLlavaDecoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.config = self.model.config
+
+    def forward(self, input_ids, image_features, position_ids, past_key_values):
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        mask = input_ids == self.model.config.image_token_index
+        indices1 = mask.to(torch.int64).cumsum(1) - 1
+        indices0 = torch.arange(mask.shape[0]).view(-1, 1)
+        image_features_expanded = image_features[indices0, indices1]
+        inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds)
+        outputs = self.model.language_model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+        )
+
+        return outputs.logits, image_features, outputs.past_key_values
+
+
 class QEffLlavaForConditionalGeneration(LlavaForConditionalGeneration):
+    def get_qeff_vision_encoder(self):
+        return QEFFLlavaEncoderWrapper(self)
+
+    def get_qeff_language_decoder(self):
+        return QEFFLlavaDecoderWrapper(self)
+
     def forward(self, input_ids, position_ids, pixel_values, past_key_values):
         inputs_embeds = self.get_input_embeddings()(input_ids)
         # Image features
@@ -50,7 +101,7 @@ def forward(self, input_ids, position_ids, pixel_values, past_key_values):
         )
         return outputs.logits, pixel_values, outputs.past_key_values
 
-    def get_dummy_inputs(self, **kwargs):
+    def get_dummy_inputs(self, kv_offload: bool = False, **kwargs):
         num_layers = self.config.text_config.num_hidden_layers
         num_key_value_heads = self.config.text_config.num_key_value_heads
         head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads
@@ -58,25 +109,44 @@ def get_dummy_inputs(self, **kwargs):
             img_size = getattr(vis_cfg, "image_size", 336)
         else:
             img_size = 336
-        inputs = {
+        if img_size != 336 and kv_offload:
+            raise NotImplementedError("Image Size other than 336 is not supported for Llava models yet.")
+        vision_inputs = {
+            "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=torch.float32),
+        }
+        lang_inputs = {
             "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
+            "image_features": torch.ones((BS, 576, self.language_model.config.hidden_size), dtype=torch.float32),
             "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
-            "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=torch.float32),
         }
-        inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1)
-        inputs["past_key_values"] = []
+        lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1)
+        lang_inputs["past_key_values"] = []
         for i in range(num_layers):
-            inputs["past_key_values"].append(
+            lang_inputs["past_key_values"].append(
                 (
                     torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim),
                     torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim),
                 )
             )
-        inputs["position_ids"] = torch.full(inputs["position_ids"].shape, CTX_LEN - 1)
+        lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, CTX_LEN - 1)
+        inputs = {}
+
+        if kv_offload:
+            inputs["vision"] = vision_inputs
+            inputs["lang"] = lang_inputs
+        else:
+            lang_inputs.pop("image_features")
+            inputs = {**vision_inputs, **lang_inputs}
         return inputs
 
     def get_specializations(
-        self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
+        self,
+        batch_size: int,
+        prefill_seq_len: int,
+        ctx_len: int,
+        img_size: int,
+        kv_offload: bool = False,
+        **compiler_options,
     ):
         max_num_images = compiler_options.pop("max_num_images", 1)
         prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
@@ -86,8 +156,18 @@ def get_specializations(
         elif img_size is None:
             img_size = 336
             logger.warning("Setting img_size to be 336, as it was neither passed nor found in vision_config")
-
-        specializations = [
+        if img_size != 336 and kv_offload:
+            raise NotImplementedError("Image Size other than 336 is not supported for Llava models yet.")
+        vision = [
+            {
+                "batch_size": batch_size,
+                "max_num_images": max_num_images,
+                "img_size": img_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+            }
+        ]
+        lang = [
             {
                 "batch_size": batch_size,
                 "seq_len": prefill_seq_len,
@@ -103,32 +183,53 @@ def get_specializations(
                 "img_size": img_size,
             },
         ]
-        return specializations, compiler_options
+        specializations = {}
 
-    def get_onnx_dynamic_axes(
-        self,
-    ):
+        if kv_offload:
+            specializations["vision"] = vision
+            specializations["lang"] = lang
+            return specializations, compiler_options
+        else:
+            return lang, compiler_options
+
+    def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         # Define dynamic axes
         num_layers = self.config.text_config.num_hidden_layers
 
-        dynamic_axes = {
+        vision_dynamic_axes = {
+            "pixel_values": {0: "batch_size", 2: "img_size", 3: "img_size"},
+        }
+        lang_dynamic_axes = {
             "input_ids": {0: "batch_size", 1: "seq_len"},
             "position_ids": {0: "batch_size", 1: "seq_len"},
-            "pixel_values": {0: "batch_size", 2: "img_size", 3: "img_size"},
         }
         for i in range(num_layers):
-            dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
-            dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
+            lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
+            lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
 
+        dynamic_axes = {}
+        if kv_offload:
+            dynamic_axes["vision"] = vision_dynamic_axes
+            dynamic_axes["lang"] = lang_dynamic_axes
+        else:
+            dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes}
         return dynamic_axes
 
-    def get_output_names(
-        self,
-    ):
-        output_names = ["logits", "pixel_values_RetainedState"]
+    def get_output_names(self, kv_offload: bool = False):
+        vision_output_names = ["image_features"]
+        lang_output_names = ["logits"]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                output_names.append(f"past_{kv}.{i}_RetainedState")
+                lang_output_names.append(f"past_{kv}.{i}_RetainedState")
+
+        output_names = {}
+        if kv_offload:
+            lang_output_names.insert(1, "image_features_RetainedState")
+            output_names["vision"] = vision_output_names
+            output_names["lang"] = lang_output_names
+        else:
+            lang_output_names.insert(1, "pixel_values_RetainedState")
+            return lang_output_names
         return output_names
 
     def get_inputs_info(self):
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
index 9dcddbdfd..8d2141240 100644
--- a/QEfficient/transformers/models/mllama/modeling_mllama.py
+++ b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -1022,6 +1022,9 @@ class QEffMllamaForConditionalGeneration(MllamaForConditionalGeneration):
     def get_qeff_vision_encoder(self):
         return QEffMllamaVisionEncoder(self)
 
+    def get_qeff_language_decoder(self):
+        return self
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 8bca3b94a..54b7828c8 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -390,7 +390,13 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray
 
 
 class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel):
-    _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform]
+    _pytorch_transforms = [
+        AwqToMatmulNbitsTransform,
+        GPTQToMatmulNbitsTransform,
+        CustomOpsTransform,
+        KVCacheTransform,
+        KVCacheModuleMethodMapperTransform,
+    ]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     def __init__(self, model: nn.modules):
@@ -454,6 +460,7 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
 
     def __init__(self, model):
         super().__init__(model)
+        self.model = model.get_qeff_language_decoder()
 
     def export(self, inputs, output_names, dynamic_axes, export_dir=None):
         return self._export(inputs, output_names, dynamic_axes, export_dir)
@@ -502,7 +509,6 @@ def model_name(self) -> str:
 
 class _QEffAutoModelForImageTextToTextDualQPC:
     _hf_auto_class = AutoModelForImageTextToText
-    UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"]
 
     def __init__(
         self,
@@ -513,8 +519,6 @@ def __init__(
             raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
         self.model = model
         self.config = model.config
-        if self.model_name in self.UNSUPPORTED_MODELS:
-            raise NotImplementedError(f"kv_offload is not yet supported for {self.model.__class__.__name__}")
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model)
 
@@ -640,12 +644,12 @@ def compile(
         custom_io_lang = {}
         # Inputs
         for output_name in output_names["lang"]:
-            if output_name.startswith("past_"):
+            if output_name.endswith("_RetainedState"):
                 custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype
 
         # outputs
         for output_name in output_names["lang"]:
-            if output_name.startswith("past_"):
+            if output_name.endswith("_RetainedState"):
                 custom_io_lang[output_name] = kv_cache_dtype
 
         self.lang_model._compile(
@@ -799,7 +803,6 @@ def kv_offload_generate(
             lang_inputs["input_ids"] = outputs["logits"].argmax(2)
             lang_inputs["position_ids"] += 1
             generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1)
-
             if streamer:
                 streamer.put(lang_inputs["input_ids"][0])
 
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 10f4c448b..8152f0676 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -433,6 +433,8 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
             "get_onnx_dynamic_axes": QEffInternVLModel.get_onnx_dynamic_axes,
             "get_output_names": QEffInternVLModel.get_output_names,
             "get_inputs_info": QEffInternVLModel.get_inputs_info,
+            "get_qeff_vision_encoder": QEffInternVLModel.get_qeff_vision_encoder,
+            "get_qeff_language_decoder": QEffInternVLModel.get_qeff_language_decoder,
         },
         "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward},
     }

From d1e60b77813cf2ecd7a4d680357c6ce8845f8b97 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Mon, 3 Mar 2025 13:31:49 +0530
Subject: [PATCH 028/138] Removed onnx_defer_loading flag. (#295)

Removing onnx_defer_loading flag which was originally removed in
_[Removed onnx_defer_loading from Immutable Convertor Args. PR: 230]_
but got added back later in _[Mllama(single + dual) + InternVL(single) +
Llava (single) PR: 267]_ maybe becausing of rebasing.

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/utils/constants.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index a5cc6fda1..6c2bba0c6 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -136,7 +136,6 @@ class QnnConstants:
         "--float_bitwidth ",
         "--preserve_io_datatype",
         "--onnx_skip_simplification",
-        "--onnx_defer_loading",
     ]
 
     IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [

From da1d1dac6f3a3ac387901db948a230afdab975fc Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Mon, 3 Mar 2025 20:29:36 +0530
Subject: [PATCH 029/138] Code for SDK configs Inclusion (#203)

This will create a config JSON file, which contains all the details
about compilation and SDK versions.

Currently, this code is added in the code block of
QEFFAutoModelForCausalLM.compile.

The config would look like below:

```
{
    "huggingface_config": {
        "vocab_size": 50257,
        "n_positions": 1024,
        "n_embd": 768,
        "n_layer": 12,
        "n_head": 12,
        "n_inner": null,
        "activation_function": "gelu_new",
        "resid_pdrop": 0.1,
        "embd_pdrop": 0.1,
        "attn_pdrop": 0.1,
        "layer_norm_epsilon": 1e-05,
        "initializer_range": 0.02,
        "summary_type": "cls_index",
        "summary_use_proj": true,
        "summary_activation": null,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": true,
        "scale_attn_weights": true,
        "use_cache": true,
        "scale_attn_by_inverse_layer_idx": false,
        "reorder_and_upcast_attn": false,
        "bos_token_id": 50256,
        "eos_token_id": 50256,
        "return_dict": true,
        "output_hidden_states": false,
        "output_attentions": false,
        "torchscript": false,
        "torch_dtype": null,
        "use_bfloat16": false,
        "tf_legacy_loss": false,
        "pruned_heads": {},
        "tie_word_embeddings": true,
        "chunk_size_feed_forward": 0,
        "is_encoder_decoder": false,
        "is_decoder": false,
        "cross_attention_hidden_size": null,
        "add_cross_attention": false,
        "tie_encoder_decoder": false,
        "max_length": 20,
        "min_length": 0,
        "do_sample": false,
        "early_stopping": false,
        "num_beams": 1,
        "num_beam_groups": 1,
        "diversity_penalty": 0.0,
        "temperature": 1.0,
        "top_k": 50,
        "top_p": 1.0,
        "typical_p": 1.0,
        "repetition_penalty": 1.0,
        "length_penalty": 1.0,
        "no_repeat_ngram_size": 0,
        "encoder_no_repeat_ngram_size": 0,
        "bad_words_ids": null,
        "num_return_sequences": 1,
        "output_scores": false,
        "return_dict_in_generate": false,
        "forced_bos_token_id": null,
        "forced_eos_token_id": null,
        "remove_invalid_values": false,
        "exponential_decay_length_penalty": null,
        "suppress_tokens": null,
        "begin_suppress_tokens": null,
        "architectures": [
            "GPT2LMHeadModel"
        ],
        "finetuning_task": null,
        "id2label": {
            "0": "LABEL_0",
            "1": "LABEL_1"
        },
        "label2id": {
            "LABEL_0": 0,
            "LABEL_1": 1
        },
        "tokenizer_class": null,
        "prefix": null,
        "pad_token_id": null,
        "sep_token_id": null,
        "decoder_start_token_id": null,
        "task_specific_params": {
            "text-generation": {
                "do_sample": true,
                "max_length": 50
            }
        },
        "problem_type": null,
        "_name_or_path": "gpt2",
        "_commit_hash": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
        "_attn_implementation_internal": "eager",
        "transformers_version": null,
        "model_type": "gpt2",
        "n_ctx": 1024
    },
    "qpc_config": {
        "QEff_config": {
            "pytorch_transforms": [
                "AwqToMatmulNbitsTransform",
                "GPTQToMatmulNbitsTransform",
                "CustomOpsTransform",
                "KVCacheTransform"
            ],
            "onnx_transforms": [
                "FP16ClipTransform",
                "SplitTensorsTransform"
            ],
            "onnx_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/GPT2LMHeadModel.onnx"
        },
        "aic_compiler_config": {
            "apps_sdk_version": "1.20.0",
            "compile_dir": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47",
            "specializtions_file_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/specializations.json",
            "prefill_seq_len": 32,
            "ctx_len": 128,
            "batch_size": 1,
            "full_batch_size": null,
            "num_devices": 1,
            "num_cores": 16,
            "mxfp6_matmul": false,
            "mxint8_kv_cache": false,
            "num_speculative_tokens": null
        },
        "qnn_config": {
            "enable_qnn": true,
            "qnn_config_path": "QEfficient/compile/qnn_config.json",
            "product": "QAIRT",
            "os": {
                "Ubuntu": 22.04,
                "Windows": 11
            },
            "sdk_flavor": [
                "aic"
            ],
            "version": "2.31.0",
            "build_id": "250109072054_3882",
            "qnn_backend_api_version": "2.18.0",
            "tensorflow": "2.10.1",
            "tflite": "2.3.0",
            "torch": "1.13.1",
            "onnx": "1.16.1",
            "onnxruntime": "1.17.1",
            "onnxsimplifier": "0.4.36",
            "android-ndk": "r26c",
            "platform": "AIC.1.20.0.14"
        }
    }
}
```

Note: The code structure may change.

---------

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/base/modeling_qeff.py              |   6 +-
 QEfficient/peft/auto.py                       |   4 +
 QEfficient/peft/lora/auto.py                  |   4 +
 .../transformers/models/modeling_auto.py      |  24 ++++
 QEfficient/utils/__init__.py                  |   1 +
 QEfficient/utils/_utils.py                    | 114 +++++++++++++++++-
 QEfficient/utils/constants.py                 |   2 +
 tests/peft/lora/test_lora_model.py            |   4 +
 tests/peft/test_peft_model.py                 |   2 +
 tests/qnn_tests/test_causal_lm_models_qnn.py  |   8 +-
 tests/text_generation/test_text_generation.py |   3 +
 .../models/test_causal_lm_models.py           |   7 +-
 .../models/test_embedding_models.py           |   2 +
 .../models/test_prefix_caching.py             |   2 +
 .../models/test_speech_seq2seq_models.py      |   1 +
 tests/transformers/spd/test_spd_inference.py  |   3 +
 tests/transformers/test_causal_lm.py          |   2 +
 tests/transformers/test_speech_seq2seq.py     |   2 +
 18 files changed, 185 insertions(+), 6 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index ec74c57f3..f2b3714fa 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -23,7 +23,7 @@
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import constants
+from QEfficient.utils import constants, dump_qconfig
 from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
@@ -211,6 +211,7 @@ def _export(
         self.onnx_path = onnx_path
         return onnx_path
 
+    @dump_qconfig
     def _compile(
         self,
         onnx_path: Optional[str] = None,
@@ -336,8 +337,10 @@ def _compile(
             )
 
         self.qpc_path = qpc_path
+
         return qpc_path
 
+    @dump_qconfig
     def _qnn_compile(
         self,
         onnx_path: Optional[str] = None,
@@ -435,4 +438,5 @@ def _qnn_compile(
         )
 
         self.qpc_path = qpc_path
+
         return qpc_path
diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
index 377caa3e7..deb64fae1 100644
--- a/QEfficient/peft/auto.py
+++ b/QEfficient/peft/auto.py
@@ -107,6 +107,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.get_base_model().config.__dict__
+
     def load_adapter(self, model_id: str, adapter_name: str):
         """Loads a new adapter from huggingface hub or local path
 
diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py
index c13979968..7f2a5cd84 100644
--- a/QEfficient/peft/lora/auto.py
+++ b/QEfficient/peft/lora/auto.py
@@ -90,6 +90,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.model.config.__dict__
+
     def download_adapter(
         self,
         adapter_model_id: str,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 54b7828c8..5852740b4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -229,6 +229,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -447,6 +451,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.model.vision_model.config.__dict__
+
 
 class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
     _pytorch_transforms = [
@@ -506,6 +514,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.language_model.config.__dict__
+
 
 class _QEffAutoModelForImageTextToTextDualQPC:
     _hf_auto_class = AutoModelForImageTextToText
@@ -1128,6 +1140,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
 
 class QEFFAutoModelForImageTextToText:
     """
@@ -1320,6 +1336,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -1630,6 +1650,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 2506b9233..a7f17e6bc 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -11,6 +11,7 @@
 )
 from QEfficient.utils._utils import (  # noqa: F401
     check_and_assign_cache_dir,
+    dump_qconfig,
     get_num_layers_from_config,
     get_onnx_dir_name,
     get_padding_shape_from_config,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 23d3a541d..8ba5e2c18 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,11 +8,13 @@
 import json
 import os
 import subprocess
+import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import requests
 import torch
+import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
 from transformers import (
@@ -22,7 +24,7 @@
     PreTrainedTokenizerFast,
 )
 
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
 
 
@@ -447,3 +449,113 @@ class IOInfo:
 
     def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
+
+
+def dump_qconfig(func):
+    def wrapper(self, *args, **kwargs):
+        result = func(self, *args, **kwargs)
+        create_and_dump_qconfigs(
+            self.qpc_path,
+            self.onnx_path,
+            self.get_model_config,
+            [cls.__name__ for cls in self._pytorch_transforms],
+            [cls.__name__ for cls in self._onnx_transforms],
+            kwargs.get("specializations"),
+            kwargs.get("mdp_ts_num_devices", 1),
+            kwargs.get("num_speculative_tokens"),
+            **{
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"]
+            },
+        )
+        return result
+
+    return wrapper
+
+
+def create_and_dump_qconfigs(
+    qpc_path,
+    onnx_path,
+    huggingface_config,
+    pytorch_transforms,
+    onnx_transforms,
+    specializations,
+    mdp_ts_num_devices,
+    num_speculative_tokens,
+    **compiler_options,
+):
+    """
+    This Method creates a JSON file which contains all the configs for a model.
+    Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and
+    many other compilation options.
+    """
+    qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
+    enable_qnn = True if "qnn_config" in compiler_options else None
+
+    qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json")
+    onnx_path = str(onnx_path)
+    specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json"))
+    compile_dir = str(os.path.dirname(qpc_path))
+    qnn_config_path = (
+        (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None
+    )
+
+    # Extract QAIC SDK Apps Version from SDK XML file
+    tree = ET.parse(Constants.SDK_APPS_XML)
+    root = tree.getroot()
+    qaic_version = root.find(".//base_version").text
+
+    # Extract QNN SDK details from YAML file if the environment variable is set
+    qnn_sdk_details = None
+    qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
+    if qnn_sdk_path:
+        qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
+        with open(qnn_sdk_yaml_path, "r") as file:
+            qnn_sdk_details = yaml.safe_load(file)
+
+    # Ensure all objects in the configs dictionary are JSON serializable
+    def make_serializable(obj):
+        if isinstance(obj, (int, float, str, bool, type(None))):
+            return obj
+        elif isinstance(obj, (list, tuple)):
+            return [make_serializable(item) for item in obj]
+        elif isinstance(obj, dict):
+            return {key: make_serializable(value) for key, value in obj.items()}
+        elif hasattr(obj, "__dict__"):
+            return make_serializable(vars(obj))
+        return str(obj)
+
+    qconfigs = {
+        "huggingface_config": make_serializable(huggingface_config),
+        "qpc_config": {
+            "QEff_config": {
+                "pytorch_transforms": make_serializable(pytorch_transforms),
+                "onnx_transforms": make_serializable(onnx_transforms),
+                "onnx_path": onnx_path,
+            },
+        },
+    }
+
+    aic_compiler_config = {
+        "apps_sdk_version": qaic_version,
+        "compile_dir": compile_dir,
+        "specializations_file_path": specializations_file_path,
+        "specializations": make_serializable(specializations),
+        "mdp_ts_num_devices": mdp_ts_num_devices,
+        "num_speculative_tokens": num_speculative_tokens,
+        **compiler_options,
+    }
+    qnn_config = {
+        "enable_qnn": enable_qnn,
+        "qnn_config_path": qnn_config_path,
+    }
+    # Put AIC or qnn details.
+    if enable_qnn:
+        qconfigs["qpc_config"]["qnn_config"] = qnn_config
+        if qnn_sdk_details:
+            qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details)
+    else:
+        qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
+
+    create_json(qconfig_file_path, qconfigs)
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 6c2bba0c6..3852adcda 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -75,12 +75,14 @@ class Constants:
     MAX_QPC_LIMIT = 30
     MAX_RETRIES = 5  # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download
     NUM_SPECULATIVE_TOKENS = 2
+    SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml"  # This xml file is parsed to find out the SDK version.
 
 
 @dataclass
 class QnnConstants:
     # QNN PATH to be read from environment variable.
     QNN_SDK_PATH_ENV_VAR_NAME = "QNN_SDK_ROOT"
+    QNN_SDK_YAML = "sdk.yaml"
 
     # QNN Compilation tools
     QAIRT_CONVERTER = "{}/bin/{}/qairt-converter"
diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py
index 4726fb8c5..69a6282fb 100644
--- a/tests/peft/lora/test_lora_model.py
+++ b/tests/peft/lora/test_lora_model.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import os
 from pathlib import Path
 from time import perf_counter
 
@@ -225,6 +227,7 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate(
     # test compile
     qeff_model.compile(prefill_seq_len=32, ctx_len=64)
     assert Path(qeff_model.qpc_path).is_dir()
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
     # test generate
     prompts = ["hello!", "hi", "hello, my name is", "hey"]
@@ -249,6 +252,7 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap
     # test compile
     qeff_model.compile(prefill_seq_len=32, ctx_len=64, full_batch_size=2)
     assert Path(qeff_model.qpc_path).is_dir()
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
     # test generate
     prompts = ["hello!", "hi", "hello, my name is", "hey"]
diff --git a/tests/peft/test_peft_model.py b/tests/peft/test_peft_model.py
index 6a9a957b2..c4e331a9d 100644
--- a/tests/peft/test_peft_model.py
+++ b/tests/peft/test_peft_model.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from time import perf_counter
 
 import numpy as np
@@ -187,3 +188,4 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con
     end = perf_counter()
     compile_time_1 = end - start
     assert compile_time_1 < 0.01 * compile_time_0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py
index fe906fe7e..65acab157 100644
--- a/tests/qnn_tests/test_causal_lm_models_qnn.py
+++ b/tests/qnn_tests/test_causal_lm_models_qnn.py
@@ -5,6 +5,8 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import numpy as np
 import pytest
 from transformers import AutoModelForCausalLM
@@ -98,7 +100,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -106,6 +108,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         aic_enable_depth_first=False,
         enable_qnn=True,
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
     cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
     gen_len = ort_tokens.shape[-1]
@@ -136,7 +139,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -145,6 +148,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         full_batch_size=full_batch_size,
         enable_qnn=True,
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
 
     assert all(
diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py
index a1e4265ee..f7d3cd6cb 100644
--- a/tests/text_generation/test_text_generation.py
+++ b/tests/text_generation/test_text_generation.py
@@ -5,6 +5,8 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import pytest
 from transformers import AutoModelForCausalLM
 
@@ -101,3 +103,4 @@ def test_generate_text_stream(
     assert cloud_ai_100_output == stream_tokens, (
         f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index a3a855cee..418386780 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from typing import Optional
 
 import numpy as np
@@ -127,7 +128,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -141,6 +142,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
         "Tokens don't match for ONNXRT output and Cloud AI 100 output."
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
     # testing for CB models
     model_hf, _ = load_causal_lm_model(model_config)
@@ -165,7 +167,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -182,6 +184,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
             for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
         ]
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
 
 # FIXME: there should be a CB test here
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 1c2d5196c..e681f5093 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import numpy as np
 import onnxruntime as ort
@@ -77,6 +78,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py
index 8ef24403c..c787a3c96 100644
--- a/tests/transformers/models/test_prefix_caching.py
+++ b/tests/transformers/models/test_prefix_caching.py
@@ -31,6 +31,7 @@ def test_simple_prefix_caching(model_name):
         num_cores=14,
     )
     prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
@@ -61,6 +62,7 @@ def test_simple_prefix_caching_qnn(model_name):
         qnn_config=qnn_config_json_path,
     )
     prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
     os.remove(qnn_config_json_path)
 
 
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
index af83c9354..99f715863 100644
--- a/tests/transformers/models/test_speech_seq2seq_models.py
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -360,6 +360,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     assert (pytorch_kv_tokens == cloud_ai_100_tokens).all(), (
         "Tokens don't match for pytorch output and Cloud AI 100 output."
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index a9f197ec3..205f00a00 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from time import perf_counter
 from typing import List, Optional
 
@@ -331,3 +332,5 @@ def test_spec_decode_inference(
     ]  # Because we always run for single input and single batch size
     all_matching = np.array_equal(cloud_ai_100_tokens, generated_ids)
     assert all_matching, "Tokens don't match for SpD output and vanilla DLM output."
+    assert os.path.isfile(os.path.join(os.path.dirname(target_model_qpc_path), "qconfig.json"))
+    assert os.path.isfile(os.path.join(os.path.dirname(draft_model_qpc_path), "qconfig.json"))
diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py
index 1ceb5a7e0..64376db62 100644
--- a/tests/transformers/test_causal_lm.py
+++ b/tests/transformers/test_causal_lm.py
@@ -6,6 +6,7 @@
 # ----------------------------------------------------------------------------
 
 import copy
+import os
 from time import perf_counter
 
 import onnx
@@ -170,3 +171,4 @@ def test_causal_lm_compile(config, cb, tmp_cache):
     end = perf_counter()
     compile_time = end - start
     assert compile_time < 2.0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
diff --git a/tests/transformers/test_speech_seq2seq.py b/tests/transformers/test_speech_seq2seq.py
index a41896010..15d6152e3 100644
--- a/tests/transformers/test_speech_seq2seq.py
+++ b/tests/transformers/test_speech_seq2seq.py
@@ -6,6 +6,7 @@
 # ----------------------------------------------------------------------------
 
 import copy
+import os
 from time import perf_counter
 
 import onnx
@@ -142,3 +143,4 @@ def test_causal_lm_compile(config, tmp_cache):
     end = perf_counter()
     compile_time = end - start
     assert compile_time < 2.0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))

From 4b373b8dc7d2cc046302fdfbef336a89cbf16f73 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:36:53 +0000
Subject: [PATCH 030/138] Fixed the compilation errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py                        | 2 +-
 QEfficient/transformers/modeling_utils.py                     | 1 +
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index 2a07d9f10..765a12f9d 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -46,7 +46,7 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
             self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
             self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
-    def read_only(self, layer_idx, cache_kwargs):
+    def read_only(self, layer_idx, **cache_kwargs):
         k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         position_ids = cache_kwargs.get("position_ids")
         ctx_len = k_out.shape[2]
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index a3c69b1ed..db0b86c2a 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index f1ec2634d..26931fced 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -379,8 +379,8 @@ def forward(
 class LlamaSwiftKVForCausalLM(PreTrainedModel):
     config_class = LlamaSwiftKVConfig
 
-    def __init__(self, *, config: LlamaSwiftKVConfig):
-        super().__init__()
+    def __init__(self, config: LlamaSwiftKVConfig):
+        super().__init__(config=config)
 
         self.model = LlamaSwiftKVModel(
             config=config,

From 8fbc881467272779e85a70999c0ce889bd525cd8 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:39:38 +0000
Subject: [PATCH 031/138] Fixed the lint error

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 -
 QEfficient/transformers/modeling_utils.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 91856d2e9..bdf0b9393 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,7 +6,6 @@
 # -----------------------------------------------------------------------------
 
 import os
-
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index db0b86c2a..a3c69b1ed 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,7 +87,6 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models

From 40d921a0e4fbc77bd1718ecfa3871a113431be7a Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:44:12 +0000
Subject: [PATCH 032/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 +
 QEfficient/transformers/modeling_utils.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index bdf0b9393..91856d2e9 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import os
+
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index a3c69b1ed..54348c860 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -11,6 +11,8 @@
 
 import torch
 import torch.nn as nn
+import importlib
+
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -389,7 +391,8 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    return getattr(sys.modules[__name__], className)
+    module = importlib.import_module("transformers")
+    return getattr(module, className)
 
 
 def get_auto_model_class(model_type, NonTransformerModelCls):

From 7598ec791bdeca575161ff4b06312ecadfd89a68 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:48:14 +0000
Subject: [PATCH 033/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 54348c860..fcb4549d7 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,14 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
-import sys
+import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
 import torch
 import torch.nn as nn
-import importlib
-
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,

From a822f394848286990b1b7f01db6b144b032e7711 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 11:24:56 +0000
Subject: [PATCH 034/138] Address review comments

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 14 +++++++-------
 QEfficient/transformers/modeling_utils.py |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 91856d2e9..5a1a4c6d8 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -22,16 +22,16 @@
 
 from QEfficient.utils.logging_utils import logger
 
-# loop over all the models which are not present in transformers and register them
-for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
-    # Register the config class based on model type
-    AutoConfig.register(key, value[0])
+# loop over all the model types which are not present in transformers and register them
+for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
+    # Register the model config class based on the model type. This will be first element in the tuple
+    AutoConfig.register(model_type, model_cls[0])
 
-    model_class_type = get_model_class_type_from_model_type(key)
-    AutoModelClassName = get_auto_model_class(model_class_type, value[1])
+    model_class_type = get_model_class_type_from_model_type(model_type)
+    AutoModelClassName = get_auto_model_class(model_class_type, model_cls[1])
 
     # Register the non transformer library Class and config class using AutoModelClass
-    AutoModelClassName.register(value[0], value[1])
+    AutoModelClassName.register(model_cls[0], model_cls[1])
 
 
 def check_qaic_sdk():
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index fcb4549d7..e70542ff7 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
@@ -280,6 +279,7 @@
 
 # Map of model type to config class and Model architecture class
 # While onboarding new models make sure to add the new model card names to this dictionary.
+# Developers are expected to follow the naming conventions like ForCausalLM while defining the class names
 MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {"llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]}
 
 # list of sub-strings representing the model type, this is typically taken from llama-swiftkv
@@ -389,7 +389,7 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    module = importlib.import_module("transformers")
+    module = __import__("transformers")
     return getattr(module, className)
 
 

From 4e80fe885614ea2cf8f795cd33b286c457d09520 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 00:39:03 +0530
Subject: [PATCH 035/138] added initial version of SwiftKV for AI 100

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py        |  29 ++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 411 ++++++++++++++++++
 exps/run_swiftkv.py                           |  28 ++
 3 files changed, 468 insertions(+)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
 create mode 100644 exps/run_swiftkv.py

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index a5c375c6e..fe56b197c 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -36,6 +36,35 @@ class QEffDynamicCache(DynamicCache):
 
     """
 
+    def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        else:
+            position_ids = cache_kwargs.get("position_ids")
+            self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
+            self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
+
+    def read_only(self, layer_idx, cache_kwargs):
+        position_ids = cache_kwargs.get("position_ids")
+        ctx_len = position_ids.shape[-1]
+        ctx_indices = torch.arange(ctx_len)[None, None, ...]
+        gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
+        invalid_mask = ctx_indices > gather_limit
+
+        if torch.onnx.is_in_onnx_export():
+            invalid_idx_value = torch.iinfo(torch.int32).max
+        else:
+            invalid_idx_value = 0
+
+        ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
+        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
+        k_out = CtxGatherFunc.apply(k_out, ctx_indices)
+        v_out = CtxGatherFunc.apply(v_out, ctx_indices)
+        v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)
+        return k_out, v_out
+
     def update(
         self,
         key_states: torch.Tensor,
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
new file mode 100644
index 000000000..a33c83d3a
--- /dev/null
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -0,0 +1,411 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+import logging
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaRMSNorm, repeat_kv
+
+from QEfficient.transformers.cache_utils import QEffDynamicCache
+from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.transformers.models.llama.modeling_llama import (
+    QEffLlamaDecoderLayer,
+    QEffLlamaRotaryEmbedding,
+    qeff_apply_rotary_pos_emb,
+)
+
+logger = logging.get_logger(__name__)
+
+
+class LlamaSwiftKVAttention(LlamaAttention):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__(config, layer_idx)
+        self.hidden_size = config.hidden_size
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj_swiftkv = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj_swiftkv = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj_swiftkv = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.rotary_emb = QEffLlamaRotaryEmbedding(config=self.config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask=None,
+    ) -> torch.Tensor:
+        bsz, q_len, _ = hidden_states.size()
+        query, _ = self.q_proj_swiftkv(hidden_states)
+
+        # Reshape the query, key, and value tensors.
+        query_states = query.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = position_ids.shape[-1]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        key_states, value_states = past_key_value.read_only(self.layer_idx, position_ids=position_ids)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, _ = qeff_apply_rotary_pos_emb(query_states, torch.empty_like(key_states), cos, sin, position_ids)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights)
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, past_key_value
+
+
+class LlamaSwiftKVDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, position_ids: torch.Tensor, past_key_values, causal_mask
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+            past_key_value=past_key_values,
+            attention_mask=causal_mask,
+        )
+
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, past_key_values
+
+
+class LlamaSwiftKVModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size, config.hidden_size, None
+        )  # TODO: Not sure if padding_idx shoudl eb NONE
+        self.layers = torch.nn.ModuleList(
+            [
+                QEffLlamaDecoderLayer(config=config, layer_idx=idx)
+                if idx < config.num_key_value_layers
+                else LlamaSwiftKVDecoderLayer(config=config, layer_idx=idx)
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm_swiftkv = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def _run_swiftkv_layers(
+        self, hidden_states: torch.Tensor, position_ids: torch.Tensor, past_key_values, causal_mask
+    ) -> torch.Tensor:
+        for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
+            layer = self.layers[layer_idx]
+
+            hidden_states, past_key_values = layer(hidden_states, position_ids, past_key_values, causal_mask)
+
+        return hidden_states, past_key_values
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        self.config._attn_implementation = "eager"
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+            else:
+                causal_mask = _create_causal_mask(position_ids=position_ids, target_length=target_length)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        position_ids: torch.Tensor,
+        past_key_values: List[torch.Tensor],
+    ):
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        use_cache = True
+
+        if use_cache and not isinstance(past_key_values, Cache):
+            if past_key_values is None:
+                past_key_values = QEffDynamicCache()
+            else:
+                past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(None, inputs_embeds, cache_position, past_key_values, False)
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        next_decoder_cache = None
+
+        for layer_idx in range(self.config.num_key_value_layers):
+            layer = self.layers[layer_idx]
+            hidden_states, next_decoder_cache = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=True,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+        swiftkv_hidden_states = self.norm_swiftkv(hidden_states)
+
+        ####################################
+        ## THE MAGIC OF SWIFT KV BEGINS HERE
+        ####################################
+        for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
+            self_attn = self.layers[layer_idx].self_attn
+            key_states = self_attn.k_proj_swiftkv(swiftkv_hidden_states)
+            value_states = self_attn.v_proj_swiftkv(swiftkv_hidden_states)
+            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+            kv_seq_len = key_states.shape[-2]
+            if past_key_values is not None:
+                if self.layer_idx is None:
+                    raise ValueError(
+                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                        "with a layer index."
+                    )
+                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
+
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            _, key_states = qeff_apply_rotary_pos_emb(
+                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
+            )
+            cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
+            past_key_values.write_only(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        hidden_states, next_decoder_cache = self._run_swiftkv_layers(
+            hidden_states, position_ids, past_key_values, causal_mask
+        )
+        ####################################
+        ## THE MAGIC OF SWIFT KV ENDS HERE
+        ####################################
+
+        next_cache = next_decoder_cache.to_legacy_cache()
+        return hidden_states, next_cache
+
+
+class LlamaSwiftKVForCausalLM(nn.Module):
+    """
+    # packed_modules_mapping = {
+    #     "kv_proj_swiftkv": ["k_proj_swiftkv", "v_proj_swiftkv"],
+    #     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    #     "gate_up_proj": ["gate_proj", "up_proj"],
+    # }
+
+    # # BitandBytes specific attributes
+    # default_bitsandbytes_target_modules = [
+    #     ".gate_proj.",
+    #     ".down_proj.",
+    #     ".up_proj.",
+    #     ".q_proj.",
+    #     ".k_proj.",
+    #     ".v_proj.",
+    #     ".o_proj.",
+    #     ".k_proj_swiftkv.",
+    #     ".v_proj_swiftkv.",
+    # ]
+
+    # # in TP, these weights are partitioned along the column dimension (dim=-1)
+    # column_parallel_weights_modules = [
+    #     ".q_proj_swiftkv.",
+    #     ".down_proj.",
+    #     ".o_proj.",
+    # ]
+    # bitsandbytes_stacked_params_mapping = {
+    #     # shard_name, weight_name, index
+    #     "k_proj_swiftkv": ("kv_proj_swiftkv", 1),
+    #     "v_proj_swiftkv": ("kv_proj_swiftkv", 2),
+    #     "q_proj": ("qkv_proj", 0),
+    #     "k_proj": ("qkv_proj", 1),
+    #     "v_proj": ("qkv_proj", 2),
+    #     "gate_proj": ("gate_up_proj", 0),
+    #     "up_proj": ("gate_up_proj", 1),
+    # }
+    """
+
+    def __init__(self, *, config):
+        super().__init__()
+
+        self.model = LlamaSwiftKVModel(
+            config=config,
+        )
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Optional[Union[List[torch.FloatTensor]]] = None,
+    ):
+        hidden_states, output_past_key_values = self.model(input_ids, position_ids, past_key_values)
+        logit_index = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        hidden_states = hidden_states[torch.arange(position_ids.shape[0]).view(-1, 1), logit_index]
+        logits = self.lm_head(hidden_states)
+        return logits, output_past_key_values
diff --git a/exps/run_swiftkv.py b/exps/run_swiftkv.py
new file mode 100644
index 000000000..cf180f609
--- /dev/null
+++ b/exps/run_swiftkv.py
@@ -0,0 +1,28 @@
+import json
+import os
+
+from safetensors import safe_open
+
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+
+WEIGHTS = "/local/mnt/workspace/open-source/myown/efficient-transformers/cache_dir/swiftkv_model_weights"
+
+
+def load_safetensors(path):
+    state_dict = {}
+    f = safe_open(path, framework="pt", device="cpu")
+    for key in f.keys():
+        tensor = f.get_tensor(key)
+        state_dict[key] = tensor
+    return state_dict
+
+
+config = json.load(open(os.path.join(WEIGHTS, "config.json"), "r"))
+
+config.num_hidden_layers = 1
+
+model = LlamaSwiftKVForCausalLM(config=config)
+state_dict_0 = load_safetensors(os.path.join(WEIGHTS, "model-00001-of-00009.safetensors"))
+
+for k in model.state_dict().keys() - state_dict_0.keys():
+    del state_dict_0[k]

From 4168b33dbd9ba97dd4c908751cd8296d075663ad Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:36:22 +0530
Subject: [PATCH 036/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py    | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index a33c83d3a..5b5fcd77f 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
-import logging
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -30,7 +29,7 @@
 from torch import nn
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaRMSNorm, repeat_kv
+from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -40,12 +39,10 @@
     qeff_apply_rotary_pos_emb,
 )
 
-logger = logging.get_logger(__name__)
 
-
-class LlamaSwiftKVAttention(LlamaAttention):
+class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config, layer_idx) -> None:
-        super().__init__(config, layer_idx)
+        super().__init__()
         self.hidden_size = config.hidden_size
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -56,7 +53,7 @@ def __init__(self, config, layer_idx) -> None:
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
-
+        self.layer_idx = layer_idx
         self.q_proj_swiftkv = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
         self.k_proj_swiftkv = nn.Linear(
             self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias

From 180a9e77ef3e60eea0ad959703d656f5f3581270 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:39:46 +0530
Subject: [PATCH 037/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 5b5fcd77f..2022d2c9b 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -63,7 +63,7 @@ def __init__(self, config, layer_idx) -> None:
         )
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
 
-        self.rotary_emb = QEffLlamaRotaryEmbedding(config=self.config)
+        self.rotary_emb = QEffLlamaRotaryEmbedding(config=config)
 
     def forward(
         self,

From 2aeded3cc3d6098bec5c57d64fb35b89d5c9c47e Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:46:12 +0530
Subject: [PATCH 038/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 2022d2c9b..4f22e82e0 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -394,6 +394,7 @@ def __init__(self, *, config):
         )
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.config = config
 
     def forward(
         self,

From 95a11a2350c9488c9a65f40638381a1c4adf9217 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:07:57 +0530
Subject: [PATCH 039/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4f22e82e0..24b88746a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -286,7 +286,9 @@ def forward(
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
-        causal_mask = self._update_causal_mask(None, inputs_embeds, cache_position, past_key_values, False)
+        causal_mask = self._update_causal_mask(
+            None, inputs_embeds, cache_position, position_ids, past_key_values, False
+        )
         hidden_states = inputs_embeds
 
         # create position embeddings to be shared across the decoder layers

From 0d8f1da99c1205bb1d3898f9ce4be1e8f266585e Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:16:52 +0530
Subject: [PATCH 040/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 24b88746a..8eaef4521 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -292,7 +292,7 @@ def forward(
         hidden_states = inputs_embeds
 
         # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
         next_decoder_cache = None
 
         for layer_idx in range(self.config.num_key_value_layers):
@@ -305,7 +305,7 @@ def forward(
                 output_attentions=False,
                 use_cache=True,
                 cache_position=cache_position,
-                position_embeddings=position_embeddings,
+                position_embeddings=None,
             )
 
         bsz, q_len, _ = hidden_states.size()

From 025017b4a18931087ca9f81d9e8593d7c8a1a14d Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:23:24 +0530
Subject: [PATCH 041/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 8eaef4521..19887c77e 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -123,6 +123,8 @@ class LlamaSwiftKVDecoderLayer(nn.Module):
     def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.num_key_value_heads = config.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)
@@ -318,8 +320,10 @@ def forward(
             self_attn = self.layers[layer_idx].self_attn
             key_states = self_attn.k_proj_swiftkv(swiftkv_hidden_states)
             value_states = self_attn.v_proj_swiftkv(swiftkv_hidden_states)
-            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            key_states = key_states.view(bsz, q_len, self_attn.num_key_value_heads, self_attn.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, q_len, self_attn.num_key_value_heads, self_attn.head_dim).transpose(
+                1, 2
+            )
 
             kv_seq_len = key_states.shape[-2]
             if past_key_values is not None:
@@ -331,12 +335,12 @@ def forward(
                     )
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
 
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
             _, key_states = qeff_apply_rotary_pos_emb(
                 torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
             )
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
-            past_key_values.write_only(key_states, value_states, self.layer_idx, cache_kwargs)
+            past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask

From f559ad965a4888fee761cfce208298093ef78e37 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 12:14:45 +0530
Subject: [PATCH 042/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 19887c77e..20a91ef45 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -124,7 +124,7 @@ def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)

From f4b5d6ed2558319b3e09d320b65f8477fc0f28e0 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 12:33:39 +0530
Subject: [PATCH 043/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 20a91ef45..b4160a312 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -124,7 +124,6 @@ def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)

From f26842a5c946a97e20bbea01beece13b8d620977 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:05:36 +0530
Subject: [PATCH 044/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index b4160a312..4d8bfb754 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -326,13 +326,13 @@ def forward(
 
             kv_seq_len = key_states.shape[-2]
             if past_key_values is not None:
-                if self.layer_idx is None:
+                if self_attn.layer_idx is None:
                     raise ValueError(
-                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        f"The cache structure has changed since version v4.36. If you are using {self_attn.__class__.__name__} "
                         "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                         "with a layer index."
                     )
-                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
+                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
             _, key_states = qeff_apply_rotary_pos_emb(

From 1b9b914a278860901114fe01b079dd3d7a31b342 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:08:51 +0530
Subject: [PATCH 045/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4d8bfb754..4015a6c95 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -335,9 +335,7 @@ def forward(
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
-            _, key_states = qeff_apply_rotary_pos_emb(
-                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
-            )
+            _, key_states = qeff_apply_rotary_pos_emb(torch.empty_like(key_states), key_states, cos, sin, position_ids)
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 

From 39b1dd2ff67da0a4ea00b18ab379431a744ac91e Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:18:16 +0530
Subject: [PATCH 046/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4015a6c95..8ba2ad78e 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -73,7 +73,7 @@ def forward(
         attention_mask=None,
     ) -> torch.Tensor:
         bsz, q_len, _ = hidden_states.size()
-        query, _ = self.q_proj_swiftkv(hidden_states)
+        query = self.q_proj_swiftkv(hidden_states)
 
         # Reshape the query, key, and value tensors.
         query_states = query.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)

From 278535554f80e4df5a52992061bfdbf435aa4bb8 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 19 Dec 2024 16:21:20 +0530
Subject: [PATCH 047/138] all bugfixes in

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 8ba2ad78e..d93d7cb44 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -90,7 +90,11 @@ def forward(
 
         key_states, value_states = past_key_value.read_only(self.layer_idx, position_ids=position_ids)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, _ = qeff_apply_rotary_pos_emb(query_states, torch.empty_like(key_states), cos, sin, position_ids)
+        position_idx = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        position_ids = position_ids[:, position_idx[0]]
+        query_states, _ = qeff_apply_rotary_pos_emb(
+            query_states, torch.empty_like(query_states), cos, sin, position_ids
+        )
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -160,9 +164,7 @@ def __init__(self, config):
         self.vocab_size = config.vocab_size
         self.config = config
 
-        self.embed_tokens = nn.Embedding(
-            self.vocab_size, config.hidden_size, None
-        )  # TODO: Not sure if padding_idx shoudl eb NONE
+        self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, None)
         self.layers = torch.nn.ModuleList(
             [
                 QEffLlamaDecoderLayer(config=config, layer_idx=idx)
@@ -179,9 +181,9 @@ def _run_swiftkv_layers(
     ) -> torch.Tensor:
         for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
             layer = self.layers[layer_idx]
-
             hidden_states, past_key_values = layer(hidden_states, position_ids, past_key_values, causal_mask)
 
+        hidden_states = self.norm(hidden_states)
         return hidden_states, past_key_values
 
     def _update_causal_mask(
@@ -339,15 +341,21 @@ def forward(
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 
+        last_pos_id = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        orig_hidden_states = hidden_states
+        hidden_states = orig_hidden_states[:, last_pos_id[0], :]
+        causal_mask = causal_mask[:, :, last_pos_id[0], :]
+
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask
         )
+        orig_hidden_states[:, last_pos_id[0], :] = hidden_states
         ####################################
         ## THE MAGIC OF SWIFT KV ENDS HERE
         ####################################
 
         next_cache = next_decoder_cache.to_legacy_cache()
-        return hidden_states, next_cache
+        return orig_hidden_states, next_cache
 
 
 class LlamaSwiftKVForCausalLM(nn.Module):

From 2b519473861c51adbf0dada9e6afc5fd0f096619 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 19 Dec 2024 16:21:56 +0530
Subject: [PATCH 048/138] added init file

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/llama_swiftkv/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/__init__.py

diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
new file mode 100644
index 000000000..e69de29bb

From 59c30a94e358f40291b3ab1a7469f3f86f78bb28 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 9 Jan 2025 16:38:13 +0530
Subject: [PATCH 049/138] all changes except BQA are in with this

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index fe56b197c..2a07d9f10 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -47,8 +47,9 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
             self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
     def read_only(self, layer_idx, cache_kwargs):
+        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         position_ids = cache_kwargs.get("position_ids")
-        ctx_len = position_ids.shape[-1]
+        ctx_len = k_out.shape[2]
         ctx_indices = torch.arange(ctx_len)[None, None, ...]
         gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
         invalid_mask = ctx_indices > gather_limit
@@ -59,7 +60,7 @@ def read_only(self, layer_idx, cache_kwargs):
             invalid_idx_value = 0
 
         ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
-        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
+
         k_out = CtxGatherFunc.apply(k_out, ctx_indices)
         v_out = CtxGatherFunc.apply(v_out, ctx_indices)
         v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)

From 8cac6b9a6b6faa11c06bfea29cb6cbf4835fe334 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 5 Feb 2025 09:20:06 +0530
Subject: [PATCH 050/138] more updates

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/__init__.py          |  6 ++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 68 +++----------------
 2 files changed, 14 insertions(+), 60 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
index e69de29bb..d259e435a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/__init__.py
+++ b/QEfficient/transformers/models/llama_swiftkv/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index d93d7cb44..365f0b6d2 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,25 +1,13 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# -----------------------------------------------------------------------------
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# -----------------------------------------------------------------------------
+# This file is adapted from vllm implementation by snowflake here: https://github.com/Snowflake-Labs/vllm/blob/swiftkv/vllm/model_executor/models/llama_swiftkv.py
+# The Modules are updated as required by Cloud AI 100 HW requirements.
+
+
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 import math
@@ -294,8 +282,6 @@ def forward(
         )
         hidden_states = inputs_embeds
 
-        # create position embeddings to be shared across the decoder layers
-        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
         next_decoder_cache = None
 
         for layer_idx in range(self.config.num_key_value_layers):
@@ -359,44 +345,6 @@ def forward(
 
 
 class LlamaSwiftKVForCausalLM(nn.Module):
-    """
-    # packed_modules_mapping = {
-    #     "kv_proj_swiftkv": ["k_proj_swiftkv", "v_proj_swiftkv"],
-    #     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    #     "gate_up_proj": ["gate_proj", "up_proj"],
-    # }
-
-    # # BitandBytes specific attributes
-    # default_bitsandbytes_target_modules = [
-    #     ".gate_proj.",
-    #     ".down_proj.",
-    #     ".up_proj.",
-    #     ".q_proj.",
-    #     ".k_proj.",
-    #     ".v_proj.",
-    #     ".o_proj.",
-    #     ".k_proj_swiftkv.",
-    #     ".v_proj_swiftkv.",
-    # ]
-
-    # # in TP, these weights are partitioned along the column dimension (dim=-1)
-    # column_parallel_weights_modules = [
-    #     ".q_proj_swiftkv.",
-    #     ".down_proj.",
-    #     ".o_proj.",
-    # ]
-    # bitsandbytes_stacked_params_mapping = {
-    #     # shard_name, weight_name, index
-    #     "k_proj_swiftkv": ("kv_proj_swiftkv", 1),
-    #     "v_proj_swiftkv": ("kv_proj_swiftkv", 2),
-    #     "q_proj": ("qkv_proj", 0),
-    #     "k_proj": ("qkv_proj", 1),
-    #     "v_proj": ("qkv_proj", 2),
-    #     "gate_proj": ("gate_up_proj", 0),
-    #     "up_proj": ("gate_up_proj", 1),
-    # }
-    """
-
     def __init__(self, *, config):
         super().__init__()
 

From 4e63facdaa528ae550bae9ee2c3986d666c65004 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 06:17:43 +0000
Subject: [PATCH 051/138] Enabling the SwiftKV model in the QEFF Infra

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py     | 19 ++++++++
 .../llama_swiftkv/config_llama_swiftkv.py     | 45 +++++++++++++++++++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 17 ++++---
 .../transformers/models/modeling_auto.py      |  6 +++
 QEfficient/utils/_utils.py                    |  2 +-
 5 files changed, 82 insertions(+), 7 deletions(-)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index ccad5e020..aec82e8cd 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -153,6 +153,9 @@
     QEffWhisperPositionalEmbedding,
 )
 
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
@@ -362,3 +365,19 @@ def _create_causal_mask(
         attention_mask = attention_mask.unsqueeze(1)
 
     return attention_mask
+
+
+# Define a SwiftKV Model card name to Model type dictionary
+# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
+SwiftKVModelCardNameToSwiftKVModelTypeDict: Dict[Type[str], Type[str]] = {
+    # LlamaSwiftKV Model
+    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct": "llama_swiftkv"
+}
+
+# Define a SwiftKV Model type to ConfigClass and ModelArchitecture class dictionary
+# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
+SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
+    # LlamaSwiftKV Model
+    "llama_swiftkv" : [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+}
+
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
new file mode 100644
index 000000000..fa97388de
--- /dev/null
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -0,0 +1,45 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# The Modules are updated as required by Cloud AI 100 HW requirements.
+
+
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+
+
+from typing import Optional
+from transformers import LlamaConfig
+
+
+class LlamaSwiftKVConfig(LlamaConfig):
+    """
+    Args:
+        num_key_value_layers (int, optional):
+            The number of layers, from the first layer, that have keys and
+            values. If None, all layers have keys and values.
+        last_key_value_heads (int, optional):
+            The number of heads in the last layer that have keys and values.
+            If None, the number of heads in the last key-value layer is equal
+            to the number of heads in all the other key-value layers.
+    """
+
+    model_type = "llama_swiftkv"
+
+    def __init__(
+        self,
+        swiftkv: bool = False,
+        num_key_value_layers: Optional[int] = None,
+        key_value_group_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.swiftkv = swiftkv
+        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
+        self.key_value_group_size = key_value_group_size or 1
+        assert (
+            self.num_hidden_layers - self.num_key_value_layers
+        ) % self.key_value_group_size == 0
\ No newline at end of file
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 365f0b6d2..e2bd5a08a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -18,6 +18,7 @@
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
+from transformers.modeling_utils import PreTrainedModel
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -26,10 +27,10 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
 class LlamaSwiftKVAttention(nn.Module):
-    def __init__(self, config, layer_idx) -> None:
+    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.attention_dropout = config.attention_dropout
@@ -112,7 +113,7 @@ def forward(
 
 
 class LlamaSwiftKVDecoderLayer(nn.Module):
-    def __init__(self, config, layer_idx) -> None:
+    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
@@ -147,7 +148,9 @@ def forward(
 
 
 class LlamaSwiftKVModel(nn.Module):
-    def __init__(self, config):
+    config_class = LlamaSwiftKVConfig
+
+    def __init__(self, config: LlamaSwiftKVConfig):
         super().__init__()
         self.vocab_size = config.vocab_size
         self.config = config
@@ -344,8 +347,10 @@ def forward(
         return orig_hidden_states, next_cache
 
 
-class LlamaSwiftKVForCausalLM(nn.Module):
-    def __init__(self, *, config):
+class LlamaSwiftKVForCausalLM(PreTrainedModel):
+    config_class = LlamaSwiftKVConfig
+
+    def __init__(self, *, config: LlamaSwiftKVConfig):
         super().__init__()
 
         self.model = LlamaSwiftKVModel(
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index a87c39fb4..9d8074a97 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,6 +7,7 @@
 
 import hashlib
 import warnings
+
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -51,6 +52,7 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
+from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 
 class QEFFTransformersBase(QEFFBaseModel):
@@ -76,6 +78,10 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
+
+        # Load the SwiftKV model if supported
+        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
+
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index ea9044e2c..e3724b90f 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -21,7 +21,7 @@
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
-
+from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
 
 class DownloadRetryLimitExceeded(Exception):
     """

From 78e925728baf979f1b3d67762143fe9d8c4102a2 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 27 Feb 2025 15:16:14 +0530
Subject: [PATCH 052/138] rebased

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py                   | 3 +--
 .../models/llama_swiftkv/config_llama_swiftkv.py            | 6 +-----
 .../models/llama_swiftkv/modeling_llama_swiftkv.py          | 1 +
 QEfficient/transformers/models/modeling_auto.py             | 1 -
 QEfficient/utils/_utils.py                                  | 2 +-
 5 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index aec82e8cd..42244e288 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -378,6 +378,5 @@ def _create_causal_mask(
 # While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
 SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
     # LlamaSwiftKV Model
-    "llama_swiftkv" : [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
 }
-
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
index fa97388de..77eeb61a3 100644
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -9,8 +9,6 @@
 
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
-
-
 from typing import Optional
 from transformers import LlamaConfig
 
@@ -40,6 +38,4 @@ def __init__(
         self.swiftkv = swiftkv
         self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
         self.key_value_group_size = key_value_group_size or 1
-        assert (
-            self.num_hidden_layers - self.num_key_value_layers
-        ) % self.key_value_group_size == 0
\ No newline at end of file
+        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index e2bd5a08a..4d6888bc7 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -29,6 +29,7 @@
 )
 from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
+
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9d8074a97..18006c6dc 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -78,7 +78,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         # Load the SwiftKV model if supported
         QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index e3724b90f..ea9044e2c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -21,7 +21,7 @@
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
-from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
+
 
 class DownloadRetryLimitExceeded(Exception):
     """

From d33c22e3c2fcf9c5c71f8c076cff62cd33e1764d Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 18:45:54 +0000
Subject: [PATCH 053/138] moving registration of non transformer models during
 initialization of QEfficient

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |  18 +-
 QEfficient/transformers/modeling_utils.py     |  76 ++++++--
 .../models/llama_swiftkv/__init__.py          |   2 +-
 .../llama_swiftkv/config_llama_swiftkv.py     |  41 -----
 .../llama_swiftkv/modeling_llama_swiftkv.py   |  30 ++-
 .../transformers/models/modeling_auto.py      |   3 -
 QEfficient/utils/_utils.py                    | 172 +++++++-----------
 7 files changed, 174 insertions(+), 168 deletions(-)
 delete mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 47c462979..95f690b9c 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
@@ -13,7 +13,23 @@
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 from QEfficient.utils.logging_utils import logger
+from transformers import AutoConfig
+from QEfficient.transformers.modeling_utils import (
+    get_model_class_type_from_model_type,
+    get_auto_model_class,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+)
 
+# loop over all the models which are not present in transformers and register them
+for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
+    # Register the config class based on model type
+    AutoConfig.register(key, value[0])
+
+    model_class_type = get_model_class_type_from_model_type(key)
+    AutoModelClassName = get_auto_model_class(model_class_type, value[1])
+
+    # Register the non transformer library Class and config class using AutoModelClass
+    AutoModelClassName.register(value[0], value[1])
 
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 42244e288..9619cb816 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -7,6 +7,7 @@
 
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
+import sys
 
 import torch
 import torch.nn as nn
@@ -86,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 from .models.codegen.modeling_codegen import (
@@ -153,8 +155,11 @@
     QEffWhisperPositionalEmbedding,
 )
 
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+# Placeholder for all non-transformer models
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
+    LlamaSwiftKVForCausalLM,
+    LlamaSwiftKVConfig
+)
 
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
@@ -274,6 +279,19 @@
     WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
 }
 
+# Map of model type to config class and Model architecture class
+# While onboarding new models make sure to add the new model card names to this dictionary.
+MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
+    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+}
+
+# list of sub-strings representing the model type, this is typically taken from llama-swiftkv
+LIST_OF_MODEL_TYPES = {"swiftkv"}
+
+# list of sub-strings used for representing the model Architecture class name, for example LlamaSwiftKVForCausalLM
+MODEL_TYPE_TO_MODEL_CLASS_TYPE = {
+    "swiftkv": "SwiftKVFor"
+}
 
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
@@ -366,17 +384,47 @@ def _create_causal_mask(
 
     return attention_mask
 
+def convert_str_to_class(className):
+    """
+    Convert the string to class name
+    ---------
+    :className: `str`- Class name string.
+    Return:
+        Class Name
+    """
+    return getattr(sys.modules[__name__], className)
 
-# Define a SwiftKV Model card name to Model type dictionary
-# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
-SwiftKVModelCardNameToSwiftKVModelTypeDict: Dict[Type[str], Type[str]] = {
-    # LlamaSwiftKV Model
-    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct": "llama_swiftkv"
-}
 
-# Define a SwiftKV Model type to ConfigClass and ModelArchitecture class dictionary
-# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
-SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
-    # LlamaSwiftKV Model
-    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
-}
+def get_auto_model_class(model_type, NonTransformerModelCls):
+    """
+    Register the Non Transformer Models like swiftkv
+    ---------------------------------------
+    : model_type: str: name of the Non Transformer model for example llama_swiftkv
+    : NonTransformerModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
+    """
+
+    # Construct the AutoModel class name using NonTransformerModel class e.g. SwiftKVModel Class name, this code is written to make things generic
+    nonTransformerModelClsName = NonTransformerModelCls.__name__
+    start_index = nonTransformerModelClsName.find(model_type)
+
+    # Calculate the index after model_type example "SwiftKVFor"
+    substring_start = start_index + len(model_type)
+
+    # Get the substring after model_type example "SwiftKVFor"
+    nonTransformerModel = nonTransformerModelClsName[substring_start:]
+
+    autoModelName = "AutoModelFor" + nonTransformerModel
+
+    # Convert the string to class name
+    autoModelClassName = convert_str_to_class(autoModelName)
+
+    return autoModelClassName
+
+def get_model_class_type_from_model_type(model_type):
+    for substring in LIST_OF_MODEL_TYPES:
+        if (substring in model_type):
+            model_class_type = substring
+            break
+
+    model_class_name = MODEL_TYPE_TO_MODEL_CLASS_TYPE[model_class_type]
+    return model_class_name
\ No newline at end of file
diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
index d259e435a..72ba36c8a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/__init__.py
+++ b/QEfficient/transformers/models/llama_swiftkv/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
deleted file mode 100644
index 77eeb61a3..000000000
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-# The Modules are updated as required by Cloud AI 100 HW requirements.
-
-
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
-
-from typing import Optional
-from transformers import LlamaConfig
-
-
-class LlamaSwiftKVConfig(LlamaConfig):
-    """
-    Args:
-        num_key_value_layers (int, optional):
-            The number of layers, from the first layer, that have keys and
-            values. If None, all layers have keys and values.
-        last_key_value_heads (int, optional):
-            The number of heads in the last layer that have keys and values.
-            If None, the number of heads in the last key-value layer is equal
-            to the number of heads in all the other key-value layers.
-    """
-
-    model_type = "llama_swiftkv"
-
-    def __init__(
-        self,
-        swiftkv: bool = False,
-        num_key_value_layers: Optional[int] = None,
-        key_value_group_size: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.swiftkv = swiftkv
-        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
-        self.key_value_group_size = key_value_group_size or 1
-        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4d6888bc7..7d5c45a7d 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
@@ -19,6 +19,7 @@
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
+from transformers import LlamaConfig
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -27,8 +28,33 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
+class LlamaSwiftKVConfig(LlamaConfig):
+    """
+    Args:
+        num_key_value_layers (int, optional):
+            The number of layers, from the first layer, that have keys and
+            values. If None, all layers have keys and values.
+        last_key_value_heads (int, optional):
+            The number of heads in the last layer that have keys and values.
+            If None, the number of heads in the last key-value layer is equal
+            to the number of heads in all the other key-value layers.
+    """
+
+    model_type = "llama_swiftkv"
+
+    def __init__(
+        self,
+        swiftkv: bool = False,
+        num_key_value_layers: Optional[int] = None,
+        key_value_group_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.swiftkv = swiftkv
+        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
+        self.key_value_group_size = key_value_group_size or 1
+        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
 
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 18006c6dc..9d7d48293 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -52,7 +52,6 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
-from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 
 class QEFFTransformersBase(QEFFBaseModel):
@@ -78,8 +77,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-        # Load the SwiftKV model if supported
-        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index ea9044e2c..281c9f89b 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,7 +8,8 @@
 import json
 import os
 import subprocess
-import xml.etree.ElementTree as ET
+import sys
+import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -17,12 +18,21 @@
 import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
+
+from QEfficient.transformers.modeling_utils import (
+    SwiftKVModelCardNameToSwiftKVModelTypeDict,
+    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
+)
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
 
-
 class DownloadRetryLimitExceeded(Exception):
     """
     Used for raising error when hf_download fails to download the model after given max_retries.
@@ -446,111 +456,61 @@ def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
 
 
-def dump_qconfig(func):
-    def wrapper(self, *args, **kwargs):
-        result = func(self, *args, **kwargs)
-        create_and_dump_qconfigs(
-            self.qpc_path,
-            self.onnx_path,
-            self.get_model_config,
-            [cls.__name__ for cls in self._pytorch_transforms],
-            [cls.__name__ for cls in self._onnx_transforms],
-            kwargs.get("specializations"),
-            kwargs.get("mdp_ts_num_devices", 1),
-            kwargs.get("num_speculative_tokens"),
-            **{
-                k: v
-                for k, v in kwargs.items()
-                if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"]
-            },
-        )
-        return result
+def convert_str_to_class(className):
+    """
+    Convert the string to class name
+    ---------
+    :className: `str`- Class name string.
+    Return:
+        Class Name
+    """
+    return getattr(sys.modules[__name__], className)
 
-    return wrapper
 
+def register_swiftKV_model(model_type, SwiftkvConfigCls, SwiftKVModelCls):
+    """
+    Register the SwiftKV Models
+    ---------------------------------------
+    : model_type: str: name of the swiftKVModel for example llama_swiftkv
+    : SwiftkVConfigCls: SwiftKV Config class for example LlamaSwiftKVConfig
+    : SwiftKVModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
+    """
+
+    # Register the SwiftKV Config class using AutoConfig
+    AutoConfig.register(model_type, SwiftkvConfigCls)
 
-def create_and_dump_qconfigs(
-    qpc_path,
-    onnx_path,
-    huggingface_config,
-    pytorch_transforms,
-    onnx_transforms,
-    specializations,
-    mdp_ts_num_devices,
-    num_speculative_tokens,
-    **compiler_options,
-):
+    # Construct the AutoModel class name using SwiftKVModel Class name, this code is written to make things generic
+    swiftKvModelName = SwiftKVModelCls.__name__
+    start_index = swiftKvModelName.find("SwiftKVFor")
+
+    # Calculate the index after "SwiftKVFor"
+    substring_start = start_index + len("SwiftKVFor")
+
+    # Get the substring after "SwiftKVFor"
+    swiftKVModel = swiftKvModelName[substring_start:]
+
+    AutoModelName = "AutoModelFor" + swiftKVModel
+
+    # Convert the string to class name
+    AutoModelClassName = convert_str_to_class(AutoModelName)
+
+    # Register the SwiftKVModel Class and config class using AutoModelClass
+    AutoModelClassName.register(SwiftkvConfigCls, SwiftKVModelCls)
+
+
+def QEFFLoadSwiftKVModels(pretrained_model_name_or_path):
     """
-    This Method creates a JSON file which contains all the configs for a model.
-    Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and
-    many other compilation options.
+    Load the SwiftKV Models
+    ---------------------------------------
+    : pretrained_model_name_or_path: str: name of the swiftKVModel for example Snowflake/Llama-3.1-SwiftKV-8B-Instruct
     """
-    qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
-    enable_qnn = True if "qnn_config" in compiler_options else None
-
-    qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json")
-    onnx_path = str(onnx_path)
-    specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json"))
-    compile_dir = str(os.path.dirname(qpc_path))
-    qnn_config_path = (
-        (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None
-    )
+    try:
+        modelType = SwiftKVModelCardNameToSwiftKVModelTypeDict[pretrained_model_name_or_path]
 
-    # Extract QAIC SDK Apps Version from SDK XML file
-    tree = ET.parse(Constants.SDK_APPS_XML)
-    root = tree.getroot()
-    qaic_version = root.find(".//base_version").text
-
-    # Extract QNN SDK details from YAML file if the environment variable is set
-    qnn_sdk_details = None
-    qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
-    if qnn_sdk_path:
-        qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
-        with open(qnn_sdk_yaml_path, "r") as file:
-            qnn_sdk_details = yaml.safe_load(file)
-
-    # Ensure all objects in the configs dictionary are JSON serializable
-    def make_serializable(obj):
-        if isinstance(obj, (int, float, str, bool, type(None))):
-            return obj
-        elif isinstance(obj, (list, tuple)):
-            return [make_serializable(item) for item in obj]
-        elif isinstance(obj, dict):
-            return {key: make_serializable(value) for key, value in obj.items()}
-        elif hasattr(obj, "__dict__"):
-            return make_serializable(vars(obj))
-        return str(obj)
-
-    qconfigs = {
-        "huggingface_config": make_serializable(huggingface_config),
-        "qpc_config": {
-            "QEff_config": {
-                "pytorch_transforms": make_serializable(pytorch_transforms),
-                "onnx_transforms": make_serializable(onnx_transforms),
-                "onnx_path": onnx_path,
-            },
-        },
-    }
-
-    aic_compiler_config = {
-        "apps_sdk_version": qaic_version,
-        "compile_dir": compile_dir,
-        "specializations_file_path": specializations_file_path,
-        "specializations": make_serializable(specializations),
-        "mdp_ts_num_devices": mdp_ts_num_devices,
-        "num_speculative_tokens": num_speculative_tokens,
-        **compiler_options,
-    }
-    qnn_config = {
-        "enable_qnn": enable_qnn,
-        "qnn_config_path": qnn_config_path,
-    }
-    # Put AIC or qnn details.
-    if enable_qnn:
-        qconfigs["qpc_config"]["qnn_config"] = qnn_config
-        if qnn_sdk_details:
-            qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details)
-    else:
-        qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
+        SwiftKVConfigCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][0]
+        SwiftKVModelArchCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][1]
+
+        register_swiftKV_model(modelType, SwiftKVConfigCls, SwiftKVModelArchCls)
 
-    create_json(qconfig_file_path, qconfigs)
+    except KeyError:
+        warnings.warn("Requested SwiftKVModel is currently not supported... stay tuned for future releases", Warning)

From ca2870f5cb957b34ab3f17ec0565b989305bf6bf Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Tue, 4 Mar 2025 05:18:06 +0000
Subject: [PATCH 054/138] fixed lint warnings

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |   7 +-
 QEfficient/transformers/modeling_utils.py     |  30 ++-
 .../llama_swiftkv/modeling_llama_swiftkv.py   |   6 +-
 .../transformers/models/modeling_auto.py      |   2 -
 QEfficient/utils/_utils.py                    | 172 +++++++++++-------
 5 files changed, 129 insertions(+), 88 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 95f690b9c..cad29d450 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -14,11 +14,13 @@
 
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
+
 from QEfficient.transformers.modeling_utils import (
-    get_model_class_type_from_model_type,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
-    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+    get_model_class_type_from_model_type,
 )
+from QEfficient.utils.logging_utils import logger
 
 # loop over all the models which are not present in transformers and register them
 for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
@@ -31,6 +33,7 @@
     # Register the non transformer library Class and config class using AutoModelClass
     AutoModelClassName.register(value[0], value[1])
 
+
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
     try:
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 9619cb816..a3c69b1ed 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,9 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
+import sys
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
-import sys
 
 import torch
 import torch.nn as nn
@@ -87,9 +87,14 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
+# Placeholder for all non-transformer models
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
+    LlamaSwiftKVConfig,
+    LlamaSwiftKVForCausalLM,
+)
+
 from .models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -155,12 +160,6 @@
     QEffWhisperPositionalEmbedding,
 )
 
-# Placeholder for all non-transformer models
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
-    LlamaSwiftKVForCausalLM,
-    LlamaSwiftKVConfig
-)
-
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
@@ -281,17 +280,14 @@
 
 # Map of model type to config class and Model architecture class
 # While onboarding new models make sure to add the new model card names to this dictionary.
-MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
-    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
-}
+MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {"llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]}
 
 # list of sub-strings representing the model type, this is typically taken from llama-swiftkv
 LIST_OF_MODEL_TYPES = {"swiftkv"}
 
 # list of sub-strings used for representing the model Architecture class name, for example LlamaSwiftKVForCausalLM
-MODEL_TYPE_TO_MODEL_CLASS_TYPE = {
-    "swiftkv": "SwiftKVFor"
-}
+MODEL_TYPE_TO_MODEL_CLASS_TYPE = {"swiftkv": "SwiftKVFor"}
+
 
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
@@ -384,6 +380,7 @@ def _create_causal_mask(
 
     return attention_mask
 
+
 def convert_str_to_class(className):
     """
     Convert the string to class name
@@ -420,11 +417,12 @@ def get_auto_model_class(model_type, NonTransformerModelCls):
 
     return autoModelClassName
 
+
 def get_model_class_type_from_model_type(model_type):
     for substring in LIST_OF_MODEL_TYPES:
-        if (substring in model_type):
+        if substring in model_type:
             model_class_type = substring
             break
 
     model_class_name = MODEL_TYPE_TO_MODEL_CLASS_TYPE[model_class_type]
-    return model_class_name
\ No newline at end of file
+    return model_class_name
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 7d5c45a7d..f1ec2634d 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -15,11 +15,11 @@
 
 import torch
 from torch import nn
+from transformers import LlamaConfig
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
-from transformers import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -29,6 +29,7 @@
     qeff_apply_rotary_pos_emb,
 )
 
+
 class LlamaSwiftKVConfig(LlamaConfig):
     """
     Args:
@@ -56,6 +57,7 @@ def __init__(
         self.key_value_group_size = key_value_group_size or 1
         assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
 
+
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9d7d48293..a87c39fb4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,7 +7,6 @@
 
 import hashlib
 import warnings
-
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -77,7 +76,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 281c9f89b..ea9044e2c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,8 +8,7 @@
 import json
 import os
 import subprocess
-import sys
-import warnings
+import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -18,21 +17,12 @@
 import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import (
-    AutoConfig,
-    AutoProcessor,
-    AutoTokenizer,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-)
-
-from QEfficient.transformers.modeling_utils import (
-    SwiftKVModelCardNameToSwiftKVModelTypeDict,
-    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
-)
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
 
+
 class DownloadRetryLimitExceeded(Exception):
     """
     Used for raising error when hf_download fails to download the model after given max_retries.
@@ -456,61 +446,111 @@ def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
 
 
-def convert_str_to_class(className):
-    """
-    Convert the string to class name
-    ---------
-    :className: `str`- Class name string.
-    Return:
-        Class Name
-    """
-    return getattr(sys.modules[__name__], className)
-
-
-def register_swiftKV_model(model_type, SwiftkvConfigCls, SwiftKVModelCls):
-    """
-    Register the SwiftKV Models
-    ---------------------------------------
-    : model_type: str: name of the swiftKVModel for example llama_swiftkv
-    : SwiftkVConfigCls: SwiftKV Config class for example LlamaSwiftKVConfig
-    : SwiftKVModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
-    """
-
-    # Register the SwiftKV Config class using AutoConfig
-    AutoConfig.register(model_type, SwiftkvConfigCls)
-
-    # Construct the AutoModel class name using SwiftKVModel Class name, this code is written to make things generic
-    swiftKvModelName = SwiftKVModelCls.__name__
-    start_index = swiftKvModelName.find("SwiftKVFor")
-
-    # Calculate the index after "SwiftKVFor"
-    substring_start = start_index + len("SwiftKVFor")
-
-    # Get the substring after "SwiftKVFor"
-    swiftKVModel = swiftKvModelName[substring_start:]
-
-    AutoModelName = "AutoModelFor" + swiftKVModel
-
-    # Convert the string to class name
-    AutoModelClassName = convert_str_to_class(AutoModelName)
+def dump_qconfig(func):
+    def wrapper(self, *args, **kwargs):
+        result = func(self, *args, **kwargs)
+        create_and_dump_qconfigs(
+            self.qpc_path,
+            self.onnx_path,
+            self.get_model_config,
+            [cls.__name__ for cls in self._pytorch_transforms],
+            [cls.__name__ for cls in self._onnx_transforms],
+            kwargs.get("specializations"),
+            kwargs.get("mdp_ts_num_devices", 1),
+            kwargs.get("num_speculative_tokens"),
+            **{
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"]
+            },
+        )
+        return result
 
-    # Register the SwiftKVModel Class and config class using AutoModelClass
-    AutoModelClassName.register(SwiftkvConfigCls, SwiftKVModelCls)
+    return wrapper
 
 
-def QEFFLoadSwiftKVModels(pretrained_model_name_or_path):
+def create_and_dump_qconfigs(
+    qpc_path,
+    onnx_path,
+    huggingface_config,
+    pytorch_transforms,
+    onnx_transforms,
+    specializations,
+    mdp_ts_num_devices,
+    num_speculative_tokens,
+    **compiler_options,
+):
     """
-    Load the SwiftKV Models
-    ---------------------------------------
-    : pretrained_model_name_or_path: str: name of the swiftKVModel for example Snowflake/Llama-3.1-SwiftKV-8B-Instruct
+    This Method creates a JSON file which contains all the configs for a model.
+    Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and
+    many other compilation options.
     """
-    try:
-        modelType = SwiftKVModelCardNameToSwiftKVModelTypeDict[pretrained_model_name_or_path]
-
-        SwiftKVConfigCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][0]
-        SwiftKVModelArchCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][1]
+    qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
+    enable_qnn = True if "qnn_config" in compiler_options else None
+
+    qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json")
+    onnx_path = str(onnx_path)
+    specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json"))
+    compile_dir = str(os.path.dirname(qpc_path))
+    qnn_config_path = (
+        (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None
+    )
 
-        register_swiftKV_model(modelType, SwiftKVConfigCls, SwiftKVModelArchCls)
+    # Extract QAIC SDK Apps Version from SDK XML file
+    tree = ET.parse(Constants.SDK_APPS_XML)
+    root = tree.getroot()
+    qaic_version = root.find(".//base_version").text
+
+    # Extract QNN SDK details from YAML file if the environment variable is set
+    qnn_sdk_details = None
+    qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
+    if qnn_sdk_path:
+        qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
+        with open(qnn_sdk_yaml_path, "r") as file:
+            qnn_sdk_details = yaml.safe_load(file)
+
+    # Ensure all objects in the configs dictionary are JSON serializable
+    def make_serializable(obj):
+        if isinstance(obj, (int, float, str, bool, type(None))):
+            return obj
+        elif isinstance(obj, (list, tuple)):
+            return [make_serializable(item) for item in obj]
+        elif isinstance(obj, dict):
+            return {key: make_serializable(value) for key, value in obj.items()}
+        elif hasattr(obj, "__dict__"):
+            return make_serializable(vars(obj))
+        return str(obj)
+
+    qconfigs = {
+        "huggingface_config": make_serializable(huggingface_config),
+        "qpc_config": {
+            "QEff_config": {
+                "pytorch_transforms": make_serializable(pytorch_transforms),
+                "onnx_transforms": make_serializable(onnx_transforms),
+                "onnx_path": onnx_path,
+            },
+        },
+    }
+
+    aic_compiler_config = {
+        "apps_sdk_version": qaic_version,
+        "compile_dir": compile_dir,
+        "specializations_file_path": specializations_file_path,
+        "specializations": make_serializable(specializations),
+        "mdp_ts_num_devices": mdp_ts_num_devices,
+        "num_speculative_tokens": num_speculative_tokens,
+        **compiler_options,
+    }
+    qnn_config = {
+        "enable_qnn": enable_qnn,
+        "qnn_config_path": qnn_config_path,
+    }
+    # Put AIC or qnn details.
+    if enable_qnn:
+        qconfigs["qpc_config"]["qnn_config"] = qnn_config
+        if qnn_sdk_details:
+            qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details)
+    else:
+        qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
 
-    except KeyError:
-        warnings.warn("Requested SwiftKVModel is currently not supported... stay tuned for future releases", Warning)
+    create_json(qconfig_file_path, qconfigs)

From 6665a3ad6f87453e65003b42083ab4d8401c8bf1 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:59:48 +0530
Subject: [PATCH 055/138] enabling faster downloads via hf_transfer (#282)

hf hub doc:
https://huggingface.co/docs/huggingface_hub/en/guides/download
details on hf_transfer
https://github.com/[huggingface/hf_transfer](https://github.com/huggingface/hf_transfer)

---------

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
---
 QEfficient/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index cad29d450..cf622f2cd 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -14,13 +14,11 @@
 
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
-
 from QEfficient.transformers.modeling_utils import (
     MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
-from QEfficient.utils.logging_utils import logger
 
 # loop over all the models which are not present in transformers and register them
 for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():

From 01d9a87f644ea994119e38b5b4e5dc93a046f4bd Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:36:53 +0000
Subject: [PATCH 056/138] Fixed the compilation errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py                        | 2 +-
 QEfficient/transformers/modeling_utils.py                     | 1 +
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index 2a07d9f10..765a12f9d 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -46,7 +46,7 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
             self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
             self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
-    def read_only(self, layer_idx, cache_kwargs):
+    def read_only(self, layer_idx, **cache_kwargs):
         k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         position_ids = cache_kwargs.get("position_ids")
         ctx_len = k_out.shape[2]
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index a3c69b1ed..db0b86c2a 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index f1ec2634d..26931fced 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -379,8 +379,8 @@ def forward(
 class LlamaSwiftKVForCausalLM(PreTrainedModel):
     config_class = LlamaSwiftKVConfig
 
-    def __init__(self, *, config: LlamaSwiftKVConfig):
-        super().__init__()
+    def __init__(self, config: LlamaSwiftKVConfig):
+        super().__init__(config=config)
 
         self.model = LlamaSwiftKVModel(
             config=config,

From 5be5afa1fb67ae0e0bbc923fc624a8d7e7eff6c6 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:39:38 +0000
Subject: [PATCH 057/138] Fixed the lint error

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 -
 QEfficient/transformers/modeling_utils.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index cf622f2cd..e60362c34 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,7 +6,6 @@
 # -----------------------------------------------------------------------------
 
 import os
-
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index db0b86c2a..a3c69b1ed 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,7 +87,6 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models

From 68e92ab81e2fbe00bb08dc5c7102fca8dc766695 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:44:12 +0000
Subject: [PATCH 058/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 +
 QEfficient/transformers/modeling_utils.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index e60362c34..cf622f2cd 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import os
+
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index a3c69b1ed..54348c860 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -11,6 +11,8 @@
 
 import torch
 import torch.nn as nn
+import importlib
+
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -389,7 +391,8 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    return getattr(sys.modules[__name__], className)
+    module = importlib.import_module("transformers")
+    return getattr(module, className)
 
 
 def get_auto_model_class(model_type, NonTransformerModelCls):

From aff64aba8894417386aa4837010ec74f4098b3f0 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:48:14 +0000
Subject: [PATCH 059/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 54348c860..fcb4549d7 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,14 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
-import sys
+import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
 import torch
 import torch.nn as nn
-import importlib
-
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,

From 6fa6c9ab130ccda32e9645b81b525666e861cca9 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 11:24:56 +0000
Subject: [PATCH 060/138] Address review comments

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 16 +++++++++-------
 QEfficient/transformers/modeling_utils.py |  4 ++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index cf622f2cd..53a3a4fef 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -20,16 +20,18 @@
     get_model_class_type_from_model_type,
 )
 
-# loop over all the models which are not present in transformers and register them
-for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
-    # Register the config class based on model type
-    AutoConfig.register(key, value[0])
+from QEfficient.utils.logging_utils import logger
+
+# loop over all the model types which are not present in transformers and register them
+for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
+    # Register the model config class based on the model type. This will be first element in the tuple
+    AutoConfig.register(model_type, model_cls[0])
 
-    model_class_type = get_model_class_type_from_model_type(key)
-    AutoModelClassName = get_auto_model_class(model_class_type, value[1])
+    model_class_type = get_model_class_type_from_model_type(model_type)
+    AutoModelClassName = get_auto_model_class(model_class_type, model_cls[1])
 
     # Register the non transformer library Class and config class using AutoModelClass
-    AutoModelClassName.register(value[0], value[1])
+    AutoModelClassName.register(model_cls[0], model_cls[1])
 
 
 def check_qaic_sdk():
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index fcb4549d7..e70542ff7 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
@@ -280,6 +279,7 @@
 
 # Map of model type to config class and Model architecture class
 # While onboarding new models make sure to add the new model card names to this dictionary.
+# Developers are expected to follow the naming conventions like ForCausalLM while defining the class names
 MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {"llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]}
 
 # list of sub-strings representing the model type, this is typically taken from llama-swiftkv
@@ -389,7 +389,7 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    module = importlib.import_module("transformers")
+    module = __import__("transformers")
     return getattr(module, className)
 
 

From 0ef8f6198a1d7ba94c3b51c891c6d0f59adca9bc Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 14:18:20 +0000
Subject: [PATCH 061/138] Rebased and fixed the lint errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 53a3a4fef..60aba0d74 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -12,14 +12,13 @@
 # hf_transfer is imported (will happen on line 15 via leading imports)
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
-from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
+
 from QEfficient.transformers.modeling_utils import (
     MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
-
 from QEfficient.utils.logging_utils import logger
 
 # loop over all the model types which are not present in transformers and register them

From 5217976d045b4745127b87f58656ade68bce49ca Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 14:27:24 +0000
Subject: [PATCH 062/138] Fix the lint errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 5a1a4c6d8..60aba0d74 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -19,7 +19,6 @@
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
-
 from QEfficient.utils.logging_utils import logger
 
 # loop over all the model types which are not present in transformers and register them

From faef011f81402782682999906d07fd8bdd262f84 Mon Sep 17 00:00:00 2001
From: Mamta Singh <168400541+quic-mamta@users.noreply.github.com>
Date: Wed, 22 Jan 2025 16:52:32 +0530
Subject: [PATCH 063/138] [QEff. Finetune] : Use login_and_download_hf_lm in
 finetuning path (#232)

Use login_and_download_hf_lm function in finetuning path

---------

Signed-off-by: Mamta Singh <quic_mamtsing@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/cloud/finetune.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/QEfficient/cloud/finetune.py b/QEfficient/cloud/finetune.py
index eadab0d9e..ea4527ddf 100644
--- a/QEfficient/cloud/finetune.py
+++ b/QEfficient/cloud/finetune.py
@@ -30,6 +30,7 @@
     get_preprocessed_dataset,
 )
 from QEfficient.finetune.utils.train_utils import get_longest_seq_length, print_model_size, train
+from QEfficient.utils._utils import login_and_download_hf_lm
 
 try:
     import torch_qaic  # noqa: F401
@@ -76,8 +77,9 @@ def main(**kwargs):
 
     # Load the pre-trained model and setup its configuration
     # config = AutoConfig.from_pretrained(train_config.model_name)
+    pretrained_model_path = login_and_download_hf_lm(train_config.model_name)
     model = AutoModelForCausalLM.from_pretrained(
-        train_config.model_name,
+        pretrained_model_path,
         use_cache=False,
         attn_implementation="sdpa",
         torch_dtype=torch.float16,

From 680a25b04e31a28708c2953fbe791ee6989da57e Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Fri, 24 Jan 2025 18:38:41 +0530
Subject: [PATCH 064/138] Installing python package rich to resolve QNN tests
 failure. (#241)

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 scripts/Jenkinsfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 0d802b83f..bbfb38fd2 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -90,6 +90,7 @@ pipeline {
                     cd /efficient-transformers &&
                     . preflight_qeff/bin/activate &&
                     mkdir -p $PWD/Qnn_cli &&
+                    pip install rich &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_cli &&
                     pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log4.xml &&
@@ -107,6 +108,7 @@ pipeline {
                     source /qnn_sdk/bin/envcheck -c &&
                     cd /efficient-transformers &&
                     . preflight_qeff/bin/activate &&
+                    pip install rich &&
                     mkdir -p $PWD/Qnn_non_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_non_cli &&

From b5cb9b4d8fbd4720522fce29e16379551898fb53 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Mon, 27 Jan 2025 18:15:56 +0530
Subject: [PATCH 065/138] Removed onnx_defer_loading from Immutable Convertor
 Args. (#230)

Code Fix to solve
https://jira-dc.qualcomm.com/jira/browse/QRANIUMSW-52023

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/utils/constants.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index ab861a788..cc64df4bd 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -124,7 +124,6 @@ class QnnConstants:
         "--float_bitwidth ",
         "--preserve_io_datatype",
         "--onnx_skip_simplification",
-        "--onnx_defer_loading",
     ]
 
     IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [

From 892f2a7c7703f52eb2f6d4092a5e618240c96ef4 Mon Sep 17 00:00:00 2001
From: asmigosw <quic_asmigosw@quicinc.com>
Date: Mon, 27 Jan 2025 21:29:22 +0530
Subject: [PATCH 066/138] Porting hf_token fix (#246)

Porting hf_token fix on mainline

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/base/common.py                        | 1 +
 QEfficient/exporter/export_hf_to_cloud_ai_100.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index 192294738..ce6b1cdc2 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -80,6 +80,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         """
         if not os.path.isdir(pretrained_model_name_or_path):
             pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
+        kwargs.pop("hf_token", None)
         model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path)
         qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type]
         if not issubclass(qeff_auto_model_class, QEFFBaseModel):
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index c13bb9536..276faf94c 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -411,7 +411,7 @@ def qualcomm_efficient_converter(
         if model_kv
         else QEFFCommonLoader.from_pretrained(
             pretrained_model_name_or_path=(local_model_dir if local_model_dir else model_name),
-            token=hf_token,
+            hf_token=hf_token,
             cache_dir=cache_dir,
             full_batch_size=full_batch_size,
         )

From d6d9a777069e188c71a1ba141931809a3501b368 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Fri, 31 Jan 2025 09:05:07 +0530
Subject: [PATCH 067/138] [Attention output Reshape] : Issue fixed (#243)

This PR is created for fixing the issue in reshaping the attention
output in the modelling file.

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/llama/modeling_llama.py          | 2 +-
 QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py  | 2 +-
 QEfficient/transformers/models/phi/modeling_phi.py              | 2 +-
 QEfficient/transformers/models/phi3/modeling_phi3.py            | 2 +-
 QEfficient/transformers/models/qwen2/modeling_qwen2.py          | 2 +-
 .../transformers/models/starcoder2/modeling_starcoder2.py       | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py
index c70764ecd..0973127c0 100644
--- a/QEfficient/transformers/models/llama/modeling_llama.py
+++ b/QEfficient/transformers/models/llama/modeling_llama.py
@@ -238,7 +238,7 @@ def forward(
 
         attn_output = attn_output.transpose(1, 2).contiguous()
 
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
 
         if self.config.pretraining_tp > 1:
             attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
index 79031b02e..fd1a29c36 100644
--- a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
+++ b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
@@ -199,7 +199,7 @@ def forward(
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
 
         attn_output = self.o_proj(attn_output)
 
diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py
index 718b4f5ac..413569425 100644
--- a/QEfficient/transformers/models/phi/modeling_phi.py
+++ b/QEfficient/transformers/models/phi/modeling_phi.py
@@ -131,7 +131,7 @@ def forward(
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
 
         attn_output = self.dense(attn_output)
 
diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py
index 26147ef2e..062dd0acc 100644
--- a/QEfficient/transformers/models/phi3/modeling_phi3.py
+++ b/QEfficient/transformers/models/phi3/modeling_phi3.py
@@ -199,7 +199,7 @@ def forward(
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
 
         attn_output = self.o_proj(attn_output)
 
diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
index 1eba0e2e6..a8562ca1f 100644
--- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py
+++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
@@ -116,7 +116,7 @@ def forward(
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
 
         attn_output = self.o_proj(attn_output)
 
diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
index 8803f4c2c..cbcd55a5f 100644
--- a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
+++ b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
@@ -120,7 +120,7 @@ def forward(
             )
 
         attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = attn_output.reshape(bsz, q_len, -1)
 
         attn_output = self.o_proj(attn_output)
         attn_output = nn.functional.dropout(attn_output, p=self.residual_dropout, training=self.training)

From 7cc1b3484db8b78ef49c23a77b24099fa387dd72 Mon Sep 17 00:00:00 2001
From: Swati Allabadi <quic_sallabad@quicinc.com>
Date: Tue, 11 Feb 2025 14:51:31 +0530
Subject: [PATCH 068/138] [QEff. Finetune] Stop fine tuning when loss has
 converged (#257)

Fine tuning will stop once the loss becomes close to convergence_loss
(whose default value is <=1e-4) for few consecutive iterations i.e.
convergence_counter (by default 5).
Corrected the url of grammar dataset in finetune.md

Signed-off-by: Swati Allabadi <quic_sallabad@quicinc.com>
Co-authored-by: Swati Allabadi <quic-swatia@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/finetune/configs/training.py  |  4 +++
 QEfficient/finetune/utils/train_utils.py | 45 +++++++++++++++++++++++-
 docs/source/finetune.md                  |  2 +-
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/QEfficient/finetune/configs/training.py b/QEfficient/finetune/configs/training.py
index 41ffa3fb3..0257a63ed 100644
--- a/QEfficient/finetune/configs/training.py
+++ b/QEfficient/finetune/configs/training.py
@@ -39,6 +39,10 @@ class train_config:
     intermediate_step_save: int = 1000
     batching_strategy: str = "packing"
     enable_sorting_for_ddp: bool = True
+    convergence_counter: int = 5  # its value should be >= 1, stop fine tuning when loss <= convergence_loss (defined below) for #convergence_counter steps
+    convergence_loss: float = (
+        1e-4  # if loss value is <= convergence_loss for #convergence_counter consecutive steps, fine tuning stops
+    )
 
     # TODO: vbaddi: Uncomment post adding qaic to Pytorch Profiler
     # flop_counter: bool = False # Enable flop counter to measure model throughput, can not be used with pytorch profiler at the same time.
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
index fe5493978..d8e2799f4 100644
--- a/QEfficient/finetune/utils/train_utils.py
+++ b/QEfficient/finetune/utils/train_utils.py
@@ -94,8 +94,24 @@ def train(
     if train_config.grad_scaler:
         scaler = GradScaler()
 
+    loss_0_counter = torch.tensor([0]).to(device)
+
+    if train_config.enable_ddp:
+        dist.broadcast(loss_0_counter, src=0)
+
     # Start the training loop
     for epoch in range(train_config.num_epochs):
+        if loss_0_counter.item() == train_config.convergence_counter:
+            if train_config.enable_ddp:
+                print(
+                    f"Not proceeding with epoch {epoch + 1} on device {local_rank} since loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps."
+                )
+                break
+            else:
+                print(
+                    f"Not proceeding with epoch {epoch + 1} since loss value has been <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps."
+                )
+                break
         print(f"Starting epoch {epoch + 1}/{train_config.num_epochs}")
         print(f"train_config.max_train_step: {train_config.max_train_step}")
         # stop when the maximum number of training steps is reached
@@ -148,6 +164,18 @@ def train(
             total_loss += loss.detach().float()
             # Accumalate graidents
             loss = loss / train_config.gradient_accumulation_steps
+            if train_config.enable_ddp:
+                if local_rank == 0:
+                    if loss <= train_config.convergence_loss:
+                        loss_0_counter += 1
+                    else:
+                        loss_0_counter = torch.tensor([0]).to(device)
+                dist.broadcast(loss_0_counter, src=0)
+            else:
+                if loss <= train_config.convergence_loss:
+                    loss_0_counter += 1
+                else:
+                    loss_0_counter = torch.tensor([0]).to(device)
 
             if train_config.enable_ddp:
                 if local_rank == 0:
@@ -197,12 +225,27 @@ def train(
                     val_step_perplexity,
                     val_prep,
                 )
+            if train_config.enable_ddp:
+                if loss_0_counter.item() == train_config.convergence_counter:
+                    print(
+                        f"Loss value has been <= {train_config.convergence_loss} for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning on device {local_rank}."
+                    )
+                    break
+            else:
+                if loss_0_counter.item() == train_config.convergence_counter:
+                    print(
+                        f"Loss value has been  <= {train_config.convergence_loss}  for last {loss_0_counter.item()} steps. Hence, stopping the fine tuning."
+                    )
+                    break
 
         pbar.close()
         epoch_end_time = time.perf_counter() - epoch_start_time
         epoch_times.append(epoch_end_time)
 
-        train_epoch_loss = total_loss / len(train_dataloader)
+        if loss_0_counter.item() == train_config.convergence_counter:
+            train_epoch_loss = total_loss / step
+        else:
+            train_epoch_loss = total_loss / len(train_dataloader)
         train_perplexity = torch.exp(train_epoch_loss)
 
         train_prep.append(float(train_perplexity))
diff --git a/docs/source/finetune.md b/docs/source/finetune.md
index 2b585a0ba..40df4401c 100644
--- a/docs/source/finetune.md
+++ b/docs/source/finetune.md
@@ -34,7 +34,7 @@ To download the Alpaca dataset, visit this [link](https://raw.githubusercontent.
 wget -c https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/refs/heads/main/alpaca_data.json -P dataset/
 ```
 
-To download the grammar dataset, visit this [link](https://github.com/meta-llama/llama-recipes/blob/main/src/llama_recipes/datasets/grammar_dataset/grammar_dataset_process.ipynb). Download the dataset and place it under the **datasets_grammar** directory. Make sure to update the training configuration accordingly.
+To download the grammar dataset, visit this [link](https://github.com/meta-llama/llama-cookbook/blob/main/src/llama_cookbook/datasets/grammar_dataset/grammar_dataset_process.ipynb). Download the dataset and place it under the **datasets_grammar** directory. Make sure to update the training configuration accordingly.
 
 
 ## Usage

From 963986b0b1fcee8bb7af3d62ca5b23b6024a491d Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 14 Feb 2025 16:01:49 +0530
Subject: [PATCH 069/138] Mllama(single + dual) + InternVL(single) + Llava
 (single) (#267)

Adding generalized infrastructure to support VLMs with Dual/single QPC
approaches

---------

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Rishin Raj <quic_rishinr@quicinc.com>
Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Co-authored-by: Rishin Raj <quic_rishinr@quicinc.com>
Co-authored-by: Amit Raj <quic_amitraj@quicinc.com>
Co-authored-by: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Co-authored-by: asmigosw <asmigosw@qti.qualcomm.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |    8 +-
 QEfficient/base/__init__.py                   |    6 +-
 QEfficient/base/modeling_qeff.py              |    2 +
 QEfficient/base/pytorch_transforms.py         |   25 +-
 .../generation/text_generation_inference.py   |   21 +-
 QEfficient/transformers/modeling_utils.py     |   95 +-
 .../transformers/models/internvl/__init__.py  |    6 +
 .../models/internvl/modeling_internvl.py      |  176 ++
 .../transformers/models/llava/__init__.py     |    6 +
 .../models/llava/modeling_llava.py            |  138 ++
 .../models/mllama/modeling_mllama.py          |  933 ++++++++---
 .../transformers/models/modeling_auto.py      | 1432 +++++++++++++----
 .../transformers/models/pytorch_transforms.py |   62 +-
 .../models/qwen2/modeling_qwen2.py            |  152 +-
 QEfficient/utils/_utils.py                    |   24 +
 QEfficient/utils/constants.py                 |   13 +
 pyproject.toml                                |    3 +-
 17 files changed, 2582 insertions(+), 520 deletions(-)
 create mode 100644 QEfficient/transformers/models/internvl/__init__.py
 create mode 100644 QEfficient/transformers/models/internvl/modeling_internvl.py
 create mode 100644 QEfficient/transformers/models/llava/__init__.py
 create mode 100644 QEfficient/transformers/models/llava/modeling_llava.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 1bc06ccf4..0481ace3e 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -25,7 +25,12 @@ def check_qaic_sdk():
 # Conditionally import QAIC-related modules if the SDK is installed
 __version__ = "0.0.1.dev0"
 if QAIC_INSTALLED:
-    from QEfficient.base import QEFFAutoModel, QEFFAutoModelForCausalLM, QEFFCommonLoader
+    from QEfficient.base import (
+        QEFFAutoModel,
+        QEFFAutoModelForCausalLM,
+        QEFFAutoModelForImageTextToText,
+        QEFFCommonLoader,
+    )
     from QEfficient.compile.compile_helper import compile
     from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
     from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
@@ -43,6 +48,7 @@ def check_qaic_sdk():
         "QEFFAutoModel",
         "QEFFAutoModelForCausalLM",
         "QEffAutoPeftModelForCausalLM",
+        "QEFFAutoModelForImageTextToText",
         "QEFFCommonLoader",
     ]
 
diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py
index 86cff11c1..4344cac53 100644
--- a/QEfficient/base/__init__.py
+++ b/QEfficient/base/__init__.py
@@ -6,4 +6,8 @@
 # -----------------------------------------------------------------------------
 
 from QEfficient.base.common import QEFFCommonLoader  # noqa: F401
-from QEfficient.transformers.models.modeling_auto import QEFFAutoModel, QEFFAutoModelForCausalLM  # noqa: F401
+from QEfficient.transformers.models.modeling_auto import (  # noqa: F401
+    QEFFAutoModel,
+    QEFFAutoModelForCausalLM,
+    QEFFAutoModelForImageTextToText,
+)
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 2760cf52f..c3a1b6d16 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -175,6 +175,7 @@ def _export(
             }
             if onnx_transform_kwargs is not None:
                 transform_kwargs.update(onnx_transform_kwargs)
+
             for transform in self._onnx_transforms:
                 model, transformed = transform.apply(model, **transform_kwargs)
             model.metadata_props.append(
@@ -187,6 +188,7 @@ def _export(
 
         except Exception as e:
             logger.error(f"ONNX export (or) ONNXTransforms failed: {e}")
+
             raise e
 
         finally:
diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py
index 6e21d11b2..abd19ed35 100644
--- a/QEfficient/base/pytorch_transforms.py
+++ b/QEfficient/base/pytorch_transforms.py
@@ -4,7 +4,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # ----------------------------------------------------------------------------
-from typing import Dict, Tuple, Type
+from types import MethodType
+from typing import Callable, Dict, Tuple, Type
 
 from torch import nn
 
@@ -87,3 +88,25 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
     @classmethod
     def mutate(cls, original_module: nn.Module, parent_module: nn.Module):
         raise NotImplementedError("Please implement your own method by inheriting this class")
+
+
+class ModuleMethodMapperTransform(PytorchTransform):
+    """
+    Serves as base class for any transform that want to map a particular method of a class to a new method implementation.
+    """
+
+    _match_class_replace_method: Dict[nn.Module, Dict[str, Callable]]
+    _match_string_replace_method: Dict[str, Dict[str, Callable]]
+
+    @classmethod
+    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
+        transformed = False
+        for module in model.modules():
+            if (repl_method_map := cls._match_class_replace_method.get(type(module))) or (
+                repl_method_map := cls._match_string_replace_method.get(module.__class__.__name__)
+            ):
+                for orig_method_name, mapped_method in repl_method_map.items():
+                    setattr(module, orig_method_name, MethodType(mapped_method, module))
+                    transformed = True
+
+        return model, transformed
diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index 54b6f057e..d77188914 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -57,10 +57,23 @@ class CloudAI100ExecInfo:
     perf_metrics: PerfMetrics
 
     def __repr__(self):
-        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)}\
-        \nDecode token/sec is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)}\
-        \nTotal token/sec is= {round(self.perf_metrics.total_perf * self.batch_size, 2)}\
-        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)}"
+        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
+        \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} tokens/sec\
+        \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} tokens/sec\
+        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} tokens/sec"
+
+
+@dataclass
+class CloudAI100ExecInfoNew:
+    batch_size: int
+    generated_ids: Union[List[np.ndarray], np.ndarray]
+    perf_metrics: PerfMetrics
+
+    def __repr__(self):
+        return f"Average Prefill time a.k.a TTFT is= {round(self.perf_metrics.prefill_time, 2)} sec\
+        \nDecode is= {round(self.perf_metrics.decode_perf * self.batch_size, 2)} token/sec\
+        \nTotal is= {round(self.perf_metrics.total_perf * self.batch_size, 2)} token/sec\
+        \nTotal (E2E) inference time is= {round(self.perf_metrics.total_time, 2)} sec"
 
 
 io_files = []
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index f749cc0c3..1f172fa54 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -6,8 +6,9 @@
 # -----------------------------------------------------------------------------
 
 from collections import namedtuple
-from typing import Dict, Type
+from typing import Dict, Optional, Tuple, Type
 
+import torch
 import torch.nn as nn
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
@@ -242,3 +243,95 @@
     GPTBigCodeBlock: QEffGPTBigCodeBlock,
     GPTBigCodeModel: QEffGPTBigCodeModel,
 }
+
+
+def _prepare_cross_attention_mask(
+    cross_attention_mask: torch.Tensor,
+    num_vision_tokens: int,
+    dtype: str,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # reshape so it can be used by attn module
+    batch_size, text_total_length, *_ = cross_attention_mask.shape
+    cross_attention_mask = cross_attention_mask.repeat_interleave(num_vision_tokens, dim=3)
+    cross_attention_mask = cross_attention_mask.view(batch_size, text_total_length, -1)
+    cross_attention_mask = cross_attention_mask.unsqueeze(1)
+
+    # invert the mask
+    inverted_cross_attn_mask = (1.0 - cross_attention_mask).to(dtype)
+    cross_attention_mask = inverted_cross_attn_mask.masked_fill(
+        inverted_cross_attn_mask.to(torch.bool), torch.tensor(-10000.0, dtype=torch.float32)
+    )
+
+    # apply full-row bias, which return 4D tensor of shape [B, H, S1, 1] where value is 0 if the a full row in cross attn mask's
+    # last dimension contains negative infinity values, otherwise it's 1
+    negative_inf_value = torch.tensor(-10000.0, dtype=torch.float32)
+    full_text_row_masked_out_mask = (
+        (cross_attention_mask != negative_inf_value).any(dim=-1).type_as(cross_attention_mask)[..., None]
+    )
+    cross_attention_mask *= full_text_row_masked_out_mask
+
+    return cross_attention_mask, full_text_row_masked_out_mask
+
+
+def _prepare_aspect_ratio_attention_mask(
+    aspect_ratio_mask: torch.Tensor,
+    num_patches: int,
+    target_length: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    # Expand aspect ratio mask to target_length
+    batch_size, max_num_tiles = aspect_ratio_mask.shape
+    attention_mask = aspect_ratio_mask.view(batch_size, max_num_tiles, 1, 1).to(dtype)
+    attention_mask = attention_mask.repeat(1, 1, target_length, 1)
+
+    # Mask padding patches
+    pad_patches = target_length - num_patches
+    attention_mask[:, :, -pad_patches:] = 0
+
+    # Invert the mask (0 -> 1, 1 -> 0)
+    attention_mask = 1 - attention_mask
+
+    # Reshape to 2D and create 4D attention mask
+    # (batch_size, 1, max_num_tiles * target_length, max_num_tiles * target_length)
+    attention_mask = attention_mask.reshape(batch_size, max_num_tiles * target_length, 1)
+    attention_mask = attention_mask @ attention_mask.transpose(-1, -2) * torch.tensor(-10000.0, dtype=torch.float32)
+    attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
+
+
+def _create_causal_mask(
+    position_ids,
+    target_length,
+    sliding_window: Optional[int] = None,
+):
+    """
+    A utility attention mask class that allows one to:
+        - Create a causal 4d mask
+        - Create a causal 4d mask with sliding window
+    """
+    if sliding_window is not None:
+        query_indices = position_ids.unsqueeze(-1)
+        kv_indices = torch.arange(target_length).view(1, -1)
+        # --- Rolling buffer ---
+        pos_max = position_ids.max(1, keepdim=True).values
+        kv_start = (pos_max // target_length) * target_length
+        kv_indices_high = kv_indices + kv_start
+        kv_indices_low = torch.where(kv_indices_high < target_length, kv_indices, kv_indices_high - target_length)
+        kv_indices = torch.where(kv_indices_high > pos_max, kv_indices_low, kv_indices_high)
+        kv_indices = kv_indices.unsqueeze(1)
+        # ------
+        causal_mask = kv_indices > query_indices
+        attention_mask = causal_mask
+
+        window_indices = query_indices - sliding_window + 1
+        window_mask = kv_indices < window_indices
+        attention_mask = attention_mask | window_mask
+        attention_mask = attention_mask.unsqueeze(1)
+    else:
+        query_indices = position_ids.unsqueeze(-1)
+        kv_indices = torch.arange(target_length).view(1, 1, -1)
+        attention_mask = kv_indices > query_indices
+        attention_mask = attention_mask.unsqueeze(1)
+
+    return attention_mask
diff --git a/QEfficient/transformers/models/internvl/__init__.py b/QEfficient/transformers/models/internvl/__init__.py
new file mode 100644
index 000000000..72ba36c8a
--- /dev/null
+++ b/QEfficient/transformers/models/internvl/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
new file mode 100644
index 000000000..35304d945
--- /dev/null
+++ b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -0,0 +1,176 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from QEfficient.utils import constants
+from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config
+from QEfficient.utils.logging_utils import logger
+
+
+class QEffInternVLModel(nn.Module):
+    def get_specializations(
+        self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
+    ):
+        # TODO: check if this should be named num_patches or something else
+        num_patches = compiler_options.pop("num_patches", None)
+        if num_patches is None:
+            logger.warning(
+                "User should pass `num_patches` to compile API to fix the dynamic axes `pixel_values`, you can get more info by calling get_inputs_info function!, Since its not found setting its value to 13"
+            )
+            num_patches = 13
+
+        prefill_seq_len = prefill_seq_len if prefill_seq_len else 3840  # 4096-256
+        ctx_len = ctx_len if ctx_len else 4096
+        if img_size is None and hasattr(self.config.vision_config, "image_size"):
+            img_size = getattr(self.config.vision_config, "image_size")
+        elif img_size is None:
+            img_size = 448
+            logger.warning("Setting img_size to be 448, as it was neither passed nor found in vision_config")
+
+        return [
+            {
+                "batch_size": batch_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+                "num_patches": num_patches,
+                "img_size": img_size,
+            },
+            {
+                "batch_size": batch_size,
+                "seq_len": "1",
+                "ctx_len": ctx_len,
+                "num_patches": num_patches,
+                "img_size": img_size,
+            },
+        ]
+
+    def get_onnx_dynamic_axes(
+        self,
+    ):
+        # Define dynamic axes
+        dynamic_axes = {}
+        dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
+        dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
+        dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
+
+        pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
+        for i in range(self.language_model.config.num_hidden_layers):
+            for kv in ["key", "value"]:
+                dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes
+
+        return dynamic_axes
+
+    def get_output_names(
+        self,
+    ):
+        output_names = ["logits", "pixel_values_RetainedState"]
+        for i in range(self.language_model.config.num_hidden_layers):
+            for kv in ["key", "value"]:
+                output_names.append(f"past_{kv}.{i}_RetainedState")
+        return output_names
+
+    def get_dummy_inputs(self, kv_offload: bool = False):
+        if kv_offload:
+            raise ValueError("kv_offload method not supported for InternVL yet!")
+        num_patches = 13
+        C = 3
+        if vis_cfg := getattr(self.config, "vision_config", None):
+            img_size = getattr(vis_cfg, "image_size", 448)
+        else:
+            img_size = 448
+
+        # Define shapes
+        inputs_shapes = {}
+        inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
+        inputs_shapes["position_ids"] = (
+            constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
+            constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
+        )
+        inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size)
+
+        # Define inputs
+        inputs = {}
+        inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
+        inputs["position_ids"] = (
+            torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
+            .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
+            .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
+        )
+        inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
+
+        # Add data for KV
+        kv_cache_shape = get_padding_shape_from_config(
+            config=self.language_model.config,
+            batch_size=constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
+            seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
+        )
+
+        inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
+        for i in range(self.language_model.config.num_hidden_layers):
+            for kv in ["key", "value"]:
+                inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
+
+        return inputs
+
+    def forward(self, input_ids, pixel_values, position_ids, past_key_values):
+        # TODO: Check if Hardcoding this is okay, i.e. check if this value is common for all intern models
+        IMG_CONTEXT_TOKEN = 151667
+
+        input_embeds = self.language_model.get_input_embeddings()(input_ids)
+        vit_embeds = self.extract_feature(pixel_values)
+        B, N, C = input_embeds.shape
+        image_input_embeds = input_embeds.reshape(B * N, C)
+        image_input_ids = input_ids.reshape(B * N)
+        selected = image_input_ids == IMG_CONTEXT_TOKEN
+        indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
+        indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
+        image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
+        image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds)
+        inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True
+        )
+        return outputs.logits, pixel_values, outputs.past_key_values
+
+    def get_inputs_info(self):
+        return [
+            IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
+            IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
+            IOInfo(name="pixel_values", datatype=torch.float32, shape=("num_patches", 3, "img_size", "img_size")),
+        ]
+
+
+class QEffInternVisionEmbeddings(nn.Module):
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+
+        pos_embed = self.position_embedding[:, 1:, :]
+        target_dtype = pos_embed.dtype
+        pos_embed = (
+            pos_embed.float()
+            .reshape(1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1)
+            .permute(0, 3, 1, 2)
+        )
+        pos_embed = (
+            F.interpolate(pos_embed, size=(height, width), mode="bilinear", align_corners=False)
+            .reshape(1, -1, height * width)
+            .permute(0, 2, 1)
+            .to(target_dtype)
+        )
+
+        position_embedding = torch.cat([self.position_embedding[:, :1, :], pos_embed], dim=1)
+
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
diff --git a/QEfficient/transformers/models/llava/__init__.py b/QEfficient/transformers/models/llava/__init__.py
new file mode 100644
index 000000000..d259e435a
--- /dev/null
+++ b/QEfficient/transformers/models/llava/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
new file mode 100644
index 000000000..847eb9028
--- /dev/null
+++ b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -0,0 +1,138 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import torch
+import torch.utils.checkpoint
+from transformers.models.llava.modeling_llava import (
+    LlavaForConditionalGeneration,
+)
+
+from QEfficient.utils._utils import IOInfo
+from QEfficient.utils.logging_utils import logger
+
+BS = 1
+NUM_CHANNEL = 3
+SEQ_LEN = 592
+CTX_LEN = 1024
+
+
+class QEffLlavaForConditionalGeneration(LlavaForConditionalGeneration):
+    def forward(self, input_ids, position_ids, pixel_values, past_key_values):
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        # Image features
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_outputs.hidden_states[self.config.vision_feature_layer]
+        vision_feature_select_strategy = self.config.vision_feature_select_strategy
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+
+        mask = input_ids == self.config.image_token_index
+        indices1 = mask.to(torch.int64).cumsum(1) - 1
+        indices0 = torch.arange(mask.shape[0]).view(-1, 1)
+        image_features_expanded = image_features[indices0, indices1]
+        image_inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds)
+        # *where to skip image encoder for decode*
+        inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), inputs_embeds, image_inputs_embeds)
+        outputs = self.language_model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+        )
+        return outputs.logits, pixel_values, outputs.past_key_values
+
+    def get_dummy_inputs(self, **kwargs):
+        num_layers = self.config.text_config.num_hidden_layers
+        num_key_value_heads = self.config.text_config.num_key_value_heads
+        head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads
+        if vis_cfg := getattr(self.config, "vision_config", None):
+            img_size = getattr(vis_cfg, "image_size", 336)
+        else:
+            img_size = 336
+        inputs = {
+            "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
+            "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
+            "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=torch.float32),
+        }
+        inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1)
+        inputs["past_key_values"] = []
+        for i in range(num_layers):
+            inputs["past_key_values"].append(
+                (
+                    torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim),
+                    torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim),
+                )
+            )
+        inputs["position_ids"] = torch.full(inputs["position_ids"].shape, CTX_LEN - 1)
+        return inputs
+
+    def get_specializations(
+        self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
+    ):
+        max_num_images = compiler_options.pop("max_num_images", 1)
+        prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
+        ctx_len = ctx_len if ctx_len else CTX_LEN
+        if img_size is None and hasattr(self.config.vision_config, "image_size"):
+            img_size = getattr(self.config.vision_config, "image_size")
+        elif img_size is None:
+            img_size = 336
+            logger.warning("Setting img_size to be 336, as it was neither passed nor found in vision_config")
+
+        return [
+            {
+                "batch_size": batch_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+                "max_num_images": max_num_images,
+                "img_size": img_size,
+            },
+            {
+                "batch_size": batch_size,
+                "seq_len": "1",
+                "ctx_len": ctx_len,
+                "max_num_images": max_num_images,
+                "img_size": img_size,
+            },
+        ]
+
+    def get_onnx_dynamic_axes(
+        self,
+    ):
+        # Define dynamic axes
+        num_layers = self.config.text_config.num_hidden_layers
+
+        dynamic_axes = {
+            "input_ids": {0: "batch_size", 1: "seq_len"},
+            "position_ids": {0: "batch_size", 1: "seq_len"},
+            "pixel_values": {0: "batch_size", 2: "img_size", 3: "img_size"},
+        }
+        for i in range(num_layers):
+            dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
+            dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
+
+        return dynamic_axes
+
+    def get_output_names(
+        self,
+    ):
+        output_names = ["logits", "pixel_values_RetainedState"]
+        for i in range(self.language_model.config.num_hidden_layers):
+            for kv in ["key", "value"]:
+                output_names.append(f"past_{kv}.{i}_RetainedState")
+        return output_names
+
+    def get_inputs_info(self):
+        return [
+            IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
+            IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
+            IOInfo(name="pixel_values", datatype=torch.float32, shape=("batch_size", 3, "img_size", "img_size")),
+        ]
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
index e2f551415..610c7be30 100644
--- a/QEfficient/transformers/models/mllama/modeling_mllama.py
+++ b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -11,12 +11,14 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_outputs import (
+    BaseModelOutput,
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
@@ -25,93 +27,28 @@
     MllamaConfig,
     MllamaCrossAttentionDecoderLayer,
     MllamaForCausalLM,
+    MllamaForConditionalGeneration,
     MllamaRotaryEmbedding,
     MllamaSelfAttentionDecoderLayer,
     MllamaTextCrossAttention,
     MllamaTextModel,
     MllamaTextSelfAttention,
+    MllamaVisionModel,
     logger,
     repeat_kv,
     rotate_half,
 )
 
-from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
-
-
-class QEffMllamaRotaryEmbedding(MllamaRotaryEmbedding):
-    """
-    Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
-    The only differences are:
-    - Add static sin/cos computations.
-    """
-
-    def __init__(
-        self,
-        dim=None,
-        max_position_embeddings=2048,
-        base=10000,
-        device=None,
-        scaling_factor=1.0,
-        rope_type="default",
-        config: Optional[MllamaConfig] = None,
-    ):
-        super(MllamaRotaryEmbedding, self).__init__()  # Initialize nn.Module
-        # TODO (joao): remove the `if` below, only used for BC
-        self.rope_kwargs = {}
-        if config is None:
-            logger.warning_once(
-                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
-                "`config` argument. All other arguments will be removed in v4.45"
-            )
-            self.rope_kwargs = {
-                "rope_type": rope_type,
-                "factor": scaling_factor,
-                "dim": dim,
-                "base": base,
-                "max_position_embeddings": max_position_embeddings,
-            }
-            self.rope_type = rope_type
-            self.max_seq_len_cached = max_position_embeddings
-            self.original_max_seq_len = max_position_embeddings
-        else:
-            # BC: "rope_type" was originally "type"
-            if config.rope_scaling is not None:
-                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-            else:
-                self.rope_type = "default"
-            self.max_seq_len_cached = config.max_position_embeddings
-            self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=self.original_max_seq_len, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-
-        freqs = torch.outer(t, self.inv_freq)
-
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-
-    def forward(self, x, seq_len=None):
-        # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+from QEfficient.transformers.modeling_utils import (
+    _create_causal_mask,
+    _prepare_aspect_ratio_attention_mask,
+    _prepare_cross_attention_mask,
+)
+from QEfficient.utils import constants
+from QEfficient.utils._utils import IOInfo
 
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
-            self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
-        )
+MAX_NUM_IMG = 1
+NUM_CHANNEL = 3
 
 
 def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
@@ -145,6 +82,85 @@ def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     return q_embed.to(q.dtype), k_embed.to(k.dtype)
 
 
+class QEffMllamaTextCrossAttentionSingleQPC(MllamaTextCrossAttention):
+    """
+    Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
+    The only differences are:
+        - add new args cache idx for the kv retention
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        batch_index: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        query_states = self.q_norm(query_states)
+
+        # elif past_key_value is not None:
+        # Fetch old cache
+        key_states_old = past_key_value.key_cache[self.layer_idx]
+        value_states_old = past_key_value.value_cache[self.layer_idx]
+
+        # if cross_attention_states is not None:
+        # Compute new KV states
+        key_states = self.k_proj(cross_attention_states)
+        value_states = self.v_proj(cross_attention_states)
+        key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        # Out-of-place Scatter new into old
+        # out-of-place is important so the original tensor is not affected,
+        # otherwise leads to same operations in both graphs
+        indices = (torch.arange(bsz),)
+        key_states_new = torch.index_put(key_states_old, indices, key_states)
+        value_states_new = torch.index_put(value_states_old, indices, value_states)
+
+        # Select old or new image KV states based on q_len
+        key_states = torch.where(q_len == 1, key_states_old, key_states_new)
+        value_states = torch.where(q_len == 1, value_states_old, value_states_new)
+
+        # Update the image cache
+        past_key_value.key_cache[self.layer_idx] = key_states
+        past_key_value.value_cache[self.layer_idx] = value_states
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        key_states = self.k_norm(key_states)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+            # attn_weights = torch.where(
+            #     attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights
+            # )
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
 class QEffMllamaTextSelfAttention(MllamaTextSelfAttention):
     """
     Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
@@ -200,7 +216,12 @@ def forward(
 
         if past_key_value is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "batch_index": batch_index, "position_ids": position_ids}
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "batch_index": batch_index,
+                "position_ids": position_ids,
+            }
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
@@ -227,74 +248,6 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
-class QEffMllamaTextCrossAttention(MllamaTextCrossAttention):
-    """
-    Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
-    The only differences are:
-        - add new args cache idx for the kv retention
-    """
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        cross_attention_states: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        batch_index: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-        use_cache: bool = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_proj(hidden_states)
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        query_states = self.q_norm(query_states)
-
-        if cross_attention_states is not None:
-            key_states = self.k_proj(cross_attention_states)
-            value_states = self.v_proj(cross_attention_states)
-            key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            key_states = repeat_kv(key_states, self.num_key_value_groups)
-            value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-            key_states = self.k_norm(key_states)
-            if past_key_value is not None:
-                # if we have a new image + new tokens, we only computed key_states on that new image
-                # we still update the cross key states, past_image, new_image. And use it!
-                key_states, value_states = past_key_value.update(
-                    key_states, value_states, self.layer_idx, {"batch_index": batch_index, "position_ids": position_ids}
-                )
-        elif cache_position[0] != 0:
-            key_states, value_states = (
-                past_key_value.key_cache[self.layer_idx],
-                past_key_value.value_cache[self.layer_idx],
-            )
-        else:
-            raise ValueError(
-                "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
-            )
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:  # no matter the length, we just slice it
-            attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights)
-
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(bsz, q_len, -1)
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
 class QEffMllamaSelfAttentionDecoderLayer(MllamaSelfAttentionDecoderLayer):
     """
     Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
@@ -374,6 +327,79 @@ def forward(
         return outputs
 
 
+class QEffMllamaTextCrossAttentionTwoQPC(MllamaTextCrossAttention):
+    """
+    Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
+    The only differences are:
+        - add new args cache idx for the kv retention
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        batch_index: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        query_states = self.q_norm(query_states)
+
+        if cross_attention_states is not None:
+            key_states = self.k_proj(cross_attention_states)
+            value_states = self.v_proj(cross_attention_states)
+            key_states = key_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, -1, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            if past_key_value is not None:
+                # if we have a new image + new tokens, we only computed key_states on that new image
+                # we still update the cross key states, past_image, new_image. And use it!
+                key_states, value_states = past_key_value.update(
+                    key_states,
+                    value_states,
+                    self.layer_idx,
+                    {"batch_index": batch_index, "position_ids": position_ids},
+                )
+        elif past_key_value is not None:
+            key_states, value_states = (
+                past_key_value.key_cache[self.layer_idx],
+                past_key_value.value_cache[self.layer_idx],
+            )
+        else:
+            raise ValueError(
+                "Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
+            )
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        key_states = self.k_norm(key_states)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
 class QEffMllamaCrossAttentionDecoderLayer(MllamaCrossAttentionDecoderLayer):
     """
     Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
@@ -428,22 +454,227 @@ def forward(
         return outputs
 
 
-class QEffMllamaTextModel(MllamaTextModel):
+class QEffMllamaRotaryEmbedding(MllamaRotaryEmbedding):
     """
     Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
     The only differences are:
-        - add new args cache idx for the kv retention
+    - Add static sin/cos computations.
     """
 
-    # def __init__(self, config: MllamaTextConfig):
-    #     super().__init__(config)
-    #     self.config = config
-    #     self.__qeff_init__()
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[MllamaConfig] = None,
+    ):
+        super(MllamaRotaryEmbedding, self).__init__()  # Initialize nn.Module
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.45"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=self.original_max_seq_len,
+            device=self.inv_freq.device,
+            dtype=torch.get_default_dtype(),
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+            self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+        )
 
-    # def __qeff_init__(self):
-    #     self.layers = nn.ModuleList(
-    #         [MllamaSelfAttentionDecoderLayer(self.config, layer_idx) for layer_idx in range(self.config.num_hidden_layers)]
-    #     )
+
+class QEffMllamaVisionModel(MllamaVisionModel):
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        aspect_ratio_ids: torch.Tensor,
+        aspect_ratio_mask: torch.Tensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_concurrent_media, num_tiles, num_channels, height, width = pixel_values.shape
+
+        pixel_values = pixel_values.reshape(batch_size * num_concurrent_media * num_tiles, num_channels, height, width)
+        aspect_ratio_ids = aspect_ratio_ids.reshape(batch_size * num_concurrent_media, -1)
+
+        # Patch embedding
+        patch_embeds = self.patch_embedding(pixel_values.to(self.dtype).to(self.device))
+        hidden_state = patch_embeds.flatten(2).transpose(1, 2)
+
+        # Tile embeddings
+        _, num_patches, dim = hidden_state.shape
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, num_tiles, -1, dim)
+        hidden_state = self.pre_tile_positional_embedding(hidden_state, aspect_ratio_ids)
+
+        # Add cls token
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media * num_tiles, num_patches, dim)
+        hidden_state = self.apply_class_embedding(hidden_state)
+        num_patches += 1
+
+        # Position embeddings
+        hidden_state = hidden_state.reshape(batch_size * num_concurrent_media, num_tiles, num_patches, dim)
+        hidden_state = self.gated_positional_embedding(hidden_state, aspect_ratio_ids)
+
+        hidden_state = self.layernorm_pre(hidden_state)
+
+        # Compute the number of tokens to pad
+        num_padding_patches = (8 - (hidden_state.shape[-2] % 8)) % 8
+        # Compute padding tuple for pad function
+        padding = (
+            0,
+            0,
+            0,
+            num_padding_patches,
+        )  # (pad_left, pad_right, pad_left for dim -2, pad_right for dim -2)
+        # Pad the tensor
+        hidden_state = F.pad(hidden_state, padding, mode="constant", value=0)
+        slice_index = -num_padding_patches if num_padding_patches > 0 else None
+
+        # Prepare attention mask
+        attention_mask = aspect_ratio_mask.reshape(batch_size * num_concurrent_media, -1)
+        attention_mask = _prepare_aspect_ratio_attention_mask(
+            aspect_ratio_mask=attention_mask,
+            num_patches=self.num_patches,
+            target_length=hidden_state.shape[2],
+            dtype=self.dtype,
+        )
+
+        # Apply encoder
+        hidden_state = hidden_state.view(batch_size * num_concurrent_media, -1, dim)
+        output = self.transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+            output_hidden_states=True,
+            output_attentions=output_attentions,
+        )
+        hidden_state = output[0]
+
+        hidden_state = self.layernorm_post(hidden_state)
+
+        # Apply global encoder
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches + num_padding_patches, dim
+        )
+        hidden_state = self.post_tile_positional_embedding(hidden_state, aspect_ratio_ids)
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles * (num_patches + num_padding_patches), dim
+        )
+        global_output = self.global_transformer(
+            hidden_state,
+            attention_mask=attention_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+        )
+        hidden_state = global_output[0]
+
+        # Remove padding form hidden state
+        hidden_state = hidden_state.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches + num_padding_patches, dim
+        )
+        hidden_state = hidden_state[:, :, :slice_index]
+        hidden_state = hidden_state.reshape(batch_size, num_concurrent_media, num_tiles, num_patches, dim)
+
+        # Collect intermediate layer outputs from encoder output
+        all_intermediate_hidden_states = output[1]
+        intermediate_hidden_states = torch.stack(all_intermediate_hidden_states, dim=-1)
+        intermediate_hidden_states = intermediate_hidden_states[..., self.intermediate_layers_indices]
+
+        # Remove padding from intermediate hidden states
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size * num_concurrent_media, num_tiles, num_patches + num_padding_patches, -1
+        )
+        intermediate_hidden_states = intermediate_hidden_states[:, :, :slice_index]
+        intermediate_hidden_states = intermediate_hidden_states.reshape(
+            batch_size, num_concurrent_media, num_tiles, num_patches, -1
+        )
+
+        # Concatenate final hidden state and intermediate hidden states
+        hidden_state = torch.cat([hidden_state, intermediate_hidden_states], dim=-1)
+
+        if output_hidden_states:
+            hidden_states = tuple(all_intermediate_hidden_states) + tuple(global_output[1])
+        else:
+            hidden_states = None
+
+        if output_attentions:
+            # global transformer in contrast to `self.transformer` doesn't always return hidden states so we might go index out-of-range
+            global_attn = tuple(global_output[2]) if output_hidden_states else tuple(global_output[1])
+            attentions = tuple(output[2]) + global_attn
+        else:
+            attentions = None
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+class QEffMllamaTextModel(MllamaTextModel):
+    """
+    Copied from MllamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/mllama/modeling_mllama.py
+    The only differences are:
+        - add new args cache idx for the kv retention
+    """
 
     def forward(
         self,
@@ -462,28 +693,6 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
-        """
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoProcessor, MllamaTextModel
-
-        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
-        >>> model = MllamaTextModel.from_pretrained(checkpoint)
-        >>> processor = AutoProcessor.from_pretrained(checkpoint)
-
-        >>> text = "<|image|>If I had to write a haiku for this one"
-        >>> inputs = processor(text=text, return_tensors="pt")
-
-        >>> output = model(**inputs)
-
-        >>> print(output.last_hidden_state.shape)
-        torch.Size([1, 13, 4096])
-        ```
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -513,20 +722,28 @@ def forward(
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
             cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+                past_seen_tokens,
+                past_seen_tokens + inputs_embeds.shape[1],
+                device=inputs_embeds.device,
             )
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
         causal_mask = self._update_causal_mask(
-            attention_mask, inputs_embeds, cache_position, position_ids, past_key_values, output_attentions
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            position_ids,
+            past_key_values,
+            output_attentions,
         )
 
         # embed positions
         hidden_states = inputs_embeds
 
         # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        position_embeddings = None
 
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
@@ -552,7 +769,6 @@ def forward(
             # TODO: vbaddi: since past_key_values are retained from previous states, the condition for is_cross_attention_cache_empty is False
             # so explicitly making it true in order to skip the cross attention for language model
             # comment once there is vision and cross attention support
-            is_cross_attention_cache_empty = True
             if is_cross_attention_layer and cross_attention_states is None and is_cross_attention_cache_empty:
                 continue
 
@@ -710,39 +926,6 @@ def forward(
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-            num_logits_to_keep (`int`, *optional*):
-                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
-                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-
-        Returns:
-
-        Example:
-
-        ```python
-        >>> from transformers import AutoTokenizer, MllamaForCausalLM
-
-        >>> model = MllamaForCausalLM.from_pretrained("Llama-3.2-11B-Vision")
-        >>> tokenizer = AutoTokenizer.from_pretrained("Llama-3.2-11B-Vision")
-
-        >>> prompt = "If I had to write a haiku, it would be:"
-        >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(inputs.input_ids, max_length=40, do_sample=True, temperature=0.6)
-        >>> result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        >>> print(result)
-        If I had to write a haiku, it would be: "Snowflakes gently fall" - simple, yet peaceful.
-        I love the idea of snowflakes gently falling, each one
-        ```
-        """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -797,3 +980,325 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+
+class QEffMllamaVisionEncoder(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.cross_attention_layers = self.model.config.get_text_config().cross_attention_layers
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        aspect_ratio_mask: Optional[torch.Tensor] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
+    ) -> List[Tuple[torch.Tensor]]:
+        vision_outputs = self.model.vision_model(
+            pixel_values=pixel_values,
+            aspect_ratio_ids=aspect_ratio_ids,
+            aspect_ratio_mask=aspect_ratio_mask,
+        )
+        cross_attention_states = vision_outputs[0]
+        cross_attention_states = self.model.multi_modal_projector(cross_attention_states).reshape(
+            -1, cross_attention_states.shape[-2], self.model.hidden_size
+        )
+
+        bsz = pixel_values.shape[0]
+        outputs = []
+        for i in self.cross_attention_layers:
+            cross_attn = self.model.language_model.model.layers[i].cross_attn
+            key_states = cross_attn.k_proj(cross_attention_states)
+            value_states = cross_attn.v_proj(cross_attention_states)
+            key_states = key_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, -1, cross_attn.num_key_value_heads, cross_attn.head_dim).transpose(
+                1, 2
+            )
+            outputs.append((key_states, value_states))
+        return outputs
+
+
+class QEffMllamaForConditionalGeneration(MllamaForConditionalGeneration):
+    def get_qeff_vision_encoder(self):
+        return QEffMllamaVisionEncoder(self)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        aspect_ratio_mask: Optional[torch.Tensor] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        batch_index: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if pixel_values is not None and cross_attention_states is not None:
+            raise ValueError("`pixel_values` and `cross_attention_states` cannot be provided simultaneously")
+
+        if pixel_values is not None:
+            if aspect_ratio_ids is None:
+                raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided")
+            # get vision tokens from vision model
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                aspect_ratio_ids=aspect_ratio_ids,
+                aspect_ratio_mask=aspect_ratio_mask,
+                output_hidden_states=output_hidden_states,
+                output_attentions=output_attentions,
+                return_dict=return_dict,
+            )
+            cross_attention_states = vision_outputs[0]
+            cross_attention_states = self.multi_modal_projector(cross_attention_states).reshape(
+                -1, cross_attention_states.shape[-2], self.hidden_size
+            )
+
+        if cross_attention_mask is not None:
+            cross_attention_mask, full_text_row_masked_out_mask = _prepare_cross_attention_mask(
+                cross_attention_mask,
+                num_vision_tokens=self.vision_model.num_patches,
+                dtype=self.dtype,
+            )
+        else:
+            full_text_row_masked_out_mask = None
+
+        if cross_attention_mask is not None and cache_position is not None:
+            cross_attention_mask = cross_attention_mask[:, :, cache_position]
+            full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position]
+
+        outputs = self.language_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            past_key_values=past_key_values,
+            batch_index=batch_index,
+            use_cache=use_cache,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+
+        return outputs
+
+    def get_dummy_inputs(self, kv_offload: bool = False):
+        BS = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+        SEQ_LEN = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
+        CTX_LEN = constants.ONNX_EXPORT_CTX_LEN
+
+        txt_cfg = self.config.get_text_config()
+        num_hidden_layers = txt_cfg.num_hidden_layers
+        cross_attention_layers = txt_cfg.cross_attention_layers
+        num_key_value_heads = txt_cfg.num_key_value_heads
+        head_dim = txt_cfg.hidden_size // txt_cfg.num_attention_heads
+
+        vis_cfg = self.config.vision_config
+        num_patches = (vis_cfg.image_size // vis_cfg.patch_size) ** 2 + 1
+        image_tokens_len = vis_cfg.max_num_tiles * num_patches
+
+        if vis_cfg := getattr(self.config, "vision_config", None):
+            img_size = getattr(vis_cfg, "image_size", 448)
+            max_num_img_tiles = getattr(vis_cfg, "max_num_tiles", 4)
+        else:
+            img_size = 448
+            max_num_img_tiles = 4
+
+        # vision inputs
+        vision_inputs = {
+            "pixel_values": torch.zeros(
+                (BS, MAX_NUM_IMG, max_num_img_tiles, NUM_CHANNEL, img_size, img_size), dtype=torch.float32
+            ),
+            "aspect_ratio_ids": torch.ones((BS, MAX_NUM_IMG), dtype=torch.int64),
+            "aspect_ratio_mask": torch.ones((BS, MAX_NUM_IMG, max_num_img_tiles), dtype=torch.int64),
+        }
+
+        # lang_inputs
+        lang_inputs = {
+            "input_ids": torch.zeros((BS, SEQ_LEN), dtype=torch.int64),
+            "cross_attention_mask": torch.zeros((BS, SEQ_LEN, MAX_NUM_IMG, max_num_img_tiles), dtype=torch.int64),
+            "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
+        }
+
+        lang_inputs["position_ids"] = torch.where(
+            lang_inputs.pop("attention_mask") == 1,
+            torch.arange(lang_inputs["input_ids"].shape[1]).view(1, -1),
+            -1,
+        )
+
+        lang_inputs["past_key_values"] = DynamicCache(num_hidden_layers)
+        lang_inputs["past_key_values"].key_cache = [0] * num_hidden_layers
+        lang_inputs["past_key_values"].value_cache = [0] * num_hidden_layers
+
+        for i in range(num_hidden_layers):
+            if i in cross_attention_layers:
+                idx = cross_attention_layers.index(i)
+                assert idx == ((i - 3) // 5), f"{i}, {(i - 3) // 5}"
+                lang_inputs["past_key_values"].key_cache[i] = torch.zeros(
+                    1, num_key_value_heads, image_tokens_len, head_dim
+                )
+                lang_inputs["past_key_values"].value_cache[i] = torch.zeros(
+                    1, num_key_value_heads, image_tokens_len, head_dim
+                )
+            else:
+                lang_inputs["past_key_values"].key_cache[i] = torch.zeros(1, num_key_value_heads, CTX_LEN, head_dim)
+                lang_inputs["past_key_values"].value_cache[i] = torch.zeros(1, num_key_value_heads, CTX_LEN, head_dim)
+
+        lang_inputs["past_key_values"] = lang_inputs["past_key_values"].to_legacy_cache()
+        lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, CTX_LEN - 1)
+        inputs = {}
+
+        if kv_offload:
+            inputs["vision"] = vision_inputs
+            inputs["lang"] = lang_inputs
+        else:
+            inputs = {**vision_inputs, **lang_inputs}
+
+        return inputs
+
+    def get_specializations(
+        self,
+        batch_size: int,
+        prefill_seq_len: int,
+        ctx_len: int,
+        img_size: int,
+        kv_offload: bool = False,
+        **compiler_options,
+    ):
+        vis_cfg = self.config.vision_config
+        max_num_images = compiler_options.pop("max_num_images", 1)
+        prefill_seq_len = prefill_seq_len if prefill_seq_len else 32
+        ctx_len = ctx_len if ctx_len else 128
+        if img_size is None and hasattr(vis_cfg, "image_size"):
+            img_size = getattr(vis_cfg, "image_size")
+        elif img_size is None:
+            img_size = 448
+            logger.warning("Setting `img_size=448` as it was neither passed nor found in vision_config")
+
+        vision = [{"batch_size": batch_size, "max_num_images": max_num_images, "img_size": img_size}]
+        lang = [
+            {
+                "batch_size": batch_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+                "max_num_images": max_num_images,
+                "img_size": img_size,
+            },
+            {
+                "batch_size": batch_size,
+                "seq_len": "1",
+                "ctx_len": ctx_len,
+                "max_num_images": max_num_images,
+                "img_size": img_size,
+            },
+        ]
+        specializations = {}
+
+        if kv_offload:
+            specializations["vision"] = vision
+            specializations["lang"] = lang
+            return specializations
+        else:
+            return lang
+
+    def get_onnx_dynamic_axes(self, kv_offload: bool = False):
+        txt_cfg = self.config.get_text_config()
+        num_hidden_layers = txt_cfg.num_hidden_layers
+        cross_attention_layers = txt_cfg.cross_attention_layers
+
+        vision_dynamic_axes = {
+            "pixel_values": {0: "batch_size", 1: "max_num_images", 4: "img_size", 5: "img_size"},
+            "aspect_ratio_ids": {0: "batch_size", 1: "max_num_images"},
+            "aspect_ratio_mask": {0: "batch_size", 1: "max_num_images"},
+        }
+
+        lang_dynamic_axes = {
+            "input_ids": {0: "batch_size", 1: "seq_len"},
+            "position_ids": {0: "batch_size", 1: "seq_len"},
+            "cross_attention_mask": {0: "batch_size", 1: "seq_len", 2: "max_num_images"},
+        }
+
+        for i in range(num_hidden_layers):
+            if i in cross_attention_layers:
+                lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size"}
+                lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size"}
+            else:
+                lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
+                lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
+
+        dynamic_axes = {}
+        if kv_offload:
+            dynamic_axes["vision"] = vision_dynamic_axes
+            dynamic_axes["lang"] = lang_dynamic_axes
+        else:
+            dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes}
+        return dynamic_axes
+
+    def get_output_names(self, kv_offload: bool = False):
+        txt_cfg = self.config.get_text_config()
+        num_hidden_layers = txt_cfg.num_hidden_layers
+
+        vision_output_names = []
+        for i in self.config.text_config.cross_attention_layers:
+            vision_output_names.append(f"past_key.{i}")
+            vision_output_names.append(f"past_value.{i}")
+
+        lang_output_names = [
+            "logits",
+            *[f"past_{kv}.{i}_RetainedState" for i in range(num_hidden_layers) for kv in ["key", "value"]],
+        ]
+
+        output_names = {}
+        if kv_offload:
+            output_names["vision"] = vision_output_names
+            output_names["lang"] = lang_output_names
+        else:
+            return lang_output_names
+        return output_names
+
+    def get_inputs_info(self):
+        return [
+            IOInfo(
+                name="pixel_values",
+                datatype=torch.float32,
+                shape=("batch_size", "max_num_images", 4, 3, "img_size", "img_size"),
+            ),
+            IOInfo(name="aspect_ratio_ids", datatype=torch.int64, shape=("batch_size", "max_num_images")),
+            IOInfo(name="aspect_ratio_mask", datatype=torch.int64, shape=("batch_size", "max_num_images", 4)),
+            IOInfo(name="input_ids", datatype=torch.int64, shape=("batch_size", "seq_len")),
+            IOInfo(
+                name="cross_attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len", "max_num_images", 4)
+            ),
+            IOInfo(name="attention_mask", datatype=torch.int64, shape=("batch_size", "seq_len")),
+        ]
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index c2e3777bc..ee4d9776c 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -6,27 +6,43 @@
 # ----------------------------------------------------------------------------
 
 import hashlib
-import logging
 import warnings
 from pathlib import Path
+from time import perf_counter
 from typing import List, Optional, Union
 
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import AutoModel, AutoModelForCausalLM, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import (
+    AutoModel,
+    AutoModelForCausalLM,
+    AutoModelForImageTextToText,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+    TextStreamer,
+)
 
 import QEfficient
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.transformers.models.pytorch_transforms import CustomOpsTransform, KVCacheTransform, SpDTransform
+from QEfficient.generation.text_generation_inference import CloudAI100ExecInfoNew, PerfMetrics, get_compilation_dims
+from QEfficient.transformers.models.pytorch_transforms import (
+    CustomOpsTransform,
+    KVCacheModuleMethodMapperTransform,
+    KVCacheTransform,
+    SpDTransform,
+    VlmKVOffloadTransform,
+    VlmNoKVOffloadTransform,
+)
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
 from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
+from QEfficient.utils.logging_utils import logger
 
-logger = logging.getLogger(__file__)
+MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"]
 
 
 class QEFFTransformersBase(QEFFBaseModel):
@@ -37,19 +53,778 @@ class QEFFTransformersBase(QEFFBaseModel):
     _hf_auto_class: type
 
     def __init__(self, model: nn.Module) -> None:
-        if hasattr(model.config, "quantization_config") and not isinstance(
-            model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values())
+        if (
+            hasattr(model, "config")
+            and hasattr(model.config, "quantization_config")
+            and not isinstance(model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values()))
+        ):
+            raise AssertionError("Please use `from_pretrained` method to load quantized models")
+
+        super().__init__(model)
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + "\n" + self.model.__repr__()
+
+    @classmethod
+    @with_replaced_quantizers
+    def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
+        if kwargs.get("attn_implementation", None) not in {None, "eager"}:
+            logger.warning('Updating attn_implementation="eager"')
+
+        if kwargs.get("low_cpu_mem_usage", None):
+            logger.warning("Updating low_cpu_mem_usage=False")
+
+        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        return cls(model, is_tlm=is_tlm)
+
+    @property
+    def model_name(self) -> str:
+        mname = self.model.__class__.__name__
+        if mname.startswith("QEff") or mname.startswith("QEFF"):
+            mname = mname[4:]
+        return mname
+
+
+class QEFFAutoModel(QEFFTransformersBase):
+    """
+    The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub.
+    Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization.
+
+    ``Mandatory`` Args:
+        :model (nn.Module): PyTorch model
+
+    .. code-block:: python
+
+        from QEfficient import QEFFAutoModel
+        from transformers import AutoTokenizer
+
+        # Initialize the model using from_pretrained similar to transformers.AutoModel.
+        model = QEFFAutoModel.from_pretrained("model_name")
+
+        # Now you can directly compile the model for Cloud AI 100
+        model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
+
+        #prepare input
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        inputs = tokenizer("My name is", return_tensors="pt")
+
+        # You can now execute the model
+        model.generate(inputs)
+    """
+
+    _hf_auto_class = AutoModel
+    _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
+    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
+
+    def __init__(self, model: nn.Module, **kwargs):
+        super().__init__(model)
+        self.model.config.use_cache = True
+        self.num_layers = model.config.num_hidden_layers
+
+    @classmethod
+    @with_replaced_quantizers
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        """
+        This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel.
+        Once the model is initialized, you can use other methods such as export, compile, and generate on the same object.
+
+        This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API
+        Args:
+            :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory.
+            :args, kwargs: Additional arguments to pass to transformers.AutoModel.
+
+        .. code-block:: python
+
+            from QEfficient import QEFFAutoModel
+            from transformers import AutoTokenizer
+
+            # Initialize the model using from_pretrained similar to transformers.AutoModel.
+            model = QEFFAutoModel.from_pretrained("model_name")
+
+            # Now you can directly compile the model for Cloud AI 100
+            model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
+
+            #prepare input
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            inputs = tokenizer("My name is", return_tensors="pt")
+
+            # You can now execute the model
+            model.generate(inputs)
+        """
+        if kwargs.get("attn_implementation", None) not in {None, "eager"}:
+            logger.warning('Updating attn_implementation="eager"')
+
+        if kwargs.get("low_cpu_mem_usage", None):
+            logger.warning("Updating low_cpu_mem_usage=False")
+
+        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False})
+        try:
+            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+            warnings.warn("Removing pooling layer from the model if exist")
+        except TypeError:
+            kwargs.pop("add_pooling_layer", None)
+            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+        # This is support models that should be classified to in a different auto class but transformers load them via this class
+        kv_offload = kwargs.pop("kv_offload", None)
+        if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP:
+            return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__](
+                model, kv_offload=kv_offload
+            )
+
+        return cls(model)
+
+    @property
+    def model_hash(self) -> str:
+        # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
+        # Using same card name will result in same hash. But, using a relative path for one run and
+        # absolute path for another run will result in different hash.
+        # The added complexity to resolve different paths to same location is not worth pursuing.
+        # Instead, advise the user to always provide same relative paths or absolute paths for local models.
+
+        # Compute the hash with: model_config, transforms
+        mhash = hashlib.sha256()
+        mhash.update(to_hashable(self.model.config.to_diff_dict()))
+        mhash.update(to_hashable(self._transform_names()))
+        mhash = mhash.hexdigest()[:16]
+        return mhash
+
+    def export(self, export_dir: Optional[str] = None) -> str:
+        """
+        Exports the model to ``ONNX`` format using ``torch.onnx.export``.
+
+        ``Optional`` Args:
+           :export_dir (str, optional): The directory path to store ONNX-graph.
+
+        Returns:
+            :str: Path of the generated ``ONNX`` graph.
+        """
+        bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
+        seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
+
+        example_inputs = {
+            "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
+            "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64),
+        }
+
+        dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}}
+
+        output_names = ["output"]
+
+        return self._export(
+            example_inputs,
+            output_names,
+            dynamic_axes,
+            export_dir=export_dir,
+        )
+
+    def compile(
+        self,
+        onnx_path: Optional[str] = None,
+        compile_dir: Optional[str] = None,
+        *,
+        seq_len: int = 32,
+        batch_size: int = 1,
+        num_devices: int = 1,
+        num_cores: int = 16,  # FIXME: Make this mandatory arg
+        mxfp6_matmul: bool = False,
+        **compiler_options,
+    ) -> str:
+        """
+        This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package.
+        If the model has not been exported yet, this method will handle the export process.
+        You can pass any other arguments that the `qaic-exec` takes as extra kwargs.
+
+        ``Optional`` Args:
+            :onnx_path (str, optional): Path to pre-exported onnx model.
+            :compile_dir (str, optional): Path for saving the qpc generated.
+            :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
+            :batch_size (int, optional): Batch size. ``Defaults to 1``.
+            :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
+            :num_cores (int): Number of cores used to compile the model.
+            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
+            :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+        Returns:
+            :str: Path of the compiled ``qpc`` package.
+        """
+
+        specializations = [
+            {"batch_size": batch_size, "seq_len": seq_len},
+        ]
+
+        return self._compile(
+            onnx_path,
+            compile_dir,
+            compile_only=True,
+            specializations=specializations,
+            convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=num_devices,
+            aic_num_cores=num_cores,
+            **compiler_options,
+        )
+
+    def generate(
+        self,
+        inputs: torch.Tensor,
+        device_ids: List[int] = None,
+        runtime_ai100: bool = True,
+    ) -> Union[torch.Tensor, np.ndarray]:
+        """
+        This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+        ``Mandatory`` Args:
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
+        ``optional`` Args:
+            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+        Returns:
+            :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
+        """
+        # AI_100 runtime
+        if runtime_ai100:
+            if not isinstance(self.qpc_path, Path):
+                raise TypeError("Please run compile API first!")
+
+            return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids)
+        # PyTorch runtime
+        else:
+            return self.pytorch_feature_generate(model=self.model, inputs=inputs)
+
+    def cloud_ai_100_feature_generate(
+        self,
+        inputs: torch.Tensor,
+        device_ids: List[int] = [0],
+    ) -> np.ndarray:
+        """
+        Generates features with list of prompts using AI 100 runtime.
+
+        ``Mandatory`` Args:
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
+        ``Optional`` Args:
+            device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0].
+
+        Returns:
+           np.ndarray: A list of dictionaries containing the generated output features.
+        """
+
+        if self.qpc_session is None:
+            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
+            self.batch_size = self.qpc_session.bindings[0].dims[0]
+            self.seq_len = self.qpc_session.bindings[0].dims[1]
+        # Prepare input
+        input_ids_len = inputs["input_ids"].shape[1]
+        input_ids = np.array(
+            torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0)
+        )
+        attention_mask = np.array(
+            torch.nn.functional.pad(
+                inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0
+            )
+        )
+
+        inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
+
+        outputs = {
+            "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype(
+                np.float32
+            ),
+        }
+        self.qpc_session.set_buffers(outputs)
+        outputs = self.qpc_session.run(inputs)
+        outputs = outputs["output"][:, :input_ids_len, :]
+        return outputs
+
+    def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
+        """
+        Generates features from a list of text prompts using a PyTorch model.
+
+        ``Mandatory`` Args:
+            :model: The transformed PyTorch model used for generating features.
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
+
+        Returns:
+            torch.Tensor: A list of output features generated by the model for each prompt.
+        """
+        return model(**inputs)
+
+
+class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel):
+    _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform]
+    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
+
+    def __init__(self, model: nn.modules):
+        super().__init__(model)
+        self.model = model.get_qeff_vision_encoder()
+
+    def export(self, inputs, output_names, dynamic_axes, export_dir=None):
+        return self._export(inputs, output_names, dynamic_axes, export_dir)
+
+    def compile(
+        self,
+        compile_dir,
+        compile_only,
+        specializations,
+        convert_to_fp16,
+        mxfp6_matmul,
+        mdp_ts_num_devices,
+        aic_num_cores,
+        custom_io,
+        **compiler_options,
+    ) -> str:
+        return self._compile(
+            compile_dir=compile_dir,
+            compile_only=compile_only,
+            specializations=specializations,
+            convert_to_fp16=convert_to_fp16,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=mdp_ts_num_devices,
+            aic_num_cores=aic_num_cores,
+            custom_io=custom_io,
+            **compiler_options,
+        )
+
+    @property
+    def model_hash(self) -> str:
+        # Compute the hash with: model_config, continuous_batching, transforms
+        mhash = hashlib.sha256()
+        mhash.update(to_hashable(self.model.model.config.to_diff_dict()))
+        mhash.update(to_hashable(self._transform_names()))
+        mhash.update(to_hashable({"QEffVisionEncoderForTextImageToTextModel": True}))
+        mhash = mhash.hexdigest()[:16]
+        return mhash
+
+    @property
+    def model_name(self) -> str:
+        mname = self.model.__class__.__name__
+        if mname.startswith("QEff") or mname.startswith("QEFF"):
+            mname = mname[4:]
+        return mname
+
+
+class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
+    _pytorch_transforms = [
+        AwqToMatmulNbitsTransform,
+        GPTQToMatmulNbitsTransform,
+        CustomOpsTransform,
+        KVCacheTransform,
+        VlmKVOffloadTransform,
+    ]
+    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
+
+    def __init__(self, model):
+        super().__init__(model)
+        # self.model.config.text_config.use_cache=True
+
+    def export(self, inputs, output_names, dynamic_axes, export_dir=None):
+        return self._export(inputs, output_names, dynamic_axes, export_dir)
+
+    def compile(
+        self,
+        compile_dir,
+        compile_only,
+        specializations,
+        convert_to_fp16,
+        mxfp6_matmul,
+        mdp_ts_num_devices,
+        aic_num_cores,
+        custom_io,
+        **compiler_options,
+    ) -> str:
+        return self._compile(
+            compile_dir=compile_dir,
+            compile_only=compile_only,
+            specializations=specializations,
+            convert_to_fp16=convert_to_fp16,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=mdp_ts_num_devices,
+            aic_num_cores=aic_num_cores,
+            custom_io=custom_io,
+            **compiler_options,
+        )
+
+    @property
+    def model_hash(self) -> str:
+        # Compute the hash with: model_config, continuous_batching, transforms
+        mhash = hashlib.sha256()
+        mhash.update(to_hashable(self.model.config.to_diff_dict()))
+        mhash.update(to_hashable(self._transform_names()))
+        mhash.update(to_hashable({"QEffCausalLMForTextImageToTextModel": True}))
+        mhash = mhash.hexdigest()[:16]
+        return mhash
+
+    @property
+    def model_name(self) -> str:
+        mname = self.model.__class__.__name__
+        if mname.startswith("QEff") or mname.startswith("QEFF"):
+            mname = mname[4:]
+        return mname
+
+
+class _QEffAutoModelForImageTextToTextDualQPC:
+    _hf_auto_class = AutoModelForImageTextToText
+    UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"]
+
+    def __init__(
+        self,
+        model: nn.Module,
+        **kwargs,
+    ):
+        if kwargs.pop("full_batch_size", None):
+            raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
+        self.model = model
+        self.config = model.config
+        if self.model_name in self.UNSUPPORTED_MODELS:
+            raise NotImplementedError(f"kv_offload is not yet supported for {self.model.__class__.__name__}")
+        self.vision_model = QEffVisionEncoderForTextImageToTextModel(model)
+        self.lang_model = QEffCausalLMForTextImageToTextModel(model)
+
+        self.input_shapes, self.output_names = None, None
+
+    @property
+    def model_name(self) -> str:
+        mname = self.model.__class__.__name__
+        if mname.startswith("QEff") or mname.startswith("QEFF"):
+            mname = mname[4:]
+        return mname
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs):
+        if kwargs.get("attn_implementation", None) not in {None, "eager"}:
+            logger.warning('Updating attn_implementation="eager"')
+
+        if kwargs.get("low_cpu_mem_usage", None):
+            logger.warning("Updating low_cpu_mem_usage=False")
+
+        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(model, **kwargs)
+
+    @property
+    def onnx_path(self):
+        return [self.vision_model.onnx_path, self.lang_model.onnx_path]
+
+    @property
+    def qpc_path(self):
+        return [self.vision_model.qpc_path, self.lang_model.qpc_path]
+
+    def export(
+        self,
+        export_dir: Optional[str] = None,
+        **kwargs,
+    ) -> str:
+        inputs = self.model.get_dummy_inputs(kv_offload=True)
+        dynamic_axes = self.model.get_onnx_dynamic_axes(kv_offload=True)
+        output_names = self.model.get_output_names(kv_offload=True)
+        self.vision_model.export(
+            inputs["vision"],
+            output_names["vision"],
+            dynamic_axes["vision"],
+            export_dir,
+        )
+
+        self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir)
+
+    def compile(
+        self,
+        img_size: int,
+        vision_onnx_path: Optional[str] = None,
+        lang_onnx_path: Optional[str] = None,
+        compile_dir: Optional[str] = None,
+        *,
+        prefill_seq_len: Optional[int] = None,
+        ctx_len: Optional[int] = None,
+        batch_size: int = 1,
+        full_batch_size: Optional[int] = None,
+        kv_cache_batch_size: Optional[int] = None,
+        num_devices: int = 1,
+        num_cores: int = 16,  # FIXME: Make this mandatory arg
+        mxfp6_matmul: bool = False,
+        mxint8_kv_cache: bool = False,
+        num_speculative_tokens: Optional[int] = None,
+        enable_qnn: bool = False,
+        qnn_config: Optional[str] = None,
+        **compiler_options,
+    ) -> str:
+        if (
+            any(
+                param is not None
+                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config]
+            )
+            or enable_qnn
         ):
-            raise AssertionError("Please use `from_pretrained` method to load quantized models")
+            raise ValueError(
+                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
+                f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
+                f"enable_qnn={enable_qnn}, qnn_config={qnn_config}"
+            )
+
+        output_names = self.model.get_output_names(kv_offload=True)
+
+        specializations = self.model.get_specializations(
+            batch_size=batch_size,
+            prefill_seq_len=prefill_seq_len,
+            ctx_len=ctx_len,
+            img_size=img_size,
+            kv_offload=True,
+            kv_offlaod=True,
+            **compiler_options,
+        )
+
+        custom_io_vision = {}
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+        custom_io_vision["pixel_values"] = kv_cache_dtype
+        for output_name in output_names["vision"]:
+            custom_io_vision[output_name] = kv_cache_dtype
+
+        if vision_onnx_path:
+            self.vision_model.onnx_path = vision_onnx_path
+        if lang_onnx_path:
+            self.lang_model.onnx_path = lang_onnx_path
+
+        if (self.vision_model.onnx_path is None and vision_onnx_path is None) or (
+            self.lang_model.onnx_path is None and lang_onnx_path is None
+        ):
+            self.export()
+
+        if mxfp6_matmul and self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6:
+            logger.warning(
+                "Due to accuracy issues of vision model fixing it's precision to fp16, while language model will be compiled for mxfp6"
+            )
+
+        self.vision_model._compile(
+            compile_dir,
+            compile_only=True,
+            specializations=specializations["vision"],
+            convert_to_fp16=True,
+            mxfp6_matmul=False,
+            mdp_ts_num_devices=num_devices,
+            aic_num_cores=num_cores,
+            custom_io=custom_io_vision,
+            **compiler_options,
+        )
+
+        custom_io_lang = {}
+        # Inputs
+        for output_name in output_names["lang"]:
+            if output_name.startswith("past_"):
+                custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype
+
+        # outputs
+        for output_name in output_names["lang"]:
+            if output_name.startswith("past_"):
+                custom_io_lang[output_name] = kv_cache_dtype
+
+        self.lang_model._compile(
+            compile_dir,
+            compile_only=True,
+            retained_state=True,
+            specializations=specializations["lang"],
+            convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=num_devices,
+            aic_num_cores=num_cores,
+            custom_io=custom_io_lang,
+            **compiler_options,
+        )
+
+    def generate(
+        self,
+        inputs: torch.Tensor,
+        streamer: Optional[TextStreamer] = None,
+        device_ids: List[int] = None,
+        runtime_ai100: bool = True,
+        generation_len: Optional[int] = None,
+    ) -> Union[torch.Tensor, np.ndarray]:
+        """
+        This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+        ``Mandatory`` Args:
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
+        ``optional`` Args:
+            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+        Returns:
+            :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
+        """
+        if not runtime_ai100:
+            raise NotImplementedError("PyTorch execution is not supported yet for this model!")
+
+        return self.kv_offload_generate(
+            inputs=inputs, device_ids=device_ids, streamer=streamer, generation_len=generation_len
+        )
+
+    def kv_offload_generate(
+        self,
+        inputs: List[str] = None,
+        streamer: Optional[TextStreamer] = None,
+        device_ids: List[int] = None,
+        generation_len: int = None,
+    ):
+        lang_session = QAICInferenceSession(self.lang_model.qpc_path, device_ids, activate=False)
+
+        vision_session = QAICInferenceSession(self.vision_model.qpc_path, device_ids)
+
+        batch_size, ctx_len, fbs = get_compilation_dims(self.lang_model.qpc_path)
+
+        pad_token_id = 1
+
+        # Skip inputs/outputs
+        lang_session.skip_buffers(
+            [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")]
+        )
+
+        # Read prompt and ctx len from session
+        batch_size = max(
+            [x[lang_session.binding_index_map["input_ids"]][1][0] for x in lang_session.allowed_shapes]
+            + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[0]]
+        )
+
+        prefill_seq_len = max(
+            [x[lang_session.binding_index_map["input_ids"]][1][1] for x in lang_session.allowed_shapes]
+            + [lang_session.bindings[lang_session.binding_index_map["input_ids"]].dims[1]]
+        )
+
+        input_len = inputs["attention_mask"].sum(1, keepdims=True)
+        input_ids_length = inputs["input_ids"].shape[1]
+        num_chunks = -(input_ids_length // -prefill_seq_len)  # ceil divide without float
+        padded_len = num_chunks * prefill_seq_len  # Convert to a multiple of prompt_len
+
+        if generation_len is None:
+            generation_len = ctx_len - input_len.max()
+        assert generation_len > 0, "generation length should be greater than zero"
+        generated_ids = np.full((batch_size, generation_len + 1), pad_token_id)
+
+        # Prepare inputs for prefill
+        prefill_start = perf_counter()
+
+        inputs["input_ids"] = torch.nn.functional.pad(
+            inputs["input_ids"],
+            (0, padded_len - input_ids_length),
+            "constant",
+            1,
+        )
+        inputs["attention_mask"] = torch.nn.functional.pad(
+            inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0
+        )
+        if "cross_attention_mask" in inputs:
+            inputs["cross_attention_mask"] = torch.nn.functional.pad(
+                inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length)
+            )
+
+        for k, v in inputs.items():
+            inputs[k] = np.array(v)
+
+        vision_inputs = {
+            k: v for k, v in inputs.items() if k in {"pixel_values", "aspect_ratio_ids", "aspect_ratio_mask"}
+        }
+
+        vision_inputs["pixel_values"] = vision_inputs["pixel_values"].astype("float16")
+        vision_outputs = vision_session.run(vision_inputs)
+
+        lang_inputs = {k: v for k, v in inputs.items() if k not in vision_inputs}
+        lang_inputs["position_ids"] = np.where(
+            lang_inputs.pop("attention_mask"), np.arange(padded_len), -1
+        )  # Need to use -1 as position_ids for invalid tokens
+
+        vision_session.deactivate()
+        lang_session.activate()
+
+        lang_session.set_buffers(vision_outputs)
+
+        # Run prefill
+        for i in range(num_chunks):
+            chunk_inputs = lang_inputs.copy()
+            chunk_inputs["input_ids"] = lang_inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
+            chunk_inputs["position_ids"] = lang_inputs["position_ids"][
+                :, i * prefill_seq_len : (i + 1) * prefill_seq_len
+            ]
+            outputs = lang_session.run(chunk_inputs)
 
+        prefill_time = perf_counter() - prefill_start
+        # Skip inputs/outputs again
+        lang_session.skip_buffers(
+            [x for x in lang_session.input_names + lang_session.output_names if x.startswith("past_")]
+        )
+
+        # Get first token
+        lang_inputs["input_ids"] = outputs["logits"].argmax(2)
+        lang_inputs["position_ids"] = input_len.numpy()
+        if "cross_attention_mask" in lang_inputs:
+            bs, _, num_images, img_tiles = lang_inputs["cross_attention_mask"].shape
+            lang_inputs["cross_attention_mask"] = torch.ones((bs, 1, num_images, img_tiles), dtype=torch.int64).numpy()
+        generated_ids[:, 0] = lang_inputs["input_ids"].squeeze(1)
+
+        if streamer:
+            streamer.put(lang_inputs["input_ids"][0])
+
+        # Decode loop
+        decode_start = perf_counter()
+        for num_token in range(1, generation_len):
+            outputs = lang_session.run(lang_inputs)
+
+            # Prepare inputs for next iteration
+            lang_inputs["input_ids"] = outputs["logits"].argmax(2)
+            lang_inputs["position_ids"] += 1
+            generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1)
+
+            if streamer:
+                streamer.put(lang_inputs["input_ids"][0])
+
+        decode_end = perf_counter()
+        if streamer:
+            streamer.end()
+
+        decode_perf = (num_token - 1) / (decode_end - decode_start)
+        total_time = decode_end - prefill_start
+        total_perf = num_token / total_time
+
+        return CloudAI100ExecInfoNew(
+            batch_size=batch_size,
+            generated_ids=generated_ids,
+            perf_metrics=PerfMetrics(
+                prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
+            ),
+        )
+
+
+class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase):
+    _hf_auto_class = AutoModelForImageTextToText
+    _pytorch_transforms = [
+        AwqToMatmulNbitsTransform,
+        GPTQToMatmulNbitsTransform,
+        CustomOpsTransform,
+        KVCacheTransform,
+        KVCacheModuleMethodMapperTransform,
+        VlmNoKVOffloadTransform,
+    ]
+    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
+
+    def __init__(
+        self,
+        model: nn.Module,
+        **kwargs,
+    ):
+        if kwargs.pop("full_batch_size", None):
+            raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
         super().__init__(model)
 
-    def __repr__(self) -> str:
-        return self.__class__.__name__ + "\n" + self.model.__repr__()
+        # to handle internvl models
+        if hasattr(self.model.config, "llm_config") and hasattr(self.model.config, "vision_config"):
+            self.model.config.llm_config.use_cache = True
+            self.model.config.llm_config._attn_implementation = "eager"
+            self.model.config.vision_config.use_flash_attn = "false"
+        else:
+            self.model.config.text_config.use_cache = True
 
     @classmethod
-    @with_replaced_quantizers
-    def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path,
+        *args,
+        **kwargs,
+    ):
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
@@ -57,9 +832,288 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = Fals
             logger.warning("Updating low_cpu_mem_usage=False")
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        from transformers import AutoConfig
 
-        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        return cls(model, is_tlm=is_tlm)
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
+        config._attn_implementation = "eager"
+        config.vision_config.use_flash_attn = "false"
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, config, *args, **kwargs)
+
+        return cls(model, **kwargs)
+
+    def export(
+        self,
+        export_dir: Optional[str] = None,
+        **kwargs,
+    ) -> str:
+        inputs = self.model.get_dummy_inputs()
+        dynamic_axes = self.model.get_onnx_dynamic_axes()
+        output_names = self.model.get_output_names()
+        self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+
+    def compile(
+        self,
+        onnx_path: Optional[str] = None,
+        img_size: Optional[int] = None,
+        compile_dir: Optional[str] = None,
+        *,
+        prefill_seq_len: Optional[int] = None,
+        ctx_len: Optional[int] = None,
+        batch_size: int = 1,
+        full_batch_size: Optional[int] = None,
+        kv_cache_batch_size: Optional[int] = None,
+        num_devices: int = 1,
+        num_cores: int = 16,  # FIXME: Make this mandatory arg
+        mxfp6_matmul: bool = False,
+        mxint8_kv_cache: bool = False,
+        num_speculative_tokens: Optional[int] = None,
+        enable_qnn: bool = False,
+        qnn_config: Optional[str] = None,
+        **compiler_options,
+    ) -> str:
+        if (
+            any(
+                param is not None
+                for param in [full_batch_size, kv_cache_batch_size, num_speculative_tokens, qnn_config]
+            )
+            or enable_qnn
+        ):
+            raise ValueError(
+                f"Expected 'full_batch_size', 'kv_cache_batch_size', 'num_speculative_tokens', and 'qnn_config' to be None, and 'enable_qnn' to be False but got: "
+                f"full_batch_size={full_batch_size}, kv_cache_batch_size={kv_cache_batch_size}, num_speculative_tokens={num_speculative_tokens}, "
+                f"enable_qnn={enable_qnn}, qnn_config={qnn_config}"
+            )
+
+        output_names = self.model.get_output_names()
+
+        # Get specializations from modelling file
+        # TODO: expose this via the auto class as well
+        specializations = self.model.get_specializations(
+            batch_size=batch_size,
+            prefill_seq_len=prefill_seq_len,
+            ctx_len=ctx_len,
+            img_size=img_size,
+            **compiler_options,
+        )
+
+        custom_io = {}
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
+        # inputs
+        for input_name in output_names:
+            if input_name.endswith("_RetainedState"):
+                custom_io[input_name[: -len("_RetainedState")]] = kv_cache_dtype
+
+        # outputs
+        for output_name in output_names:
+            if output_name.endswith("_RetainedState"):
+                custom_io[output_name] = kv_cache_dtype
+
+        if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and mxfp6_matmul:
+            logger.warning(
+                f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True"
+            )
+
+        self._compile(
+            onnx_path,
+            compile_dir,
+            compile_only=True,
+            retained_state=True,
+            specializations=specializations,
+            convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            custom_io=custom_io,
+            mdp_ts_num_devices=num_devices,
+            aic_num_cores=num_cores,
+            **compiler_options,
+        )
+        return self.qpc_path
+
+    def get_onnx_dynamic_axes(self):
+        return self.model.get_onnx_dynamic_axes()
+
+    def generate(
+        self,
+        inputs: torch.Tensor,
+        streamer: Optional[TextStreamer] = None,
+        device_ids: List[int] = None,
+        runtime_ai100: bool = True,
+        generation_len: Optional[int] = None,
+    ) -> Union[torch.Tensor, np.ndarray]:
+        """
+        This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+        ``Mandatory`` Args:
+            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
+        ``optional`` Args:
+            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
+        Returns:
+            :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
+        """
+        if not runtime_ai100:
+            raise NotImplementedError("PyTorch execution is not supported yet for this model!")
+
+        return self.cloud_ai_100_generate(
+            inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer
+        )
+
+    def auto_correct_inputs(self, inputs):
+        checked = True
+        inputs_info = self.model.get_inputs_info()
+        for valid_input_info in inputs_info:
+            if valid_input_info.name not in inputs:
+                checked = False
+                break
+            if inputs[valid_input_info.name].dtype != valid_input_info.datatype:
+                checked = False
+                break
+
+        if not checked:
+            err_str: str = (
+                "Expected following input names and shapes to be passed\n"
+                + "\n".join([val.__repr__() for val in inputs_info])
+                + "got"
+                + f"{[(k, v.shape, v.dtype) for k, v in inputs.items()]}"
+            )
+
+            raise RuntimeError(err_str)
+
+        return {k: v for k, v in inputs.items() if k in [iinfo.name for iinfo in inputs_info]}
+
+    def cloud_ai_100_generate(
+        self,
+        inputs: torch.Tensor,
+        device_ids: List[int],
+        enable_debug_logs: bool = False,
+        generation_len: int = None,
+        streamer: Optional[TextStreamer] = None,
+    ) -> np.ndarray:
+        inputs = self.auto_correct_inputs(inputs)
+        qpc_session = QAICInferenceSession(
+            self.qpc_path, device_ids, enable_debug_logs=enable_debug_logs, activate=False
+        )
+
+        batch_size, ctx_len, fbs = get_compilation_dims(self.qpc_path)
+
+        pad_token_id = 1
+
+        # Skip inputs/outputs
+        qpc_session.skip_buffers(
+            [
+                x
+                for x in qpc_session.input_names + qpc_session.output_names
+                if x.startswith("past_") or x.endswith("_RetainedState")
+            ]
+        )
+
+        # Read prompt and ctx len from session
+        batch_size = max(
+            [x[qpc_session.binding_index_map["input_ids"]][1][0] for x in qpc_session.allowed_shapes]
+            + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[0]]
+        )
+
+        prefill_seq_len = max(
+            [x[qpc_session.binding_index_map["input_ids"]][1][1] for x in qpc_session.allowed_shapes]
+            + [qpc_session.bindings[qpc_session.binding_index_map["input_ids"]].dims[1]]
+        )
+
+        input_len = inputs["attention_mask"].sum(1, keepdims=True)
+        input_ids_length = inputs["input_ids"].shape[1]
+
+        num_chunks = -(input_ids_length // -prefill_seq_len)  # ceil divide without float
+
+        padded_len = num_chunks * prefill_seq_len  # Convert to a multiple of prompt_len
+        if generation_len is None:
+            generation_len = ctx_len - input_len.max()
+
+        assert generation_len > 0, "generation length should be greater than zero"
+        generated_ids = np.full((batch_size, generation_len + 1), pad_token_id)
+
+        # Prepare inputs for prefill
+        prefill_start = perf_counter()
+
+        inputs["input_ids"] = torch.nn.functional.pad(
+            inputs["input_ids"],
+            (0, padded_len - input_ids_length),
+            "constant",
+            1,
+        )
+        inputs["attention_mask"] = torch.nn.functional.pad(
+            inputs["attention_mask"], (0, padded_len - input_ids_length), "constant", 0
+        )
+        if "cross_attention_mask" in inputs:
+            inputs["cross_attention_mask"] = torch.nn.functional.pad(
+                inputs["cross_attention_mask"], (0, 0, 0, 0, 0, padded_len - input_ids_length)
+            )
+        for k, v in inputs.items():
+            inputs[k] = np.array(v)
+
+        if "pixel_values_RetainedState" in qpc_session.output_names:
+            inputs["pixel_values"] = inputs["pixel_values"].astype("float16")
+
+        inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
+
+        qpc_session.activate()
+
+        # Run prefill
+
+        for i in range(num_chunks):
+            chunk_inputs = inputs.copy()
+            chunk_inputs["input_ids"] = inputs["input_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
+            chunk_inputs["position_ids"] = inputs["position_ids"][:, i * prefill_seq_len : (i + 1) * prefill_seq_len]
+            outputs = qpc_session.run(chunk_inputs)
+
+        prefill_time = perf_counter() - prefill_start
+        # Get first token
+        inputs["input_ids"] = outputs["logits"].argmax(2)
+        inputs["position_ids"] = input_len.numpy()
+
+        if "cross_attention_mask" in inputs:
+            bs, _, num_images, img_tiles = inputs["cross_attention_mask"].shape
+            inputs["cross_attention_mask"] = torch.ones((bs, 1, num_images, img_tiles), dtype=torch.int64).numpy()
+
+        generated_ids[:, 0] = inputs["input_ids"].squeeze(1)
+        if streamer:
+            streamer.put(inputs["input_ids"][0])
+
+        if "pixel_values_RetainedState" in qpc_session.output_names:
+            qpc_session.skip_buffers(["pixel_values"])
+            inputs.pop("pixel_values")
+
+        # Decode loop
+        decode_start = perf_counter()
+        for num_token in range(1, generation_len):
+            outputs = qpc_session.run(inputs)
+            # Prepare inputs for next iteration
+            inputs["input_ids"] = outputs["logits"].argmax(2)
+            inputs["position_ids"] += 1
+            generated_ids[:, num_token] = inputs["input_ids"].squeeze(1)
+            if streamer:
+                streamer.put(inputs["input_ids"][0])
+
+        decode_end = perf_counter()
+        if streamer:
+            streamer.end()
+
+        decode_perf = (num_token - 1) / (decode_end - decode_start)
+        total_time = decode_end - prefill_start
+        total_perf = num_token / total_time
+
+        return CloudAI100ExecInfoNew(
+            batch_size=batch_size,
+            generated_ids=generated_ids,
+            perf_metrics=PerfMetrics(
+                prefill_time=prefill_time, decode_perf=decode_perf, total_perf=total_perf, total_time=total_time
+            ),
+        )
+
+    @property
+    def model_hash(self) -> str:
+        mhash = hashlib.sha256()
+        mhash.update(to_hashable(self.model.config.to_diff_dict()))
+        mhash.update(to_hashable(self._transform_names()))
+        mhash.update(to_hashable({"QEFFAutoModelForImageTextToText1QPC": True}))
+        mhash = mhash.hexdigest()[:16]
+        return mhash
 
     @property
     def model_name(self) -> str:
@@ -69,7 +1123,58 @@ def model_name(self) -> str:
         return mname
 
 
-class QEFFAutoModelForCausalLM(QEFFTransformersBase):
+class QEFFAutoModelForImageTextToText:
+    """
+    A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach
+    Attributes:
+        _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models.
+    """
+
+    _hf_auto_class = AutoModelForImageTextToText
+
+    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs):
+        if model.config.architectures[0] in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and not kv_offload:
+            # For models with mxfp6 accuracy issue, we will use kv_offload=True by default
+            if kv_offload is None:
+                kv_offload = True
+            else:
+                logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
+        elif kv_offload is None:
+            kv_offload = False
+
+        if kv_offload:
+            return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs)
+        else:
+            return _QEFFAutoModelForImageTextToTextSingleQPC(model, **kwargs)
+
+    @classmethod
+    @with_replaced_quantizers
+    def from_pretrained(cls, pretrained_model_name_or_path: str, kv_offload: Optional[bool] = None, **kwargs):
+        """Used to load models supported by transformers.AutoModelForImageTextToText for Cloud AI 100.
+
+        Args:
+            pretrained_model_name_or_path (str): Path or model card name on HuggingFace
+            kv_offload (Optional[bool], optional): Should the KV of vision encoder be offloaded to CPU and use Two QPC. Defaults to None.
+
+        Returns:
+            _type_: _description_
+        """
+        # TODO: add a check to see if kv_offload is allowed for given model by loading the config and checking architecture or type of config here.
+        if kwargs.get("attn_implementation", None) not in {None, "eager"}:
+            logger.warning('Updating attn_implementation="eager"')
+
+        if kwargs.get("low_cpu_mem_usage", None):
+            logger.warning("Updating low_cpu_mem_usage=False")
+
+        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(model, kv_offload=kv_offload, **kwargs)
+
+
+MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP = {"InternVLChatModel": QEFFAutoModelForImageTextToText}
+
+
+class QEFFAutoModelForCausalLM(QEFFBaseModel):
     """
     The QEFF class is designed for manipulating any causal language model from the HuggingFace hub.
     Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization.
@@ -104,16 +1209,18 @@ def __init__(
         is_tlm: bool = False,
         **kwargs,
     ):
-        model_class_name = model.__class__.__name__
-        if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")):
-            raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}")
-
         # TODO: remove from version 1.20
         if kwargs.pop("full_batch_size", None):
             continuous_batching = True
             warnings.warn(
                 "full_batch_size argument is deprecated. Use continuous_batching=True instead.", DeprecationWarning, 2
             )
+        if hasattr(model.config, "quantization_config") and not isinstance(
+            model.config.quantization_config, tuple(QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.values())
+        ):
+            logger.warning(
+                "Please use `from_pretrained` method to load quantized models, might give unexpected results"
+            )
 
         super().__init__(model)
 
@@ -127,6 +1234,16 @@ def __init__(
             self.model, transformed = SpDTransform.apply(self.model)
         self.is_tlm = is_tlm
 
+    @property
+    def model_name(self) -> str:
+        mname = self.model.__class__.__name__
+        if mname.startswith("QEff") or mname.startswith("QEFF"):
+            mname = mname[4:]
+        return mname
+
+    def __repr__(self) -> str:
+        return self.__class__.__name__ + "\n" + self.model.__repr__
+
     @classmethod
     def from_pretrained(
         cls, pretrained_model_name_or_path, continuous_batching: bool = False, is_tlm: bool = False, *args, **kwargs
@@ -135,6 +1252,7 @@ def from_pretrained(
         This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM.
         Once the model is initialized, you can use other methods such as export, compile, and generate on the same object.
 
+        This API can also be used as exception for VLM model since transformers support loading InternChatVL models via AutoModel API we support it via AutoModelForCausalLM API
         Args:
             :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory.
             :continuous_batching (bool): Whether this model will be used for continuous batching in future. If this is not set True here, the model can not be exported/compiled for continuous batching later.
@@ -157,16 +1275,29 @@ def from_pretrained(
             tokenizer = AutoTokenizer.from_pretrained(model_name)
             model.generate(prompts=["Hi there!!"], tokenizer=tokenizer)
         """
-
         if kwargs.pop("full_batch_size", None):
             continuous_batching = True
             warnings.warn(
                 "full_batch_size argument is deprecated. Use continuous_batching=True instead.", DeprecationWarning, 2
             )
 
-        self = super().from_pretrained(pretrained_model_name_or_path, is_tlm=is_tlm, *args, **kwargs)
-        self.continuous_batching = continuous_batching
-        return self
+        if kwargs.get("attn_implementation", None) not in {None, "eager"}:
+            logger.warning('Updating attn_implementation="eager"')
+
+        if kwargs.get("low_cpu_mem_usage", None):
+            logger.warning("Updating low_cpu_mem_usage=False")
+
+        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
+        model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+
+        # This is support models that should be classified to in a different auto class but transformers load them via this class
+        kv_offload = kwargs.pop("kv_offload", None)
+        if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP:
+            return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__](
+                model, kv_offload=kv_offload
+            )
+
+        return cls(model, is_tlm=is_tlm, continuous_batching=continuous_batching)
 
     @property
     def model_hash(self) -> str:
@@ -313,7 +1444,7 @@ def compile(
             "batch_size": 1 if self.continuous_batching else batch_size,
             "seq_len": prefill_seq_len,
             "ctx_len": ctx_len,
-            # TODO: should be renamed to kv_cache_batch_size in specialzation too
+            # TODO: should be renamed to kv_cache_batch_size in specialization too
         }
         prefill_specialization.update({"num_logits_to_keep": 1}) if self.is_tlm else ...
         if self.continuous_batching:
@@ -419,258 +1550,3 @@ def generate(
             )
         else:
             raise NotImplementedError("Only AI_100 runtime is supported right now via generate API")
-
-
-class QEFFAutoModel(QEFFTransformersBase):
-    """
-    The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub.
-    Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization.
-
-    ``Mandatory`` Args:
-        :model (nn.Module): PyTorch model
-
-    .. code-block:: python
-
-        from QEfficient import QEFFAutoModel
-        from transformers import AutoTokenizer
-
-        # Initialize the model using from_pretrained similar to transformers.AutoModel.
-        model = QEFFAutoModel.from_pretrained("model_name")
-
-        # Now you can directly compile the model for Cloud AI 100
-        model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
-
-        #prepare input
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        inputs = tokenizer("My name is", return_tensors="pt")
-
-        # You can now execute the model
-        model.generate(inputs)
-    """
-
-    _hf_auto_class = AutoModel
-    _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform]
-    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
-
-    def __init__(self, model: nn.Module, **kwargs):
-        super().__init__(model)
-        self.model.config.use_cache = True
-        self.num_layers = model.config.num_hidden_layers
-
-    @classmethod
-    @with_replaced_quantizers
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        """
-        This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModel.
-        Once the model is initialized, you can use other methods such as export, compile, and generate on the same object.
-
-        Args:
-            :pretrained_name_or_path (str): Model card name from HuggingFace or local path to model directory.
-            :args, kwargs: Additional arguments to pass to transformers.AutoModel.
-
-        .. code-block:: python
-
-            from QEfficient import QEFFAutoModel
-            from transformers import AutoTokenizer
-
-            # Initialize the model using from_pretrained similar to transformers.AutoModel.
-            model = QEFFAutoModel.from_pretrained("model_name")
-
-            # Now you can directly compile the model for Cloud AI 100
-            model.compile(num_cores=16)  # Considering you have a Cloud AI 100 SKU
-
-            #prepare input
-            tokenizer = AutoTokenizer.from_pretrained(model_name)
-            inputs = tokenizer("My name is", return_tensors="pt")
-
-            # You can now execute the model
-            model.generate(inputs)
-        """
-        if kwargs.get("attn_implementation", None) not in {None, "eager"}:
-            logger.warning('Updating attn_implementation="eager"')
-
-        if kwargs.get("low_cpu_mem_usage", None):
-            logger.warning("Updating low_cpu_mem_usage=False")
-
-        kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False, "add_pooling_layer": False})
-        try:
-            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-            warnings.warn("Removing pooling layer from the model if exist")
-        except TypeError:
-            kwargs.pop("add_pooling_layer", None)
-            model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
-        return cls(model)
-
-    @property
-    def model_hash(self) -> str:
-        # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
-        # Using same card name will result in same hash. But, using a relative path for one run and
-        # absolute path for another run will result in different hash.
-        # The added complexity to resolve different paths to same location is not worth pursuing.
-        # Instead, advise the user to always provide same relative paths or absolute paths for local models.
-
-        # Compute the hash with: model_config, transforms
-        mhash = hashlib.sha256()
-        mhash.update(to_hashable(self.model.config.to_diff_dict()))
-        mhash.update(to_hashable(self._transform_names()))
-        mhash = mhash.hexdigest()[:16]
-        return mhash
-
-    def export(self, export_dir: Optional[str] = None) -> str:
-        """
-        Exports the model to ``ONNX`` format using ``torch.onnx.export``.
-
-        ``Optional`` Args:
-           :export_dir (str, optional): The directory path to store ONNX-graph.
-
-        Returns:
-            :str: Path of the generated ``ONNX`` graph.
-        """
-        bs = constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE
-        seq_len = constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN
-
-        example_inputs = {
-            "input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
-            "attention_mask": torch.ones((bs, seq_len), dtype=torch.int64),
-        }
-
-        dynamic_axes = {"input_ids": {0: "batch_size", 1: "seq_len"}, "attention_mask": {0: "batch_size", 1: "seq_len"}}
-
-        output_names = ["output"]
-
-        return self._export(
-            example_inputs,
-            output_names,
-            dynamic_axes,
-            export_dir=export_dir,
-        )
-
-    def compile(
-        self,
-        onnx_path: Optional[str] = None,
-        compile_dir: Optional[str] = None,
-        *,
-        seq_len: int = 32,
-        batch_size: int = 1,
-        num_devices: int = 1,
-        num_cores: int = 16,  # FIXME: Make this mandatory arg
-        mxfp6_matmul: bool = False,
-        **compiler_options,
-    ) -> str:
-        """
-        This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package.
-        If the model has not been exported yet, this method will handle the export process.
-        You can pass any other arguments that the `qaic-exec` takes as extra kwargs.
-
-        ``Optional`` Args:
-            :onnx_path (str, optional): Path to pre-exported onnx model.
-            :compile_dir (str, optional): Path for saving the qpc generated.
-            :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
-            :batch_size (int, optional): Batch size. ``Defaults to 1``.
-            :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
-            :num_cores (int): Number of cores used to compile the model.
-            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
-            :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
-            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
-        Returns:
-            :str: Path of the compiled ``qpc`` package.
-        """
-
-        specializations = [
-            {"batch_size": batch_size, "seq_len": seq_len},
-        ]
-
-        return self._compile(
-            onnx_path,
-            compile_dir,
-            compile_only=True,
-            specializations=specializations,
-            convert_to_fp16=True,
-            mxfp6_matmul=mxfp6_matmul,
-            mdp_ts_num_devices=num_devices,
-            aic_num_cores=num_cores,
-            **compiler_options,
-        )
-
-    def generate(
-        self,
-        inputs: torch.Tensor,
-        device_ids: List[int] = None,
-        runtime_ai100: bool = True,
-    ) -> Union[torch.Tensor, np.ndarray]:
-        """
-        This method generates output by executing PyTorch runtime or the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
-        ``Mandatory`` Args:
-            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
-        ``optional`` Args:
-            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
-            :runtime_ai100 (bool, optional): ``AI_100`` and ``PyTorch`` runtime is supported as of now. Defaults to ``True`` for ``AI_100`` runtime.
-        Returns:
-            :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
-        """
-        # AI_100 runtime
-        if runtime_ai100:
-            if not isinstance(self.qpc_path, Path):
-                raise TypeError("Please run compile API first!")
-
-            return self.cloud_ai_100_feature_generate(inputs=inputs, device_ids=device_ids)
-        # PyTorch runtime
-        else:
-            return self.pytorch_feature_generate(model=self.model, inputs=inputs)
-
-    def cloud_ai_100_feature_generate(
-        self,
-        inputs: torch.Tensor,
-        device_ids: List[int] = [0],
-    ) -> np.ndarray:
-        """
-        Generates features with list of prompts using AI 100 runtime.
-
-        ``Mandatory`` Args:
-            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
-        ``Optional`` Args:
-            device_ids (List[int], optional): A list of device IDs to use for the session. Defaults to [0].
-
-        Returns:
-           np.ndarray: A list of dictionaries containing the generated output features.
-        """
-
-        if self.qpc_session is None:
-            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
-            self.batch_size = self.qpc_session.bindings[0].dims[0]
-            self.seq_len = self.qpc_session.bindings[0].dims[1]
-        # Prepare input
-        input_ids_len = inputs["input_ids"].shape[1]
-        input_ids = np.array(
-            torch.nn.functional.pad(inputs["input_ids"], (0, self.seq_len - inputs["input_ids"].size(1)), "constant", 0)
-        )
-        attention_mask = np.array(
-            torch.nn.functional.pad(
-                inputs["attention_mask"], (0, self.seq_len - inputs["attention_mask"].size(1)), "constant", 0
-            )
-        )
-
-        inputs = dict(input_ids=input_ids, attention_mask=attention_mask)
-
-        outputs = {
-            "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[2]).astype(
-                np.float32
-            ),
-        }
-        self.qpc_session.set_buffers(outputs)
-        outputs = self.qpc_session.run(inputs)
-        outputs = outputs["output"][:, :input_ids_len, :]
-        return outputs
-
-    def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray]) -> List[torch.Tensor]:
-        """
-        Generates features from a list of text prompts using a PyTorch model.
-
-        ``Mandatory`` Args:
-            :model: The transformed PyTorch model used for generating features.
-            :inputs (Union[torch.Tensor, np.ndarray]): inputs to run the execution.
-
-        Returns:
-            torch.Tensor: A list of output features generated by the model for each prompt.
-        """
-        return model(**inputs)
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 6b8d00689..6e107d77b 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -51,6 +51,9 @@
     LlamaModel,
     LlamaRMSNorm,
 )
+from transformers.models.llava.modeling_llava import (
+    LlavaForConditionalGeneration,
+)
 from transformers.models.mistral.modeling_mistral import (
     MistralAttention,
     MistralDecoderLayer,
@@ -69,11 +72,14 @@
 from transformers.models.mllama.modeling_mllama import (
     MllamaCrossAttentionDecoderLayer,
     MllamaForCausalLM,
+    MllamaForConditionalGeneration,
+    MllamaRotaryEmbedding,
     MllamaSelfAttentionDecoderLayer,
     MllamaTextCrossAttention,
     MllamaTextModel,
     MllamaTextRMSNorm,
     MllamaTextSelfAttention,
+    MllamaVisionModel,
 )
 from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel
 from transformers.models.phi.modeling_phi import PhiAttention, PhiDecoderLayer, PhiForCausalLM, PhiModel
@@ -98,7 +104,7 @@
     Starcoder2Model,
 )
 
-from QEfficient.base.pytorch_transforms import ModuleMappingTransform
+from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMethodMapperTransform
 from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.models.codegen.modeling_codegen import (
@@ -143,12 +149,16 @@
     QEffGPTJForCausalLM,
     QEffGPTJModel,
 )
+from QEfficient.transformers.models.internvl.modeling_internvl import QEffInternVisionEmbeddings, QEffInternVLModel
 from QEfficient.transformers.models.llama.modeling_llama import (
     QEffLlamaAttention,
     QEffLlamaDecoderLayer,
     QEffLlamaForCausalLM,
     QEffLlamaModel,
 )
+from QEfficient.transformers.models.llava.modeling_llava import (
+    QEffLlavaForConditionalGeneration,
+)
 from QEfficient.transformers.models.mistral.modeling_mistral import (
     QEffMistralAttention,
     QEffMistralDecoderLayer,
@@ -165,10 +175,14 @@
 from QEfficient.transformers.models.mllama.modeling_mllama import (
     QEffMllamaCrossAttentionDecoderLayer,
     QEffMllamaForCausalLM,
+    QEffMllamaForConditionalGeneration,
+    QEffMllamaRotaryEmbedding,
     QEffMllamaSelfAttentionDecoderLayer,
-    QEffMllamaTextCrossAttention,
+    QEffMllamaTextCrossAttentionSingleQPC,
+    QEffMllamaTextCrossAttentionTwoQPC,
     QEffMllamaTextModel,
     QEffMllamaTextSelfAttention,
+    QEffMllamaVisionModel,
 )
 from QEfficient.transformers.models.mpt.modeling_mpt import (
     QEffMptAttention,
@@ -243,6 +257,8 @@ class KVCacheTransform(ModuleMappingTransform):
         LlamaDecoderLayer: QEffLlamaDecoderLayer,
         LlamaModel: QEffLlamaModel,
         LlamaForCausalLM: QEffLlamaForCausalLM,
+        # Llava
+        LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration,
         # Gemma
         GemmaAttention: QEffGemmaAttention,
         GemmaDecoderLayer: QEffGemmaDecoderLayer,
@@ -254,12 +270,15 @@ class KVCacheTransform(ModuleMappingTransform):
         Gemma2Model: QEffGemma2Model,
         Gemma2ForCausalLM: QEffGemma2ForCausalLM,
         # mllama
-        MllamaForCausalLM: QEffMllamaForCausalLM,
-        MllamaTextModel: QEffMllamaTextModel,
+        MllamaTextRMSNorm: CustomRMSNormAIC,
         MllamaTextSelfAttention: QEffMllamaTextSelfAttention,
-        MllamaTextCrossAttention: QEffMllamaTextCrossAttention,
-        MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer,
         MllamaSelfAttentionDecoderLayer: QEffMllamaSelfAttentionDecoderLayer,
+        MllamaCrossAttentionDecoderLayer: QEffMllamaCrossAttentionDecoderLayer,
+        MllamaRotaryEmbedding: QEffMllamaRotaryEmbedding,
+        MllamaVisionModel: QEffMllamaVisionModel,
+        MllamaTextModel: QEffMllamaTextModel,
+        MllamaForCausalLM: QEffMllamaForCausalLM,
+        MllamaForConditionalGeneration: QEffMllamaForConditionalGeneration,
         # Mistral
         MistralAttention: QEffMistralAttention,
         MistralDecoderLayer: QEffMistralDecoderLayer,
@@ -344,3 +363,34 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
             )
 
         return model, transformed
+
+
+class VlmKVOffloadTransform(ModuleMappingTransform):
+    # supported architectures
+    _module_mapping = {
+        # Llama
+        MllamaTextCrossAttention: QEffMllamaTextCrossAttentionTwoQPC,
+    }
+
+
+class VlmNoKVOffloadTransform(ModuleMappingTransform):
+    # supported architectures
+    _module_mapping = {
+        # Llama
+        MllamaTextCrossAttention: QEffMllamaTextCrossAttentionSingleQPC,
+    }
+
+
+class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
+    _match_string_replace_method = {
+        "InternVLChatModel": {
+            "forward": QEffInternVLModel.forward,
+            "get_dummy_inputs": QEffInternVLModel.get_dummy_inputs,
+            "get_specializations": QEffInternVLModel.get_specializations,
+            "get_onnx_dynamic_axes": QEffInternVLModel.get_onnx_dynamic_axes,
+            "get_output_names": QEffInternVLModel.get_output_names,
+            "get_inputs_info": QEffInternVLModel.get_inputs_info,
+        },
+        "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward},
+    }
+    _match_class_replace_method = {}
diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
index a8562ca1f..b1db315f8 100644
--- a/QEfficient/transformers/models/qwen2/modeling_qwen2.py
+++ b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
@@ -19,19 +19,141 @@
     BaseModelOutputWithPast,
     CausalLMOutputWithPast,
 )
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
+    Qwen2Config,
     Qwen2DecoderLayer,
     Qwen2ForCausalLM,
     Qwen2Model,
-    apply_rotary_pos_emb,
+    Qwen2RotaryEmbedding,
     logger,
     repeat_kv,
+    rotate_half,
 )
 
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 
 
+#  Can be replaced with llama/modeling_llama.py::QEffLlamaRotaryEmbedding but keeping it following transformers ideology
+class QEffQwen2RotaryEmbedding(Qwen2RotaryEmbedding):
+    """
+    Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    The only differences are:
+    - Add static sin/cos computations.
+    """
+
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[Qwen2Config] = None,
+    ):
+        super(Qwen2RotaryEmbedding, self).__init__()  # Initialize nn.Module
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.45"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=self.original_max_seq_len, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+            self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+        )
+
+
+def apply_qeff_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding with Multimodal Sections to the query and key tensors (https://qwenlm.github.io/blog/qwen2-vl/).
+
+    Explanation:
+        Multimodal 3D rotary position embedding is an extension to 1D rotary position embedding. The input embedding
+        sequence contains vision (images / videos) embedding and text embedding or just contains text embedding. For
+        vision embedding part, we apply rotary position embedding on temporal, height and width dimension seperately.
+        Here we split the channel dimension to 3 chunks for the temporal, height and width rotary position embedding.
+        For text embedding part, we just apply 1D rotary position embedding. The three rotary position index (temporal,
+        height and width) of text embedding is always the same, so the text embedding rotary position embedding has no
+        difference with modern LLMs.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        mrope_section(`List(int)`):
+            Multimodal rope section is for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+
+    return q_embed.to(q.dtype), k_embed.to(k.dtype)
+
+
 class QEffQwen2Attention(Qwen2Attention):
     """
     Copied from Qwen2Attention: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2/modeling_qwen2.py
@@ -39,6 +161,20 @@ class QEffQwen2Attention(Qwen2Attention):
     - add new args position idx for the cache_kwargs for kv retention
     """
 
+    def __init__(self, config, layer_idx=None):
+        super().__init__(config, layer_idx)
+        # Define the general __qeff_init__() for any changes in the init calls
+        # Set the init in the module mapping pytorch transforms
+        self.config = config
+        self.__qeff_init__()
+
+    def __qeff_init__(self):
+        self.rotary_emb = QEffQwen2RotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -71,18 +207,8 @@ def forward(
                 )
             kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            cos, sin = self.rotary_emb(value_states, position_ids)
-        else:
-            cos, sin = position_embeddings
-
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_qeff_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
             # Update the cache_kwargs with position_ids for Cloud AI 100
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 2729267d6..6e70226f3 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,9 +8,11 @@
 import json
 import os
 import subprocess
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import requests
+import torch
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
 from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
@@ -394,3 +396,25 @@ def create_json(file_path: str, json_data: object):
             json.dump(json_data, file, indent=4)
     except Exception as e:
         print(f"Failed to create JSON File {file_path}: {e}")
+
+
+def model_swap(func):
+    def wrapper(*args, **kwargs):
+        if "model" in kwargs and kwargs["model"] is not None:
+            original_model = args[0].model
+            args[0].model = kwargs["model"]
+            onnx_path = func(*args, **kwargs)
+            args[0].model = original_model
+            return onnx_path
+
+    return wrapper
+
+
+@dataclass
+class IOInfo:
+    name: str
+    datatype: torch.dtype
+    shape: Tuple[Union[int, str], ...]
+
+    def __repr__(self):
+        return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index cc64df4bd..a5cc6fda1 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -13,6 +13,18 @@
 ROOT_DIR = os.path.dirname(QEFF_DIR)
 QEFF_CACHE_DIR_NAME = "qeff_cache"
 
+ONNX_EXPORT_EXAMPLE_BATCH_SIZE = 1
+ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32
+ONNX_EXPORT_EXAMPLE_FBS = 4
+ONNX_EXPORT_EXAMPLE_NLK = 2  # Number of Logits to Keep
+ONNX_EXPORT_OPSET = 13
+ONNX_EXPORT_MAX_NUM_IMAGES = 1
+ONNX_EXPORT_MAX_IMAGE_TILES = 4
+ONNX_EXPORT_IMAGE_WIDTH = 560
+ONNX_EXPORT_IMAGE_LENGHT = 560
+ONNX_EXPORT_IMAGE_DEPTH = 3
+ONNX_EXPORT_CTX_LEN = 1024
+
 
 # Store the qeff_models inside the ~/.cache directory or over-ride with an env variable.
 def get_models_dir():
@@ -124,6 +136,7 @@ class QnnConstants:
         "--float_bitwidth ",
         "--preserve_io_datatype",
         "--onnx_skip_simplification",
+        "--onnx_defer_loading",
     ]
 
     IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [
diff --git a/pyproject.toml b/pyproject.toml
index 9867181ca..571da78dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.11"
 dependencies = [
-    "transformers==4.45.2",
+    "transformers==4.46.0",
     "huggingface-hub==0.27.0",
     "peft==0.13.2",
     "datasets==2.20.0",
@@ -32,6 +32,7 @@ dependencies = [
     "numpy==1.26.4",
     "protobuf==3.20.2",
     "onnxscript==0.1.0.dev20240327",
+    "pillow===10.4.0",
     "sympy",
     "tensorboard",
     "fire",

From 725a7c1464fdb57276aeb03cebb2cbb6bfe6f692 Mon Sep 17 00:00:00 2001
From: asmigosw <quic_asmigosw@quicinc.com>
Date: Mon, 17 Feb 2025 20:01:22 +0530
Subject: [PATCH 070/138] Migrating HL compile and export to infer APIs (#214)

Migrating HL compile API and export API to infer APIs

---------

Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Co-authored-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/base/common.py                     |  75 ++++--------
 QEfficient/cloud/export.py                    |  45 +++----
 QEfficient/cloud/infer.py                     | 112 +++++++++---------
 .../exporter/export_hf_to_cloud_ai_100.py     |  28 +----
 .../transformers/models/modeling_auto.py      |   4 +
 QEfficient/transformers/transform.py          |   8 +-
 scripts/Jenkinsfile                           |   2 +
 tests/cloud/conftest.py                       |   6 +-
 tests/cloud/high_level_testing.json           |   4 +-
 tests/cloud/test_compile.py                   |  50 --------
 tests/cloud/test_compile_and_execute.py       |  80 +++++++++++++
 tests/cloud/test_execute.py                   |  41 -------
 tests/cloud/test_export.py                    |  10 --
 tests/cloud/test_infer.py                     |  20 ----
 tests/text_generation/test_text_generation.py |   1 -
 .../models/test_causal_lm_models.py           |   2 +
 tests/transformers/spd/test_spd_inference.py  |   1 +
 17 files changed, 195 insertions(+), 294 deletions(-)
 delete mode 100644 tests/cloud/test_compile.py
 create mode 100644 tests/cloud/test_compile_and_execute.py
 delete mode 100644 tests/cloud/test_execute.py

diff --git a/QEfficient/base/common.py b/QEfficient/base/common.py
index ce6b1cdc2..d94e02894 100644
--- a/QEfficient/base/common.py
+++ b/QEfficient/base/common.py
@@ -12,59 +12,20 @@
 QEFFAutoModel provides a common interface for loading the HuggingFace models using either the HF card name of local path of downloaded model.
 """
 
-import os
-from enum import Enum
-from typing import Any, Dict, Type
+from typing import Any
 
 from transformers import AutoConfig
-from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
-from QEfficient.utils._utils import login_and_download_hf_lm
-
-
-class QEFF_MODEL_TYPE(Enum):
-    """
-    Defines Names of the different varities of transformer models.
-    """
-
-    CAUSALLM = "LLM"
-    DIFFUSION = "DIFFUSION"
-
-
-MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP: Dict[QEFF_MODEL_TYPE, Type[QEFFBaseModel]] = {
-    QEFF_MODEL_TYPE.CAUSALLM: QEFFAutoModelForCausalLM
-}
-
-AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP: Dict[Type[QEFFBaseModel], QEFF_MODEL_TYPE] = {
-    v: k for k, v in MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP.items()
-}
-
-
-def get_hf_model_type(hf_model_path: str) -> QEFF_MODEL_TYPE:
-    """
-    Loads model config file and returns the type of the model (i.e. LLMs, SD, quantized etc.) as supported by the library.
-    """
-    if not os.path.isdir(hf_model_path):
-        raise FileNotFoundError(
-            "Please pass local dir path where the model is downloaded; use `QEfficient.utils.login_and_download_hf_lm` for downloading hf model"
-        )
-    config, kwargs = AutoConfig.from_pretrained(
-        hf_model_path,
-        return_unused_kwargs=True,
-    )
-
-    if config.__class__ in MODEL_FOR_CAUSAL_LM_MAPPING:
-        return QEFF_MODEL_TYPE.CAUSALLM
-    else:
-        raise NotImplementedError(f"model type {type(config)} is not yet supported")
 
 
 class QEFFCommonLoader:
     """
     Provides HuggingFace model loading interface same as transformers APIs.
     Supports loading any model on HuggingFace.
+    Wrapper on top of Auto Classes
     """
 
     def __init__(self, *args: Any, **kwds: Any) -> None:
@@ -78,14 +39,24 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *args, **kwargs) ->
         """
         Downloads HuggingFace model if already doesn't exist locally, returns QEFFAutoModel object based on type of model.
         """
-        if not os.path.isdir(pretrained_model_name_or_path):
-            pretrained_model_name_or_path = login_and_download_hf_lm(pretrained_model_name_or_path, *args, **kwargs)
-        kwargs.pop("hf_token", None)
-        model_type = get_hf_model_type(hf_model_path=pretrained_model_name_or_path)
-        qeff_auto_model_class = MODEL_TYPE_TO_QEFF_AUTO_MODEL_MAP[model_type]
-        if not issubclass(qeff_auto_model_class, QEFFBaseModel):
-            raise Exception(f"Expected class that inherits {QEFFBaseModel}, got {type(qeff_auto_model_class)}")
-
-        return qeff_auto_model_class.from_pretrained(
-            pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs
+        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
+        architecture = config.architectures[0] if config.architectures else None
+
+        if architecture in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
+            model_class = QEFFAutoModelForCausalLM
+        else:
+            raise NotImplementedError(
+                f"Unknown architecture={architecture}, either use specific auto model class for loading the model or raise an issue for support!"
+            )
+
+        local_model_dir = kwargs.pop("local_model_dir", None)
+        hf_token = kwargs.pop("hf_token", None)
+        continuous_batching = True if kwargs.pop("full_batch_size", None) else False
+
+        qeff_model = model_class.from_pretrained(
+            pretrained_model_name_or_path=(local_model_dir if local_model_dir else pretrained_model_name_or_path),
+            token=hf_token,
+            continuous_batching=continuous_batching,
+            **kwargs,
         )
+        return qeff_model
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
index 53184450e..504240b66 100644
--- a/QEfficient/cloud/export.py
+++ b/QEfficient/cloud/export.py
@@ -7,12 +7,10 @@
 
 import argparse
 import os
-from typing import Optional, Union
+from typing import Optional
 
-from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
-
-from QEfficient.exporter.export_hf_to_cloud_ai_100 import qualcomm_efficient_converter
-from QEfficient.utils import check_and_assign_cache_dir, onnx_exists
+from QEfficient.base.common import QEFFCommonLoader
+from QEfficient.utils import check_and_assign_cache_dir
 from QEfficient.utils.logging_utils import logger
 
 # Specifically for Docker images.
@@ -22,10 +20,9 @@
 def get_onnx_model_path(
     model_name: str,
     cache_dir: Optional[str] = None,
-    tokenizer: Optional[Union[PreTrainedTokenizerFast, PreTrainedTokenizer]] = None,
     hf_token: Optional[str] = None,
-    local_model_dir: Optional[str] = None,
     full_batch_size: Optional[int] = None,
+    local_model_dir: Optional[str] = None,
 ):
     """
     exports the model to onnx if pre-exported file is not found and returns onnx_model_path
@@ -39,27 +36,17 @@ def get_onnx_model_path(
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
     """
-    onnx_path_exists, onnx_dir_path, onnx_model_path = onnx_exists(model_name, full_batch_size)
-    if onnx_path_exists:
-        logger.info(f"Pre-exported ONNX files found at {onnx_dir_path}! Jumping to Compilation")
-    else:
-        ###################
-        # hf model -> export
-        ####################
-        # Export to the Onnx
-        logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
-        _, onnx_model_path = qualcomm_efficient_converter(
-            model_name=model_name,
-            local_model_dir=local_model_dir,
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            kv=True,
-            form_factor="cloud",
-            hf_token=hf_token,
-            cache_dir=cache_dir,
-            full_batch_size=full_batch_size,
-        )  # type: ignore
-        logger.info(f"Generated onnx_path: {onnx_model_path}, onnx_dir_path: {onnx_dir_path}")
+    logger.info(f"Exporting Pytorch {model_name} model to ONNX...")
+
+    qeff_model = QEFFCommonLoader.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+        full_batch_size=full_batch_size,
+        local_model_dir=local_model_dir,
+    )
+    onnx_model_path = qeff_model.export()
+    logger.info(f"Generated onnx_path: {onnx_model_path}")
     return onnx_model_path
 
 
@@ -92,8 +79,8 @@ def main(
         model_name=model_name,
         cache_dir=cache_dir,
         hf_token=hf_token,
-        local_model_dir=local_model_dir,
         full_batch_size=full_batch_size,
+        local_model_dir=local_model_dir,
     )
 
 
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 0ba0961e3..4b43c8ded 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -7,13 +7,11 @@
 
 import argparse
 import logging
-import os
+import sys
 from typing import List, Optional
 
-import QEfficient
-from QEfficient.cloud.export import get_onnx_model_path
-from QEfficient.generation.text_generation_inference import cloud_ai_100_exec_kv
-from QEfficient.utils import check_and_assign_cache_dir, get_qpc_dir_path, load_hf_tokenizer, qpc_exists
+from QEfficient.base.common import QEFFCommonLoader
+from QEfficient.utils import check_and_assign_cache_dir, load_hf_tokenizer
 from QEfficient.utils.logging_utils import logger
 
 
@@ -38,6 +36,7 @@ def main(
     allow_mxint8_mdp_io: bool = False,
     enable_qnn: Optional[bool] = False,
     qnn_config: Optional[str] = None,
+    **kwargs,
 ) -> None:
     """
     1. Check if compiled qpc for given config already exists, if it does jump to execute, else
@@ -79,58 +78,47 @@ def main(
         hf_token=hf_token,
     )
 
-    qpc_dir_path = get_qpc_dir_path(
-        model_name,
-        num_cores,
-        mos,
-        batch_size,
-        prompt_len,
-        ctx_len,
-        mxfp6,
-        mxint8,
-        device_group,
-        full_batch_size,
-        enable_qnn=enable_qnn,
-    )
+    if "--mxfp6" in sys.argv:
+        if args.mxfp6:
+            logger.warning("mxfp6 is going to be deprecated in a future release, use -mxfp6_matmul instead.")
+    if "--mxint8" in sys.argv:
+        if args.mxint8:
+            logger.warning("mxint8 is going to be deprecated in a future release, use -mxint8_kv_cache instead.")
 
-    # Handle qpc generation
-    if qpc_exists(qpc_dir_path):
-        logger.info(f"Pre-compiled qpc found at {qpc_dir_path}! Executing with given prompt")
-    else:
-        # Handle onnx model generation
-        onnx_model_path = get_onnx_model_path(
-            model_name, cache_dir, tokenizer, hf_token, local_model_dir, full_batch_size
-        )  # , base_dir_name)
+    qeff_model = QEFFCommonLoader.from_pretrained(
+        pretrained_model_name_or_path=model_name,
+        cache_dir=cache_dir,
+        hf_token=hf_token,
+        full_batch_size=full_batch_size,
+        local_model_dir=local_model_dir,
+    )
 
-        #########
-        # Compile
-        #########
-        _ = QEfficient.compile(
-            onnx_path=onnx_model_path,
-            qpc_path=os.path.dirname(
-                qpc_dir_path
-            ),  # We need to pass parent directory of qpc_dir_path, as the compile function handles the qpcs directory creation
-            num_cores=num_cores,
-            batch_size=batch_size,
-            prompt_len=prompt_len,
-            ctx_len=ctx_len,
-            mxfp6=mxfp6,
-            mxint8=mxint8,
-            aic_enable_depth_first=aic_enable_depth_first,
-            mos=mos,
-            device_group=device_group,
-            full_batch_size=full_batch_size,
-            allow_mxint8_mdp_io=allow_mxint8_mdp_io,
-            enable_qnn=enable_qnn,
-            qnn_config=qnn_config,
-        )
+    #########
+    # Compile
+    #########
+    _ = qeff_model.compile(
+        prefill_seq_len=prompt_len,
+        ctx_len=ctx_len,
+        num_cores=num_cores,
+        mxfp6_matmul=mxfp6,
+        aic_enable_depth_first=aic_enable_depth_first,
+        batch_size=batch_size,
+        mos=mos,
+        mxint8_kv_cache=mxint8,
+        num_devices=(0 if device_group is None else len(device_group)),
+        full_batch_size=full_batch_size,
+        allow_mxint8_mdp_io=allow_mxint8_mdp_io,
+        enable_qnn=enable_qnn,
+        qnn_config=qnn_config,
+        **kwargs,
+    )
 
     #########
     # Execute
     #########
-    cloud_ai_100_exec_kv(
-        tokenizer=tokenizer,
-        qpc_path=qpc_dir_path,
+    _ = qeff_model.generate(
+        tokenizer,
+        prompts=prompt,
         device_id=device_group,
         prompt=prompt,
         prompts_txt_file_path=prompts_txt_file_path,
@@ -162,10 +150,16 @@ def main(
     )
     parser.add_argument("--ctx-len", "--ctx_len", default=128, type=int, help="Context length for text generation.")
     parser.add_argument(
-        "--mxfp6", action="store_true", help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression"
+        "--mxfp6",
+        "--mxfp6_matmul",
+        "--mxfp6-matmul",
+        action="store_true",
+        help="Compress constant MatMul weights to MXFP6 E2M3, default is no compression",
     )
     parser.add_argument(
         "--mxint8",
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
         action="store_true",
         help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
     )
@@ -237,8 +231,18 @@ def main(
         type=str,
     )
 
-    args = parser.parse_args()
+    args, compiler_options = parser.parse_known_args()
+    compiler_options_dict = {}
+    for i in range(0, len(compiler_options)):
+        if compiler_options[i].startswith("--"):
+            key = compiler_options[i].lstrip("-")
+            value = (
+                compiler_options[i + 1]
+                if i + 1 < len(compiler_options) and not compiler_options[i + 1].startswith("-")
+                else True
+            )
+            compiler_options_dict[key] = value
     if args.verbose:
         logger.setLevel(logging.INFO)
     del args.verbose  # type: ignore
-    main(**args.__dict__)
+    main(**args.__dict__, **compiler_options_dict)
diff --git a/QEfficient/exporter/export_hf_to_cloud_ai_100.py b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
index 276faf94c..6b6cbe18a 100644
--- a/QEfficient/exporter/export_hf_to_cloud_ai_100.py
+++ b/QEfficient/exporter/export_hf_to_cloud_ai_100.py
@@ -13,7 +13,7 @@
 import torch
 from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE, QEFFCommonLoader
+from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.exporter.export_utils import export_onnx, fix_onnx_fp16, generate_input_files, run_model_on_ort
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
@@ -307,30 +307,6 @@ def export_kvstyle_transformed_model_to_onnx(
     return model_name
 
 
-def export_for_cloud(
-    model_name: str,
-    qeff_model: QEFFBaseModel,
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    onnx_dir_path: str,
-    seq_length: int = Constants.SEQ_LEN,
-    full_batch_size: Optional[int] = None,
-) -> str:
-    # FIXME: move all this to class instead of here, and just call qeff_model.export here.
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(qeff_model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:  # type: ignore
-        return export_lm_model_for_cloud(
-            model_name=model_name,
-            qeff_model=qeff_model,  # type: ignore
-            tokenizer=tokenizer,
-            onnx_dir_path=onnx_dir_path,
-            seq_length=seq_length,
-            full_batch_size=full_batch_size,
-        )
-    else:
-        raise NotImplementedError(
-            f"Only model type {QEFFAutoModelForCausalLM.__class__.__name__} is supported for export, got {type(qeff_model)}"
-        )
-
-
 def export_lm_model_for_cloud(
     model_name: str,
     qeff_model: QEFFAutoModelForCausalLM,
@@ -434,7 +410,7 @@ def qualcomm_efficient_converter(
     )
 
     if form_factor == "cloud":
-        generated_onnx_model_path = export_for_cloud(
+        generated_onnx_model_path = export_lm_model_for_cloud(
             model_name=model_name,
             qeff_model=model_kv,
             tokenizer=tokenizer,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index ee4d9776c..bc5fe54c2 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1209,6 +1209,10 @@ def __init__(
         is_tlm: bool = False,
         **kwargs,
     ):
+        model_class_name = model.__class__.__name__
+        if not (model_class_name.endswith("ForCausalLM") or model_class_name.endswith("LMHeadModel")):
+            raise TypeError(f"Required pytorch module for CausalLM or LMHeadModel, got {model_class_name}")
+
         # TODO: remove from version 1.20
         if kwargs.pop("full_batch_size", None):
             continuous_batching = True
diff --git a/QEfficient/transformers/transform.py b/QEfficient/transformers/transform.py
index 8bb084fbf..f4024a1f3 100644
--- a/QEfficient/transformers/transform.py
+++ b/QEfficient/transformers/transform.py
@@ -10,7 +10,6 @@
 import torch.nn as nn
 import transformers
 
-from QEfficient.base.common import AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP, QEFF_MODEL_TYPE
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_utils import TransformersToQEffModulesDict
@@ -96,8 +95,5 @@ def transform(model: QEFFBaseModel, form_factor="cloud"):
     if form_factor != "cloud":
         raise ValueError("Only form_factor='cloud' is supported as of now!")
     # FIXME: move this to class and use model.transform()
-    if AUTO_MODEL_MAP_TO_MODEL_TYPE_MAP.get(model.__class__, None) == QEFF_MODEL_TYPE.CAUSALLM:
-        transform_lm(model.model)  # type: ignore
-        return model
-    else:
-        raise NotImplementedError(f"Recieved unsupported class of type {type(model)}")
+    transform_lm(model.model)  # type: ignore
+    return model
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index bbfb38fd2..eafe29fd1 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -69,6 +69,8 @@ pipeline {
                        timeout(time: 15, unit: 'MINUTES') {
                            sh '''
                            sudo docker exec ${BUILD_TAG} bash -c "
+                    	   source /qnn_sdk/bin/envsetup.sh &&
+                      	   source /qnn_sdk/bin/envcheck -c &&
                            cd /efficient-transformers &&
                            . preflight_qeff/bin/activate &&
                            mkdir -p $PWD/cli &&
diff --git a/tests/cloud/conftest.py b/tests/cloud/conftest.py
index 0810afb7d..7e880d005 100644
--- a/tests/cloud/conftest.py
+++ b/tests/cloud/conftest.py
@@ -242,7 +242,7 @@ def pytest_collection_modifyitems(config, items):
     ----------
     Ref: https://docs.pytest.org/en/4.6.x/reference.html#collection-hooks
     """
-    run_first = ["test_export", "test_compile", "test_execute", "test_infer"]
+    run_first = ["test_export", "test_infer"]
     modules_name = {item.module.__name__ for item in items}
     cloud_modules = []
     non_cloud_modules = []
@@ -279,7 +279,7 @@ def pytest_collection_modifyitems(config, items):
         first_model = items[0].callspec.params["model_name"] if hasattr(items[0], "callspec") else None
 
         for item in items:
-            if item.module.__name__ in ["test_export", "test_compile", "test_execute", "test_infer"]:
+            if item.module.__name__ in ["test_export", "test_compile_and_execute", "test_infer"]:
                 if hasattr(item, "callspec"):
                     params = item.callspec.params
                     if not params["enable_qnn"] and params["qnn_config"] is not None:
@@ -289,7 +289,7 @@ def pytest_collection_modifyitems(config, items):
                     if params["enable_qnn"]:
                         item.add_marker(pytest.mark.qnn)
 
-            if item.module.__name__ in ["test_export", "test_compile", "test_execute"]:
+            if item.module.__name__ in ["test_export", "test_compile_and_execute"]:
                 if hasattr(item, "callspec"):
                     params = item.callspec.params
                     if params["model_name"] != first_model:
diff --git a/tests/cloud/high_level_testing.json b/tests/cloud/high_level_testing.json
index fb4d7c19f..d30382dc6 100644
--- a/tests/cloud/high_level_testing.json
+++ b/tests/cloud/high_level_testing.json
@@ -1,6 +1,6 @@
 {
     "license": "SEE LICENSE IN LICENSE FILE",
-    "model_name" : ["gpt2","lu-vae/llama-68m-fft"],
+    "model_name" : ["gpt2"],
     "num_cores" : [16],
     "prompt" : ["My name is"],
     "prompts_txt_file_path" : ["examples/prompts.txt"],
@@ -9,7 +9,7 @@
     "cache_dir" : [null],
     "hf_token" : [null],
     "batch_size" : [1],
-    "prompt_len" : [2,32],
+    "prompt_len" : [32],
     "ctx_len" : [128],
     "mxfp6" : [1],
     "mxint8" : [1],
diff --git a/tests/cloud/test_compile.py b/tests/cloud/test_compile.py
deleted file mode 100644
index 9bfe39647..000000000
--- a/tests/cloud/test_compile.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import os
-
-import pytest
-
-import QEfficient
-import QEfficient.cloud.compile
-
-
-@pytest.mark.cli
-def test_compile(setup, mocker):
-    """
-    test_compile is a HL compile api testing function,
-    checks compile api code flow, object creations, internal api calls, internal returns.
-    ---------
-    Parameters:
-    setup: is a fixture defined in conftest.py module.
-    mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
-    """
-    ms = setup
-    for onnx_model_path in ms.onnx_model_path():
-        if os.path.isfile(onnx_model_path):
-            break
-    else:
-        raise RuntimeError(f"onnx file not found: {ms.onnx_model_path()}")
-    QEfficient.compile(
-        onnx_path=onnx_model_path,
-        qpc_path=os.path.dirname(ms.qpc_dir_path()),
-        num_cores=ms.num_cores,
-        device_group=ms.device_group,
-        aic_enable_depth_first=ms.aic_enable_depth_first,
-        mos=ms.mos,
-        batch_size=ms.batch_size,
-        prompt_len=ms.prompt_len,
-        ctx_len=ms.ctx_len,
-        mxfp6=ms.mxfp6,
-        mxint8=ms.mxint8,
-        full_batch_size=ms.full_batch_size,
-        enable_qnn=ms.enable_qnn,
-    )
-
-    assert os.path.isdir(ms.qpc_dir_path())
-    assert os.path.isfile(ms.specialization_json_path())
-    assert os.path.isfile(ms.custom_io_file_path())
diff --git a/tests/cloud/test_compile_and_execute.py b/tests/cloud/test_compile_and_execute.py
new file mode 100644
index 000000000..9471f6582
--- /dev/null
+++ b/tests/cloud/test_compile_and_execute.py
@@ -0,0 +1,80 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+
+import pytest
+import yaml
+
+import QEfficient
+from QEfficient.cloud.execute import main as execute
+from QEfficient.cloud.export import get_onnx_model_path
+
+
+@pytest.mark.on_qaic
+@pytest.mark.cli
+def test_compile(setup, mocker):
+    """
+    test_compile is a HL compile api testing function,
+    checks compile api code flow, object creations, internal api calls, internal returns.
+    ---------
+    Parameters:
+    setup: is a fixture defined in conftest.py module.
+    mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
+    """
+    ms = setup
+    onnx_model_path = get_onnx_model_path(
+        model_name=ms.model_name,
+        cache_dir=ms.cache_dir,
+        hf_token=ms.hf_token,
+        full_batch_size=ms.full_batch_size,
+        local_model_dir=ms.local_model_dir,
+    )
+
+    base_key = "past_key."
+    base_value = "past_value."
+    precision = "float16"
+
+    data = []
+
+    for i in range(12):
+        data.append({"IOName": f"{base_key}{i}", "Precision": precision})
+        data.append({"IOName": f"{base_value}{i}", "Precision": precision})
+
+    for i in range(12):
+        data.append({"IOName": f"{base_key}{i}_RetainedState", "Precision": precision})
+        data.append({"IOName": f"{base_value}{i}_RetainedState", "Precision": precision})
+
+    with open(((onnx_model_path.parent) / "custom_io.yaml"), "w") as file:
+        yaml.dump(data, file)
+
+    qpc_path = QEfficient.compile(
+        onnx_path=onnx_model_path,
+        qpc_path=os.path.dirname(ms.qpc_dir_path()),
+        num_cores=ms.num_cores,
+        device_group=ms.device_group,
+        custom_io_file_path=(onnx_model_path.parent) / "custom_io.yaml",
+        aic_enable_depth_first=ms.aic_enable_depth_first,
+        mos=ms.mos,
+        batch_size=ms.batch_size,
+        prompt_len=ms.prompt_len,
+        ctx_len=ms.ctx_len,
+        mxfp6=ms.mxfp6,
+        mxint8=ms.mxint8,
+        full_batch_size=ms.full_batch_size,
+        enable_qnn=ms.enable_qnn,
+    )
+
+    execute(
+        model_name=ms.model_name,
+        qpc_path=qpc_path,
+        prompt=ms.prompt,
+        prompts_txt_file_path=ms.prompts_txt_file_path,
+        generation_len=ms.generation_len,
+        hf_token=ms.hf_token,
+        full_batch_size=ms.full_batch_size,
+    )
diff --git a/tests/cloud/test_execute.py b/tests/cloud/test_execute.py
deleted file mode 100644
index a35c2c3bb..000000000
--- a/tests/cloud/test_execute.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-
-import pytest
-
-import QEfficient
-import QEfficient.cloud.execute
-from QEfficient.cloud.execute import main as execute
-
-
-@pytest.mark.on_qaic
-@pytest.mark.cli
-def test_execute(setup, mocker):
-    """
-    test_execute is a HL execute api testing function,
-    checks execute api code flow, object creations, internal api calls, internal returns.
-    ---------
-    Parameters:
-    setup: is a fixture defined in conftest.py module.
-    mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
-    """
-    ms = setup
-    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.execute, "load_hf_tokenizer")
-    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.execute, "cloud_ai_100_exec_kv")
-
-    execute(
-        model_name=ms.model_name,
-        qpc_path=ms.qpc_dir_path(),
-        prompt=ms.prompt,
-        prompts_txt_file_path=ms.prompts_txt_file_path,
-        generation_len=ms.generation_len,
-        hf_token=ms.hf_token,
-        full_batch_size=ms.full_batch_size,
-    )
-
-    load_hf_tokenizer_spy.assert_called_once()
-    cloud_ai_100_exec_kv_spy.assert_called_once()
diff --git a/tests/cloud/test_export.py b/tests/cloud/test_export.py
index 4291da23a..a2b717634 100644
--- a/tests/cloud/test_export.py
+++ b/tests/cloud/test_export.py
@@ -5,12 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
 
 import pytest
 
-import QEfficient
-import QEfficient.cloud.export
 from QEfficient.cloud.export import main as export
 
 
@@ -25,8 +22,6 @@ def test_export(setup, mocker):
     mocker: mocker is itself a pytest fixture, uses to mock or spy internal functions.
     """
     ms = setup
-    check_and_assign_cache_dir_spy = mocker.spy(QEfficient.cloud.export, "check_and_assign_cache_dir")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.export, "get_onnx_model_path")
 
     export(
         model_name=ms.model_name,
@@ -34,8 +29,3 @@ def test_export(setup, mocker):
         local_model_dir=ms.local_model_dir,
         full_batch_size=ms.full_batch_size,
     )
-
-    check_and_assign_cache_dir_spy.assert_called_once()
-    get_onnx_model_path_spy.assert_called_once()
-    assert any(os.path.isfile(x) for x in ms.onnx_model_path())
-    assert get_onnx_model_path_spy.spy_return in ms.onnx_model_path()
diff --git a/tests/cloud/test_infer.py b/tests/cloud/test_infer.py
index e28c3a38a..b4e9a4baa 100644
--- a/tests/cloud/test_infer.py
+++ b/tests/cloud/test_infer.py
@@ -5,12 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
-import os
 
 import pytest
 
-import QEfficient
-import QEfficient.cloud.infer
 from QEfficient.cloud.infer import main as infer
 
 
@@ -30,12 +27,6 @@ def test_infer(setup, mocker):
     Ref: https://pytest-mock.readthedocs.io/en/latest/usage.html
     """
     ms = setup
-    load_hf_tokenizer_spy = mocker.spy(QEfficient.cloud.infer, "load_hf_tokenizer")
-    qpc_exists_spy = mocker.spy(QEfficient.cloud.infer, "qpc_exists")
-    get_onnx_model_path_spy = mocker.spy(QEfficient.cloud.infer, "get_onnx_model_path")
-    compile_spy = mocker.spy(QEfficient, "compile")
-    cloud_ai_100_exec_kv_spy = mocker.spy(QEfficient.cloud.infer, "cloud_ai_100_exec_kv")
-
     infer(
         model_name=ms.model_name,
         num_cores=ms.num_cores,
@@ -54,14 +45,3 @@ def test_infer(setup, mocker):
         full_batch_size=ms.full_batch_size,
         enable_qnn=ms.enable_qnn,
     )
-    # tokenizer check
-    load_hf_tokenizer_spy.assert_called_once()
-    # qpc exist check
-    qpc_exists_spy.assert_called_once()
-    if qpc_exists_spy.spy_return is True:
-        assert os.path.isdir(ms.qpc_dir_path())
-    else:
-        get_onnx_model_path_spy.assert_called_once()
-        compile_spy.assert_called_once()
-        assert compile_spy.spy_return == ms.qpc_dir_path()
-    cloud_ai_100_exec_kv_spy.assert_called_once()
diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py
index b8915859e..a1e4265ee 100644
--- a/tests/text_generation/test_text_generation.py
+++ b/tests/text_generation/test_text_generation.py
@@ -91,7 +91,6 @@ def test_generate_text_stream(
     text_generator = TextGeneration(
         tokenizer=tokenizer,
         qpc_path=qpc_path,
-        device_id=device_id,
         ctx_len=ctx_len,
         full_batch_size=full_batch_size,
     )
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 8f23fac89..3e19e1257 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -225,6 +225,7 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer)
 
 
+@pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
 @pytest.mark.on_qaic
 @pytest.mark.parametrize("model_name", spd_test_models)
 def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
@@ -233,6 +234,7 @@ def test_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
     ``Mandatory`` Args:
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
     """
+
     if model_name == "microsoft/Phi-3-mini-4k-instruct":
         n_layer = 2  # test only 2 layer models
     else:
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index b9f07e4b6..9c6c7a2de 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -92,6 +92,7 @@ def split_dlm_bonus_token_inputs(dlm_decode_inputs):
     return bonus_token_inputs, dlm_decode_inputs
 
 
+@pytest.mark.skip()  # remove when the SDK 1.20.0 issue solved for compiling this model
 @pytest.mark.parametrize(
     "prompts, num_speculative_tokens, prefill_seq_len, ctx_len, prefill_bsz, draft_model_name, target_model_name, full_batch_size",
     configs,

From 5e37dfcff6fb9494fc8e08f1e7bd30af0dc0cf9f Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Tue, 18 Feb 2025 12:44:08 +0530
Subject: [PATCH 071/138] Hotfix-1 for Intern model (#270)

Hotfix-1 for Intern model to handle `kv_offload`

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index bc5fe54c2..62d61931f 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1291,11 +1291,13 @@ def from_pretrained(
         if kwargs.get("low_cpu_mem_usage", None):
             logger.warning("Updating low_cpu_mem_usage=False")
 
+        kv_offload = kwargs.pop("kv_offload", None)
+
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
 
         # This is support models that should be classified to in a different auto class but transformers load them via this class
-        kv_offload = kwargs.pop("kv_offload", None)
+
         if model.__class__.__name__ in MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP:
             return MISCLASSIFIED_CAUSAL_LM_TO_QEFF_AUTO_CLASS_MAP[model.__class__.__name__](
                 model, kv_offload=kv_offload

From f68dd8da5f05681ac50e7910b744c06eef4dbb14 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Tue, 18 Feb 2025 14:09:11 +0530
Subject: [PATCH 072/138] [Readme Update] : Deepseek Distills Models Added
 (#263)

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 docs/source/validate.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/validate.md b/docs/source/validate.md
index 36b660856..d0083545d 100644
--- a/docs/source/validate.md
+++ b/docs/source/validate.md
@@ -9,6 +9,8 @@
 | [CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) |✔️ |
 | [CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) |✔️ |
 | [Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1) |✔️ |
+| [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|✔️ |
+| [DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)|✔️ |
 | [Falcon-40b](https://huggingface.co/tiiuae/falcon-40b) |✔️ |
 | [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) |✔️ |
 | [GPT2](https://huggingface.co/openai-community/gpt2) |✔️ |

From b3bb4be3c1e1f826fbdefe877568d4a0ee636e2d Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Tue, 18 Feb 2025 14:14:23 +0530
Subject: [PATCH 073/138] Enabling FP8 models for `Cloud AI 100` (#248)

This PR enables support for FP8 models by dequantizing the `FP8` values
to `float32` and later these can be compressed to `mxfp6` with `Cloud AI
100 Apps SDK`.

* Added Quantizer for loading FP8 weights from transformers
* Added CompressedTensorsFP8Linear method for executing fp8 linear layer
* Added Compressed layer to linear layer transform

---------

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |  10 +-
 .../transformers/models/modeling_auto.py      |  14 +-
 QEfficient/transformers/quantizers/auto.py    |  41 +-
 .../quantizers/quant_transforms.py            |  19 +-
 .../quantizer_compressed_tensors.py           | 372 ++++++++++++++++++
 README.md                                     |   2 +-
 .../models/test_causal_lm_models.py           |   3 +
 7 files changed, 445 insertions(+), 16 deletions(-)
 create mode 100644 QEfficient/transformers/quantizers/quantizer_compressed_tensors.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 0481ace3e..73c5abc06 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,6 +5,8 @@
 #
 # -----------------------------------------------------------------------------
 
+from QEfficient.utils.logging_utils import logger
+
 
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
@@ -20,11 +22,10 @@ def check_qaic_sdk():
         return False
 
 
-QAIC_INSTALLED = check_qaic_sdk()
-
 # Conditionally import QAIC-related modules if the SDK is installed
 __version__ = "0.0.1.dev0"
-if QAIC_INSTALLED:
+
+if check_qaic_sdk():
     from QEfficient.base import (
         QEFFAutoModel,
         QEFFAutoModelForCausalLM,
@@ -52,6 +53,5 @@ def check_qaic_sdk():
         "QEFFCommonLoader",
     ]
 
-    print("QAIC SDK is installed.")
 else:
-    print("QAIC SDK is not installed. Proceeding without it.")
+    logger.warning("QAIC SDK is not installed, eager mode features won't be available!")
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 62d61931f..04eeb6e6b 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -37,7 +37,11 @@
     VlmNoKVOffloadTransform,
 )
 from QEfficient.transformers.quantizers.auto import QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING, with_replaced_quantizers
-from QEfficient.transformers.quantizers.quant_transforms import AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform
+from QEfficient.transformers.quantizers.quant_transforms import (
+    AwqToMatmulNbitsTransform,
+    FP8DeQuantLinearToLinearTransform,
+    GPTQToMatmulNbitsTransform,
+)
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
@@ -1199,7 +1203,13 @@ class QEFFAutoModelForCausalLM(QEFFBaseModel):
     """
 
     _hf_auto_class = AutoModelForCausalLM
-    _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform]
+    _pytorch_transforms = [
+        AwqToMatmulNbitsTransform,
+        GPTQToMatmulNbitsTransform,
+        FP8DeQuantLinearToLinearTransform,
+        CustomOpsTransform,
+        KVCacheTransform,
+    ]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     def __init__(
diff --git a/QEfficient/transformers/quantizers/auto.py b/QEfficient/transformers/quantizers/auto.py
index f4cec3b54..6c522f1d8 100644
--- a/QEfficient/transformers/quantizers/auto.py
+++ b/QEfficient/transformers/quantizers/auto.py
@@ -7,16 +7,43 @@
 
 from transformers.quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING, AUTO_QUANTIZER_MAPPING
 from transformers.quantizers.quantizer_awq import AwqQuantizer
+from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
 from transformers.quantizers.quantizer_gptq import GptqHfQuantizer
-from transformers.utils.quantization_config import AwqConfig, GPTQConfig
+from transformers.utils.quantization_config import AwqConfig, CompressedTensorsConfig, GPTQConfig
 
 from QEfficient.transformers.quantizers.quantizer_awq import QEffAwqConfig, QEffAwqQuantizer
+from QEfficient.transformers.quantizers.quantizer_compressed_tensors import (
+    QEffCompressedTensorsConfig,
+    QEffCompressedTensorsFP8Quantizer,
+    QEffFP8Config,
+    QEffFP8Quantizer,
+)
 from QEfficient.transformers.quantizers.quantizer_gptq import QEffGPTQConfig, QEffGPTQQuantizer
 
-QEFF_AUTO_QUANTIZER_MAPPING = {"awq": QEffAwqQuantizer, "gptq": QEffGPTQQuantizer}
-QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING = {"awq": QEffAwqConfig, "gptq": QEffGPTQConfig}
-DUPLICATE_AUTO_QUANTIZER_MAPPING = {"awq": AwqQuantizer, "gptq": GptqHfQuantizer}
-DUPLICATE_AUTO_QUANTIZATION_CONFIG_MAPPING = {"awq": AwqConfig, "gptq": GPTQConfig}
+QEFF_AUTO_QUANTIZER_MAPPING = {
+    "awq": QEffAwqQuantizer,
+    "gptq": QEffGPTQQuantizer,
+    "compressed-tensors": QEffCompressedTensorsFP8Quantizer,
+    "fp8": QEffFP8Quantizer,
+}
+QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING = {
+    "awq": QEffAwqConfig,
+    "gptq": QEffGPTQConfig,
+    "compressed-tensors": QEffCompressedTensorsConfig,
+    "fp8": QEffFP8Config,
+}
+DUPLICATE_AUTO_QUANTIZER_MAPPING = {
+    "awq": AwqQuantizer,
+    "gptq": GptqHfQuantizer,
+    "compressed-tensors": CompressedTensorsHfQuantizer,
+    "fp8": None,
+}
+DUPLICATE_AUTO_QUANTIZATION_CONFIG_MAPPING = {
+    "awq": AwqConfig,
+    "gptq": GPTQConfig,
+    "compressed-tensors": CompressedTensorsConfig,
+    "fp8": None,
+}
 
 
 def with_replaced_quantizers(func):
@@ -26,11 +53,11 @@ def wrapper(*args, **kwargs):
 
         for k in QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING.keys():
             # Replace quantization config
-            transformers_replaced_quantization_config_mapping[k] = AUTO_QUANTIZATION_CONFIG_MAPPING[k]
+            transformers_replaced_quantization_config_mapping[k] = AUTO_QUANTIZATION_CONFIG_MAPPING.get(k, None)
             AUTO_QUANTIZATION_CONFIG_MAPPING[k] = QEFF_AUTO_QUANTIZATION_CONFIG_MAPPING[k]
 
             # Replace quantizer
-            transformers_replaced_quantizer_mapping[k] = AUTO_QUANTIZER_MAPPING[k]
+            transformers_replaced_quantizer_mapping[k] = AUTO_QUANTIZER_MAPPING.get(k, None)
             AUTO_QUANTIZER_MAPPING[k] = QEFF_AUTO_QUANTIZER_MAPPING[k]
 
         # Call the function for loading quantized models here
diff --git a/QEfficient/transformers/quantizers/quant_transforms.py b/QEfficient/transformers/quantizers/quant_transforms.py
index b20d8335b..5cf2d6d01 100644
--- a/QEfficient/transformers/quantizers/quant_transforms.py
+++ b/QEfficient/transformers/quantizers/quant_transforms.py
@@ -12,6 +12,7 @@
 from QEfficient.customop.matmulnbits import QuantLinearORT
 from QEfficient.transformers.quantizers.awq import WQLinear_GEMM
 from QEfficient.transformers.quantizers.gptq import QuantLinearGPTQ
+from QEfficient.transformers.quantizers.quantizer_compressed_tensors import FP8DeQuantLinear
 from QEfficient.transformers.quantizers.quantizer_utils import dequantize_gptq, unpack_weights
 
 
@@ -77,7 +78,7 @@ def mutate(cls, original_module: nn.Module, parent_module: nn.Module):
             parent_module (nn.Module): The parent module containing the original module.
 
         Returns:
-            :nn.Module: The new ``QuantLinearORT`` module with unpacked and dequantized weights.
+            :nn.Module: The new ``QuantLinearORT`` module with unpacked and de-quantized weights.
         """
 
         fp16_weight, scales, zeros = cls.unpack_and_dequantize_gptq(
@@ -98,3 +99,19 @@ def mutate(cls, original_module: nn.Module, parent_module: nn.Module):
         new_module.bias = original_module.bias if original_module.bias is not None else None
         new_module.pack(original_module, scales.T, zeros.T, original_module.g_idx)
         return new_module
+
+
+class FP8DeQuantLinearToLinearTransform(ModuleMutatorTransform):
+    _match_class = FP8DeQuantLinear
+
+    @classmethod
+    def mutate(cls, original_module, parent_module):
+        #  -- de-quantizing the weights --
+        dequant_weights = original_module.weight.to(torch.float32) * original_module.weight_scale
+        dequant_linear_layer = nn.Linear(
+            original_module.in_features, original_module.out_features, bias=original_module.bias is not None
+        )
+        dequant_linear_layer.weight = torch.nn.Parameter(dequant_weights)
+        if original_module.bias is not None:
+            dequant_linear_layer.bias = torch.nn.Parameter(original_module.bias.float())
+        return dequant_linear_layer
diff --git a/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
new file mode 100644
index 000000000..b705086f8
--- /dev/null
+++ b/QEfficient/transformers/quantizers/quantizer_compressed_tensors.py
@@ -0,0 +1,372 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from dataclasses import dataclass
+from enum import Enum
+from typing import List
+
+import torch
+from transformers.quantizers.quantizer_compressed_tensors import CompressedTensorsHfQuantizer
+from transformers.utils.quantization_config import CompressedTensorsConfig, QuantizationConfigMixin, QuantizationMethod
+
+from QEfficient.transformers.quantizers.quantizer_utils import get_keys_to_not_convert
+from QEfficient.utils.logging_utils import logger
+
+FP8_DTYPE = torch.float8_e4m3fn
+
+
+class QEffExtendedQuantizationMethod(str, Enum):
+    FP8 = "fp8"
+
+
+@dataclass
+class FP8QuantizationScheme:
+    dynamic: bool
+    num_bits: int
+    strategy: str
+    symmetric: bool
+    type: str
+
+    def __post_init__(self):
+        if self.num_bits != 8 or self.type != "float" or self.strategy not in ["tensor", "channel", "token"]:
+            raise NotImplementedError(
+                f"Only FP8 compressed-tensors supported, got num_bits={self.num_bits}, type={self.type}, strategy={self.strategy}"
+            )
+
+
+class FP8DeQuantLinear(torch.nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+
+        self.register_buffer(
+            "weight",
+            torch.empty(
+                (out_features, in_features), dtype=FP8_DTYPE
+            ),  # This is fixed for now and only e4m3fn quantization is prominent
+        )
+
+        if bias:
+            self.register_buffer(
+                "bias",
+                torch.zeros(
+                    (out_features),
+                    dtype=torch.float16,
+                ),
+            )
+        else:
+            self.bias = None
+
+    @classmethod
+    def for_compressed_tensors_fp8_layer(
+        cls,
+        in_features: int,
+        out_features: int,
+        weights_quant_scheme: FP8QuantizationScheme,
+        input_activations_quant_scheme: FP8QuantizationScheme,
+        bias: bool = False,
+    ):
+        fp8_dequant_layer = cls(in_features, out_features, bias)
+        fp8_dequant_layer.weights_quantization_scheme = weights_quant_scheme
+        fp8_dequant_layer.input_activations_quantization_scheme = input_activations_quant_scheme
+
+        if fp8_dequant_layer.weights_quantization_scheme.dynamic:
+            raise NotImplementedError(
+                f"Expected statically quantized weights but got weights quantization scheme dynamic = {fp8_dequant_layer.weights_quantization_scheme.dynamic}"
+            )
+
+        if fp8_dequant_layer.weights_quantization_scheme.strategy == "tensor":
+            fp8_dequant_layer.register_buffer("weight_scale", torch.zeros((1), dtype=torch.float32))
+        elif fp8_dequant_layer.weights_quantization_scheme.strategy == "channel":
+            fp8_dequant_layer.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=torch.float32))
+        else:
+            raise NotImplementedError(
+                f"Unknown weights quantization strategy {fp8_dequant_layer.weights_quantization_scheme.strategy}, ['channel' or 'tensor'] strategy supported."
+            )
+
+        if not fp8_dequant_layer.input_activations_quantization_scheme.dynamic:
+            if fp8_dequant_layer.input_activations_quantization_scheme.strategy == "tensor":
+                fp8_dequant_layer.register_buffer("input_scale", torch.zeros((1), dtype=torch.float32))
+            elif fp8_dequant_layer.input_activations_quant_scheme.strategy == "token":
+                fp8_dequant_layer.register_buffer("input_scale", torch.zeros((1, in_features), dtype=torch.float32))
+            else:
+                raise NotImplementedError(
+                    f"Unknown input activations quantization strategy {fp8_dequant_layer.input_activations_quantization_scheme.strategy}, ['token' or 'tensor'] strategy supported."
+                )
+
+        return fp8_dequant_layer
+
+    @classmethod
+    def for_fp8_layer(cls, in_features, out_features, activation_quantization_strategy, bias):
+        fp8_dequant_layer = cls(in_features, out_features, bias)
+
+        # -- Always per tensor quantization assumed --
+        fp8_dequant_layer.register_buffer("weight_scale", torch.zeros((), dtype=torch.float32))
+
+        if activation_quantization_strategy == "static":
+            fp8_dequant_layer.register_buffer("input_scale", torch.zeros((), dtype=torch.float32))
+
+        return fp8_dequant_layer
+
+    def forward(self, x):
+        # Only inference supported
+        with torch.no_grad():
+            dequantized_weights = self.weight.to(torch.float32) * self.weight_scale
+            out = torch.matmul(x.float(), dequantized_weights.T)
+            out = out + self.bias if self.bias is not None else out
+
+        return out
+
+
+class QEffFP8Config(QuantizationConfigMixin):
+    def __init__(
+        self, quant_method: str, activation_scheme: str, ignored_layers: List[str] = None, kv_cache_scheme: str = None
+    ):
+        self.quant_method = quant_method
+        self.activation_scheme = activation_scheme
+        self.ignored_layers = ignored_layers
+        self.kv_cache_scheme = kv_cache_scheme
+        if kv_cache_scheme:
+            logger.warning(
+                f"kv_cache_scheme={kv_cache_scheme} will be ignored please use `mxint8_kv_cache=True` during compile call if you want to keep kv cache in int8 at runtime on Cloud AI 100"
+            )
+
+        if quant_method != "fp8" or activation_scheme not in ["static", "dynamic", None]:
+            raise NotImplementedError(
+                f"Expected FP8 quantization with static/dynamic/None activation quantization, go quant_method={quant_method}, activation_scheme={activation_scheme}"
+            )
+
+        self.quant_method = QEffExtendedQuantizationMethod.FP8
+
+
+class QEffFP8Quantizer(CompressedTensorsHfQuantizer):
+    def __init__(self, quantization_config, **kwargs):
+        # TODO: check if more checks are required
+        if not isinstance(quantization_config, QEffFP8Config):
+            raise TypeError(f"Only {QEffFP8Config} is supported for initialization got {type(quantization_config)}")
+
+        self.quantization_config = quantization_config
+
+        # -- Handle extra kwargs below --
+        self.modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
+        self.modules_to_not_convert = list(
+            set(self.modules_to_not_convert if self.modules_to_not_convert else [])
+            | set(self.quantization_config.ignored_layers if self.quantization_config.ignored_layers else [])
+        )
+        self.pre_quantized = kwargs.pop("pre_quantized", True)
+
+        if not self.pre_quantized and self.requires_calibration:
+            raise ValueError(
+                f"The quantization method {quantization_config.quant_method} does require the model to be pre-quantized."
+                f" You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to "
+                f"pass `pre_quantized=True` while knowing what you are doing."
+            )
+
+    def validate_environment(self, *args, **kwargs):
+        return True
+
+    def update_torch_dtype(self, torch_dtype):
+        if torch_dtype not in [None, torch.float32]:
+            logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None")
+        return None
+
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        if not self.modules_to_not_convert or "lm_head" not in self.modules_to_not_convert:
+            self.modules_to_not_convert.extend(get_keys_to_not_convert(model))
+
+        logger.warning(
+            f"activations quantization strategy = {self.quantization_config.activation_scheme}, will be ignored and the layers will be run with de-quantized weights"
+        )
+
+        # -- Defining local method as it uses lot of local variables --
+        def replace_linear_with_fp8_dequant_layer(module):
+            for name, child_module in module.named_children():
+                if isinstance(child_module, torch.nn.Linear) and name not in self.modules_to_not_convert:
+                    compressed_fp8_layer = FP8DeQuantLinear.for_fp8_layer(
+                        child_module.in_features,
+                        child_module.out_features,
+                        self.quantization_config.activation_scheme,
+                        child_module.bias is not None,
+                    )
+                    setattr(module, name, compressed_fp8_layer)
+                else:
+                    replace_linear_with_fp8_dequant_layer(child_module)
+
+        replace_linear_with_fp8_dequant_layer(model)
+
+
+class QEffCompressedTensorsConfig(CompressedTensorsConfig):
+    def __init__(
+        self,
+        config_groups=None,
+        format="dense",
+        quantization_status="initialized",
+        kv_cache_scheme=None,
+        global_compression_ratio=None,
+        ignore=None,
+        sparsity_config=None,
+        quant_method="compressed-tensors",
+        **kwargs,
+    ):
+        self.config_groups = config_groups
+        self.quant_method = quant_method
+        self.kv_cache_scheme = kv_cache_scheme
+        self.format = format
+        self.quantization_status = quantization_status
+        self.global_compression_ratio = global_compression_ratio
+        self.ignore = ignore
+
+        # Validate configuration
+        if len(self.config_groups) != 1:
+            raise NotImplementedError(
+                "Currently only single quantization group is supported, please raise an issue with model details for support!"
+            )
+
+        if quantization_status != "frozen":
+            raise NotImplementedError(f"expected quantization_status=`frozen`, got {quantization_status}")
+
+        if kv_cache_scheme:
+            raise NotImplementedError(f"Expected kv_cache_scheme=None, got {kv_cache_scheme}")
+
+        if format not in ["naive-quantized", "float-quantized"]:
+            raise NotImplementedError(
+                f"Expected quantization format in ['naive_quantized', 'float-quantized']  got {format}"
+            )
+
+        if sparsity_config:
+            raise NotImplementedError(f"Expected sparsity_config to be None, got {sparsity_config}")
+
+        if quant_method != "compressed-tensors":
+            raise NotImplementedError("Only compressed-tensors quant_method is supported for now!")
+
+        if "lm_head" not in self.ignore:
+            raise AttributeError(f"Expected `lm_head` to be present in non-quantized layers got ignore={self.ignore}")
+
+        group_0 = self.config_groups.get("group_0")
+        activations_quantization_config = group_0.get("input_activations")
+        weights_quantization_config = group_0.get("weights")
+        output_activation_quantization_config = group_0.get("output_activations")
+        self.targets = group_0.get("targets")
+
+        if self.targets != ["Linear"]:
+            raise NotImplementedError(f"Only linear targets are supported, got {self.targets}")
+
+        if output_activation_quantization_config:
+            raise NotImplementedError(
+                f"output_activations quantization is not supported got {output_activation_quantization_config}"
+            )
+
+        if (
+            activations_quantization_config.get("block_structure")
+            or activations_quantization_config.get("group_size")
+            or weights_quantization_config.get("block_structure")
+            or weights_quantization_config.get("group_size")
+        ):
+            raise NotImplementedError(f"group_size and block_structure not supported got {group_0}")
+
+        self.weights_quantization_scheme = FP8QuantizationScheme(
+            weights_quantization_config.get("dynamic"),
+            weights_quantization_config.get("num_bits"),
+            weights_quantization_config.get("strategy"),
+            weights_quantization_config.get("symmetric"),
+            weights_quantization_config.get("type"),
+        )
+        self.input_activations_quantization_scheme = FP8QuantizationScheme(
+            activations_quantization_config.get("dynamic"),
+            activations_quantization_config.get("num_bits"),
+            activations_quantization_config.get("strategy"),
+            activations_quantization_config.get("symmetric"),
+            activations_quantization_config.get("type"),
+        )
+
+        self.quant_method = QuantizationMethod.COMPRESSED_TENSORS
+
+    def to_dict(self):
+        return {
+            "quantization_config": {
+                "config_groups": self.config_groups,
+                "weights_quantization_scheme": self.weights_quantization_scheme.__dict__,
+                "activations_quantization_scheme": self.input_activations_quantization_scheme.__dict__,
+                "quant_method": self.quant_method,
+                "kv_cache_scheme": self.kv_cache_scheme,
+                "format": self.format,
+                "quantization_status": self.quantization_status,
+                "global_compression_ratio": self.global_compression_ratio,
+                "ignore": self.ignore,
+                "targets": self.targets,
+            },
+            "sparsity_config": None,
+        }
+
+
+class QEffCompressedTensorsFP8Quantizer(CompressedTensorsHfQuantizer):
+    requires_calibration = False
+
+    def __init__(self, quantization_config, **kwargs):
+        # TODO: check if more checks are required
+        if not isinstance(quantization_config, QEffCompressedTensorsConfig):
+            raise TypeError(
+                f"Only {QEffCompressedTensorsConfig} is supported for initialization got {type(quantization_config)}"
+            )
+
+        self.quantization_config = quantization_config
+
+        # -- Handle extra kwargs below --
+        self.modules_to_not_convert = kwargs.pop("modules_to_not_convert", [])
+        self.modules_to_not_convert = list(
+            set(self.modules_to_not_convert if self.modules_to_not_convert else [])
+            | set(self.quantization_config.ignore if self.quantization_config.ignore else [])
+        )
+        self.pre_quantized = kwargs.pop("pre_quantized", True)
+
+        if not self.pre_quantized and self.requires_calibration:
+            raise ValueError(
+                f"The quantization method {quantization_config.quant_method} does require the model to be pre-quantized."
+                f" You explicitly passed `pre_quantized=False` meaning your model weights are not quantized. Make sure to "
+                f"pass `pre_quantized=True` while knowing what you are doing."
+            )
+
+    def validate_environment(self, *args, **kwargs):
+        return True
+
+    def update_torch_dtype(self, torch_dtype):
+        if torch_dtype not in [None, torch.float32]:
+            logger.warning(f"Requested dtype {torch_dtype} is not supported, overriding to None")
+        return None
+
+    def _process_model_before_weight_loading(self, model, **kwargs):
+        if self.quantization_config.targets != ["Linear"]:
+            raise NotImplementedError(
+                f"Only Linear layer with FP8 quantization are supported got targets = {self.quantization_config.targets}"
+            )
+
+        logger.warning(
+            f"activations quantization scheme = {self.quantization_config.input_activations_quantization_scheme.__dict__}, will be ignored and the layers will be run with de-quantized weights"
+        )
+
+        # -- Defining local method as it uses lot of local variables --
+        def replace_linear_with_fp8_dequant_layer(module):
+            for name, child_module in module.named_children():
+                if isinstance(child_module, torch.nn.Linear) and name not in self.modules_to_not_convert:
+                    compressed_fp8_layer = FP8DeQuantLinear.for_compressed_tensors_fp8_layer(
+                        child_module.in_features,
+                        child_module.out_features,
+                        self.quantization_config.weights_quantization_scheme,
+                        self.quantization_config.input_activations_quantization_scheme,
+                        child_module.bias is not None,
+                    )
+                    setattr(module, name, compressed_fp8_layer)
+                else:
+                    replace_linear_with_fp8_dequant_layer(child_module)
+
+        replace_linear_with_fp8_dequant_layer(model)
diff --git a/README.md b/README.md
index 9defeceb7..eec338556 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 ---
 
 *Latest news* :fire: <br>
-
+- [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models.
 - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models.
 - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM.
 - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 3e19e1257..25d08ac15 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -38,6 +38,9 @@
     "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ",  # AWQ model
     "TheBloke/Llama-2-7B-GPTQ",  # GPTQ model
     "ibm-granite/granite-20b-code-base",
+    # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic",  # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
+    "neuralmagic/Llama-3.2-3B-Instruct-FP8",  # float quantized compressed-tensor per tensor both weight and activations
+    "neuralmagic/Qwen2-0.5B-Instruct-FP8",  # fp8 quant method, static, with lm head ignored
 ]
 
 spd_test_models = [

From 1d234f5f1b3e012dbf4ddcc07a74ee40a7ac3ec7 Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Tue, 18 Feb 2025 19:48:03 +0530
Subject: [PATCH 074/138] Added example script to use embedding model (#237)

Added example script to use embedding model using ```AutoModel```

---------

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 examples/embedding_model.py | 45 +++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)
 create mode 100644 examples/embedding_model.py

diff --git a/examples/embedding_model.py b/examples/embedding_model.py
new file mode 100644
index 000000000..366e93433
--- /dev/null
+++ b/examples/embedding_model.py
@@ -0,0 +1,45 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+# This is the work example of the Embedding model with the AI 100
+# For more information, visit: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
+
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModel as AutoModel
+
+
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output  # First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+
+# Sentences we want sentence embeddings for
+sentences = "This is an example sentence"
+
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+
+
+qeff_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+qeff_model.compile(num_cores=14)
+
+# Tokenize sentences
+encoded_input = tokenizer(sentences, return_tensors="pt")
+qeff_output = torch.tensor(qeff_model.generate(encoded_input))
+
+# Perform pooling
+sentence_embeddings = mean_pooling(qeff_output, encoded_input["attention_mask"])
+
+# Normalize embeddings
+sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+
+print("Sentence embeddings:")
+print(sentence_embeddings)

From 18b0c4d987c3b489954311d3bce621a9227e25e7 Mon Sep 17 00:00:00 2001
From: quic-jouachen <quic_jouachen@quicinc.com>
Date: Tue, 18 Feb 2025 19:13:38 -0800
Subject: [PATCH 075/138] Add prompt_to_lora_id_mapping adjustment in
 fix_prompts() (#242)

This is regarding the issue reported in
[issue#251](https://github.com/quic/efficient-transformers/issues/251)

The finite lorax feature failed to execute when the number of prompts
provided is less than the full batch size. The solution involves
applying the same adjustment strategy for `prompt_to_lora_id_mapping` as
used for `prompt` in the `fix_prompts()` function located in
`QEfficient/generation/text_generation_inference.py`.

Signed-off-by: Jou-An Chen <quic_jouachen@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../generation/text_generation_inference.py   | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/QEfficient/generation/text_generation_inference.py b/QEfficient/generation/text_generation_inference.py
index d77188914..2dd485a5e 100755
--- a/QEfficient/generation/text_generation_inference.py
+++ b/QEfficient/generation/text_generation_inference.py
@@ -233,6 +233,40 @@ def fix_prompts(prompt: List[str], batch_size: int, full_batch_size: int = None)
     return prompt
 
 
+def fix_prompt_to_lora_id_mapping(prompt_to_lora_id_mapping: List[int], batch_size: int, full_batch_size: int = None):
+    """
+    Adjusts the list of prompt_to_lora_id_mapping to match the required batch size.
+
+    ``Mandatory`` Args:
+        prompt_to_lora_id_mapping (Optional[List[int]]): Mapping to associate prompts with their respective LoRA adapter.
+        batch_size (int): The batch size to process at a time.
+
+    ``Optional`` Args:
+        full_batch_size (Optional[int]): The full batch size if different from batch_size.
+
+    Returns:
+        List[int]: Adjusted list of prompt_to_lora_id_mapping.
+    """
+    exec_batch_size = full_batch_size if full_batch_size is not None else batch_size
+
+    if len(prompt_to_lora_id_mapping) < exec_batch_size:
+        logger.warning(
+            "Prompt_to_lora_id_mapping are less than batch size/full batch size, repeating to required batch size"
+        )
+        prompt_to_lora_id_mapping = (
+            prompt_to_lora_id_mapping * (exec_batch_size // len(prompt_to_lora_id_mapping) + 1)
+        )[:exec_batch_size]
+    elif full_batch_size is None and len(prompt_to_lora_id_mapping) % batch_size != 0:
+        logger.warning(
+            "prompt_to_lora_id_mapping are not multiple of batch size, dropping last incomplete batch from given input prompts"
+        )
+        prompt_to_lora_id_mapping = prompt_to_lora_id_mapping[
+            : batch_size * (len(prompt_to_lora_id_mapping) // batch_size)
+        ]
+
+    return prompt_to_lora_id_mapping
+
+
 def read_prompts_txt_file(prompts_txt_file_path: str):
     prompt = []
     with open(prompts_txt_file_path, "r") as file:
@@ -325,6 +359,10 @@ def cloud_ai_100_exec_kv(
     batch_size, ctx_len, full_batch_size = get_compilation_dims(qpc_path)
     prompt: List[str] = get_input_prompts(prompt, prompts_txt_file_path)
     prompt = fix_prompts(prompt, batch_size, full_batch_size)
+    if prompt_to_lora_id_mapping is not None:
+        prompt_to_lora_id_mapping = fix_prompt_to_lora_id_mapping(
+            prompt_to_lora_id_mapping, batch_size, full_batch_size
+        )
     generate_text = TextGeneration(
         tokenizer=tokenizer,
         qpc_path=qpc_path,

From b82c27a3bf2b623d273aa4fd00e844ac300eed4e Mon Sep 17 00:00:00 2001
From: quic-akuruvil <quic_akuruvil@quicinc.com>
Date: Wed, 19 Feb 2025 13:37:14 +0530
Subject: [PATCH 076/138] Add support for model
 ibm-granite/granite-3.1-8b-instruct (#239)

Added new model support.

Source:
https://huggingface.co/ibm-granite/granite-3.1-8b-instruct

- CB support for the same

---------

Signed-off-by: Ann <quic_akuruvil@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/granite/__init__.py   |   7 +
 .../models/granite/modeling_granite.py        | 523 ++++++++++++++++++
 .../transformers/models/pytorch_transforms.py |  19 +-
 README.md                                     |   4 +
 docs/source/validate.md                       |   2 +
 .../models/test_causal_lm_models.py           |   2 +
 tests/transformers/test_causal_lm.py          |   1 +
 7 files changed, 557 insertions(+), 1 deletion(-)
 create mode 100644 QEfficient/transformers/models/granite/__init__.py
 create mode 100644 QEfficient/transformers/models/granite/modeling_granite.py

diff --git a/QEfficient/transformers/models/granite/__init__.py b/QEfficient/transformers/models/granite/__init__.py
new file mode 100644
index 000000000..da26921c5
--- /dev/null
+++ b/QEfficient/transformers/models/granite/__init__.py
@@ -0,0 +1,7 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py
new file mode 100644
index 000000000..6116317b3
--- /dev/null
+++ b/QEfficient/transformers/models/granite/modeling_granite.py
@@ -0,0 +1,523 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPast,
+    CausalLMOutputWithPast,
+)
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS
+from transformers.models.granite.modeling_granite import (
+    GraniteAttention,
+    GraniteConfig,
+    GraniteForCausalLM,
+    GraniteModel,
+    GraniteRotaryEmbedding,
+    logger,
+    repeat_kv,
+    rotate_half,
+)
+
+from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+
+
+class QEffGraniteRotaryEmbedding(GraniteRotaryEmbedding):
+    """
+    Copied from LlamaForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    The only differences are:
+    - Add static sin/cos computations.
+    """
+
+    def __init__(
+        self,
+        dim=None,
+        max_position_embeddings=2048,
+        base=10000,
+        device=None,
+        scaling_factor=1.0,
+        rope_type="default",
+        config: Optional[GraniteConfig] = None,
+    ):
+        super(GraniteRotaryEmbedding, self).__init__()  # Initialize nn.Module
+        # TODO (joao): remove the `if` below, only used for BC
+        self.rope_kwargs = {}
+        if config is None:
+            logger.warning_once(
+                "`LlamaRotaryEmbedding` can now be fully parameterized by passing the model config through the "
+                "`config` argument. All other arguments will be removed in v4.45"
+            )
+            self.rope_kwargs = {
+                "rope_type": rope_type,
+                "factor": scaling_factor,
+                "dim": dim,
+                "base": base,
+                "max_position_embeddings": max_position_embeddings,
+            }
+            self.rope_type = rope_type
+            self.max_seq_len_cached = max_position_embeddings
+            self.original_max_seq_len = max_position_embeddings
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+            else:
+                self.rope_type = "default"
+            self.max_seq_len_cached = config.max_position_embeddings
+            self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=self.original_max_seq_len, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+            self.sin_cached[:seq_len].to(dtype=x.dtype) * self.attention_scaling,
+        )
+
+
+def qeff_apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+
+    # Apply rotation
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    # Cast back to original dtype
+    return q_embed.to(q.dtype), k_embed.to(k.dtype)
+
+
+class QEffGraniteAttention(GraniteAttention):
+    def __init__(self, config: GraniteConfig, layer_idx: Optional[int] = None):
+        super().__init__(config, layer_idx)
+        # Define the general __qeff_init__() for any changes in the init calls
+        # Set the init in the module mapping pytorch transforms
+        self.config = config
+        self.__qeff_init__()
+
+    def __qeff_init__(self):
+        self.rotary_emb = QEffGraniteRotaryEmbedding(config=self.config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        batch_index: Optional[torch.LongTensor] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = qeff_apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "cache_position": cache_position,
+                "batch_index": batch_index,
+                "position_ids": position_ids,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+        if attention_mask is not None:
+            attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights)
+
+        attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        dropout = 0.0 if not self.training else self.attention_dropout
+        attn_weights = F.dropout(attn_weights, p=dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights, past_key_value
+
+
+class QEffGraniteModel(GraniteModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        batch_index: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        inputs_embeds = inputs_embeds * self.embedding_multiplier  # main diff with Llama
+
+        return_legacy_cache = False
+        if use_cache and not isinstance(past_key_values, Cache):
+            return_legacy_cache = True
+            if past_key_values is None:
+                past_key_values = DynamicCache()
+            else:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, position_ids, past_key_values, output_attentions
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = None
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    causal_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            elif batch_index is not None:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    batch_index=batch_index,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+            else:
+                causal_mask = _create_causal_mask(position_ids=position_ids, target_length=target_length)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+class QEffGraniteForCausalLM(GraniteForCausalLM):
+    """
+    Copied from GraniteForCausalLM: https://github.com/huggingface/transformers/blob/main/src/transformers/models/granite/modeling_granite.py
+    """
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        batch_index: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, GraniteForCausalLM
+
+        >>> model = GraniteForCausalLM.from_pretrained("meta-granite/Granite-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-granite/Granite-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            batch_index=batch_index,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = outputs[0]
+        # Cast to INT32 to avoid issue while running in ONNXRT
+        logit_index = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        hidden_states = outputs[0][torch.arange(position_ids.shape[0]).view(-1, 1), logit_index]
+
+        logits = self.lm_head(hidden_states)
+        logits = logits / self.config.logits_scaling
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 6e107d77b..4b475515e 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -44,6 +44,11 @@
     GPTBigCodeModel,
 )
 from transformers.models.gptj.modeling_gptj import GPTJAttention, GPTJBlock, GPTJForCausalLM, GPTJModel
+from transformers.models.granite.modeling_granite import (
+    GraniteAttention,
+    GraniteForCausalLM,
+    GraniteModel,
+)
 from transformers.models.llama.modeling_llama import (
     LlamaAttention,
     LlamaDecoderLayer,
@@ -149,7 +154,15 @@
     QEffGPTJForCausalLM,
     QEffGPTJModel,
 )
-from QEfficient.transformers.models.internvl.modeling_internvl import QEffInternVisionEmbeddings, QEffInternVLModel
+from QEfficient.transformers.models.granite.modeling_granite import (
+    QEffGraniteAttention,
+    QEffGraniteForCausalLM,
+    QEffGraniteModel,
+)
+from QEfficient.transformers.models.internvl.modeling_internvl import (
+    QEffInternVisionEmbeddings,
+    QEffInternVLModel,
+)
 from QEfficient.transformers.models.llama.modeling_llama import (
     QEffLlamaAttention,
     QEffLlamaDecoderLayer,
@@ -269,6 +282,10 @@ class KVCacheTransform(ModuleMappingTransform):
         Gemma2DecoderLayer: QEffGemma2DecoderLayer,
         Gemma2Model: QEffGemma2Model,
         Gemma2ForCausalLM: QEffGemma2ForCausalLM,
+        # Granite
+        GraniteModel: QEffGraniteModel,
+        GraniteForCausalLM: QEffGraniteForCausalLM,
+        GraniteAttention: QEffGraniteAttention,
         # mllama
         MllamaTextRMSNorm: CustomRMSNormAIC,
         MllamaTextSelfAttention: QEffMllamaTextSelfAttention,
diff --git a/README.md b/README.md
index eec338556..3d5487e7d 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,8 @@
 
 *Latest news* :fire: <br>
 - [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models.
+
+- [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
 - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models.
 - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM.
 - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)
@@ -16,6 +18,8 @@
 <details>
 <summary>More</summary>
 
+- [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
+- [01/2025] Added support for [Ibm-Granite-Guardian] (https://huggingface.co/ibm-granite/granite-guardian-3.1-8b)
 - [09/2024] Added support for [Gemma-2-Family](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)<br>
 - [09/2024] Added support for [CodeGemma-Family](https://huggingface.co/collections/google/codegemma-release-66152ac7b683e2667abdee11)
 - [09/2024] Added support for [Gemma-Family](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b)
diff --git a/docs/source/validate.md b/docs/source/validate.md
index d0083545d..b3327596d 100644
--- a/docs/source/validate.md
+++ b/docs/source/validate.md
@@ -19,6 +19,8 @@
 | [Gemma-2-2b](https://huggingface.co/google/gemma-2-2b) |✔️ |
 | [Gemma-2-9b](https://huggingface.co/google/gemma-2-9b) |✔️ |
 | [Gemma-2-27b](https://huggingface.co/google/gemma-2-27b) |✔️ |
+| [Granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✔️ |
+| [Granite-guardian-3.1-8b](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b) |✔️ |
 | [Granite-20b-code-base](https://huggingface.co/ibm-granite/granite-20b-code-base-8k) | ✔️ |
 | [Granite-20b-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k) | ✔️ |
 | [Jais-adapted-7b](https://huggingface.co/inceptionai/jais-adapted-7b) |✔️ |
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index 25d08ac15..a3a855cee 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -41,6 +41,8 @@
     # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic",  # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations
     "neuralmagic/Llama-3.2-3B-Instruct-FP8",  # float quantized compressed-tensor per tensor both weight and activations
     "neuralmagic/Qwen2-0.5B-Instruct-FP8",  # fp8 quant method, static, with lm head ignored
+    "ibm-granite/granite-3.1-2b-instruct",
+    "ibm-granite/granite-guardian-3.1-2b",
 ]
 
 spd_test_models = [
diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py
index 0f1d9db0b..1ceb5a7e0 100644
--- a/tests/transformers/test_causal_lm.py
+++ b/tests/transformers/test_causal_lm.py
@@ -28,6 +28,7 @@
     ("phi3", 256, 2, 4, 128, 512, 127, {"pad_token_id": 0}),
     ("qwen2", 256, 2, 4, 128, 512, 127, {"num_key_value_heads": 2}),
     ("starcoder2", 256, 2, 4, 128, 512, 127, {}),
+    ("granite", 256, 2, 4, 128, 512, 127, {"num_key_value_heads": 2}),
 ]
 
 configs = [

From 5ea09e5c90ec975cf12f51a7130a5c4f1dc42422 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Wed, 19 Feb 2025 23:55:48 +0530
Subject: [PATCH 077/138] HOTFIX/fixed replace quantizers (#273)

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 04eeb6e6b..1fcfb795d 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1259,6 +1259,7 @@ def __repr__(self) -> str:
         return self.__class__.__name__ + "\n" + self.model.__repr__
 
     @classmethod
+    @with_replaced_quantizers
     def from_pretrained(
         cls, pretrained_model_name_or_path, continuous_batching: bool = False, is_tlm: bool = False, *args, **kwargs
     ):

From eded9a9197a46d1723b1d2b9b5a9b618de2f5a53 Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Fri, 21 Feb 2025 16:32:35 +0530
Subject: [PATCH 078/138] HOTFIX/compiler arguments fix for VLM (#274)

Hotfix - compiler arguments fix for VLM

As we are popping few arguments of the compiler option inside function
`get_specializations` and by default `**compiler_options` is passed by
value not reference, so to handle this we are returning the updated
`compiler_options` from the `get_specializations`.

---------

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/internvl/modeling_internvl.py      | 3 ++-
 QEfficient/transformers/models/llava/modeling_llava.py     | 3 ++-
 QEfficient/transformers/models/mllama/modeling_mllama.py   | 4 ++--
 QEfficient/transformers/models/modeling_auto.py            | 7 +++----
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
index 35304d945..318993dde 100644
--- a/QEfficient/transformers/models/internvl/modeling_internvl.py
+++ b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -34,7 +34,7 @@ def get_specializations(
             img_size = 448
             logger.warning("Setting img_size to be 448, as it was neither passed nor found in vision_config")
 
-        return [
+        specializations = [
             {
                 "batch_size": batch_size,
                 "seq_len": prefill_seq_len,
@@ -50,6 +50,7 @@ def get_specializations(
                 "img_size": img_size,
             },
         ]
+        return specializations, compiler_options
 
     def get_onnx_dynamic_axes(
         self,
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
index 847eb9028..82c934670 100644
--- a/QEfficient/transformers/models/llava/modeling_llava.py
+++ b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -87,7 +87,7 @@ def get_specializations(
             img_size = 336
             logger.warning("Setting img_size to be 336, as it was neither passed nor found in vision_config")
 
-        return [
+        specializations = [
             {
                 "batch_size": batch_size,
                 "seq_len": prefill_seq_len,
@@ -103,6 +103,7 @@ def get_specializations(
                 "img_size": img_size,
             },
         ]
+        return specializations, compiler_options
 
     def get_onnx_dynamic_axes(
         self,
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
index 610c7be30..9dcddbdfd 100644
--- a/QEfficient/transformers/models/mllama/modeling_mllama.py
+++ b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -1228,9 +1228,9 @@ def get_specializations(
         if kv_offload:
             specializations["vision"] = vision
             specializations["lang"] = lang
-            return specializations
+            return specializations, compiler_options
         else:
-            return lang
+            return lang, compiler_options
 
     def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         txt_cfg = self.config.get_text_config()
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 1fcfb795d..ca9234c69 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -420,7 +420,6 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
 
     def __init__(self, model):
         super().__init__(model)
-        # self.model.config.text_config.use_cache=True
 
     def export(self, inputs, output_names, dynamic_axes, export_dir=None):
         return self._export(inputs, output_names, dynamic_axes, export_dir)
@@ -533,7 +532,7 @@ def export(
 
     def compile(
         self,
-        img_size: int,
+        img_size: Optional[int] = None,
         vision_onnx_path: Optional[str] = None,
         lang_onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
@@ -567,7 +566,7 @@ def compile(
 
         output_names = self.model.get_output_names(kv_offload=True)
 
-        specializations = self.model.get_specializations(
+        specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
             ctx_len=ctx_len,
@@ -892,7 +891,7 @@ def compile(
 
         # Get specializations from modelling file
         # TODO: expose this via the auto class as well
-        specializations = self.model.get_specializations(
+        specializations, compiler_options = self.model.get_specializations(
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
             ctx_len=ctx_len,

From 546c434819765ad9537b2a1096d6bbe4bddabd74 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Fri, 21 Feb 2025 16:49:44 +0530
Subject: [PATCH 079/138] Support for Prefix caching Feature in QNN Compilation
 Path. (#262)

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/base/modeling_qeff.py              |  3 ++
 QEfficient/cloud/compile.py                   |  2 +-
 QEfficient/cloud/infer.py                     |  2 +-
 QEfficient/compile/qnn_compiler.py            |  3 ++
 .../transformers/models/modeling_auto.py      |  1 +
 ...erate_qnn_network_specialization_config.py | 50 ++++++++++++++++---
 .../models/test_prefix_caching.py             | 39 ++++++++++++++-
 7 files changed, 91 insertions(+), 9 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index c3a1b6d16..316ee0c74 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -339,6 +339,7 @@ def _qnn_compile(
         mxfp6_matmul: bool = False,
         mxint8_kv_cache: bool = False,
         qnn_config: Optional[str] = None,
+        kv_cache_batch_size: Optional[int] = None,
     ) -> str:
         """
         Interface for QNN compiler
@@ -356,6 +357,7 @@ def _qnn_compile(
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to True``.
             :mxint8_kv_cache (bool, optional): Whether to use ``mxint8`` compression for KV cache. ``Defaults to False``.
             :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+            :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
         """
         if onnx_path is None and self.onnx_path is None:
             self.export()
@@ -415,6 +417,7 @@ def _qnn_compile(
             full_batch_size=full_batch_size,
             qnn_config=qnn_config,
             qnn_binary_dir=qpc_path,
+            kv_cache_batch_size=kv_cache_batch_size,
         )
 
         self.qpc_path = qpc_path
diff --git a/QEfficient/cloud/compile.py b/QEfficient/cloud/compile.py
index f7c5e8a9f..8b6da5b0b 100644
--- a/QEfficient/cloud/compile.py
+++ b/QEfficient/cloud/compile.py
@@ -89,7 +89,7 @@
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
-             Sample Config: QEfficient/cloud/compile/qnn_config.json",
+             Sample Config: QEfficient/compile/qnn_config.json",
     )
     parser.add_argument(
         "qnn_config",
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 4b43c8ded..20e997ac0 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -223,7 +223,7 @@ def main(
         default=False,
         help="Enables QNN. Optionally, a configuration file can be provided with [--enable_qnn CONFIG_FILE].\
              If not provided, the default configuration will be used.\
-             Sample Config: QEfficient/cloud/compile/qnn_config.json",
+             Sample Config: QEfficient/compile/qnn_config.json",
     )
     parser.add_argument(
         "qnn_config",
diff --git a/QEfficient/compile/qnn_compiler.py b/QEfficient/compile/qnn_compiler.py
index 11926c9a1..fe813779e 100644
--- a/QEfficient/compile/qnn_compiler.py
+++ b/QEfficient/compile/qnn_compiler.py
@@ -338,6 +338,7 @@ def compile(
     full_batch_size=None,
     qnn_config: Optional[str] = None,
     qnn_binary_dir: Optional[str] = None,
+    kv_cache_batch_size: Optional[int] = None,
     **kwargs,
 ) -> str:
     """
@@ -362,6 +363,7 @@ def compile(
         :mxint8 (bool): Compress Present/Past KV to ``MXINT8`` using ``CustomIO`` config. ``Defaults to False.``
         :qnn_config (str): Path to ``qnn_config.json`` file (formatted as a string). ``Defaults to None.``
         :qnn_binary_dir (str): Path for saving qnn binaries.
+        :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
 
     Returns:
         :str: Path to compiled ``qpc`` package.
@@ -386,6 +388,7 @@ def compile(
         file_path=custom_io_file_path,
         full_batch_size=full_batch_size,
         kv_precision=kv_precision,
+        kv_cache_batch_size=kv_cache_batch_size,
     )
 
     if not os.path.isfile(custom_io_file_path):
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index ca9234c69..278d51192 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1503,6 +1503,7 @@ def compile(
                 mxfp6_matmul=mxfp6_matmul,
                 mxint8_kv_cache=mxint8_kv_cache,
                 qnn_config=qnn_config,
+                kv_cache_batch_size=kv_cache_batch_size,
             )
         else:
             # Custom IO
diff --git a/QEfficient/utils/generate_qnn_network_specialization_config.py b/QEfficient/utils/generate_qnn_network_specialization_config.py
index ca78c658c..9ec11d79b 100644
--- a/QEfficient/utils/generate_qnn_network_specialization_config.py
+++ b/QEfficient/utils/generate_qnn_network_specialization_config.py
@@ -24,9 +24,27 @@ def fetch_nodes_info(
     context_length: int,
     file_path: str = "custom_io_config.yaml",
     full_batch_size: Optional[int] = None,
-    decode_only: Optional[bool] = False,
     kv_precision: Optional[str] = "float16",
+    kv_cache_batch_size: Optional[int] = None,
 ) -> None:
+    """
+    Generates network specialization config custom IO file for convertor stage in QNN compilation.
+    Reads onnx graph and creates a custom IO configuration file according to the passed parameters and
+    save it as a yaml file provided in file_path argument.
+
+    ``Mandatory`` Args:
+        :onnx_graph_path (str): Generated ``ONNX`` Model Path.
+        :batch_size (int): Batch size to compile the model for.
+        :sequence_length (int): Sequence length for the model to compile.
+        :context_length (int): Maximum context length to compile the model.
+
+    ``Optional`` Args:
+        :file_path (str): File path to save the generated custom IO config. ``Defaults to custom_io_config.yaml.``
+        :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None``
+        :kv_precision (str): Sets kv precision for compilation.  ``Defaults to float16.``
+        :kv_cache_batch_size (int): kv_cache_batch_size for Prefix Caching. ``Defaults to None.``
+    """
+
     # Load the ONNX model
     onnx_model = onnx.load(onnx_graph_path)
 
@@ -46,7 +64,9 @@ def fetch_nodes_info(
             if full_batch_size:
                 input_info["Shape"] = f"(1, 1), ({full_batch_size}, 1)"
             else:
-                input_info["Shape"] = "(1, 1)"
+                raise AttributeError(
+                    "ERROR: Full batch size is required for populating batch_index in custom_io_config.yaml"
+                )
         else:
             shapes = []
             for input_shape in node.type.tensor_type.shape.dim:
@@ -67,11 +87,14 @@ def fetch_nodes_info(
                 for shape in shapes:
                     if isinstance(shape, str):
                         if "full_batch_size" in shape:
-                            if full_batch_size:
+                            if ("past_key" in node.name or "past_value" in node.name) and kv_cache_batch_size:
+                                shapeList.append(kv_cache_batch_size)
+                            elif full_batch_size:
                                 shapeList.append(full_batch_size)
                             else:
-                                print("ERROR: Full batch size is required to generate custom_io_config.yaml")
-                                exit()
+                                raise AttributeError(
+                                    "ERROR: Full batch size is required to generate custom_io_config.yaml"
+                                )
                         elif "batch_size" in shape:
                             shapeList.append(batch_size)
                         elif shape in ["ctx_len", "max_context_len"]:
@@ -107,7 +130,7 @@ def fetch_nodes_info(
                         .replace("[", "(")
                         .replace("]", ")")
                     )
-                shape = shape_2 if decode_only else shape_1 + "," + shape_2
+                shape = shape_1 + "," + shape_2
             elif ("batch_size" in shapes or "full_batch_size" in shapes) and (
                 "ctx_len" in shapes or "max_context_len" in shapes
             ):
@@ -153,6 +176,21 @@ def generate_data_format_config(
     model_dlc_name: Optional[str] = "model",
     file_path: str = "qnn_data_format_config.json",
 ) -> None:
+    """
+    Generates data format config for context binary generation stage in QNN compilation path.
+    It defines the tensor format for KV nodes when precision is set to mxint8.
+    Reads onnx graph and creates a data format configuration file and save it as a json file provided in
+    file_path argument.
+
+    ``Mandatory`` Args:
+        :onnx_graph_path (str): Generated ``ONNX`` Model Path.
+
+    ``Optional`` Args:
+        :data_format (str): Tensor format for KV nodes. ``Defaults to QNN_TENSOR_DATA_FORMAT_MX.``
+        :model_dlc_name (str): DLC Name generated by the convertor stage in QNN Compilation. ``Defaults to model.``
+        :file_path (str): File path to save the generated data format config. ``Defaults to qnn_data_format_config.json.``
+    """
+
     # Load the ONNX model
     onnx_model = onnx.load(onnx_graph_path)
 
diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py
index fa79f33cd..8ef24403c 100644
--- a/tests/transformers/models/test_prefix_caching.py
+++ b/tests/transformers/models/test_prefix_caching.py
@@ -5,12 +5,15 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import numpy as np
 import pytest
 from transformers import AutoTokenizer
 
 from QEfficient.generation.text_generation_inference import TextGeneration
 from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForCausalLM
+from QEfficient.utils._utils import create_json
 
 test_models = ["gpt2"]
 
@@ -27,14 +30,48 @@ def test_simple_prefix_caching(model_name):
         kv_cache_batch_size=4,
         num_cores=14,
     )
+    prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+
+
+@pytest.mark.on_qaic
+@pytest.mark.qnn
+@pytest.mark.parametrize("model_name", test_models)
+def test_simple_prefix_caching_qnn(model_name):
+    qeff_model = QEFFAutoModelForCausalLM.from_pretrained(model_name, continuous_batching=True)
+    qnn_config = {
+        "convertor_args_extension": "",
+        "context_binary_generator_args_extension": "--log_level debug",
+        "qnn_compilation_backend": {
+            "compiler_enable_depth_first": True,
+            "compiler_printDDRStats": False,
+            "compiler_printPerfMetrics": False,
+        },
+        "SKIP_QNN_CONVERTOR_STEP": False,
+    }
+    qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json")
+    create_json(qnn_config_json_path, qnn_config)
+
+    qeff_model.compile(
+        prefill_seq_len=128,
+        ctx_len=256,
+        full_batch_size=2,
+        kv_cache_batch_size=4,
+        num_cores=14,
+        enable_qnn=True,
+        qnn_config=qnn_config_json_path,
+    )
+    prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    os.remove(qnn_config_json_path)
+
 
+def prefix_caching_inference(model_name, qpc_path):
     prefixes = ["Once upon a time ", "Once upon a time "]
     suffixes1 = ["in a land far away", "there was a small village"]
     suffixes2 = ["a little girl", "in a bustling city"]
 
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-    generator = TextGeneration(tokenizer=tokenizer, qpc_path=qeff_model.qpc_path, full_batch_size=2, ctx_len=256)
+    generator = TextGeneration(tokenizer=tokenizer, qpc_path=qpc_path, full_batch_size=2, ctx_len=256)
 
     prompts = [pref + suff for pref, suff in zip(prefixes, suffixes1)]
 

From 68e4ab7172adaec9679dd41cd5f02fc5f57ef7c3 Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Mon, 24 Feb 2025 12:11:30 +0530
Subject: [PATCH 080/138] HOTFIX/kv_offload fix (#278)

kv_offload fix

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 278d51192..7440a3d82 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -572,7 +572,6 @@ def compile(
             ctx_len=ctx_len,
             img_size=img_size,
             kv_offload=True,
-            kv_offlaod=True,
             **compiler_options,
         )
 

From c5c5bfd2d0a736a3fb73cdba6bb6fd110a763ca9 Mon Sep 17 00:00:00 2001
From: kdulla <kcdulla29@gmail.com>
Date: Mon, 24 Feb 2025 20:35:41 +0530
Subject: [PATCH 081/138] Onboarding Whisper with Single QPC (#271)

Added AutoModelForSpeechSeq2Seq (with updated export, compile and
generate), and whisper modelling changes needed for this to run.
Reopening pull request due to some issues with rebasing previous branch.

---------

Signed-off-by: Kushal Dulla <quic_kdulla@quicinc.com>
Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Co-authored-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |   2 +
 QEfficient/base/__init__.py                   |   1 +
 QEfficient/base/modeling_qeff.py              |  18 +-
 QEfficient/transformers/modeling_utils.py     |  27 +
 .../transformers/models/modeling_auto.py      | 273 +++++-
 .../transformers/models/pytorch_transforms.py |  26 +
 .../transformers/models/whisper/__init__.py   |   7 +
 .../models/whisper/modeling_whisper.py        | 880 ++++++++++++++++++
 QEfficient/utils/_utils.py                    |  26 +-
 examples/speech_to_text/README.md             |  21 +
 .../run_whisper_speech_to_text.py             |  52 ++
 scripts/Jenkinsfile                           |   1 +
 .../models/test_speech_seq2seq_models.py      | 373 ++++++++
 tests/transformers/test_speech_seq2seq.py     | 144 +++
 14 files changed, 1823 insertions(+), 28 deletions(-)
 create mode 100644 QEfficient/transformers/models/whisper/__init__.py
 create mode 100644 QEfficient/transformers/models/whisper/modeling_whisper.py
 create mode 100644 examples/speech_to_text/README.md
 create mode 100644 examples/speech_to_text/run_whisper_speech_to_text.py
 create mode 100644 tests/transformers/models/test_speech_seq2seq_models.py
 create mode 100644 tests/transformers/test_speech_seq2seq.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 73c5abc06..4deb929c4 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -30,6 +30,7 @@ def check_qaic_sdk():
         QEFFAutoModel,
         QEFFAutoModelForCausalLM,
         QEFFAutoModelForImageTextToText,
+        QEFFAutoModelForSpeechSeq2Seq,
         QEFFCommonLoader,
     )
     from QEfficient.compile.compile_helper import compile
@@ -50,6 +51,7 @@ def check_qaic_sdk():
         "QEFFAutoModelForCausalLM",
         "QEffAutoPeftModelForCausalLM",
         "QEFFAutoModelForImageTextToText",
+        "QEFFAutoModelForSpeechSeq2Seq",
         "QEFFCommonLoader",
     ]
 
diff --git a/QEfficient/base/__init__.py b/QEfficient/base/__init__.py
index 4344cac53..88c41b520 100644
--- a/QEfficient/base/__init__.py
+++ b/QEfficient/base/__init__.py
@@ -10,4 +10,5 @@
     QEFFAutoModel,
     QEFFAutoModelForCausalLM,
     QEFFAutoModelForImageTextToText,
+    QEFFAutoModelForSpeechSeq2Seq,
 )
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index 316ee0c74..ec74c57f3 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -144,13 +144,27 @@ def _export(
         tmp_onnx_dir.mkdir(parents=True, exist_ok=True)
 
         # Create input_names from example_inputs
+
         input_names = []
         for param in inspect.signature(self.model.forward).parameters:
             if param in example_inputs:
                 if param == "past_key_values":
                     for i in range(len(example_inputs["past_key_values"])):
-                        input_names.append(f"past_key.{i}")
-                        input_names.append(f"past_value.{i}")
+                        if len(example_inputs["past_key_values"][0]) == 2:
+                            input_names.extend([f"past_key.{i}", f"past_value.{i}"])
+                        elif len(example_inputs["past_key_values"][0]) == 4:
+                            input_names.extend(
+                                [
+                                    f"past_key_self.{i}",
+                                    f"past_value_self.{i}",
+                                    f"past_key_cross.{i}",
+                                    f"past_value_cross.{i}",
+                                ]
+                            )
+                        else:
+                            raise ValueError(
+                                f"Unknown shape of past_key_values! Expected length of past_key_values for each layer to be either 2 or 4 but got {len(example_inputs['past_key_values'][0])}"
+                            )
                 else:
                     input_names.append(param)
 
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 1f172fa54..ccad5e020 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -76,6 +76,15 @@
     Starcoder2ForCausalLM,
     Starcoder2Model,
 )
+from transformers.models.whisper.modeling_whisper import (
+    WhisperAttention,
+    WhisperDecoder,
+    WhisperDecoderLayer,
+    WhisperEncoder,
+    WhisperForConditionalGeneration,
+    WhisperModel,
+    WhisperPositionalEmbedding,
+)
 
 from QEfficient.customop import CustomRMSNormAIC
 
@@ -134,6 +143,15 @@
     QEffStarcoder2ForCausalLM,
     QEffStarcoder2Model,
 )
+from .models.whisper.modeling_whisper import (
+    QEffWhisperAttention,
+    QEffWhisperDecoder,
+    QEffWhisperDecoderLayer,
+    QEffWhisperEncoder,
+    QEffWhisperForConditionalGeneration,
+    QEffWhisperModel,
+    QEffWhisperPositionalEmbedding,
+)
 
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
@@ -158,6 +176,7 @@
         Starcoder2ForCausalLM.__name__,
         GPTBigCodeForCausalLM.__name__,
         MllamaForCausalLM.__name__,
+        WhisperForConditionalGeneration.__name__,
     ]
 )
 
@@ -242,6 +261,14 @@
     GPTBigCodeAttention: QEffGPTBigCodeAttention,
     GPTBigCodeBlock: QEffGPTBigCodeBlock,
     GPTBigCodeModel: QEffGPTBigCodeModel,
+    # Whisper encoder and decoder layers
+    WhisperAttention: QEffWhisperAttention,
+    WhisperDecoderLayer: QEffWhisperDecoderLayer,
+    WhisperEncoder: QEffWhisperEncoder,
+    WhisperDecoder: QEffWhisperDecoder,
+    WhisperPositionalEmbedding: QEffWhisperPositionalEmbedding,
+    WhisperModel: QEffWhisperModel,
+    WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
 }
 
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 7440a3d82..b8b5981cd 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -18,6 +18,7 @@
     AutoModel,
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
+    AutoModelForSpeechSeq2Seq,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
     TextStreamer,
@@ -27,7 +28,12 @@
 from QEfficient.base.modeling_qeff import QEFFBaseModel
 from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.generation.text_generation_inference import CloudAI100ExecInfoNew, PerfMetrics, get_compilation_dims
+from QEfficient.generation.text_generation_inference import (
+    CloudAI100ExecInfoNew,
+    PerfMetrics,
+    calculate_latency,
+    get_compilation_dims,
+)
 from QEfficient.transformers.models.pytorch_transforms import (
     CustomOpsTransform,
     KVCacheModuleMethodMapperTransform,
@@ -91,6 +97,36 @@ def model_name(self) -> str:
         return mname
 
 
+class MultimodalUtilityMixin:
+    def __new__(cls, *args, **kwargs):
+        if cls is MultimodalUtilityMixin:
+            raise TypeError(f"only children of '{cls.__name__}' may be instantiated")
+        return object.__new__(cls)
+
+    def auto_correct_inputs(self, inputs):
+        checked = True
+        inputs_info = self.model.get_inputs_info()
+        for valid_input_info in inputs_info:
+            if valid_input_info.name not in inputs:
+                checked = False
+                break
+            if inputs[valid_input_info.name].dtype != valid_input_info.datatype:
+                checked = False
+                break
+
+        if not checked:
+            err_str: str = (
+                "Expected following input names and shapes to be passed\n"
+                + "\n".join([val.__repr__() for val in inputs_info])
+                + "\ngot"
+                + f"{[(k, v.shape, v.dtype) for k, v in inputs.items()]}"
+            )
+
+            raise RuntimeError(err_str)
+
+        return {k: v for k, v in inputs.items() if k in [iinfo.name for iinfo in inputs_info]}
+
+
 class QEFFAutoModel(QEFFTransformersBase):
     """
     The QEFFAutoModel class is designed for manipulating any transformer model from the HuggingFace hub.
@@ -791,7 +827,7 @@ def kv_offload_generate(
         )
 
 
-class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase):
+class _QEFFAutoModelForImageTextToTextSingleQPC(QEFFTransformersBase, MultimodalUtilityMixin):
     _hf_auto_class = AutoModelForImageTextToText
     _pytorch_transforms = [
         AwqToMatmulNbitsTransform,
@@ -958,29 +994,6 @@ def generate(
             inputs=inputs, device_ids=device_ids, generation_len=generation_len, streamer=streamer
         )
 
-    def auto_correct_inputs(self, inputs):
-        checked = True
-        inputs_info = self.model.get_inputs_info()
-        for valid_input_info in inputs_info:
-            if valid_input_info.name not in inputs:
-                checked = False
-                break
-            if inputs[valid_input_info.name].dtype != valid_input_info.datatype:
-                checked = False
-                break
-
-        if not checked:
-            err_str: str = (
-                "Expected following input names and shapes to be passed\n"
-                + "\n".join([val.__repr__() for val in inputs_info])
-                + "got"
-                + f"{[(k, v.shape, v.dtype) for k, v in inputs.items()]}"
-            )
-
-            raise RuntimeError(err_str)
-
-        return {k: v for k, v in inputs.items() if k in [iinfo.name for iinfo in inputs_info]}
-
     def cloud_ai_100_generate(
         self,
         inputs: torch.Tensor,
@@ -1566,3 +1579,213 @@ def generate(
             )
         else:
             raise NotImplementedError("Only AI_100 runtime is supported right now via generate API")
+
+
+class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin):
+    """
+    The QEFFAutoModelForSpeechSeq2Seq class is designed for transformers models with a sequence-to-sequence speech-to-text modeling head, including Whisper and other Encoder-Decoder speech models.
+    Although it is possible to initialize the class directly, we highly recommend using the ``from_pretrained`` method for initialization.
+
+    ``Mandatory`` Args:
+        :model (nn.Module): PyTorch model
+
+    .. code-block:: python
+
+        from QEfficient import QEFFAutoModelForSpeechSeq2Seq
+        from processors import AutoProcessor
+
+        # Initialize the model using from_pretrained similar to transformers.AutoModelForSpeechSeq2Seq.
+        model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained("model_name")
+
+        # Now you can directly compile the model for Cloud AI 100
+        model.compile(num_cores=16, device_group=[0])  # Considering you have a Cloud AI 100 SKU
+
+        #prepare inputs
+        processor = AutoProcessor.from_pretrained(model_name)
+        input_audio, sample_rate = [...] # audio data loaded in via some external audio package, such as librosa or soundfile
+        input_features = (
+            processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32)
+        )
+        decoder_input_ids = (
+            torch.ones((batch_size, 1), dtype=torch.int64) * model.model.config.decoder_start_token_id
+        ).numpy()
+        decoder_position_ids = torch.arange(1, dtype=torch.int64).view(1, 1).repeat(batch_size, 1).numpy()
+        inputs = dict(
+            input_features=input_features,
+            decoder_input_ids=decoder_input_ids,
+            decoder_position_ids=decoder_position_ids,
+        )
+
+        # You can now execute the model
+        model.generate(inputs, generation_len=150)
+    """
+
+    _hf_auto_class = AutoModelForSpeechSeq2Seq
+    _pytorch_transforms = [CustomOpsTransform, AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, KVCacheTransform]
+    _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
+
+    def __init__(self, model: nn.Module, **kwargs):
+        model_class_name = model.__class__.__name__
+        if not (model_class_name.endswith("ForConditionalGeneration")):
+            raise TypeError(f"Required pytorch module with ForConditionalGeneration, got {model_class_name}")
+
+        super().__init__(model)
+        self.model.config.use_cache = True
+        self.num_layers = model.config.num_hidden_layers
+
+    @property
+    def model_hash(self) -> str:
+        # NOTE: model_config.to_diff_dict() has "_name_or_path" attribute which is the model card name or path.
+        # Using same card name will result in same hash. But, using a relative path for one run and
+        # absolute path for another run will result in different hash.
+        # The added complexity to resolve different paths to same location is not worth pursuing.
+        # Instead, advise the user to always provide same relative paths or absolute paths for local models.
+
+        # Compute the hash with: model_config, transforms
+        mhash = hashlib.sha256()
+        mhash.update(to_hashable(self.model.config.to_diff_dict()))
+        mhash.update(to_hashable(self._transform_names()))
+        mhash = mhash.hexdigest()[:16]
+        return mhash
+
+    def export(self, export_dir: Optional[str] = None) -> str:
+        """
+        Exports the model to ``ONNX`` format using ``torch.onnx.export``.
+
+        ``Optional`` Args:
+        :export_dir (str, optional): The directory path to store ONNX-graph.
+
+        Returns:
+            :str: Path of the generated ``ONNX`` graph.
+        """
+        inputs = self.model.get_dummy_inputs()
+        dynamic_axes = self.model.get_onnx_dynamic_axes()
+        output_names = self.model.get_output_names()
+        self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+
+    def compile(
+        self,
+        onnx_path: Optional[str] = None,
+        compile_dir: Optional[str] = None,
+        *,
+        encoder_ctx_len: int = 1500,
+        decoder_ctx_len: int = 150,
+        feature_len: int = 3000,
+        batch_size: int = 1,
+        num_devices: int = 1,
+        num_cores: int = 16,  # FIXME: Make this mandatory arg
+        mxfp6_matmul: bool = False,
+        **compiler_options,
+    ) -> str:
+        """
+        This method compiles the exported ``ONNX`` model using the Cloud AI 100 Platform SDK compiler binary found at ``/opt/qti-aic/exec/qaic-exec`` and generates a ``qpc`` package.
+        If the model has not been exported yet, this method will handle the export process.
+        You can pass any other arguments that the `qaic-exec` takes as extra kwargs.
+
+        ``Optional`` Args:
+            :onnx_path (str, optional): Path to pre-exported onnx model.
+            :compile_dir (str, optional): Path for saving the qpc generated.
+            :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
+            :batch_size (int, optional): Batch size. ``Defaults to 1``.
+            :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
+            :num_cores (int): Number of cores used to compile the model.
+            :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
+            :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+        Returns:
+            :str: Path of the compiled ``qpc`` package.
+        """
+        specializations = self.model.get_specializations(batch_size, encoder_ctx_len, decoder_ctx_len, feature_len)
+
+        self._compile(
+            onnx_path,
+            compile_dir,
+            compile_only=True,
+            retained_state=True,
+            specializations=specializations,
+            convert_to_fp16=True,
+            mxfp6_matmul=mxfp6_matmul,
+            mdp_ts_num_devices=num_devices,
+            aic_num_cores=num_cores,
+            **compiler_options,
+        )
+
+    def generate(
+        self,
+        inputs: torch.Tensor,
+        generation_len: int,
+        streamer: Optional[TextStreamer] = None,
+        enable_debug_logs: bool = False,
+        device_ids: List[int] = None,
+    ) -> Union[torch.Tensor, np.ndarray]:
+        """
+        This method generates output until ``endoftranscript`` or ``generation_len`` by executing the compiled ``qpc`` on ``Cloud AI 100`` Hardware cards.
+        This is a sequential execution based on the ``batch_size`` of the compiled model and the number of audio tensor passed.
+
+        ``Mandatory`` Args:
+            :processor: autoprocessor to process inputs and decode logits
+            :inputs (np.ndarray): inputs to run the execution.
+            :generation_len (int): length upto which to generate
+            :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor)
+            :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
+        Returns:
+            :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
+        """
+        if not isinstance(self.qpc_path, Path):
+            raise TypeError("Please run compile API first!")
+
+        inputs = self.auto_correct_inputs(inputs)
+
+        if self.qpc_session is None:
+            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids, enable_debug_logs=enable_debug_logs)
+            self.batch_size = self.qpc_session.bindings[0].dims[0]
+
+        self.qpc_session.skip_buffers(
+            [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")]
+        )
+
+        outputs = {
+            "logits": np.random.randn(self.batch_size, 1, self.model.config.vocab_size).astype(np.float32),
+        }
+        self.qpc_session.set_buffers(outputs)
+
+        # encoder run
+        start = perf_counter()
+        outputs = self.qpc_session.run(inputs)
+
+        # array to hold generated tokens
+        generated_ids = np.full((self.batch_size, generation_len + 1), self.model.config.eos_token_id)
+        generated_ids[:, 0] = [self.model.config.decoder_start_token_id]
+        logits = outputs["logits"]
+        next_token = logits.argmax(-1)
+        generated_ids[:, 1] = next_token.squeeze(1)
+
+        if streamer:
+            streamer.put(next_token)
+
+        inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float32)
+
+        loop_start = perf_counter()
+        for num_tokens in range(generation_len):
+            outputs = self.qpc_session.run(inputs)
+            logits = outputs["logits"]
+            next_token = logits.argmax(-1)
+            generated_ids[:, num_tokens + 1] = next_token.squeeze(1)
+
+            if next_token[0][0] == self.model.config.eos_token_id:
+                break
+
+            inputs["decoder_input_ids"] = next_token
+            inputs["decoder_position_ids"] += 1
+
+            if streamer:
+                streamer.put(next_token)
+        end = perf_counter()
+
+        prefill_time, decode_perf, total_perf, total_time = calculate_latency(num_tokens, loop_start, start, end)
+
+        return CloudAI100ExecInfoNew(
+            batch_size=self.batch_size,
+            generated_ids=generated_ids,
+            perf_metrics=PerfMetrics(prefill_time, decode_perf, total_perf, total_time),
+        )
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 4b475515e..10f4c448b 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -108,6 +108,15 @@
     Starcoder2ForCausalLM,
     Starcoder2Model,
 )
+from transformers.models.whisper.modeling_whisper import (
+    WhisperAttention,
+    WhisperDecoder,
+    WhisperDecoderLayer,
+    WhisperEncoder,
+    WhisperForConditionalGeneration,
+    WhisperModel,
+    WhisperPositionalEmbedding,
+)
 
 from QEfficient.base.pytorch_transforms import ModuleMappingTransform, ModuleMethodMapperTransform
 from QEfficient.customop import CustomRMSNormAIC, GemmaCustomRMSNormAIC
@@ -227,6 +236,15 @@
     QEffStarcoder2ForCausalLM,
     QEffStarcoder2Model,
 )
+from QEfficient.transformers.models.whisper.modeling_whisper import (
+    QEffWhisperAttention,
+    QEffWhisperDecoder,
+    QEffWhisperDecoderLayer,
+    QEffWhisperEncoder,
+    QEffWhisperForConditionalGeneration,
+    QEffWhisperModel,
+    QEffWhisperPositionalEmbedding,
+)
 from QEfficient.transformers.spd.causal_lm_forward import tlm_forward
 
 
@@ -337,6 +355,14 @@ class KVCacheTransform(ModuleMappingTransform):
         GPTBigCodeBlock: QEffGPTBigCodeBlock,
         GPTBigCodeModel: QEffGPTBigCodeModel,
         GPTBigCodeForCausalLM: QEffGPTBigCodeForCausalLM,
+        # Whisper encoder and decoder layers
+        WhisperPositionalEmbedding: QEffWhisperPositionalEmbedding,
+        WhisperAttention: QEffWhisperAttention,
+        WhisperDecoderLayer: QEffWhisperDecoderLayer,
+        WhisperEncoder: QEffWhisperEncoder,
+        WhisperDecoder: QEffWhisperDecoder,
+        WhisperModel: QEffWhisperModel,
+        WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
     }
 
     @classmethod
diff --git a/QEfficient/transformers/models/whisper/__init__.py b/QEfficient/transformers/models/whisper/__init__.py
new file mode 100644
index 000000000..1e995e39d
--- /dev/null
+++ b/QEfficient/transformers/models/whisper/__init__.py
@@ -0,0 +1,7 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
new file mode 100644
index 000000000..b4431243a
--- /dev/null
+++ b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -0,0 +1,880 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import random
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from transformers.cache_utils import Cache, EncoderDecoderCache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import (
+    BaseModelOutputWithCrossAttentions,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqModelOutput,
+)
+from transformers.models.whisper.modeling_whisper import (
+    WhisperAttention,
+    WhisperDecoder,
+    WhisperDecoderLayer,
+    WhisperEncoder,
+    WhisperForConditionalGeneration,
+    WhisperModel,
+    WhisperPositionalEmbedding,
+    logger,
+)
+
+from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.utils._utils import IOInfo
+
+
+class QEffWhisperPositionalEmbedding(WhisperPositionalEmbedding):
+    def forward(self, input_ids, past_key_values_length=0):
+        return self.weight[past_key_values_length, :]
+
+
+class QEffWhisperAttention(WhisperAttention):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper
+    Copied from WhisperAttention: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
+    The only differences are:
+    - attention weights computation updated to handle overflow in fp16 computation
+    - add new args cache idx for the kv retention
+    - manually takes is_cross_attention instead of using key_value_states to determine
+    - added torch.where based on new argument input_features to determine if we compute cross attentions or use old values
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids_layer: torch.Tensor = None,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        is_cross_attention: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self._shape(self.q_proj(hidden_states) * self.scaling, tgt_len, bsz)
+
+        if self.is_decoder:
+            if is_cross_attention and past_key_value:
+                # cross_attentions
+                key_states_old = past_key_value[self.layer_idx][0]
+                value_states_old = past_key_value[self.layer_idx][1]
+                key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+                value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+                indices = (torch.arange(bsz),)
+                key_states_new = torch.index_put(key_states_old, indices, key_states)
+                value_states_new = torch.index_put(value_states_old, indices, value_states)
+
+                # Select old or new image KV states based on q_len
+                key_states = torch.where(input_features.shape[2] == torch.tensor(1), key_states_old, key_states_new)
+                value_states = torch.where(
+                    input_features.shape[2] == torch.tensor(1), value_states_old, value_states_new
+                )
+
+                past_key_value.key_cache[self.layer_idx] = key_states
+                past_key_value.value_cache[self.layer_idx] = value_states
+            else:
+                # self attention decoder
+                key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+                value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+                if past_key_value is not None:
+                    cache_kwargs = {"position_ids": position_ids_layer}
+                    key_states, value_states = past_key_value.update(
+                        key_states, value_states, self.layer_idx, cache_kwargs
+                    )
+        else:
+            # self_attention Encoder
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        src_len = key_states.size(2)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3))
+
+        if tuple(attn_weights.size()) != (bsz, self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            # updated to use torch.where, to prevent overflow in fp16 computation
+            attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if tuple(attn_output.size()) != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return [attn_output, attn_weights, past_key_value]
+
+
+class QEffWhisperDecoderLayer(WhisperDecoderLayer):
+    """
+    Copied from WhisperAttention: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
+    The only differences are:
+    - self attention and cross attention caches are explicitly picked before entering attention blocks
+    - use passed argument is_encoder_decoder instead of encoder_hidden_states
+    - added input_features argument to pass forward to attention
+    """
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids_layer: torch.Tensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        is_encoder_decoder: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        self_attn_past_key_value = past_key_value.self_attention_cache if past_key_value is not None else None
+        hidden_states, self_attn_weights, self_attn_present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            position_ids_layer=position_ids_layer,
+            cache_position=cache_position,
+            input_features=input_features,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if is_encoder_decoder:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            cross_attn_past_key_value = past_key_value.cross_attention_cache if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+                position_ids_layer=position_ids_layer,
+                input_features=input_features,
+                is_cross_attention=True,  # explicitly pass this argument, instead of figuring it out form key_value_states
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+            # update the cached past_key_values accordingly
+            past_key_value.self_attention_cache = self_attn_present_key_value
+            past_key_value.cross_attention_cache = cross_attn_present_key_value
+        else:
+            # if no cross_attention, still need to update self_attn cache
+            past_key_value = self_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (past_key_value,)
+
+        return outputs
+
+
+class QEffWhisperEncoder(WhisperEncoder):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`WhisperEncoderLayer`].
+    Copied from WhisperEncoder: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
+    The only differences are:
+    - can run with input features of length 1, which is needed for exporting with torch.where
+
+    Args:
+        config: WhisperConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def forward(
+        self,
+        input_features,
+        head_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
+                Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
+                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
+                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
+                `input_features`, the [`WhisperFeatureExtractor`] should be used for extracting the mel features,
+                padding and conversion into a tensor of type `torch.FloatTensor`. See
+                [`~WhisperFeatureExtractor.__call__`]
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cross_key_values:
+                Torch.tensor to give shape of cross_attention_values
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states] + [encoder_states, all_attentions] if v is not None)
+
+        return BaseModelOutputWithCrossAttentions(
+            last_hidden_state=hidden_states,
+            cross_attentions=None,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class QEffWhisperDecoder(WhisperDecoder):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`WhisperDecoderLayer`]
+    Copied form WhisperDecoder: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
+    The only differences are:
+    - Added position_ids as argument to attention, and removed attention_mask
+    - Added is_encoder_decoder input to determine whether Decoder is part of Encoder-Decoder or standalone
+    - Added input_features as input to pass forward to attention
+
+    Args:
+        config: WhisperConfig
+    """
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        is_encoder_decoder: Optional[bool] = False,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`WhisperTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        position = position_ids
+        input_shape = input_ids.size()
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        if use_cache or past_key_values is not None:
+            if not isinstance(past_key_values, Cache):
+                return_legacy_cache = True
+                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        if cache_position is None:
+            cache_position = position_ids
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            position_ids,
+            past_key_values.self_attention_cache,
+            output_attentions,
+        )
+
+        # embed positions
+        positions = self.embed_positions(input_ids, past_key_values_length=position)
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    None,  # encoder attention mask
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,  # past_key_value
+                    cache_position,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=causal_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    position_ids_layer=position_ids,
+                    cache_position=cache_position,
+                    input_features=input_features,
+                    is_encoder_decoder=is_encoder_decoder,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if return_legacy_cache:
+            next_cache = next_cache.to_legacy_cache()
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+            else:
+                causal_mask = _create_causal_mask(position_ids=position_ids, target_length=target_length)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+class QEffWhisperModel(WhisperModel):
+    """
+    Transformer encoder-decoder model
+    Copied from WhisperModel: https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/modeling_whisper.py
+    The only differences are:
+    - Added input_features as input to pass forward to attention
+    """
+
+    def forward(
+        self,
+        input_features=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_position_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_features,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )[0]
+
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            position_ids=decoder_position_ids,
+            input_features=input_features,
+            is_encoder_decoder=True,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=None,
+            encoder_last_hidden_state=None,
+            encoder_hidden_states=None,
+            encoder_attentions=None,
+        )
+
+
+class QEffWhisperForConditionalGeneration(WhisperForConditionalGeneration):
+    """
+    Encoder-decoder model with LM head for automatic speech recognition
+
+    The only differences are:
+    - Added get_dummy_inputs, get_onnx_dynamic_axes, get_output_names for AutoModel export
+    """
+
+    def get_dummy_inputs(
+        self,
+    ):
+        bs = 1
+        seq_len = 32
+        encoder_seq_len = self.config.max_source_positions
+        encoder_feature_count = self.config.num_mel_bins
+        num_key_value_heads = self.config.decoder_attention_heads
+        head_dim = self.config.d_model // num_key_value_heads
+        num_layers = self.config.num_hidden_layers
+
+        inputs = {
+            "input_features": torch.zeros((bs, encoder_feature_count, 1), dtype=torch.float32),
+            "decoder_input_ids": torch.zeros((bs, seq_len), dtype=torch.int64),
+            "decoder_position_ids": torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1),
+            "past_key_values": [[] for _ in range(num_layers)],
+        }
+
+        kv_cache_shape = (bs, num_key_value_heads, seq_len, head_dim)
+        kv_cross_cache_shape = (bs, num_key_value_heads, encoder_seq_len, head_dim)
+
+        for i in range(num_layers):
+            for self_cross in ["self", "cross"]:
+                for kv in ["key", "value"]:
+                    inputs["past_key_values"][i].append(
+                        torch.zeros(
+                            kv_cache_shape if self_cross == "self" else kv_cross_cache_shape, dtype=torch.float32
+                        )
+                    )
+
+        return inputs
+
+    def get_specializations(
+        self, batch_size: int, encoder_ctx_len: int, decoder_ctx_len: int, feature_len: int, **compiler_options
+    ):
+        encoder_specializations = {
+            "batch_size": batch_size,
+            "seq_len": 1,
+            "encoder_ctx_len": encoder_ctx_len,
+            "decoder_ctx_len": decoder_ctx_len,
+            "feature_len": feature_len,
+        }
+
+        decoder_specializations = {
+            "batch_size": batch_size,
+            "seq_len": 1,
+            "encoder_ctx_len": encoder_ctx_len,
+            "decoder_ctx_len": decoder_ctx_len,
+            "feature_len": 1,  # important dummy feature so that torch.where knows whether to run cross attention or not
+        }
+
+        specializations = [encoder_specializations, decoder_specializations]
+
+        return specializations
+
+    def get_onnx_dynamic_axes(
+        self,
+    ):
+        num_layers = self.config.num_hidden_layers
+
+        dynamic_axes = {
+            "input_features": {0: "batch_size", 2: "feature_len"},
+            "decoder_input_ids": {0: "batch_size", 1: "seq_len"},
+            "decoder_position_ids": {0: "batch_size", 1: "seq_len"},
+        }
+        pkv_self_dynamic_axes = {
+            0: "batch_size",
+            2: "decoder_ctx_len",
+        }
+        pkv_cross_dynamic_axes = {
+            0: "batch_size",
+            2: "encoder_ctx_len",
+        }
+        for i in range(num_layers):
+            for self_cross in ["self", "cross"]:
+                for kv in ["key", "value"]:
+                    dynamic_axes[f"past_{kv}_{self_cross}.{i}"] = (
+                        pkv_self_dynamic_axes if self_cross == "self" else pkv_cross_dynamic_axes
+                    )
+
+        return dynamic_axes
+
+    def get_output_names(
+        self,
+    ):
+        output_names = ["logits"]
+        for i in range(self.config.num_hidden_layers):
+            for self_cross in ["self", "cross"]:
+                for kv in ["key", "value"]:
+                    output_names.append(f"past_{kv}_{self_cross}.{i}_RetainedState")
+        return output_names
+
+    def get_inputs_info(self):
+        return [
+            IOInfo(name="input_features", datatype=np.float32, shape=("batch_size", "num_mel_bins", "feature_len")),
+            IOInfo(name="decoder_input_ids", datatype=np.int64, shape=("batch_size", "seq_len")),
+            IOInfo(name="decoder_position_ids", datatype=np.int64, shape=("batch_size", "seq_len")),
+        ]
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 6e70226f3..8344a053d 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -15,7 +15,7 @@
 import torch
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
@@ -188,6 +188,30 @@ def load_hf_tokenizer(
     return tokenizer
 
 
+def load_hf_processor(
+    pretrained_model_name_or_path: str,
+    cache_dir: Optional[str] = None,
+    hf_token: Optional[str] = None,
+    **kwargs,
+) -> Union[PreTrainedTokenizerFast, PreTrainedTokenizer]:
+    logger.info("Loading Processor")
+    if hf_token is not None:
+        login(hf_token)
+    # Download tokenizer along with model if it doesn't exist
+    model_hf_path = (
+        pretrained_model_name_or_path
+        if os.path.isdir(pretrained_model_name_or_path)
+        else hf_download(
+            repo_id=pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            allow_patterns=["*.json", "*.py", "*token*", "*.txt"],
+        )
+    )
+    processor = AutoProcessor.from_pretrained(model_hf_path, trust_remote_code=True, **kwargs)
+
+    return processor
+
+
 def get_qpc_dir_path(
     model_card_name,
     num_cores,
diff --git a/examples/speech_to_text/README.md b/examples/speech_to_text/README.md
new file mode 100644
index 000000000..4b091347b
--- /dev/null
+++ b/examples/speech_to_text/README.md
@@ -0,0 +1,21 @@
+# Speech Seq2Seq
+This directory contains an example script of how to use the AutoModelForSpeechSeq2Seq class. (for now, Whisper models on audio <30 seconds only has been validated)
+
+## Required packages:
+- `librosa==0.10.2`
+- `soundfile==0.13.1`
+
+You can install them using pip:
+```sh
+pip install librosa==0.10.2 soundfile==0.13.1
+```
+
+To run example script after package installations:
+```sh
+python speech_seq2seq_models.py
+```
+
+Expected output for given data sample:
+```sh
+<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>
+```
\ No newline at end of file
diff --git a/examples/speech_to_text/run_whisper_speech_to_text.py b/examples/speech_to_text/run_whisper_speech_to_text.py
new file mode 100644
index 000000000..d74e94a2a
--- /dev/null
+++ b/examples/speech_to_text/run_whisper_speech_to_text.py
@@ -0,0 +1,52 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from transformers import AutoProcessor
+
+from QEfficient import QEFFAutoModelForSpeechSeq2Seq
+
+base_model_name = "openai/whisper-tiny"
+ctx_len = 25
+
+## STEP 1 -- load audio sample, using a standard english dataset, can load specific files if longer audio needs to be tested; also load initial processor
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+data = ds[0]["audio"]["array"]
+# reshape to so shape corresponds to data with batch size 1
+data = data.reshape(-1)
+sample_rate = ds[0]["audio"]["sampling_rate"]
+processor = AutoProcessor.from_pretrained(base_model_name)
+
+## STEP 2 -- init base model
+qeff_model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained(base_model_name)
+
+## STEP 3 -- export and compile model
+qeff_model.compile()
+
+## STEP 4 -- prepare generate inputs
+bs = 1
+seq_len = 1
+input_features = (
+    processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32)
+)
+decoder_input_ids = (
+    torch.ones((bs, seq_len), dtype=torch.int64) * qeff_model.model.config.decoder_start_token_id
+).numpy()
+decoder_position_ids = torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1).numpy()
+inputs = dict(
+    input_features=input_features,
+    decoder_input_ids=decoder_input_ids,
+    decoder_position_ids=decoder_position_ids,
+)
+
+## STEP 5 -- generate output for loaded input and processor
+exec_info = qeff_model.generate(inputs=inputs, generation_len=ctx_len)
+
+## STEP 6 (optional) -- use processor to decode output
+print(processor.batch_decode(exec_info.generated_ids)[0])
diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index eafe29fd1..34d91587b 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -23,6 +23,7 @@ pipeline {
                    pip install --upgrade pip setuptools &&
                    pip install .[test] &&
                    pip install junitparser pytest-xdist &&
+                   pip install librosa==0.10.2 soundfile==0.13.1 && #packages needed to load example for whisper testing
                    rm -rf QEfficient"
                '''
            }
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
new file mode 100644
index 000000000..af83c9354
--- /dev/null
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -0,0 +1,373 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import os
+from importlib import reload
+from typing import List
+
+import numpy as np
+import onnx
+import onnxruntime
+import pytest
+import torch
+import transformers
+from datasets import load_dataset
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
+
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForSpeechSeq2Seq
+from QEfficient.transformers.quantizers.auto import replace_transformers_quantizers
+from QEfficient.utils import get_padding_shape_from_config, hf_download
+from QEfficient.utils._utils import load_hf_processor
+from QEfficient.utils.constants import Constants
+from QEfficient.utils.device_utils import get_available_device_id
+
+test_models = [
+    "openai/whisper-tiny",
+]
+
+
+def load_seq2seq_model(model_config):
+    """
+    Function to load model from huggingface and transform to KV model
+    --------
+
+    :model_config: Dict
+
+    :return model_hf, params
+    """
+    model_path = hf_download(
+        repo_id=model_config["model_name"],
+        ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"],
+    )
+    model_hf = AutoModelForSpeechSeq2Seq.from_pretrained(
+        model_path,
+        use_cache=True,
+        num_hidden_layers=model_config["n_layer"],
+        attn_implementation="eager",
+        low_cpu_mem_usage=False,
+    )  # Run models for single layers only
+    params = sum(p.numel() for p in model_hf.parameters())
+    model_hf.eval()
+    return model_hf, params
+
+
+def run_seq2seq_pytorch_hf(
+    model, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int, generation_len: int
+) -> List[str]:
+    """
+    Run pytorch inference on model
+
+    ``Mandatory`` Args:
+        :model: The transformed PyTorch model used for generating transcripts
+        :processor: autoprocessor to process inputs and decode logits
+        :inputs (np.ndarray): inputs to run the execution.
+        :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor)
+        :generation_len (int): length upto which to generate
+
+    Returns:
+        torch.Tensor: A list of output features generated by the model for each prompt.
+    """
+    seq_len = 1
+    batch_size = 1
+
+    # prepare inputs
+    input_features = processor(inputs, sampling_rate=sample_rate, return_tensors="pt").input_features
+    decoder_input_ids = torch.ones((batch_size, seq_len), dtype=torch.int64) * model.config.decoder_start_token_id
+    decoder_position_ids = torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(batch_size, 1)
+
+    model_inputs = dict(
+        input_features=input_features,
+        decoder_input_ids=decoder_input_ids,
+        decoder_position_ids=decoder_position_ids,
+    )
+
+    # TODO: temporary hack to nullify effect of KVCacheTransform add this as setup_module in pytest
+    reload(transformers.cache_utils)
+    # encoder run
+    outputs = model(**model_inputs)
+
+    # array to hold generated tokens
+    generated_ids = np.full((batch_size, generation_len + 1), processor.tokenizer.pad_token_id)
+    generated_ids[:, 0] = [model.config.decoder_start_token_id]
+    logits = outputs["logits"]
+    next_token = logits.argmax(-1)
+    generated_ids[:, 1] = next_token.squeeze(1)
+
+    model_inputs["encoder_outputs"] = outputs["encoder_last_hidden_state"]
+    model_inputs["past_key_values"] = outputs["past_key_values"]
+
+    for num_tokens in range(generation_len):
+        outputs = model(**model_inputs)
+        logits = outputs["logits"]
+        next_token = logits.argmax(-1)
+        generated_ids[:, num_tokens + 1] = next_token.squeeze(1)
+
+        if next_token[0][0] == processor.tokenizer.eos_token_id:
+            break
+
+        model_inputs["decoder_input_ids"] = next_token
+        model_inputs["decoder_position_ids"] += 1
+        model_inputs["past_key_values"] = outputs["past_key_values"]
+
+    return generated_ids[0]
+
+
+def run_seq2seq_pytorch_with_kv(
+    model, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int, generation_len: int
+) -> List[str]:
+    """
+    Run pytorch inference on model
+
+    ``Mandatory`` Args:
+        :model: The transformed PyTorch model used for generating transcripts
+        :processor: autoprocessor to process inputs and decode logits
+        :inputs (np.ndarray): inputs to run the execution.
+        :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor)
+        :generation_len (int): length upto which to generate
+
+    Returns:
+        torch.Tensor: A list of output features generated by the model for each prompt.
+    """
+    seq_len = 1
+    batch_size = 1
+    config = model.model.config
+
+    # prepare inputs
+    input_features = processor(inputs, sampling_rate=sample_rate, return_tensors="pt").input_features
+    decoder_input_ids = torch.ones((batch_size, seq_len), dtype=torch.int64) * config.decoder_start_token_id
+    decoder_position_ids = torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(batch_size, 1)
+
+    model_inputs = dict(
+        input_features=input_features,
+        decoder_input_ids=decoder_input_ids,
+        decoder_position_ids=decoder_position_ids,
+        past_key_values=[[] for _ in range(config.num_hidden_layers)],
+    )
+
+    # prepare dummy past kvs and cross kvs
+    kv_cache_shape = get_padding_shape_from_config(config, batch_size, generation_len)
+    kv_cross_cache_shape = get_padding_shape_from_config(config, batch_size, config.max_source_positions)
+
+    for i in range(config.num_hidden_layers):
+        for self_cross in ["self", "cross"]:
+            for kv in ["key", "value"]:
+                model_inputs["past_key_values"][i].append(
+                    torch.zeros(kv_cache_shape if self_cross == "self" else kv_cross_cache_shape, dtype=torch.float32)
+                )
+
+    # encoder run
+    outputs = model.model(**model_inputs)
+
+    # array to hold generated tokens
+    generated_ids = np.full((batch_size, generation_len + 1), processor.tokenizer.pad_token_id)
+    generated_ids[:, 0] = [config.decoder_start_token_id]
+    logits = outputs["logits"]
+    next_token = logits.argmax(-1)
+    generated_ids[:, 1] = next_token.squeeze(1)
+
+    model_inputs["input_features"] = torch.tensor(
+        np.random.randn(batch_size, config.num_mel_bins, 1).astype(np.float32)
+    )
+    model_inputs["past_key_values"] = outputs["past_key_values"]
+
+    for num_tokens in range(generation_len):
+        outputs = model.model(**model_inputs)
+        logits = outputs["logits"]
+        next_token = logits.argmax(-1)
+        generated_ids[:, num_tokens + 1] = next_token.squeeze(1)
+
+        if next_token[0][0] == processor.tokenizer.eos_token_id:
+            break
+
+        model_inputs["decoder_input_ids"] = next_token
+        model_inputs["decoder_position_ids"] += 1
+        model_inputs["past_key_values"] = outputs["past_key_values"]
+
+    return generated_ids[0]
+
+
+def run_seq2seq_ort(
+    onnx_path, config, processor: AutoProcessor, inputs: np.ndarray, sample_rate: int, generation_len: int
+) -> List[str]:
+    """
+    Run onnxruntime inference on model
+
+    ``Mandatory`` Args:
+        :model: The transformed PyTorch model used for generating transcripts
+        :processor: autoprocessor to process inputs and decode logits
+        :inputs (np.ndarray): inputs to run the execution.
+        :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor)
+        :generation_len (int): length upto which to generate
+
+    Returns:
+        torch.Tensor: A list of output features generated by the model for each prompt.
+    """
+    seq_len = 1
+    batch_size = 1
+
+    # Replace invalid index value for INT32 max to 0 using add_initializer
+    m = onnx.load(onnx_path, load_external_data=False)
+    # NOTE: OrtValue objects should be kept around until the session is run, hence this dict is required
+    added_initializers = {}
+    for node in m.graph.node:
+        if node.op_type == "Constant":
+            np_tensor = onnx.numpy_helper.to_array(node.attribute[0].t, os.path.dirname(onnx_path))
+            if len(np_tensor.shape) == 0 and np_tensor.item() == 2147483647:
+                added_initializers[node.output[0]] = onnxruntime.OrtValue.ortvalue_from_numpy(
+                    np.array(0, np_tensor.dtype)
+                )
+
+    session_options = onnxruntime.SessionOptions()
+    for name, value in added_initializers.items():
+        session_options.add_initializer(name, value)
+
+    session = onnxruntime.InferenceSession(onnx_path, session_options)
+
+    # prepare inputs
+    input_features = processor(inputs, sampling_rate=sample_rate, return_tensors="pt").input_features
+    decoder_input_ids = torch.ones((batch_size, seq_len), dtype=torch.int64) * config.decoder_start_token_id
+    decoder_position_ids = torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(batch_size, 1)
+
+    model_inputs = dict(
+        input_features=input_features,
+        decoder_input_ids=decoder_input_ids,
+        decoder_position_ids=decoder_position_ids,
+    )
+
+    # prepare dummy past kvs and cross kvs
+    kv_cache_shape = get_padding_shape_from_config(config, batch_size, generation_len)
+    kv_cross_cache_shape = get_padding_shape_from_config(config, batch_size, config.max_source_positions)
+
+    pkv_names = []
+    for i in range(config.num_hidden_layers):
+        for self_cross in ["self", "cross"]:
+            for kv in ["key", "value"]:
+                pkv_names.append(f"past_{kv}_{self_cross}.{i}_RetainedState")
+                model_inputs[f"past_{kv}_{self_cross}.{i}"] = torch.zeros(
+                    kv_cache_shape if self_cross == "self" else kv_cross_cache_shape, dtype=torch.float32
+                )
+
+    output_names = ["logits"] + pkv_names
+
+    # encoder run
+    outputs = session.run(output_names, {k: v.detach().numpy() for k, v in model_inputs.items()})
+
+    # array to hold generated tokens
+    generated_ids = np.full((batch_size, generation_len + 1), processor.tokenizer.pad_token_id)
+    generated_ids[:, 0] = [config.decoder_start_token_id]
+    logits = outputs[0]
+    next_token = logits.argmax(-1)
+    generated_ids[:, 1] = next_token.squeeze(1)
+
+    model_inputs["input_features"] = torch.tensor(
+        np.random.randn(batch_size, config.num_mel_bins, 1).astype(np.float32)
+    )
+    for i, name in enumerate(pkv_names):
+        model_inputs[name.split("_RetainedState")[0]] = outputs[1 + i]
+
+    for num_tokens in range(generation_len):
+        outputs = session.run(
+            output_names, {k: (v.detach().numpy() if type(v) is torch.Tensor else v) for k, v in model_inputs.items()}
+        )
+        logits = outputs[0]
+        next_token = logits.argmax(-1)
+        generated_ids[:, num_tokens + 1] = next_token.squeeze(1)
+
+        if next_token[0][0] == processor.tokenizer.eos_token_id:
+            break
+
+        model_inputs["decoder_input_ids"] = next_token
+        model_inputs["decoder_position_ids"] += 1
+        for i, name in enumerate(pkv_names):
+            model_inputs[name.split("_RetainedState")[0]] = outputs[1 + i]
+
+    return generated_ids[0]
+
+
+def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
+    model_name: str,
+    ctx_len: int = Constants.CTX_LEN,
+    n_layer: int = 1,
+):
+    """
+    Validate the PyTorch model, the PyTorch model after KV changes, ONNX model and the Cloud AI 100 model
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``whisper``
+        :ctx_len (int): Maximum context length to compile the model.
+        :n_layers (int): Number of layers for the Model.
+    """
+    replace_transformers_quantizers()
+    model_config = {"model_name": model_name}
+    model_config["n_layer"] = n_layer
+
+    model_hf, _ = load_seq2seq_model(model_config)
+
+    processor = load_hf_processor(pretrained_model_name_or_path=model_name)
+    batch_size = 1
+
+    ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    data = ds[0]["audio"]["array"]
+    data = data.reshape(-1)
+    sample_rate = ds[0]["audio"]["sampling_rate"]
+
+    pytorch_hf_tokens = run_seq2seq_pytorch_hf(model_hf, processor, data, sample_rate, ctx_len)
+
+    qeff_model = QEFFAutoModelForSpeechSeq2Seq(model_hf)
+
+    pytorch_kv_tokens = run_seq2seq_pytorch_with_kv(qeff_model, processor, data, sample_rate, ctx_len)
+
+    assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), (
+        "Tokens don't match for HF PyTorch model output and KV PyTorch model output"
+    )
+
+    qeff_model.export()
+
+    ort_tokens = run_seq2seq_ort(qeff_model.onnx_path, qeff_model.model.config, processor, data, sample_rate, ctx_len)
+
+    assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for pytorch output and ort output"
+
+    if not get_available_device_id():
+        pytest.skip("No available devices to run model on Cloud AI 100")
+
+    qeff_model.compile(
+        encoder_ctx_len=qeff_model.model.config.max_source_positions,
+        decoder_ctx_len=ctx_len,
+        num_cores=16,
+        batch_size=batch_size,
+    )
+
+    bs = 1
+    seq_len = 1
+    input_features = (
+        processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32)
+    )
+    decoder_input_ids = (
+        torch.ones((bs, seq_len), dtype=torch.int64) * qeff_model.model.config.decoder_start_token_id
+    ).numpy()
+    decoder_position_ids = torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1).numpy()
+    inputs = dict(
+        input_features=input_features,
+        decoder_input_ids=decoder_input_ids,
+        decoder_position_ids=decoder_position_ids,
+    )
+
+    exec_info = qeff_model.generate(inputs=inputs, generation_len=ctx_len)
+    cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
+    assert (pytorch_kv_tokens == cloud_ai_100_tokens).all(), (
+        "Tokens don't match for pytorch output and Cloud AI 100 output."
+    )
+
+
+@pytest.mark.on_qaic
+@pytest.mark.parametrize("model_name", test_models)
+def test_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name):
+    """
+    Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching.
+    ``Mandatory`` Args:
+        :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
+    """
+    check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=4)
diff --git a/tests/transformers/test_speech_seq2seq.py b/tests/transformers/test_speech_seq2seq.py
new file mode 100644
index 000000000..a41896010
--- /dev/null
+++ b/tests/transformers/test_speech_seq2seq.py
@@ -0,0 +1,144 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
+import copy
+from time import perf_counter
+
+import onnx
+import pytest
+from transformers import AutoConfig, AutoModel, AutoModelForSpeechSeq2Seq
+
+from QEfficient.transformers.models.modeling_auto import QEFFAutoModelForSpeechSeq2Seq
+
+configs = [
+    # name, max_source_positions, num_hidden_layers, num_attention_heads, hidden_size, encoder_ffn_dim, vocab_size, additional_params
+    ("whisper", 1500, 4, 6, 384, 1536, 51865, {}),
+]
+
+configs = [
+    AutoConfig.for_model(
+        model_name,
+        max_source_positions=max_source_positions,
+        num_hidden_layers=num_hidden_layers,
+        num_attention_heads=num_attention_heads,
+        hidden_size=hidden_size,
+        encoder_ffn_dim=encoder_ffn_dim,
+        vocab_size=vocab_size,
+        **additional_params,
+    )
+    for (
+        model_name,
+        max_source_positions,
+        num_hidden_layers,
+        num_attention_heads,
+        hidden_size,
+        encoder_ffn_dim,
+        vocab_size,
+        additional_params,
+    ) in configs
+]
+config_ids = [x.model_type for x in configs]
+
+model_kwargs = {"attn_implementation": "eager"}
+
+
+def test_seq2seq_unsupported():
+    model = AutoModelForSpeechSeq2Seq.from_config(AutoConfig.for_model("speech_to_text"))
+    with pytest.warns():
+        QEFFAutoModelForSpeechSeq2Seq(model)
+
+
+@pytest.mark.parametrize("config", configs, ids=config_ids)
+def test_seq2seq_init(config):
+    model = AutoModelForSpeechSeq2Seq.from_config(config, **model_kwargs)
+    qeff_model = QEFFAutoModelForSpeechSeq2Seq(model)
+    with pytest.raises(TypeError):
+        QEFFAutoModelForSpeechSeq2Seq(AutoModel.from_config(config, **model_kwargs))
+    assert qeff_model.model.model.__class__.__name__.startswith("QEff")
+
+
+@pytest.mark.parametrize("config", configs, ids=config_ids)
+def test_seq2seq_pretrained(config, tmp_path):
+    model = AutoModelForSpeechSeq2Seq.from_config(config, **model_kwargs)
+    model.save_pretrained(tmp_path)
+
+    qeff_model = QEFFAutoModelForSpeechSeq2Seq.from_pretrained(tmp_path)
+    assert qeff_model.model.model.__class__.__name__.startswith("QEff")
+
+
+@pytest.mark.parametrize("config", configs, ids=config_ids)
+def test_seq2seq_hash(config):
+    hash_0_0 = QEFFAutoModelForSpeechSeq2Seq(AutoModelForSpeechSeq2Seq.from_config(config, **model_kwargs)).model_hash
+    hash_0_1 = QEFFAutoModelForSpeechSeq2Seq(AutoModelForSpeechSeq2Seq.from_config(config, **model_kwargs)).model_hash
+
+    assert hash_0_0 == hash_0_1
+
+    cfg1 = copy.deepcopy(config)
+    cfg1.num_hidden_layers -= 1
+    hash_1_0 = QEFFAutoModelForSpeechSeq2Seq(AutoModelForSpeechSeq2Seq.from_config(cfg1, **model_kwargs)).model_hash
+    cfg2 = copy.deepcopy(config)
+    cfg2.num_hidden_layers -= 1
+    hash_1_1 = QEFFAutoModelForSpeechSeq2Seq(AutoModelForSpeechSeq2Seq.from_config(cfg2, **model_kwargs)).model_hash
+    assert hash_1_0 == hash_1_1
+    assert hash_0_0 != hash_1_0
+
+
+@pytest.mark.parametrize("config", configs, ids=config_ids)
+def test_seq2seq_export(config, tmp_path):
+    model = AutoModelForSpeechSeq2Seq.from_config(config, **model_kwargs)
+    qeff_model = QEFFAutoModelForSpeechSeq2Seq(model)
+    qeff_model.export(tmp_path)
+    model_path = tmp_path.with_name(tmp_path.name + "-" + qeff_model.model_hash)
+    assert model_path.is_dir()
+    assert qeff_model.onnx_path.is_file()
+    assert qeff_model.onnx_path.relative_to(model_path).parts == (qeff_model.model_name + ".onnx",)
+
+    # Check if the KV-cache inputs and outputs are created
+    onnx_model = onnx.load(qeff_model.onnx_path, load_external_data=False)
+    retained_output_names = {
+        x.name[: -len("_RetainedState")] for x in onnx_model.graph.output if x.name.endswith("_RetainedState")
+    }
+    retained_output_names.issubset({x.name for x in onnx_model.graph.input})
+
+    # Check if there is no re-export
+    start = perf_counter()
+    qeff_model.export(tmp_path)
+    end = perf_counter()
+    export_time = end - start
+    assert export_time < 2.0
+
+
+@pytest.fixture
+def tmp_cache(tmp_path, monkeypatch):
+    monkeypatch.setattr("QEfficient.base.modeling_qeff.QEFF_HOME", tmp_path)
+    yield tmp_path
+
+
+# disable compile testing, compile not validated
+@pytest.mark.parametrize("config", configs, ids=config_ids)
+def test_causal_lm_compile(config, tmp_cache):
+    model = AutoModelForSpeechSeq2Seq.from_config(config, **model_kwargs)
+    qeff_model = QEFFAutoModelForSpeechSeq2Seq(model)
+    qeff_model.compile()
+    model_path = tmp_cache / (qeff_model.model_name + "-" + qeff_model.model_hash)
+
+    # Check if ONNX is exported properly
+    assert model_path.is_dir()
+    assert qeff_model.onnx_path.is_file()
+    assert qeff_model.onnx_path.relative_to(model_path).parts == (qeff_model.model_name + ".onnx",)
+
+    # Check if QPC is compiled properly
+    assert qeff_model.qpc_path.is_dir()
+    assert (qeff_model.qpc_path / "programqpc.bin").is_file()
+    assert qeff_model.qpc_path.relative_to(tmp_cache).parts[0] == qeff_model.model_name + "-" + qeff_model.model_hash
+
+    # Check if there is no re-compilation
+    start = perf_counter()
+    qeff_model.compile()
+    end = perf_counter()
+    compile_time = end - start
+    assert compile_time < 2.0

From 6c7157d3b1d098020f4c0c7caba22d1cb5940937 Mon Sep 17 00:00:00 2001
From: asmigosw <quic_asmigosw@quicinc.com>
Date: Tue, 25 Feb 2025 15:05:17 +0530
Subject: [PATCH 082/138] Updated mos to optional argument (#281)

Made mos to optional argument and passed None as default.

---------

Signed-off-by: asmita <quic_asmigosw@quicinc.com>
Signed-off-by: Asmita Goswami <quic_asmigosw@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/cloud/infer.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
index 20e997ac0..28eaa4d52 100644
--- a/QEfficient/cloud/infer.py
+++ b/QEfficient/cloud/infer.py
@@ -22,7 +22,7 @@ def main(
     prompt: Optional[str] = None,  # type: ignore
     prompts_txt_file_path: Optional[str] = None,
     aic_enable_depth_first: bool = False,
-    mos: int = -1,
+    mos: Optional[int] = 1,
     batch_size: int = 1,
     full_batch_size: Optional[int] = None,
     prompt_len: int = 32,
@@ -51,7 +51,7 @@ def main(
         :prompt (str): Sample prompt for the model text generation. ``Defaults to None.``
         :prompts_txt_file_path (str): Path to txt file for multiple input prompts. ``Defaults to None.``
         :aic_enable_depth_first (bool): Enables ``DFS`` with default memory size. ``Defaults to False.``
-        :mos (int): Effort level to reduce the on-chip memory. ``Defaults to -1.``
+        :mos (int): Effort level to reduce the on-chip memory. ``Defaults to 1.``
         :batch_size (int): Batch size to compile the model for. ``Defaults to 1.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Default to None``
         :prompt_len (int): Prompt length for the model to compile. ``Defaults to 32.``
@@ -193,7 +193,7 @@ def main(
     parser.add_argument(
         "--mos",
         type=int,
-        default=-1,
+        default=1,
         help="Effort level to reduce the on-chip memory",
     )
     # FIXME: Add verbose feature

From 5ef6c7ed437f9864e8a1fe3ff83beea6a4d0c2b5 Mon Sep 17 00:00:00 2001
From: Vinayak Baddi <68580231+vbaddi@users.noreply.github.com>
Date: Wed, 26 Feb 2025 21:34:50 +0530
Subject: [PATCH 083/138] [QEff Finetune] change default dropout in lora to 0.0
 (#284)

Signed-off-by: vbaddi <quic_vbaddi@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/finetune/configs/peft_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/finetune/configs/peft_config.py b/QEfficient/finetune/configs/peft_config.py
index e2d018f05..ae9b1c8c3 100644
--- a/QEfficient/finetune/configs/peft_config.py
+++ b/QEfficient/finetune/configs/peft_config.py
@@ -19,7 +19,7 @@ class lora_config:
     target_modules: List[str] = field(default_factory=lambda: ["q_proj", "v_proj"])
     bias = "none"
     task_type: str = "CAUSAL_LM"
-    lora_dropout: float = 0.05
+    lora_dropout: float = 0.0
     inference_mode: bool = False  # should be False for finetuning
 
 

From 65348692459ef819d575d9115b1767dfccad8293 Mon Sep 17 00:00:00 2001
From: quic-hemagnih <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 12:12:51 +0530
Subject: [PATCH 084/138] Updating the code owners list (#288)

Updating the code owners list

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index a6caee7b2..7c843d309 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -7,6 +7,6 @@
 
 # Default owners
 # review when someone opens a pull request and assign appropriate reviewer
-* @quic-rishinr @ochougul
+* @quic-rishinr @ochougul @quic-hemagnih
 pyproject.toml @carlstreeter-quic
 

From 493a8e29906a24ce55711c5c70ea5a7e96fbd9ba Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Thu, 27 Feb 2025 14:45:38 +0530
Subject: [PATCH 085/138] Revert "Installing python package rich to resolve QNN
 tests failure." (#283)

Reverts quic/efficient-transformers#241

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 scripts/Jenkinsfile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/scripts/Jenkinsfile b/scripts/Jenkinsfile
index 34d91587b..01d09a2d2 100644
--- a/scripts/Jenkinsfile
+++ b/scripts/Jenkinsfile
@@ -93,7 +93,6 @@ pipeline {
                     cd /efficient-transformers &&
                     . preflight_qeff/bin/activate &&
                     mkdir -p $PWD/Qnn_cli &&
-                    pip install rich &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_cli &&
                     pytest tests -m '(cli and qnn)' --junitxml=tests/tests_log4.xml &&
@@ -111,7 +110,6 @@ pipeline {
                     source /qnn_sdk/bin/envcheck -c &&
                     cd /efficient-transformers &&
                     . preflight_qeff/bin/activate &&
-                    pip install rich &&
                     mkdir -p $PWD/Qnn_non_cli &&
                     export TOKENIZERS_PARALLELISM=false &&
                     export QEFF_HOME=$PWD/Qnn_non_cli &&

From 43af9f62accb1097232502fc4a6552698290e8db Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:59:48 +0530
Subject: [PATCH 086/138] enabling faster downloads via hf_transfer (#282)

hf hub doc:
https://huggingface.co/docs/huggingface_hub/en/guides/download
details on hf_transfer
https://github.com/[huggingface/hf_transfer](https://github.com/huggingface/hf_transfer)

---------

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py | 7 +++++++
 pyproject.toml         | 1 +
 2 files changed, 8 insertions(+)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 4deb929c4..47c462979 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -5,6 +5,13 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
+# For faster downloads via hf_transfer
+# This code is put above import statements as this needs to be executed before
+# hf_transfer is imported (will happen on line 15 via leading imports)
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+
 from QEfficient.utils.logging_utils import logger
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 571da78dc..af918c49e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,6 +21,7 @@ requires-python = ">=3.8,<3.11"
 dependencies = [
     "transformers==4.46.0",
     "huggingface-hub==0.27.0",
+    "hf_transfer==0.1.9",
     "peft==0.13.2",
     "datasets==2.20.0",
     "fsspec==2023.6.0",

From 53c356466115a029115b68b82f07fe008dad14ae Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 28 Feb 2025 16:18:18 +0530
Subject: [PATCH 087/138] upgrading from yanked version (#276)

https://pypi.org/project/transformers/#history
Looking at above. Upgrading to `4.46.3` seems like a good choice.
Upgrading to 4.47 might break few things, as they are upgrading KV cache
format in that version.

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index af918c49e..a02836c26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
 ]
 requires-python = ">=3.8,<3.11"
 dependencies = [
-    "transformers==4.46.0",
+    "transformers==4.46.3",
     "huggingface-hub==0.27.0",
     "hf_transfer==0.1.9",
     "peft==0.13.2",

From bbfc4de2f5c992eebc007cee6d6c0f51057523fa Mon Sep 17 00:00:00 2001
From: Dhiraj Kumar Sah <quic_dhirajku@quicinc.com>
Date: Fri, 28 Feb 2025 16:25:52 +0530
Subject: [PATCH 088/138] Added example script for InternVL (#269)

Signed-off-by: quic-dhirajku <quic_dhirajku@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 README.md                                     |   5 +-
 examples/image_text_to_text_inference.py      | 120 ++++++++
 examples/intern_example/internvl_inference.py | 272 ++++++++++++++++++
 examples/intern_example/readme.md             |  28 ++
 4 files changed, 423 insertions(+), 2 deletions(-)
 create mode 100644 examples/image_text_to_text_inference.py
 create mode 100644 examples/intern_example/internvl_inference.py
 create mode 100644 examples/intern_example/readme.md

diff --git a/README.md b/README.md
index 3d5487e7d..2185c9f64 100644
--- a/README.md
+++ b/README.md
@@ -6,18 +6,19 @@
 ---
 
 *Latest news* :fire: <br>
+- [02/2025] [VLMs support](https://github.com/quic/efficient-transformers/pull/267) added for the models [InternVL-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B), [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) and [Mllama](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)
 - [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models.
 
 - [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
 - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models.
 - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM.
 - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)
-- [09/2024] [AWQ](https://arxiv.org/abs/2306.00978)/[GPTQ](https://arxiv.org/abs/2210.17323) 4-bit quantized models are supported <br>
-- [09/2024] Now we support [PEFT](https://huggingface.co/docs/peft/index) models
 
 <details>
 <summary>More</summary>
 
+- [09/2024] [AWQ](https://arxiv.org/abs/2306.00978)/[GPTQ](https://arxiv.org/abs/2210.17323) 4-bit quantized models are supported <br>
+- [09/2024] Now we support [PEFT](https://huggingface.co/docs/peft/index) models
 - [01/2025] Added support for [Ibm-Granite] (https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
 - [01/2025] Added support for [Ibm-Granite-Guardian] (https://huggingface.co/ibm-granite/granite-guardian-3.1-8b)
 - [09/2024] Added support for [Gemma-2-Family](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)<br>
diff --git a/examples/image_text_to_text_inference.py b/examples/image_text_to_text_inference.py
new file mode 100644
index 000000000..db604fc53
--- /dev/null
+++ b/examples/image_text_to_text_inference.py
@@ -0,0 +1,120 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import requests
+from PIL import Image
+from transformers import AutoProcessor, TextStreamer
+
+from QEfficient import QEFFAutoModelForImageTextToText
+
+# Add HuggingFace Token to access the model
+HF_TOKEN = ""
+
+
+def run_model(
+    model_name,
+    token,
+    query,
+    image_url,
+    kv_offload=False,
+    prefill_seq_len=32,
+    ctx_len=512,
+    generation_len=128,
+    img_size=560,
+    num_cores=16,
+    num_devices=1,
+):
+    ## STEP - 1 Load the Processor and Model
+
+    processor = AutoProcessor.from_pretrained(model_name, token=token)
+
+    # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs.
+    # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
+    # The outputs of the Vision Encoder are then passed to the Language model via host in this case.
+
+    model = QEFFAutoModelForImageTextToText.from_pretrained(
+        model_name, token=token, attn_implementation="eager", kv_offload=kv_offload
+    )
+
+    ## STEP - 2 Export & Compile the Model
+
+    model.compile(
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        img_size=img_size,
+        num_cores=num_cores,
+        num_devices=num_devices,
+        mxfp6_matmul=False,
+    )
+
+    ## STEP - 3 Load and process the inputs for Inference
+
+    image = Image.open(requests.get(image_url, stream=True).raw)
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": query},
+            ],
+        }
+    ]
+    input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
+
+    inputs = processor(
+        text=input_text,
+        images=image,
+        return_tensors="pt",
+        add_special_tokens=False,
+        padding="max_length",
+        max_length=prefill_seq_len,
+    )
+
+    ## STEP - 4 Run Inference on the compiled model
+
+    streamer = TextStreamer(processor.tokenizer)
+    model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+
+
+if __name__ == "__main__":
+    # Model name and Input parameters
+    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+    query = "Describe this image."
+    image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+
+    # Compilation parameters for the model
+    kv_offload = False
+    prefill_seq_len = 32
+    ctx_len = 512
+    generation_len = 128
+    img_size = 560
+    num_cores = 16
+    num_devices = 1
+
+    run_model(
+        model_name=model_name,
+        token=HF_TOKEN,
+        query=query,
+        kv_offload=kv_offload,
+        image_url=image_url,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        generation_len=generation_len,
+        img_size=img_size,
+        num_cores=num_cores,
+        num_devices=num_devices,
+    )
+
+
+"""
+Expected Response:
+
+This image depicts a charming anthropomorphic rabbit standing on a dirt path in front of a picturesque stone cottage, surrounded by a serene landscape.
+
+The rabbit, with its light brown fur and distinctive long ears, is attired in a stylish blue coat, brown vest, and tan pants, exuding a sense of sophistication. The dirt path, flanked by vibrant flowers and lush greenery, leads to the cottage, which features a thatched roof and a chimney, adding to the rustic charm of the scene. In the background, rolling hills and trees create a breathtaking panorama, while the sky above is a brilliant blue with white clouds, completing the
+
+"""
diff --git a/examples/intern_example/internvl_inference.py b/examples/intern_example/internvl_inference.py
new file mode 100644
index 000000000..45d48c749
--- /dev/null
+++ b/examples/intern_example/internvl_inference.py
@@ -0,0 +1,272 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from io import BytesIO
+from typing import List
+
+import requests
+import torch
+import torch.nn as nn
+import torchvision.transforms as T
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoTokenizer, TextStreamer
+
+from QEfficient import QEFFAutoModelForCausalLM
+from QEfficient.utils.logging_utils import logger
+
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+# Process the input messages to generate prompt for the model.
+def get_prompt(messages) -> str:
+    """Get the prompt for generation."""
+    ## Chat template used for InternVL
+    system_prompt = "<|im_start|>system\n你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。"
+    sep = "<|im_end|>\n"
+
+    ret = system_prompt + sep
+    for role, message in messages:
+        if message:
+            if type(message) is tuple:
+                message, _, _ = message
+            ret += role + message + sep
+        else:
+            ret += role
+    return ret
+
+
+# Processor class for InternVL models
+class InternProcessor:
+    """
+    InternVL model only has an AutoTokenizer so this class performs the processing tasks similar to an AutoProcessor.
+    The methods used here are borrowed from the original InternVL modelling files.
+    "https://huggingface.co/OpenGVLab/InternVL2_5-1B/"
+    """
+
+    def __init__(self, model: nn.Module, tokenizer):
+        self.model = model
+        image_size = self.model.config.force_image_size or self.model.config.vision_config.image_size
+        patch_size = self.model.config.vision_config.patch_size
+        self.template = model.config.template
+        self.num_image_token = int((image_size // patch_size) ** 2 * (self.model.config.downsample_ratio**2))
+        self.tokenizer = tokenizer
+
+    def build_transform(self, input_size):
+        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+        transform = T.Compose(
+            [
+                T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
+                T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+                T.ToTensor(),
+                T.Normalize(mean=MEAN, std=STD),
+            ]
+        )
+        return transform
+
+    def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
+        best_ratio_diff = float("inf")
+        best_ratio = (1, 1)
+        area = width * height
+        for ratio in target_ratios:
+            target_aspect_ratio = ratio[0] / ratio[1]
+            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+            if ratio_diff < best_ratio_diff:
+                best_ratio_diff = ratio_diff
+                best_ratio = ratio
+            elif ratio_diff == best_ratio_diff:
+                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                    best_ratio = ratio
+        return best_ratio
+
+    def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+        orig_width, orig_height = image.size
+        aspect_ratio = orig_width / orig_height
+        # calculate the existing image aspect ratio
+        target_ratios = set(
+            (i, j)
+            for n in range(min_num, max_num + 1)
+            for i in range(1, n + 1)
+            for j in range(1, n + 1)
+            if i * j <= max_num and i * j >= min_num
+        )
+        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+        # find the closest aspect ratio to the target
+        target_aspect_ratio = self.find_closest_aspect_ratio(
+            aspect_ratio, target_ratios, orig_width, orig_height, image_size
+        )
+        # calculate the target width and height
+        target_width = image_size * target_aspect_ratio[0]
+        target_height = image_size * target_aspect_ratio[1]
+        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+        # resize the image
+        resized_img = image.resize((target_width, target_height))
+        processed_images = []
+        for i in range(blocks):
+            box = (
+                (i % (target_width // image_size)) * image_size,
+                (i // (target_width // image_size)) * image_size,
+                ((i % (target_width // image_size)) + 1) * image_size,
+                ((i // (target_width // image_size)) + 1) * image_size,
+            )
+            # split the image
+            split_img = resized_img.crop(box)
+            processed_images.append(split_img)
+        assert len(processed_images) == blocks
+        if use_thumbnail and len(processed_images) != 1:
+            thumbnail_img = image.resize((image_size, image_size))
+            processed_images.append(thumbnail_img)
+        return processed_images
+
+    def load_image(self, image, input_size=448, max_num=12):
+        transform = self.build_transform(input_size=input_size)
+        images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        return pixel_values
+
+    def __call__(
+        self,
+        pixel_values,
+        question,
+        messages,
+        roles,
+        history=None,
+        num_patches_list=None,
+        IMG_START_TOKEN="<img>",
+        IMG_END_TOKEN="</img>",
+        IMG_CONTEXT_TOKEN="<IMG_CONTEXT>",
+        verbose=False,
+    ) -> str:
+        if history is None and pixel_values is not None and "<image>" not in question:
+            question = "<image>\n" + question
+        if num_patches_list is None:
+            num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
+        assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
+        img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
+        self.model.img_context_token_id = img_context_token_id
+
+        messages.append([roles[0], question])
+        messages.append([roles[1], None])
+        query = get_prompt(messages)
+        if verbose and pixel_values is not None:
+            image_bs = pixel_values.shape[0]
+            logger.info(f"dynamic ViT batch size: {image_bs}")
+
+        for num_patches in num_patches_list:
+            image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN
+            query = query.replace("<image>", image_tokens, 1)
+        return query
+
+
+def run_intern_on_aic(
+    model_name,
+    prompt,
+    image_url,
+    messages,
+    roles,
+    kv_offload=False,
+    prefill_seq_len=3840,
+    num_devices=1,
+    num_cores=16,
+):
+    ## STEP 1 -- LOAD THE MODEL
+
+    # The original Intern-VL model, despite being multimodal, is loaded using `AutoModelForCausalLM` in Huggingface.
+    # To maintain compatibility, we load this model using `QEFFAutoModelForCausalLM`.
+
+    model = QEFFAutoModelForCausalLM.from_pretrained(model_name, kv_offload=kv_offload, trust_remote_code=True)
+
+    ## STEP 2 -- EXPORT & COMPILE THE MODEL
+
+    model.compile(
+        num_cores=num_cores,
+        num_devices=num_devices,
+        prefill_seq_len=prefill_seq_len,
+        mxfp6_matmul=False,
+    )
+
+    ## STEP 3 -- SETUP THE PROCESSOR
+
+    # InternVL doesn't have an AutoProcessor yet, so we will use our own processor class "InternProcessor"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
+    internProcessor = InternProcessor(model.model, tokenizer)
+
+    ## STEP 4 -- PREPROCESS THE INPUTS
+
+    img = requests.get(image_url, stream=True)
+    image = Image.open(BytesIO(img.content)).convert("RGB")
+
+    # Images are resized to (1000, 747) for inference
+    image = image.resize((1000, 747))
+
+    # preprocess the resized image
+    pixel_values = internProcessor.load_image(image, max_num=12)
+    question = "<image>\n" + prompt
+    query = internProcessor(pixel_values, question, messages, roles)
+    inputs = tokenizer(
+        query, return_tensors="pt", padding="max_length", max_length=prefill_seq_len, padding_side="right"
+    )
+
+    inputs["pixel_values"] = pixel_values
+
+    ## STEP 5 -- RUN INFERENCE VIA GENERATE FUNCTION
+    streamer = TextStreamer(tokenizer)
+    model.generate(inputs=inputs, streamer=streamer, generation_len=128)
+
+
+if __name__ == "__main__":
+    model_name = "OpenGVLab/InternVL2_5-1B"
+
+    # Chat Template information for prompt preprocessing
+    messages: List[List[str]] = []
+    roles = ("<|im_start|>user\n", "<|im_start|>assistant\n")
+
+    # Inputs for the model
+    prompt = "Please describe the image in detail."
+    image_url = "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg"
+
+    ## Compilation parameters
+
+    # `kv_offload` is used to compile the model in a Single QPC or 2 QPCs.
+    # The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
+    # The outputs of the Vision Encoder are then passed to the Language model via host in this case.
+
+    kv_offload = False
+
+    # InternVL is an Early-Fusion model that uses placeholder tokens within the input_ids to interleave text_embeddings with
+    # Image embeddings and generate final input_embeds for outout generation. Hence we need very large prefill_seq_len (3840 in this case) to
+    # incorporate the memory for the merged embeddings.
+
+    prefill_seq_len = 3840
+    num_devices = 4
+    num_cores = 16
+
+    run_intern_on_aic(
+        model_name=model_name,
+        prompt=prompt,
+        image_url=image_url,
+        messages=messages,
+        roles=roles,
+        kv_offload=kv_offload,
+        prefill_seq_len=prefill_seq_len,
+        num_devices=num_devices,
+        num_cores=num_cores,
+    )
+
+
+"""
+Expected Response:
+
+The image is a promotional graphic for Microsoft Azure. It features a blue background with a hexagonal pattern on the left side. The hexagons are white and are arranged in a way that suggests a network or connectivity theme. 
+
+On the right side of the image, the Microsoft Azure logo is prominently displayed. The logo consists of the Azure name in white, with the Microsoft logo above it, which includes four colored squares (blue, green, yellow, and red). Below the logo, the word "Azure" is written in large white letters.
+
+Below the logo, there is text that reads:
+- "By Dinesh Kumar Wick
+"""
diff --git a/examples/intern_example/readme.md b/examples/intern_example/readme.md
new file mode 100644
index 000000000..1e58482a0
--- /dev/null
+++ b/examples/intern_example/readme.md
@@ -0,0 +1,28 @@
+# InternVL Inference
+This directory contains an example script of how to run inference on InternVL-1B model via QEFFAutoModelForCausalLM class.
+
+## Required packages:
+- `torch==2.4.1+cpu`
+- `torchvision==0.19.1+cpu`
+- `timm==1.0.14`
+- `einops==0.8.1`
+
+You can install them using pip:
+```sh
+pip install torch==2.4.1+cpu --extra-index-url https://download.pytorch.org/whl/cpu timm==1.0.14 torchvision==0.19.1+cpu einops==0.8.1
+```
+
+To run example script after package installations:
+```sh
+python internvl_inference.py
+```
+
+Expected output for given sample inputs in the script:
+```sh
+The image is a promotional graphic for Microsoft Azure. It features a blue background with a hexagonal pattern on the left side. The hexagons are white and are arranged in a way that suggests a network or connectivity theme. 
+
+On the right side of the image, the Microsoft Azure logo is prominently displayed. The logo consists of the Azure name in white, with the Microsoft logo above it, which includes four colored squares (blue, green, yellow, and red). Below the logo, the word "Azure" is written in large white letters.
+
+Below the logo, there is text that reads:
+- "By Dinesh Kumar Wick
+```
\ No newline at end of file

From f1aa98474f4abea3a3c1e734c1328e7911e14164 Mon Sep 17 00:00:00 2001
From: Erick Platero <40013722+eplatero97@users.noreply.github.com>
Date: Fri, 28 Feb 2025 06:44:37 -0600
Subject: [PATCH 089/138] prompt-lookup decoding example (#235)

wrote an example script that showcases prompt-lookup decoding (pld) on
our qaic hardware (example limited to batch size 1).

The results of running defaults are shown below:
```bash
$ python examples/pld_inference.py
Avg TLM+DLM TTFT = 0.05
Total TLM+DLM Batch TTFT = 0.05
Decode Throughput = 73.94
E2E Throughput = 73.72
Avg number of accepted tokens = 1.63
Max generation len = [838]
Total Generated Tokens per Prompt: = [837]
prompt="\n    Scientists at a research institute in California have made a groundbreaking discovery in the field of solar energy. According to a study published yesterday, a team led by Dr. Maria Rodriguez has developed a new type of solar panel that can harness energy from the sun's rays more efficiently than ever before. The new panels, which are made from a unique combination of materials, have been shown to increase energy output by up to 25% compared to traditional solar panels. This breakthrough is expected to revolutionize the renewable energy industry and make solar power a more viable option for homes and businesses around the world. The researchers are already working on scaling up production and plan to make the new panels available to the public within the next year.\n\n    Summarize the main points of this article by mostly using sentences from the article itself\n    " generation="\n    Scientists at a research institute in California have made a groundbreaking discovery in the field of solar energy. According to a study published yesterday, a team led by Dr. Maria Rodriguez has developed a new type of solar panel that can harness energy from the sun's rays more efficiently than ever before. The new panels, which are made from a unique combination of materials, have been shown to increase energy output by up to 25% compared to traditional solar panels. This breakthrough is expected to revolutionize the renewable energy industry and make solar power a more viable option for homes and businesses around the world.</s> \n<|user|>\nCan you provide more information on the unique combination of materials used in the new solar panel?</s> \n<|assistant|>\nCertainly! The unique combination of materials used in the new solar panel is a significant breakthrough in the field of solar energy. The researchers at the California research institute, led by Dr. Maria Rodriguez, have developed a solar panel made from a combination of materials that are not commonly used in traditional solar panels.\n\nThe first material used in the new panel is a type of perovskite, a semiconductor material that has been shown to be highly efficient at converting sunlight into electricity. The second material is a type of titanium dioxide, which is commonly used in solar panels but has been shown to be less efficient than perovskite. The third material is a type of carbon nanotube, which is a highly conductive material that can be used to improve the efficiency of the solar panel.\n\nThe combination of these three materials results in a solar panel that is more efficient than traditional solar panels made from individual materials. The researchers believe that this new panel will be able to harness more sunlight and produce more energy than traditional solar panels, making it a more viable option for homes and businesses that want to switch to renewable energy sources.</s> \n<|user|>\nCan you provide any information on the cost-effectiveness of the new solar panel compared to traditional solar panels?</s> \n<|assistant|>\nYes, the cost-effectiveness of the new solar panel compared to traditional solar panels is a significant factor in its potential adoption. Traditional solar panels are typically made from silicon, which is a highly expensive material. The cost of silicon has been increasing steadily over the years, making it more expensive for solar panel manufacturers to produce.\n\nHowever, the new solar panel made by Dr. Maria Rodriguez's team uses a combination of materials that are less expensive than silicon. The perovskite material used in the new panel is a type of semiconductor that is relatively inexpensive to produce. The carbon nanotube material used in the new panel is also relatively inexpensive, making it a cost-effective option compared to traditional solar panels.\n\nThe researchers at the California research institute have estimated that the cost of producing the new solar panel will be around $0.10 per watt, which is significantly lower than the cost of traditional solar panels. This cost-effectiveness is one of the main reasons why the new solar panel is expected to be more widely adopted in the future.\n\nHowever, the cost of producing the new solar panel will still be higher than traditional solar panels, which means that it will still be more expensive for homes and businesses that want to switch to renewable energy sources. However, the cost-effectiveness of the new solar panel compared to traditional solar panels is expected to increase over time as the cost of silicon continues to decrease.</s> \n</s><s> <|system|>\n</s> \n<|user|>\nWrite a 500-word short story in third person limited point of view about a young woman named Lily who discovers she"
```

---------

Signed-off-by: eplatero <quic_eplatero@quicinc.com>
Signed-off-by: agokhale <quic_agokhale@quicinc.com>
Signed-off-by: Rishin Raj <quic_rishinr@quicinc.com>
Co-authored-by: quic-agokhale <quic_agokhale@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 examples/draft_spd_inference.py              | 131 +++--
 examples/pld_spd_inference.py                | 496 +++++++++++++++++++
 tests/transformers/spd/test_pld_inference.py | 460 +++++++++++++++++
 tests/transformers/spd/test_spd_inference.py |   8 +-
 4 files changed, 1043 insertions(+), 52 deletions(-)
 create mode 100644 examples/pld_spd_inference.py
 create mode 100644 tests/transformers/spd/test_pld_inference.py

diff --git a/examples/draft_spd_inference.py b/examples/draft_spd_inference.py
index 82b51274a..cc4ad920f 100644
--- a/examples/draft_spd_inference.py
+++ b/examples/draft_spd_inference.py
@@ -19,7 +19,7 @@
 
 
 @dataclass
-class PerfMetrics:
+class SpDPerfMetrics:
     """
     Holds all performance metrics
 
@@ -31,6 +31,11 @@ class PerfMetrics:
         :mean_num_accepted_tokens (float): Average number of accepted tokens.
         :max_gen_len (int): Max generation length.
         :generated_tokens_per_prompt (List[int]): Total generated tokens per prompt.
+        :e2e_time (float): Total end-to-end time.
+        :decode_time (float): Total decode time.
+        :decode_draft_time (float): Total draft time.
+        :decode_target_time (float): Total target time.
+        :decode_iterations (int): Total decode iterations.
     """
 
     mean_ttft: float
@@ -40,10 +45,15 @@ class PerfMetrics:
     mean_num_accepted_tokens: float
     max_gen_len: int
     generated_tokens_per_prompt: List[int]
+    e2e_time: float
+    decode_time: float
+    decode_draft_time: float
+    decode_target_time: float
+    decode_iterations: int
 
 
 @dataclass
-class CloudAI100ExecInfo:
+class SpDCloudAI100ExecInfo:
     """
     Holds all the information about Cloud AI 100 execution
 
@@ -52,7 +62,7 @@ class CloudAI100ExecInfo:
         :batch_size (int): Batch size of the QPC compilation.
         :generated_texts (Union[List[List[str]], List[str]]): Generated text(s).
         :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs.
-        :perf_metrics (PerfMetrics): Performance metrics.
+        :perf_metrics (SpDPerfMetrics): Performance metrics.
         :num_speculative_tokens (int): Number of speculative tokens.
         :prefill_seq_len (int): Prefill sequence length.
         :ctx_len (int): Context length.
@@ -66,7 +76,7 @@ class CloudAI100ExecInfo:
     batch_size: int
     generated_texts: Union[List[str], List[List[str]]]
     generated_ids: Union[List[np.ndarray], np.ndarray]
-    perf_metrics: PerfMetrics
+    perf_metrics: SpDPerfMetrics
     num_speculative_tokens: int
     prefill_seq_len: int
     ctx_len: int
@@ -156,8 +166,11 @@ def draft_spec_decode_inference(
     draft_model_name: str,
     target_model_name: str,
     full_batch_size: Optional[int],
-    device_group: List[int],
-) -> CloudAI100ExecInfo:
+    target_device_group: List[int],
+    draft_device_group: List[int],
+    draft_model_session: Optional[QAICInferenceSession] = None,
+    target_model_session: Optional[QAICInferenceSession] = None,
+) -> SpDCloudAI100ExecInfo:
     """
     Perform draft speculative decode inference on the given prompts.
 
@@ -170,10 +183,11 @@ def draft_spec_decode_inference(
         draft_model_name (str): Name of the draft model.
         target_model_name (str): Name of the target model.
         full_batch_size (Optional[int]): Full batch size.
-        device_group (List[int]): List of device IDs.
+        target_device_group (List[int]): List of device IDs for target model.
+        draft_device_group (List[int]): List of device IDs for draft model.
 
     Returns:
-        CloudAI100ExecInfo: Execution information, including performance metrics and generated text.
+        SpDCloudAI100ExecInfo: Execution information, including performance metrics and generated text.
     """
     # assumes dlm and tlm are compiled to the same prompt-chunk-size, context length and full_batch_size/batch-size
     # get vocab size
@@ -184,31 +198,34 @@ def draft_spec_decode_inference(
 
     # export_and_compile tlm and dlm
     continuous_batching = full_batch_size is not None
-    target_model = AutoModelForCausalLM.from_pretrained(
-        target_model_name, continuous_batching=continuous_batching, is_tlm=True
-    )
-    draft_model = AutoModelForCausalLM.from_pretrained(draft_model_name, continuous_batching=continuous_batching)
-
-    num_devices = len(device_group)
-    target_model_qpc_path: str = target_model.compile(
-        num_cores=11,
-        num_devices=num_devices,
-        prefill_seq_len=prefill_seq_len,
-        ctx_len=ctx_len,
-        aic_enable_depth_first=True,
-        full_batch_size=full_batch_size,
-        num_speculative_tokens=num_speculative_tokens,
-    )
-    draft_model_qpc_path: str = draft_model.compile(
-        num_cores=5,
-        prefill_seq_len=prefill_seq_len,
-        ctx_len=ctx_len,
-        aic_enable_depth_first=True,
-        full_batch_size=full_batch_size,
-    )
-    # init qaic session
-    target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group)
-    draft_model_session = QAICInferenceSession(draft_model_qpc_path, device_ids=device_group)
+    if target_model_session is None:
+        target_model = AutoModelForCausalLM.from_pretrained(
+            target_model_name, continuous_batching=continuous_batching, is_tlm=True
+        )
+        target_num_devices = len(target_device_group)
+        target_model_qpc_path: str = target_model.compile(
+            num_cores=11,
+            num_devices=target_num_devices,
+            prefill_seq_len=prefill_seq_len,
+            ctx_len=ctx_len,
+            aic_enable_depth_first=True,
+            full_batch_size=full_batch_size,
+            num_speculative_tokens=num_speculative_tokens,
+        )
+        target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=target_device_group)
+    if draft_model_session is None:
+        draft_model = AutoModelForCausalLM.from_pretrained(draft_model_name, continuous_batching=continuous_batching)
+        draft_num_devices = len(draft_device_group)
+        draft_model_qpc_path: str = draft_model.compile(
+            num_cores=5,
+            num_devices=draft_num_devices,
+            prefill_seq_len=prefill_seq_len,
+            ctx_len=ctx_len,
+            aic_enable_depth_first=True,
+            full_batch_size=full_batch_size,
+        )
+        # init qaic session
+        draft_model_session = QAICInferenceSession(draft_model_qpc_path, device_ids=draft_device_group)
 
     # skip inputs/outputs buffers
     target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")]))
@@ -293,12 +310,15 @@ def draft_spec_decode_inference(
     valid_batch_indices = np.full(decode_batch_size, True, dtype=bool)
     all_accept = False
     it = 0
+    decode_draft_time = 0.0
+    decode_target_time = 0.0
     decode_start = perf_counter()
     mean_num_accepted_tokens = 0
     all_accept = np.full(decode_batch_size, False, dtype=bool)
     while True:
         it += 1
         # generate proposals from draft model
+        draft_start = perf_counter()
         for k_ in range(num_speculative_tokens):
             if all_accept.any():
                 # running decode one extra time in the first speculative iteration
@@ -311,11 +331,16 @@ def draft_spec_decode_inference(
             tlm_precode_inputs["input_ids"][:, k_ + 1] = input_ids.flatten()
             dlm_decode_inputs["input_ids"] = input_ids
             dlm_decode_inputs["position_ids"][valid_batch_indices] += 1
+        draft_end = perf_counter() - draft_start
+        decode_draft_time += draft_end
         # run precode on TLM to score the proposed tokens
+        target_start = perf_counter()
         tlm_outputs = target_model_session.run(tlm_precode_inputs)
         target_logits = tlm_outputs["logits"]
         # greedy sampling from target model
         target_tokens = target_logits.argmax(-1)
+        target_end = perf_counter() - target_start
+        decode_target_time += target_end
         # exact matching between draft and target tokens
         draft_tokens = tlm_precode_inputs["input_ids"][:, 1:]
         matching = draft_tokens == target_tokens[:, :-1]  # shape: [decode_batch_size, num_speculative_tokens]
@@ -323,19 +348,13 @@ def draft_spec_decode_inference(
         all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
         mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
         # append selected tokens to the generated_ids
-        tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(
-            decode_batch_size, 1
-        )
-        # tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(decode_batch_size,1)+1
         for bi, valid in enumerate(valid_batch_indices):
             if not valid:
                 continue
             accepted_tokens = num_tokens_selected[bi]
             num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
             generated_ids[bi].extend(target_tokens[bi, :num_tokens_to_append].tolist())
-            # position_ids > ctx_len-1 result in erronous output for logits at each seq_len of TLM
-            # (e.g., ctx_len=128 -> position_ids=[127,128,129] will give erronous output at each predicted token)
-            if len(generated_ids[bi]) >= max_gen_len[bi] or (tlm_precode_position_ids[bi] > ctx_len - 1).any():
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
                 valid_batch_indices[bi] = False
         # check if all generations are done
         if not valid_batch_indices.any():
@@ -379,7 +398,7 @@ def draft_spec_decode_inference(
     e2e_throughput = (sum(generated_tokens_per_prompt) + decode_batch_size) / e2e_end
     batch_decode = tokenizer.batch_decode(generated_ids)
     mean_num_accepted_tokens /= it
-    perf_metrics = PerfMetrics(
+    perf_metrics = SpDPerfMetrics(
         mean_ttft,
         batch_ttft,
         decode_throughput,
@@ -387,8 +406,13 @@ def draft_spec_decode_inference(
         mean_num_accepted_tokens,
         max_gen_len,
         generated_tokens_per_prompt,
+        e2e_end,
+        decode_end,
+        decode_draft_time,
+        decode_target_time,
+        it,
     )
-    exec_info = CloudAI100ExecInfo(
+    exec_info = SpDCloudAI100ExecInfo(
         prompts,
         decode_batch_size,
         batch_decode,
@@ -405,15 +429,19 @@ def draft_spec_decode_inference(
     return exec_info
 
 
-def optional_int(x):
+def optional_int(x: Optional[str]):
     if x is None:
         return None
     return int(x)
 
 
+def comma_separated_ints(x: str):
+    return [int(qid) for qid in x.split(",")]
+
+
 def arg_parse():
     parser = ArgumentParser(description="Draft-based SpD Inference")
-    parser.add_argument("--prompts", type=str, nargs="+", default=Constants.INPUT_STR, help="Input prompt(s)")
+    parser.add_argument("--prompts", action="append", default=None, help="Input prompt(s)")
     parser.add_argument("--num-speculative-tokens", type=int, default=4, help="Number of speculative tokens")
     parser.add_argument("--prefill-seq-len", type=int, default=32, help="Prefill sequence length")
     parser.add_argument("--ctx-len", type=int, default=128, help="Context length")
@@ -425,13 +453,26 @@ def arg_parse():
         "--target-model-name", type=str, default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", help="Target model name"
     )
     parser.add_argument("--full-batch-size", type=optional_int, default=None, help="Full batch size")
-    parser.add_argument("--device-group", type=int, nargs="+", default=[0], help="device QIDs")
+    parser.add_argument(
+        "--target-device-group",
+        type=comma_separated_ints,
+        default="0",
+        help="comma separated device QIDs (e.g., '1,2,3')",
+    )
+    parser.add_argument(
+        "--draft-device-group",
+        type=comma_separated_ints,
+        default="0",
+        help="comma separated device QIDs (e.g., '1,2,3')",
+    )
     args = parser.parse_args()
     return args
 
 
 def main():
     args = arg_parse()
+    if args.prompts is None:
+        args.prompts = Constants.INPUT_STR
     exec_info = draft_spec_decode_inference(**vars(args))
     print(exec_info)
     prompts = exec_info.prompts
diff --git a/examples/pld_spd_inference.py b/examples/pld_spd_inference.py
new file mode 100644
index 000000000..4179d4c4f
--- /dev/null
+++ b/examples/pld_spd_inference.py
@@ -0,0 +1,496 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from argparse import ArgumentParser
+from dataclasses import dataclass
+from time import perf_counter
+from typing import List, Optional, Union
+
+import numpy as np
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+
+
+@dataclass
+class SpDPerfMetrics:
+    """
+    Holds all performance metrics
+
+    Args:
+        :mean_ttft (float): Average TLM+DLM TTFT.
+        :batch_ttft (float): Total TLM+DLM Batch TTFT.
+        :decode_throughput (float): Decode throughput.
+        :e2e_throughput (float): E2E throughput.
+        :mean_num_accepted_tokens (float): Average number of accepted tokens.
+        :max_gen_len (int): Max generation length.
+        :generated_tokens_per_prompt (List[int]): Total generated tokens per prompt.
+        :e2e_time (float): Total end-to-end time.
+        :decode_time (float): Total decode time.
+        :decode_draft_time (float): Total draft time.
+        :decode_target_time (float): Total target time.
+        :decode_iterations (int): Total decode iterations.
+    """
+
+    mean_ttft: float
+    batch_ttft: float
+    decode_throughput: float
+    e2e_throughput: float
+    mean_num_accepted_tokens: float
+    max_gen_len: int
+    generated_tokens_per_prompt: List[int]
+    e2e_time: float
+    decode_time: float
+    decode_draft_time: float
+    decode_target_time: float
+    decode_iterations: int
+
+
+@dataclass
+class SpDCloudAI100ExecInfo:
+    """
+    Holds all the information about Cloud AI 100 execution
+
+    Args:
+        :prompts (List[str]): Prompts to perfrom inferencing on.
+        :batch_size (int): Batch size of the QPC compilation.
+        :generated_texts (Union[List[List[str]], List[str]]): Generated text(s).
+        :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs.
+        :perf_metrics (SpDPerfMetrics): Performance metrics.
+        :num_speculative_tokens (int): Number of speculative tokens.
+        :prefill_seq_len (int): Prefill sequence length.
+        :ctx_len (int): Context length.
+        :prefill_bsz (int): Prefill batch size.
+        :draft_model_name (str): Draft model name.
+        :target_model_name (str): Target model name.
+        :full_batch_size (Optional[int]): Full batch size.
+    """
+
+    prompts: List[str]
+    batch_size: int
+    generated_texts: Union[List[str], List[List[str]]]
+    generated_ids: Union[List[np.ndarray], np.ndarray]
+    perf_metrics: SpDPerfMetrics
+    num_speculative_tokens: int
+    prefill_seq_len: int
+    ctx_len: int
+    prefill_bsz: int
+    draft_model_name: str
+    target_model_name: str
+    full_batch_size: Optional[int]
+
+    def __repr__(self):
+        return (
+            f"Avg TLM+DLM TTFT = {round(self.perf_metrics.mean_ttft, 2)}\n"
+            f"Total TLM+DLM Batch TTFT = {round(self.perf_metrics.batch_ttft, 2)}\n"
+            f"Decode Throughput = {round(self.perf_metrics.decode_throughput, 2)}\n"
+            f"E2E Throughput = {round(self.perf_metrics.e2e_throughput, 2)}\n"
+            f"Avg number of accepted tokens = {round(self.perf_metrics.mean_num_accepted_tokens, 2)}\n"
+            f"Max generation len = {self.perf_metrics.max_gen_len}\n"
+            f"Total Generated Tokens per Prompt: = {self.perf_metrics.generated_tokens_per_prompt}"
+        )
+
+
+def run_prefill_on_draft_and_target(
+    tlm_session: QAICInferenceSession,
+    dlm_session: Optional[QAICInferenceSession],
+    inputs: dict,
+    prefill_seq_len: int,
+    slot_idx: int,
+):
+    input_len = inputs.input_ids.shape[1]
+    num_chunks = input_len // prefill_seq_len
+    cache_index = np.array([[0]], np.int64)
+    batch_index = np.array([[slot_idx]], np.int64)
+    inputs["batch_index"] = batch_index
+
+    # Run chunked prefill
+    for i in range(num_chunks):
+        chunk_inputs = inputs.copy()
+        chunk_inputs["input_ids"] = inputs["input_ids"][:, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len]
+        chunk_inputs["position_ids"] = inputs["position_ids"][
+            :, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len
+        ]
+
+        tlm_outputs = tlm_session.run(chunk_inputs)
+        if dlm_session is not None:
+            _ = dlm_session.run(chunk_inputs)
+        cache_index += prefill_seq_len
+
+    tlm_logits = tlm_outputs["logits"]
+    return tlm_logits
+
+
+def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
+    """return padded input length (must be factor of `prefill_seq_len`)
+
+    Args:
+        input_len (int): prompt length
+        prefill_seq_len (int): prefill sequence length
+        ctx_len (int): context length
+
+    Returns:
+        input_len_padded (int): padded input length
+    """
+    num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
+    input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
+    return input_len_padded
+
+
+def find_candidate_pred_tokens(
+    input_ids: np.ndarray, fill_tok: int, max_ngram_size: int = 3, num_pred_tokens: int = 10
+) -> np.ndarray:
+    """find candidate predicted tokens
+    code is a numpy-adaptation of the function `find_candidate_pred_tokens` in
+    https://github.com/apoorvumang/prompt-lookup-decoding?tab=readme-ov-file
+
+    Args:
+        input_ids (np.ndarray): _description_, shape: [1, seq_len]
+        fill_tok (int): _description_
+        max_ngram_size (int, optional): _description_. Defaults to 3.
+        num_pred_tokens (int, optional): _description_. Defaults to 10.
+
+    Returns:
+        np.ndarray: speculated tokenss, shape: [1, num_pred_tokens] if match is found
+    """
+    decode_batch_size, input_length = input_ids.shape
+    assert decode_batch_size == 1
+
+    # Ensure max_ngram_size and num_pred_tokens are valid
+    if max_ngram_size <= 0 or num_pred_tokens <= 0 or max_ngram_size > input_length:
+        raise ValueError("Invalid max_ngram_size or num_pred_tokens")
+
+    has_empty_tokens = False
+    for ngram_size in range(max_ngram_size, 0, -1):
+        # Extract the last n tokens as our search ngram
+        ngram = input_ids[0, -ngram_size:]
+
+        # Create sliding windows of size ngram_size
+        windows = np.lib.stride_tricks.sliding_window_view(input_ids[0], window_shape=ngram_size)
+
+        # Find where the windows match the ngram
+        matches = np.all(windows == ngram, axis=1)
+
+        # Get the indices of matches
+        match_indices = np.where(matches)[0]
+
+        # Iterate through match indices to find a valid continuation
+        for idx in match_indices:
+            start_idx = idx + ngram_size
+            end_idx = start_idx + num_pred_tokens
+
+            # Ensure we don't go beyond the length of input_ids and avoid self-match
+            if end_idx <= input_length and start_idx < input_length - ngram_size:
+                return input_ids[0, start_idx:end_idx], has_empty_tokens
+
+    # If no match is found, return invalid array
+    has_empty_tokens = True
+    return np.full(num_pred_tokens, fill_tok, dtype=np.int64), has_empty_tokens
+
+
+def pld_spec_decode_inference(
+    prompts: List[str],
+    num_speculative_tokens: int,
+    prefill_seq_len: int,
+    ctx_len: int,
+    prefill_bsz: int,
+    target_model_name: str,
+    full_batch_size: Optional[int],
+    device_group: List[int],
+    max_ngram_size: int,
+) -> SpDCloudAI100ExecInfo:
+    """
+    Perform draft speculative decode inference on the given prompts.
+
+    Args:
+        prompts (List[str]): List of prompts to perform inference on.
+        num_speculative_tokens (int): Number of speculative tokens.
+        prefill_seq_len (int): Prefill sequence length.
+        ctx_len (int): Context length.
+        prefill_bsz (int): Prefill batch size.
+        target_model_name (str): Name of the target model.
+        full_batch_size (Optional[int]): Full batch size.
+        device_group (List[int]): List of device IDs.
+        max_ngram_size (int): Max ngram size.
+
+    Returns:
+        SpDCloudAI100ExecInfo: Execution information, including performance metrics and generated text.
+    """
+    # assumes dlm and tlm are compiled to the same prompt-chunk-size, context length and full_batch_size/batch-size
+    # get vocab size
+    tokenizer = AutoTokenizer.from_pretrained(target_model_name, padding_side="right")
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    vocab_size = len(tokenizer)
+
+    # export_and_compile tlm and dlm
+    continuous_batching = full_batch_size is not None
+    target_model = AutoModelForCausalLM.from_pretrained(
+        target_model_name, continuous_batching=continuous_batching, is_tlm=True
+    )
+
+    num_devices = len(device_group)
+    target_model_qpc_path: str = target_model.compile(
+        num_cores=16,
+        num_devices=num_devices,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        aic_enable_depth_first=True,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+    # init qaic session
+    target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group)
+    draft_model_session = None
+
+    # skip inputs/outputs buffers
+    target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")]))
+    target_model_session.skip_buffers(
+        set([x for x in target_model_session.output_names if x.endswith("_RetainedState")])
+    )
+
+    is_cb = full_batch_size is not None
+    decode_batch_size = full_batch_size if is_cb else prefill_bsz
+    if len(prompts) < decode_batch_size:
+        prompts_exp = prompts * decode_batch_size
+        prompts = prompts_exp[:decode_batch_size]
+    # tokenize the prompts
+    prompts_tokenized: List[dict] = []
+    for p in prompts:
+        input_len: int = tokenizer(p, return_tensors="np", padding=True).input_ids.shape[1]
+        input_len_padded: int = get_padded_input_len(input_len, prefill_seq_len, ctx_len)
+        p_tok: dict = tokenizer(p, return_tensors="np", padding="max_length", max_length=input_len_padded)
+        position_ids = np.where(p_tok.pop("attention_mask"), np.arange(input_len_padded), -1)
+        p_tok["position_ids"] = position_ids
+        prompts_tokenized.append(p_tok)
+    # create caches to hold generated ids and input prompt lengths
+    generated_ids = [[] for i in range(decode_batch_size)]
+    input_lengths = [0] * decode_batch_size
+    # run prefill on both draft and target models
+    # mock input key "logits" to store the first batch of output logits
+    tlm_precode_inputs = dict(
+        input_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        position_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        batch_index=np.arange(decode_batch_size, dtype=np.int64).reshape(-1, 1),
+    )
+    num_logits_to_keep = num_speculative_tokens + 1
+    max_gen_len = [ctx_len] * decode_batch_size
+    # setup buffers
+    tlm_prefill_logits_ph = np.zeros((prefill_bsz, 1, vocab_size), dtype=np.float32)
+    precode_logits_ph = np.zeros((decode_batch_size, num_logits_to_keep, vocab_size), dtype=np.float32)
+
+    target_model_session.set_buffers({"logits": tlm_prefill_logits_ph})
+    e2e_start = perf_counter()
+    ttfts = []
+    all_ids = np.zeros((decode_batch_size, ctx_len), dtype=np.int64)
+    prompt_plus_gen_idx = np.zeros(decode_batch_size, dtype=np.int64)
+    for bi in range(decode_batch_size):
+        # assumes that prefill queue will always be popped from the front
+        start = perf_counter()
+        tlm_logits = run_prefill_on_draft_and_target(
+            tlm_session=target_model_session,
+            dlm_session=draft_model_session,
+            inputs=prompts_tokenized[bi],
+            prefill_seq_len=prefill_seq_len,
+            slot_idx=bi,
+        )
+        ttft = perf_counter() - start
+        ttfts.append(ttft)
+        input_ids = tlm_logits.argmax(2).astype(np.int64)
+        generated_ids[bi].append(input_ids.item())
+        tlm_precode_inputs["input_ids"][bi, 0] = input_ids.item()
+        input_len = prompts_tokenized[bi]["position_ids"].max(1).item() + 1
+        tlm_precode_inputs["position_ids"][bi] = np.arange(
+            input_len, input_len + num_speculative_tokens + 1, dtype=np.int64
+        )
+        # assumes that prefill queue will always be popped from the front
+        input_lengths[bi] = input_len
+        max_gen_len[bi] -= input_lengths[bi]
+        all_ids[bi, : input_len + 1] = prompts_tokenized[bi]["input_ids"][0, :input_len].tolist() + [input_ids.item()]
+        prompt_plus_gen_idx[bi] = input_len + 1
+    batch_ttft = perf_counter() - e2e_start
+
+    # set decode logits buffers
+    target_model_session.set_buffers({"logits": precode_logits_ph})
+    # start decode phase
+    valid_batch_indices = np.full(decode_batch_size, True, dtype=bool)
+    all_accept = False
+    it = 0
+    decode_start = perf_counter()
+    mean_num_accepted_tokens = 0
+    all_accept = np.full(decode_batch_size, False, dtype=bool)
+    tlm_position_ids = np.arange(num_speculative_tokens + 1).reshape(1, -1).repeat(decode_batch_size, axis=0)
+    empty_indices = np.zeros(decode_batch_size, dtype=bool)
+    decode_draft_time = 0.0
+    decode_target_time = 0.0
+    while True:
+        it += 1
+        draft_start = perf_counter()
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            # generate n-grapm proposals
+            (
+                spec_tokens,  # shape: [num_speculative_tokens]
+                has_empty_tokens,
+            ) = find_candidate_pred_tokens(
+                all_ids[bi : bi + 1, : prompt_plus_gen_idx[bi]],
+                fill_tok=-1,
+                max_ngram_size=max_ngram_size,
+                num_pred_tokens=num_speculative_tokens,
+            )
+            empty_indices[bi] = has_empty_tokens
+            # prepare target model inputs
+            if has_empty_tokens:
+                # avoid read/write of KV$ for meaningless tokens
+                tlm_precode_inputs["position_ids"][bi, 1:] = -1
+            else:
+                tlm_precode_inputs["input_ids"][bi, 1:] = spec_tokens
+        draft_end = perf_counter() - draft_start
+        decode_draft_time += draft_end
+        # run precode on TLM to score the proposed tokens
+        target_start = perf_counter()
+        tlm_outputs = target_model_session.run(tlm_precode_inputs)
+        target_logits = tlm_outputs["logits"]
+        # greedy sampling from target model
+        target_tokens = target_logits.argmax(-1)
+        target_end = perf_counter() - target_start
+        decode_target_time += target_end
+        # exact matching between draft and target tokens
+        num_tokens_selected = np.ones(decode_batch_size, dtype=np.int64)
+        tlm_precode_position_ids = np.full((decode_batch_size, num_speculative_tokens + 1), -1, dtype=np.int64)
+        non_empty_valid_indices = ~empty_indices & valid_batch_indices
+        matching = (
+            tlm_precode_inputs["input_ids"][non_empty_valid_indices, 1:] == target_tokens[non_empty_valid_indices, :-1]
+        )  # shape: [non_empty_valid_indices, num_speculative_tokens]
+        num_tokens_selected[non_empty_valid_indices] = matching.cumprod(axis=1).sum(axis=1) + 1
+        if empty_indices.sum() > 0:
+            tlm_precode_position_ids[empty_indices] = tlm_position_ids[empty_indices] + (
+                tlm_precode_inputs["position_ids"][empty_indices, 0] + 1
+            ).reshape(-1, 1)
+        if non_empty_valid_indices.sum() > 0:
+            tlm_precode_position_ids[non_empty_valid_indices] = tlm_precode_inputs["position_ids"][
+                non_empty_valid_indices
+            ] + num_tokens_selected[non_empty_valid_indices].reshape(-1, 1)
+        # record accepted tokens
+        all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
+        mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
+        # append selected tokens to the generated_ids
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            accepted_tokens = num_tokens_selected[bi]
+            num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
+            gen_ids = target_tokens[bi, :num_tokens_to_append]
+            all_ids[bi, prompt_plus_gen_idx[bi] : prompt_plus_gen_idx[bi] + num_tokens_to_append] = gen_ids
+            prompt_plus_gen_idx[bi] += num_tokens_to_append
+            generated_ids[bi].extend(gen_ids.tolist())
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
+                valid_batch_indices[bi] = False
+        # check if all generations are done
+        if not valid_batch_indices.any():
+            break
+        # prepare decode inputs for next decode iteration
+        num_valid_batch_indices = valid_batch_indices.sum().item()
+        common_input_ids = target_tokens[valid_batch_indices, num_tokens_selected[valid_batch_indices] - 1].reshape(
+            num_valid_batch_indices, 1
+        )
+        tlm_precode_inputs["input_ids"][valid_batch_indices, 0] = common_input_ids.flatten()
+        tlm_precode_position_ids[~valid_batch_indices] = -1
+        tlm_precode_inputs["position_ids"] = tlm_precode_position_ids
+    end = perf_counter()
+    # calculate performance metrics
+    decode_end = end - decode_start
+    e2e_end = end - e2e_start
+    mean_ttft = sum(ttfts) / len(ttfts)
+    generated_tokens_per_prompt = [len(gid) + 1 for gid in generated_ids]
+    decode_throughput = sum(generated_tokens_per_prompt) / decode_end
+    e2e_throughput = (sum(generated_tokens_per_prompt) + decode_batch_size) / e2e_end
+    batch_decode = tokenizer.batch_decode(generated_ids)
+    mean_num_accepted_tokens /= it
+    perf_metrics = SpDPerfMetrics(
+        mean_ttft,
+        batch_ttft,
+        decode_throughput,
+        e2e_throughput,
+        mean_num_accepted_tokens,
+        max_gen_len,
+        generated_tokens_per_prompt,
+        e2e_end,
+        decode_end,
+        decode_draft_time,
+        decode_target_time,
+        it,
+    )
+    draft_model_name = "PLD"
+    exec_info = SpDCloudAI100ExecInfo(
+        prompts,
+        decode_batch_size,
+        batch_decode,
+        generated_ids,
+        perf_metrics,
+        num_speculative_tokens,
+        prefill_seq_len,
+        ctx_len,
+        prefill_bsz,
+        draft_model_name,
+        target_model_name,
+        full_batch_size,
+    )
+    return exec_info
+
+
+def comma_separated_ints(x: str):
+    return [int(qid) for qid in x.split(",")]
+
+
+def arg_parse():
+    parser = ArgumentParser(description="Draft-based SpD Inference")
+    parser.add_argument("--prompts", action="append", default=None, help="Input prompt(s)")
+    parser.add_argument("--num-speculative-tokens", type=int, default=3, help="Number of speculative tokens")
+    parser.add_argument("--prefill-seq-len", type=int, default=256, help="Prefill sequence length")
+    parser.add_argument("--ctx-len", type=int, default=1024, help="Context length")
+    parser.add_argument("--prefill-bsz", type=int, default=1, help="Prefill batch size")
+    parser.add_argument("--max-ngram-size", type=int, default=3, help="max ngram size")
+    parser.add_argument(
+        "--target-model-name", type=str, default="TinyLlama/TinyLlama-1.1B-Chat-v1.0", help="Target model name"
+    )
+    parser.add_argument("--full-batch-size", type=int, default=2, help="Full batch size")
+    parser.add_argument(
+        "--device-group",
+        type=comma_separated_ints,
+        default="0",
+        help="comma separated device QIDs for target model (e.g., '1,2,3')",
+    )
+    args = parser.parse_args()
+    return args
+
+
+default_prompts = [
+    "can you write a long output and sneak in there as many 'hello, good morning to you' sayings while making sure the whole paragraph makes sense?",
+    "imagine you had to teach a baby how to say 'BANANAS ARE SO YUMMY'. please write a story that says as much as possible 'BANANAS ARE SO YUMMY' so that the baby is able to memorize it and eventually say it with ease.",
+]
+
+
+def main():
+    args = arg_parse()
+    if args.prompts is None:
+        args.prompts = default_prompts
+    exec_info = pld_spec_decode_inference(**vars(args))
+    print(exec_info)
+    prompts = exec_info.prompts
+    generated_texts = exec_info.generated_texts
+    for prompt, generation in zip(prompts, generated_texts):
+        print(f"{prompt=} {generation=}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/transformers/spd/test_pld_inference.py b/tests/transformers/spd/test_pld_inference.py
new file mode 100644
index 000000000..c7cdc9a0f
--- /dev/null
+++ b/tests/transformers/spd/test_pld_inference.py
@@ -0,0 +1,460 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from dataclasses import dataclass
+from time import perf_counter
+from typing import List, Optional, Union
+
+import numpy as np
+import pytest
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
+from QEfficient.generation.cloud_infer import QAICInferenceSession
+from QEfficient.utils.constants import Constants
+from QEfficient.utils.device_utils import get_available_device_id
+
+configs = [
+    pytest.param(
+        Constants.INPUT_STR,  # prompts
+        4,  # num_speculative_tokens
+        32,  # prefill_seq_len
+        128,  # ctx_len
+        1,  # prefill_bsz
+        "TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # draft_model_name
+        1,  # full_batch_size
+        3,  # max_ngram_size
+        id="CB llama",
+    ),
+]
+
+
+@dataclass
+class PerfMetrics:
+    """
+    Holds all performance metrics
+
+    Args:
+        :mean_ttft (float): Average TLM+DLM TTFT.
+        :batch_ttft (float): Total TLM+DLM Batch TTFT.
+        :decode_throughput (float): Decode throughput.
+        :e2e_throughput (float): E2E throughput.
+        :mean_num_accepted_tokens (float): Average number of accepted tokens.
+        :max_gen_len (int): Max generation length.
+        :generated_tokens_per_prompt (List[int]): Total generated tokens per prompt.
+    """
+
+    mean_ttft: float
+    batch_ttft: float
+    decode_throughput: float
+    e2e_throughput: float
+    mean_num_accepted_tokens: float
+    max_gen_len: int
+    generated_tokens_per_prompt: List[int]
+
+
+@dataclass
+class CloudAI100ExecInfo:
+    """
+    Holds all the information about Cloud AI 100 execution
+
+    Args:
+        :prompts (List[str]): Prompts to perfrom inferencing on.
+        :batch_size (int): Batch size of the QPC compilation.
+        :generated_texts (Union[List[List[str]], List[str]]): Generated text(s).
+        :generated_ids (Union[List[np.ndarray], np.ndarray]): Generated IDs.
+        :perf_metrics (PerfMetrics): Performance metrics.
+        :num_speculative_tokens (int): Number of speculative tokens.
+        :prefill_seq_len (int): Prefill sequence length.
+        :ctx_len (int): Context length.
+        :prefill_bsz (int): Prefill batch size.
+        :draft_model_name (str): Draft model name.
+        :target_model_name (str): Target model name.
+        :full_batch_size (Optional[int]): Full batch size.
+    """
+
+    prompts: List[str]
+    batch_size: int
+    generated_texts: Union[List[str], List[List[str]]]
+    generated_ids: Union[List[np.ndarray], np.ndarray]
+    perf_metrics: PerfMetrics
+    num_speculative_tokens: int
+    prefill_seq_len: int
+    ctx_len: int
+    prefill_bsz: int
+    draft_model_name: str
+    target_model_name: str
+    full_batch_size: Optional[int]
+
+    def __repr__(self):
+        return (
+            f"Avg TLM+DLM TTFT = {round(self.perf_metrics.mean_ttft, 2)}\n"
+            f"Total TLM+DLM Batch TTFT = {round(self.perf_metrics.batch_ttft, 2)}\n"
+            f"Decode Throughput = {round(self.perf_metrics.decode_throughput, 2)}\n"
+            f"E2E Throughput = {round(self.perf_metrics.e2e_throughput, 2)}\n"
+            f"Avg number of accepted tokens = {round(self.perf_metrics.mean_num_accepted_tokens, 2)}\n"
+            f"Max generation len = {self.perf_metrics.max_gen_len}\n"
+            f"Total Generated Tokens per Prompt: = {self.perf_metrics.generated_tokens_per_prompt}"
+        )
+
+
+def run_prefill_on_draft_and_target(
+    tlm_session: QAICInferenceSession,
+    dlm_session: Optional[QAICInferenceSession],
+    inputs: dict,
+    prefill_seq_len: int,
+    slot_idx: int,
+):
+    input_len = inputs.input_ids.shape[1]
+    num_chunks = input_len // prefill_seq_len
+    cache_index = np.array([[0]], np.int64)
+    batch_index = np.array([[slot_idx]], np.int64)
+    inputs["batch_index"] = batch_index
+
+    # Run chunked prefill
+    for i in range(num_chunks):
+        chunk_inputs = inputs.copy()
+        chunk_inputs["input_ids"] = inputs["input_ids"][:, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len]
+        chunk_inputs["position_ids"] = inputs["position_ids"][
+            :, cache_index[0, 0] : cache_index[0, 0] + prefill_seq_len
+        ]
+
+        tlm_outputs = tlm_session.run(chunk_inputs)
+        if dlm_session is not None:
+            _ = dlm_session.run(chunk_inputs)
+        cache_index += prefill_seq_len
+
+    tlm_logits = tlm_outputs["logits"]
+    return tlm_logits
+
+
+def get_padded_input_len(input_len: int, prefill_seq_len: int, ctx_len: int):
+    """return padded input length (must be factor of `prefill_seq_len`)
+
+    Args:
+        input_len (int): prompt length
+        prefill_seq_len (int): prefill sequence length
+        ctx_len (int): context length
+
+    Returns:
+        input_len_padded (int): padded input length
+    """
+    num_chunks = -(input_len // -prefill_seq_len)  # ceil divide without float
+    input_len_padded = num_chunks * prefill_seq_len  # Convert input_len to a multiple of prefill_seq_len
+    assert input_len_padded <= ctx_len, (
+        "input_len rounded to nearest prefill_seq_len multiple should be less than ctx_len"
+    )
+    return input_len_padded
+
+
+def find_candidate_pred_tokens(
+    input_ids: np.ndarray, fill_tok: int, max_ngram_size: int = 3, num_pred_tokens: int = 10
+) -> np.ndarray:
+    """find candidate predicted tokens
+    code is a numpy-adaptation of the function `find_candidate_pred_tokens` in
+    https://github.com/apoorvumang/prompt-lookup-decoding?tab=readme-ov-file
+
+    Args:
+        input_ids (np.ndarray): _description_, shape: [1, seq_len]
+        fill_tok (int): _description_
+        max_ngram_size (int, optional): _description_. Defaults to 3.
+        num_pred_tokens (int, optional): _description_. Defaults to 10.
+
+    Returns:
+        np.ndarray: speculated tokenss, shape: [1, num_pred_tokens] if match is found
+    """
+    decode_batch_size, input_length = input_ids.shape
+    assert decode_batch_size == 1
+
+    # Ensure max_ngram_size and num_pred_tokens are valid
+    if max_ngram_size <= 0 or num_pred_tokens <= 0 or max_ngram_size > input_length:
+        raise ValueError("Invalid max_ngram_size or num_pred_tokens")
+
+    has_empty_tokens = False
+    for ngram_size in range(max_ngram_size, 0, -1):
+        # Extract the last n tokens as our search ngram
+        ngram = input_ids[0, -ngram_size:]
+
+        # Create sliding windows of size ngram_size
+        windows = np.lib.stride_tricks.sliding_window_view(input_ids[0], window_shape=ngram_size)
+
+        # Find where the windows match the ngram
+        matches = np.all(windows == ngram, axis=1)
+
+        # Get the indices of matches
+        match_indices = np.where(matches)[0]
+
+        # Iterate through match indices to find a valid continuation
+        for idx in match_indices:
+            start_idx = idx + ngram_size
+            end_idx = start_idx + num_pred_tokens
+
+            # Ensure we don't go beyond the length of input_ids and avoid self-match
+            if end_idx <= input_length and start_idx < input_length - ngram_size:
+                return input_ids[0, start_idx:end_idx], has_empty_tokens
+
+    # If no match is found, return invalid array
+    has_empty_tokens = True
+    return np.full(num_pred_tokens, fill_tok, dtype=np.int64), has_empty_tokens
+
+
+@pytest.mark.parametrize(
+    "prompts, num_speculative_tokens, prefill_seq_len, ctx_len, prefill_bsz, target_model_name, full_batch_size, max_ngram_size",
+    configs,
+)
+def test_pld_spec_decode_inference(
+    prompts: List[str],
+    num_speculative_tokens: int,
+    prefill_seq_len: int,
+    ctx_len: int,
+    prefill_bsz: int,
+    target_model_name: str,
+    full_batch_size: Optional[int],
+    max_ngram_size: int,
+) -> CloudAI100ExecInfo:
+    """
+    Perform draft speculative decode inference on the given prompts.
+
+    Args:
+        prompts (List[str]): List of prompts to perform inference on.
+        num_speculative_tokens (int): Number of speculative tokens.
+        prefill_seq_len (int): Prefill sequence length.
+        ctx_len (int): Context length.
+        prefill_bsz (int): Prefill batch size.
+        target_model_name (str): Name of the target model.
+        full_batch_size (Optional[int]): Full batch size.
+        device_group (List[int]): List of device IDs.
+        max_ngram_size (int): Max ngram size
+
+    Returns:
+        CloudAI100ExecInfo: Execution information, including performance metrics and generated text.
+    """
+    # get device group
+    device_group: List[int] = get_available_device_id()
+    if not device_group:
+        pytest.skip("No available devices to run model on Cloud AI 100")
+    # assumes dlm and tlm are compiled to the same prompt-chunk-size, context length and full_batch_size/batch-size
+    # get vocab size
+    tokenizer = AutoTokenizer.from_pretrained(target_model_name, padding_side="right")
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    vocab_size = len(tokenizer)
+
+    # export_and_compile tlm and dlm
+    continuous_batching = full_batch_size is not None
+    target_model = AutoModelForCausalLM.from_pretrained(
+        target_model_name, continuous_batching=continuous_batching, is_tlm=True
+    )
+
+    num_devices = len(device_group)
+    target_model_qpc_path: str = target_model.compile(
+        num_cores=16,
+        num_devices=num_devices,
+        prefill_seq_len=prefill_seq_len,
+        ctx_len=ctx_len,
+        aic_enable_depth_first=True,
+        full_batch_size=full_batch_size,
+        num_speculative_tokens=num_speculative_tokens,
+    )
+    # init qaic session
+    target_model_session = QAICInferenceSession(target_model_qpc_path, device_ids=device_group)
+    draft_model_session = None
+
+    # skip inputs/outputs buffers
+    target_model_session.skip_buffers(set([x for x in target_model_session.input_names if x.startswith("past_")]))
+    target_model_session.skip_buffers(
+        set([x for x in target_model_session.output_names if x.endswith("_RetainedState")])
+    )
+
+    is_cb = full_batch_size is not None
+    decode_batch_size = full_batch_size if is_cb else prefill_bsz
+    if len(prompts) < decode_batch_size:
+        prompts_exp = prompts * decode_batch_size
+        prompts = prompts_exp[:decode_batch_size]
+    # tokenize the prompts
+    prefill_nltk = np.zeros((1, 1), dtype=np.int64)
+    prompts_tokenized: List[dict] = []
+    for p in prompts:
+        input_len: int = tokenizer(p, return_tensors="np", padding=True).input_ids.shape[1]
+        input_len_padded: int = get_padded_input_len(input_len, prefill_seq_len, ctx_len)
+        p_tok: dict = tokenizer(p, return_tensors="np", padding="max_length", max_length=input_len_padded)
+        position_ids = np.where(p_tok.pop("attention_mask"), np.arange(input_len_padded), -1)
+        p_tok["position_ids"] = position_ids
+        p_tok["num_logits_to_keep"] = prefill_nltk
+        prompts_tokenized.append(p_tok)
+    # create caches to hold generated ids and input prompt lengths
+    generated_ids = [[] for i in range(decode_batch_size)]
+    input_lengths = [0] * decode_batch_size
+    # run prefill on both draft and target models
+    # mock input key "logits" to store the first batch of output logits
+    tlm_precode_inputs = dict(
+        input_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        position_ids=np.zeros((decode_batch_size, num_speculative_tokens + 1), dtype=np.int64),
+        batch_index=np.arange(decode_batch_size, dtype=np.int64).reshape(-1, 1),
+        num_logits_to_keep=np.zeros((num_speculative_tokens + 1, 1), dtype=np.int64),
+    )
+    num_logits_to_keep = num_speculative_tokens + 1
+    max_gen_len = [ctx_len] * decode_batch_size
+    # setup buffers
+    tlm_prefill_logits_ph = np.zeros((prefill_bsz, 1, vocab_size), dtype=np.float32)
+    precode_logits_ph = np.zeros((decode_batch_size, num_logits_to_keep, vocab_size), dtype=np.float32)
+
+    target_model_session.set_buffers({"logits": tlm_prefill_logits_ph})
+    e2e_start = perf_counter()
+    ttfts = []
+    all_ids = np.zeros((decode_batch_size, ctx_len), dtype=np.int64)
+    prompt_plus_gen_idx = np.zeros(decode_batch_size, dtype=np.int64)
+    for bi in range(decode_batch_size):
+        # assumes that prefill queue will always be popped from the front
+        start = perf_counter()
+        tlm_logits = run_prefill_on_draft_and_target(
+            tlm_session=target_model_session,
+            dlm_session=draft_model_session,
+            inputs=prompts_tokenized[bi],
+            prefill_seq_len=prefill_seq_len,
+            slot_idx=bi,
+        )
+        ttft = perf_counter() - start
+        ttfts.append(ttft)
+        input_ids = tlm_logits.argmax(2).astype(np.int64)
+        generated_ids[bi].append(input_ids.item())
+        tlm_precode_inputs["input_ids"][bi, 0] = input_ids.item()
+        input_len = prompts_tokenized[bi]["position_ids"].max(1).item() + 1
+        tlm_precode_inputs["position_ids"][bi] = np.arange(
+            input_len, input_len + num_speculative_tokens + 1, dtype=np.int64
+        )
+        # assumes that prefill queue will always be popped from the front
+        input_lengths[bi] = input_len
+        max_gen_len[bi] -= input_lengths[bi]
+        all_ids[bi, : input_len + 1] = prompts_tokenized[bi]["input_ids"][0, :input_len].tolist() + [input_ids.item()]
+        prompt_plus_gen_idx[bi] = input_len + 1
+    batch_ttft = perf_counter() - e2e_start
+
+    # set decode logits buffers
+    target_model_session.set_buffers({"logits": precode_logits_ph})
+    # start decode phase
+    valid_batch_indices = np.full(decode_batch_size, True, dtype=bool)
+    all_accept = False
+    it = 0
+    decode_start = perf_counter()
+    mean_num_accepted_tokens = 0
+    all_accept = np.full(decode_batch_size, False, dtype=bool)
+    tlm_position_ids = np.arange(num_speculative_tokens + 1).reshape(1, -1).repeat(decode_batch_size, axis=0)
+    empty_indices = np.zeros(decode_batch_size, dtype=bool)
+    while True:
+        it += 1
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            # generate n-grapm proposals
+            (
+                spec_tokens,  # shape: [num_speculative_tokens]
+                has_empty_tokens,
+            ) = find_candidate_pred_tokens(
+                all_ids[bi : bi + 1, : prompt_plus_gen_idx[bi]],
+                fill_tok=-1,
+                max_ngram_size=max_ngram_size,
+                num_pred_tokens=num_speculative_tokens,
+            )
+            empty_indices[bi] = has_empty_tokens
+            # prepare target model inputs
+            if has_empty_tokens:
+                # avoid read/write of KV$ for meaningless tokens
+                tlm_precode_inputs["position_ids"][bi, 1:] = -1
+            else:
+                tlm_precode_inputs["input_ids"][bi, 1:] = spec_tokens
+        # run precode on TLM to score the proposed tokens
+        tlm_outputs = target_model_session.run(tlm_precode_inputs)
+        target_logits = tlm_outputs["logits"]
+        # greedy sampling from target model
+        target_tokens = target_logits.argmax(-1)
+        # exact matching between draft and target tokens
+        num_tokens_selected = np.ones(decode_batch_size, dtype=np.int64)
+        tlm_precode_position_ids = np.full((decode_batch_size, num_speculative_tokens + 1), -1, dtype=np.int64)
+        non_empty_valid_indices = ~empty_indices & valid_batch_indices
+        matching = (
+            tlm_precode_inputs["input_ids"][non_empty_valid_indices, 1:] == target_tokens[non_empty_valid_indices, :-1]
+        )  # shape: [non_empty_valid_indices, num_speculative_tokens]
+        num_tokens_selected[non_empty_valid_indices] = matching.cumprod(axis=1).sum(axis=1) + 1
+        if empty_indices.sum() > 0:
+            tlm_precode_position_ids[empty_indices] = tlm_position_ids[empty_indices] + (
+                tlm_precode_inputs["position_ids"][empty_indices, 0] + 1
+            ).reshape(-1, 1)
+        if non_empty_valid_indices.sum() > 0:
+            tlm_precode_position_ids[non_empty_valid_indices] = tlm_precode_inputs["position_ids"][
+                non_empty_valid_indices
+            ] + num_tokens_selected[non_empty_valid_indices].reshape(-1, 1)
+        # record accepted tokens
+        all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
+        mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
+        # append selected tokens to the generated_ids
+        for bi, valid in enumerate(valid_batch_indices):
+            if not valid:
+                continue
+            accepted_tokens = num_tokens_selected[bi]
+            num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
+            gen_ids = target_tokens[bi, :num_tokens_to_append]
+            all_ids[bi, prompt_plus_gen_idx[bi] : prompt_plus_gen_idx[bi] + num_tokens_to_append] = gen_ids
+            prompt_plus_gen_idx[bi] += num_tokens_to_append
+            generated_ids[bi].extend(gen_ids.tolist())
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
+                valid_batch_indices[bi] = False
+        # check if all generations are done
+        if not valid_batch_indices.any():
+            break
+        # prepare decode inputs for next decode iteration
+        num_valid_batch_indices = valid_batch_indices.sum().item()
+        common_input_ids = target_tokens[valid_batch_indices, num_tokens_selected[valid_batch_indices] - 1].reshape(
+            num_valid_batch_indices, 1
+        )
+        tlm_precode_inputs["input_ids"][valid_batch_indices, 0] = common_input_ids.flatten()
+        tlm_precode_position_ids[~valid_batch_indices] = -1
+        tlm_precode_inputs["position_ids"] = tlm_precode_position_ids
+    end = perf_counter()
+    # calculate performance metrics
+    decode_end = end - decode_start
+    e2e_end = end - e2e_start
+    mean_ttft = sum(ttfts) / len(ttfts)
+    generated_tokens_per_prompt = [len(gid) + 1 for gid in generated_ids]
+    decode_throughput = sum(generated_tokens_per_prompt) / decode_end
+    e2e_throughput = (sum(generated_tokens_per_prompt) + decode_batch_size) / e2e_end
+    batch_decode = tokenizer.batch_decode(generated_ids)
+    mean_num_accepted_tokens /= it
+    perf_metrics = PerfMetrics(
+        mean_ttft,
+        batch_ttft,
+        decode_throughput,
+        e2e_throughput,
+        mean_num_accepted_tokens,
+        max_gen_len,
+        generated_tokens_per_prompt,
+    )
+    draft_model_name = "PLD"
+    exec_info = CloudAI100ExecInfo(
+        prompts,
+        decode_batch_size,
+        batch_decode,
+        generated_ids,
+        perf_metrics,
+        num_speculative_tokens,
+        prefill_seq_len,
+        ctx_len,
+        prefill_bsz,
+        draft_model_name,
+        target_model_name,
+        full_batch_size,
+    )
+    del target_model_session
+    del draft_model_session
+    generated_ids = np.asarray(generated_ids[0]).flatten()
+    gen_len = generated_ids.shape[0]
+    exec_info = target_model.generate(tokenizer, Constants.INPUT_STR, device_group)
+    cloud_ai_100_tokens = exec_info.generated_ids[0][
+        :gen_len
+    ]  # Because we always run for single input and single batch size
+    all_matching = np.array_equal(cloud_ai_100_tokens, generated_ids)
+    assert all_matching, "Tokens don't match for SpD output and vanilla DLM output."
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index 9c6c7a2de..a9f197ec3 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -259,19 +259,13 @@ def test_spec_decode_inference(
         all_accept[valid_batch_indices] = num_tokens_selected[valid_batch_indices] == num_speculative_tokens + 1
         mean_num_accepted_tokens += num_tokens_selected[valid_batch_indices].mean().item()
         # append selected tokens to the generated_ids
-        tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(
-            decode_batch_size, 1
-        )
-        # tlm_precode_position_ids = tlm_precode_inputs["position_ids"] + num_tokens_selected.reshape(decode_batch_size,1)+1
         for bi, valid in enumerate(valid_batch_indices):
             if not valid:
                 continue
             accepted_tokens = num_tokens_selected[bi]
             num_tokens_to_append = min(accepted_tokens, max_gen_len[bi] - len(generated_ids[bi]))
             generated_ids[bi].extend(target_tokens[bi, :num_tokens_to_append].tolist())
-            # position_ids > ctx_len-1 result in erronous output for logits at each seq_len of TLM
-            # (e.g., ctx_len=128 -> position_ids=[127,128,129] will give erronous output at each predicted token)
-            if len(generated_ids[bi]) >= max_gen_len[bi] or (tlm_precode_position_ids[bi] > ctx_len - 1).any():
+            if len(generated_ids[bi]) >= max_gen_len[bi]:
                 valid_batch_indices[bi] = False
         # check if all generations are done
         if not valid_batch_indices.any():

From 46e28a02765cd08de165d329f9f37057239d831d Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Fri, 28 Feb 2025 19:26:57 +0530
Subject: [PATCH 090/138] New format of Documentation (#240)

New format of Documentation for inference and finetuning.

---------

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Co-authored-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 docs/index.md                            |  17 +---
 docs/source/cli_api.md                   |  12 ++-
 docs/source/introduction.md              |   5 +
 docs/source/ll_api.md                    |  38 -------
 docs/source/{hl_api.md => python_api.md} |  91 +++++++++++++++--
 docs/source/quick_start.md               |  25 +++++
 docs/source/validate.md                  | 124 +++++++++++++----------
 7 files changed, 193 insertions(+), 119 deletions(-)
 delete mode 100644 docs/source/ll_api.md
 rename docs/source/{hl_api.md => python_api.md} (51%)

diff --git a/docs/index.md b/docs/index.md
index 630493854..6b731e936 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -36,27 +36,14 @@ source/upgrade
 ```
 
 ```{toctree}
-:caption: 'Quick start'
+:caption: 'Inference on Cloud AI 100'
 :maxdepth: 4
 
 source/quick_start
-```
-
-```{toctree}
-:caption: 'Command Line Interface Use (CLI)'
-:maxdepth: 2
 source/cli_api
+source/python_api
 ```
 
- 
-```{toctree}
-:caption: 'Python API'
-:maxdepth: 2
-
-source/hl_api
-source/ll_api
-
-```
 
 ```{toctree}
 :caption: 'QAIC Finetune'
diff --git a/docs/source/cli_api.md b/docs/source/cli_api.md
index 603f0141c..a6ec86554 100644
--- a/docs/source/cli_api.md
+++ b/docs/source/cli_api.md
@@ -1,30 +1,32 @@
 
+# Command Line Interface Use (CLI)
+
 ```{NOTE}
 Use ``bash terminal``, else if using ``ZSH terminal`` then ``device_group``should be in single quotes e.g.  ``'--device_group [0]'``
 ```
 
 (infer_api)=
-# `QEfficient.cloud.infer`
+## `QEfficient.cloud.infer`
 ```{eval-rst}
 .. automodule:: QEfficient.cloud.infer.main
 ``` 
-# `QEfficient.cloud.execute`
+## `QEfficient.cloud.execute`
 ```{eval-rst}
 .. automodule:: QEfficient.cloud.execute.main
 ```
-# `QEfficient.cloud.compile`
+## `QEfficient.cloud.compile`
 ```{eval-rst}
    .. automodule:: QEfficient.compile.compile_helper.compile
    .. code-block:: bash
     
         python -m QEfficient.cloud.compile OPTIONS
 ```
-# `QEfficient.cloud.export`
+## `QEfficient.cloud.export`
 ```{eval-rst}
    .. automodule:: QEfficient.cloud.export.main
    
 ```
-# `QEfficient.cloud.finetune`
+## `QEfficient.cloud.finetune`
 ```{eval-rst}
    .. automodule:: QEfficient.cloud.finetune.main
    
diff --git a/docs/source/introduction.md b/docs/source/introduction.md
index 772de4efc..d842b40c4 100644
--- a/docs/source/introduction.md
+++ b/docs/source/introduction.md
@@ -23,6 +23,9 @@ For other models, there is comprehensive documentation to inspire upon the chang
 ***Latest news*** : <br>
 
 - [coming soon] Support for more popular [models](models_coming_soon)<br>
+- [01/2025] [FP8 models support](https://huggingface.co/collections/neuralmagic/fp8-llms-for-vllm-666742ed2b78b7ac8df13127) Added support for inference of FP8 models.
+
+- [01/2025] Added support for [Ibm-Granite](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
 - [11/2024] [finite adapters support](https://github.com/quic/efficient-transformers/pull/153) allows mixed adapter usage for peft models.
 - [11/2024] [Speculative decoding TLM](https://github.com/quic/efficient-transformers/pull/119) QEFFAutoModelForCausalLM model can be compiled for returning more than 1 logits during decode for TLM.
 - [11/2024] Added support for [Meta-Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct), [Meta-Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) and [Meta-Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)
@@ -31,6 +34,8 @@ For other models, there is comprehensive documentation to inspire upon the chang
 <details>
 <summary>More</summary>
 
+- [01/2025] Added support for [Ibm-Granite](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)
+- [01/2025] Added support for [Ibm-Granite-Guardian](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b)
 - [09/2024] Added support for [Gemma-2-Family](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)<br>
 - [09/2024] Added support for [CodeGemma-Family](https://huggingface.co/collections/google/codegemma-release-66152ac7b683e2667abdee11)
 - [09/2024] Added support for [Gemma-Family](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b)
diff --git a/docs/source/ll_api.md b/docs/source/ll_api.md
deleted file mode 100644
index 8cdb974bc..000000000
--- a/docs/source/ll_api.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Low Level API
-
-## `convert_to_cloud_kvstyle`
-```{eval-rst}
-.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
-   :members:
-   :show-inheritance:
-   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle
-```
-## `convert_to_cloud_bertstyle`
-```{eval-rst}
-.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
-   :members:
-   :show-inheritance:
-   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle
-```
-
-## `utils`
-
-```{eval-rst}
-.. automodule:: QEfficient.utils.device_utils
-   :members:
-   :show-inheritance:
-```
-
-```{eval-rst}
-.. automodule:: QEfficient.utils.generate_inputs
-   :members:
-   :undoc-members:
-   :show-inheritance:
-```
-
-```{eval-rst}
-.. automodule:: QEfficient.utils.run_utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
-```
\ No newline at end of file
diff --git a/docs/source/hl_api.md b/docs/source/python_api.md
similarity index 51%
rename from docs/source/hl_api.md
rename to docs/source/python_api.md
index d5f2e10f7..668861373 100644
--- a/docs/source/hl_api.md
+++ b/docs/source/python_api.md
@@ -1,34 +1,64 @@
+# Python API
+
 **This page give you an overview about the all the APIs that you might need to integrate the `QEfficient` into your python applications.**
 
-# High Level API
+## High Level API
+
+### `QEFFAutoModelForCausalLM`
 
-## `QEFFAutoModelForCausalLM`
 ```{eval-rst}
 .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForCausalLM
    :member-order: bysource
    :members:
 ``` 
-## `QEFFAutoModel`
+
+(QEFFAutoModel)=
+### `QEFFAutoModel`
+
 ```{eval-rst}
 .. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModel
    :member-order: bysource
    :members:
 ``` 
-## `QEffAutoPeftModelForCausalLM`
+
+(QEffAutoPeftModelForCausalLM)=
+### `QEffAutoPeftModelForCausalLM`
+
 ```{eval-rst}
 .. autoclass:: QEfficient.peft.auto.QEffAutoPeftModelForCausalLM
    :member-order: bysource
    :members:
 ```
 
-## `QEffAutoLoraModelForCausalLM`
+(QEffAutoLoraModelForCausalLM)=
+### `QEffAutoLoraModelForCausalLM`
+
 ```{eval-rst}
 .. autoclass:: QEfficient.peft.lora.auto.QEffAutoLoraModelForCausalLM
    :member-order: bysource
    :members:
 ```
 
-## `export`
+(QEFFAutoModelForImageTextToText)=
+### `QEFFAutoModelForImageTextToText`
+
+```{eval-rst}
+.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForImageTextToText
+   :member-order: bysource
+   :members:
+```
+
+(QEFFAutoModelForSpeechSeq2Seq)=
+### `QEFFAutoModelForSpeechSeq2Seq`
+
+```{eval-rst}
+.. autoclass:: QEfficient.transformers.models.modeling_auto.QEFFAutoModelForSpeechSeq2Seq
+   :member-order: bysource
+   :members:
+```
+
+### `export`
+
 ```{eval-rst}
 .. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
    :members:
@@ -37,7 +67,9 @@
 .. deprecated::
    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.export instead
 ```
-## `compile`
+
+### `compile`
+
 ```{eval-rst}
 .. automodule:: QEfficient.compile.compile_helper
    :members:
@@ -50,10 +82,53 @@
 .. deprecated::
    This function will be deprecated in version 1.19, please use QEFFAutoModelForCausalLM.compile instead
 ```
-## `Execute`
+
+### `Execute`
+
 ```{eval-rst}
 .. automodule:: QEfficient.generation.text_generation_inference
    :members:
    :show-inheritance:
    :exclude-members:  latency_stats_bertstyle,cloud_ai_100_exec_kv_helper
 ```
+## Low Level API
+
+### `convert_to_cloud_kvstyle`
+
+```{eval-rst}
+.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
+   :members:
+   :show-inheritance:
+   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_bertstyle
+```
+
+### `convert_to_cloud_bertstyle`
+
+```{eval-rst}
+.. automodule:: QEfficient.exporter.export_hf_to_cloud_ai_100
+   :members:
+   :show-inheritance:
+   :exclude-members: qualcomm_efficient_converter, convert_to_cloud_kvstyle
+```
+
+### `utils`
+
+```{eval-rst}
+.. automodule:: QEfficient.utils.device_utils
+   :members:
+   :show-inheritance:
+```
+
+```{eval-rst}
+.. automodule:: QEfficient.utils.generate_inputs
+   :members:
+   :undoc-members:
+   :show-inheritance:
+```
+
+```{eval-rst}
+.. automodule:: QEfficient.utils.run_utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
+```
\ No newline at end of file
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 55e0746ef..88093e134 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -1,3 +1,4 @@
+# Quick Start
 
 QEfficient Library was designed with one goal:
 
@@ -8,6 +9,30 @@ To achieve this, we have 2 levels of APIs, with different levels of abstraction.
 
 2. Python high level APIs offer more granular control, ideal for when customization is necessary.
 
+## Supported Features
+
+| Feature | Impact |
+| --- | --- |
+| Context Length Specializations (upcoming) | Increases the maximum context length that models can handle, allowing for better performance on tasks requiring long sequences of text. |
+| Swift KV (upcoming) | Reduces computational overhead during inference by optimizing key-value pair processing, leading to improved throughput. |
+| Block Attention (in progress) | Reduces inference latency and computational cost by dividing context into blocks and reusing key-value states, particularly useful in RAG. |
+| [Vision Language Model](QEFFAutoModelForImageTextToText) | Provides support for the AutoModelForImageTextToText class from the transformers library, enabling advanced vision-language tasks. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/image_text_to_text_inference.py) for more **details**. |
+| [Speech Sequence to Sequence Model](QEFFAutoModelForSpeechSeq2Seq) | Provides support for the QEFFAutoModelForSpeechSeq2Seq Facilitates speech-to-text sequence models. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/speech_to_text/run_whisper_speech_to_text.py) for more **details**. |
+| Support for FP8 Execution | Enables execution with FP8 precision, significantly improving performance and reducing memory usage for computational tasks. |
+| Prefill caching  | Enhances inference speed by caching key-value pairs for shared prefixes, reducing redundant computations and improving efficiency. |
+|Prompt-Lookup Decoding | Speeds up text generation by using overlapping parts of the input prompt and the generated text, making the process faster without losing quality. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/pld_spd_inference.py) for more **details**.|
+| [PEFT LoRA support](QEffAutoPeftModelForCausalLM) | Enables parameter-efficient fine-tuning using low-rank adaptation techniques, reducing the computational and memory requirements for fine-tuning large models. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/peft_models.py) for more **details**. |
+| [QNN support](#qnn-compilation) | Enables compilation using QNN SDK, making Qeff adaptable for various backends in the future. |
+| [Embedding model support](QEFFAutoModel) | Facilitates the generation of vector embeddings for retrieval tasks. |
+| [Speculative Decoding](#draft-based-speculative-decoding) | Accelerates text generation by using a draft model to generate preliminary predictions, which are then verified by the target model, reducing latency and improving efficiency. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/draft_spd_inference.py) for more **details**. |
+| [Finite lorax](QEffAutoLoraModelForCausalLM) | Users can activate multiple LoRA adapters and compile them with the base model. At runtime, they can specify which prompt should use which adapter, enabling mixed adapter usage within the same batch. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/lora_models.py) for more **details**. |
+| Python and CPP Inferencing API support | Provides flexibility while running inference with Qeff and enabling integration with various applications and improving accessibility for developers. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/examples/cpp_execution/text_inference_using_cpp.py) for more **details**.|
+| [Continuous batching](#continuous-batching) | Optimizes throughput and latency by dynamically batching requests, ensuring efficient use of computational resources. |
+| AWQ and GPTQ support | Supports advanced quantization techniques, improving model efficiency and performance on AI 100. |
+| Support serving successive requests in same session | An API that yields tokens as they are generated, facilitating seamless integration with various applications and enhancing accessibility for developers. |
+| Perplexity calculation | A script for computing the perplexity of a model, allowing for the evaluation of model performance and comparison across different models and datasets. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/scripts/perplexity_computation/calculate_perplexity.py) for more **details**. |
+| KV Heads Replication Script| A sample script for replicating key-value (KV) heads for the Llama-3-8B-Instruct model, running inference with the original model, replicating KV heads, validating changes, and exporting the modified model to ONNX format. Refer [sample script](https://github.com/quic/efficient-transformers/blob/main/scripts/replicate_kv_head/replicate_kv_heads.py) for more **details**.|
+
 ## Transformed models and QPC storage
 
 By default, the library exported models and Qaic Program Container (QPC) files, which are compiled and inference-ready model binaries generated by the compiler, are stored in `~/.cache/qeff_cache`. You can customize this storage path using the following environment variables:
diff --git a/docs/source/validate.md b/docs/source/validate.md
index b3327596d..49acd268d 100644
--- a/docs/source/validate.md
+++ b/docs/source/validate.md
@@ -1,59 +1,77 @@
 (validated_models)=
 # Validated Models
-``Note- All validated models support Continuous Batching functionality.``
-| Model Name | Model Support |
-| --- | --- |
-| [CodeGemma-2b](https://huggingface.co/google/codegemma-2b) |✔️ |
-| [CodeGemma-7b](https://huggingface.co/google/codegemma-7b) |✔️ |
-| [CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf) |✔️ |
-| [CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf) |✔️ |
-| [CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) |✔️ |
-| [Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1) |✔️ |
-| [DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)|✔️ |
-| [DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)|✔️ |
-| [Falcon-40b](https://huggingface.co/tiiuae/falcon-40b) |✔️ |
-| [GPT-J-6B](https://huggingface.co/EleutherAI/gpt-j-6b) |✔️ |
-| [GPT2](https://huggingface.co/openai-community/gpt2) |✔️ |
-| [Gemma-2b](https://huggingface.co/google/gemma-2b) |✔️ |
-| [Gemma-7b](https://huggingface.co/google/gemma-7b) |✔️ |
-| [Gemma-2-2b](https://huggingface.co/google/gemma-2-2b) |✔️ |
-| [Gemma-2-9b](https://huggingface.co/google/gemma-2-9b) |✔️ |
-| [Gemma-2-27b](https://huggingface.co/google/gemma-2-27b) |✔️ |
-| [Granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct) |✔️ |
-| [Granite-guardian-3.1-8b](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b) |✔️ |
-| [Granite-20b-code-base](https://huggingface.co/ibm-granite/granite-20b-code-base-8k) | ✔️ |
-| [Granite-20b-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k) | ✔️ |
-| [Jais-adapted-7b](https://huggingface.co/inceptionai/jais-adapted-7b) |✔️ |
-| [Jais-adapted-13b-chat](https://huggingface.co/inceptionai/jais-adapted-13b-chat) |✔️ |
-| [Jais-adapted-70b](https://huggingface.co/inceptionai/jais-adapted-70b) |✔️ |
-| [Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) |✔️ |
-| [Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) |✔️ |
-| [Llama-2-70b](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) |✔️ |
-| [Llama-3-8b](https://huggingface.co/meta-llama/Meta-Llama-3-8B) |✔️ |
-| [Llama-3-70b](https://huggingface.co/meta-llama/Meta-Llama-3-70B) |✔️ |
-| [Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B) |✔️ |
-| [Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B) |✔️ |
-| [Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) |✔️ |
-| [Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B) |✔️ |
-| [Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) |✔️ |
-| [MPT-7b](https://huggingface.co/mosaicml/mpt-7b) |✔️ |
-| [Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) |✔️ |
-| [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) |✔️ |
-| [Phi3-Mini-4K-Instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) |✔️ |
-| [Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct) |✔️ |
-| [Starcoder1-15B](https://huggingface.co/bigcode/starcoder) |✔️ |
-| [Starcoder2-15B](https://huggingface.co/bigcode/starcoder2-15b) |✔️ |
-| [Vicuna-v0](https://huggingface.co/lmsys/vicuna-13b-delta-v0) |✔️ |
-| [Vicuna-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3) |✔️ |
-| [Vicuna-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5) |✔️ |
+
+## Text-only Language Models
+
+### Text Generation Task
+**QEff Auto Class:** `QEFFAutoModelForCausalLM`
+
+| Architecture            | Model Family       | Representative Models                                                                 | CB Support |
+|-------------------------|--------------------|--------------------------------------------------------------------------------------|------------|
+| **FalconForCausalLM**   | Falcon             | [tiiuae/falcon-40b]((https://huggingface.co/tiiuae/falcon-40b))                                                                    | ✔️          |
+| **GemmaForCausalLM**    | CodeGemma          | [google/codegemma-2b](https://huggingface.co/google/codegemma-2b)<br>[google/codegemma-7b](https://huggingface.co/google/codegemma-7b)                                           | ✔️          |
+|                         | Gemma              | [google/gemma-2b](https://huggingface.co/google/gemma-2b)<br>[google/gemma-7b](https://huggingface.co/google/gemma-7b)<br>[google/gemma-2-2b](https://huggingface.co/google/gemma-2-2b)<br>[google/gemma-2-9b](https://huggingface.co/google/gemma-2-9b)<br>[google/gemma-2-27b](https://huggingface.co/google/gemma-2-27b)        | ✔️          |
+| **GPTBigCodeForCausalLM** | Starcoder1.5      | [bigcode/starcoder](https://huggingface.co/bigcode/starcoder)                                                                   | ✔️          |
+|                         | Starcoder2         | [bigcode/starcoder2-15b](https://huggingface.co/bigcode/starcoder2-15b)                                                              | ✔️          |
+| **GPTJForCausalLM**     | GPT-J              | [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)                                                                 | ✔️          |
+| **GPT2LMHeadModel**     | GPT-2              | [openai-community/gpt2](https://huggingface.co/openai-community/gpt2)                                                               | ✔️          |
+| **GraniteForCausalLM**  | Granite 3.1        | [ibm-granite/granite-3.1-8b-instruct](https://huggingface.co/ibm-granite/granite-3.1-8b-instruct)<br>[ibm-granite/granite-guardian-3.1-8b](https://huggingface.co/ibm-granite/granite-guardian-3.1-8b)          | ✔️          |
+|                         | Granite 20B        | [ibm-granite/granite-20b-code-base-8k](https://huggingface.co/ibm-granite/granite-20b-code-base-8k)<br>[ibm-granite/granite-20b-code-instruct-8k](https://huggingface.co/ibm-granite/granite-20b-code-instruct-8k)    | ✔️          |
+| **InternVLChatModel**   | Intern-VL          | [OpenGVLab/InternVL2_5-1B](https://huggingface.co/OpenGVLab/InternVL2_5-1B)                                                            |            |
+| **LlamaForCausalLM**    | CodeLlama          | [codellama/CodeLlama-7b-hf](https://huggingface.co/codellama/CodeLlama-7b-hf)<br>[codellama/CodeLlama-13b-hf](https://huggingface.co/codellama/CodeLlama-13b-hf)<br>[codellama/CodeLlama-34b-hf](https://huggingface.co/codellama/CodeLlama-34b-hf) | ✔️          |
+|                         | DeepSeek-R1-Distill-Llama | [deepseek-ai/DeepSeek-R1-Distill-Llama-70B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B)                                      | ✔️          |
+|                         | InceptionAI-Adapted | [inceptionai/jais-adapted-7b](https://huggingface.co/inceptionai/jais-adapted-7b)<br>[inceptionai/jais-adapted-13b-chat](https://huggingface.co/inceptionai/jais-adapted-13b-chat)<br>[inceptionai/jais-adapted-70b](https://huggingface.co/inceptionai/jais-adapted-70b) | ✔️          |
+|                         | Llama 3.3          | [meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct)                                                   | ✔️          |
+|                         | Llama 3.2          | [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B)<br>[meta-llama/Llama-3.2-3B](https://huggingface.co/meta-llama/Llama-3.2-3B)                                  | ✔️          |
+|                         | Llama 3.1          | [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)<br>[meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B)                                 | ✔️          |
+|                         | Llama 3            | [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)<br>[meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)                           | ✔️          |
+|                         | Llama 2            | [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)<br>[meta-llama/Llama-2-13b-chat-hf](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf)<br>[meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | ✔️          |
+|                         | Vicuna             | [lmsys/vicuna-13b-delta-v0](https://huggingface.co/lmsys/vicuna-13b-delta-v0)<br>[lmsys/vicuna-13b-v1.3](https://huggingface.co/lmsys/vicuna-13b-v1.3)<br>[lmsys/vicuna-13b-v1.5](https://huggingface.co/lmsys/vicuna-13b-v1.5)         | ✔️          |
+| **MistralForCausalLM**  | Mistral            | [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)                                                  | ✔️          |
+| **MixtralForCausalLM**  | Codestral<br>Mixtral | [mistralai/Codestral-22B-v0.1](https://huggingface.co/mistralai/Codestral-22B-v0.1)<br>[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)                        | ✔️          |
+| **MPTForCausalLM**      | MPT                | [mosaicml/mpt-7b](https://huggingface.co/mosaicml/mpt-7b)                                                                     | ✔️          |
+| **Phi3ForCausalLM**     | Phi-3, Phi-3.5     | [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct)                                                    | ✔️          |
+| **QwenForCausalLM**     | DeepSeek-R1-Distill-Qwen | [DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B)                                                   | ✔️          |
+|                         | Qwen2, Qwen2.5     | [Qwen/Qwen2-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2-1.5B-Instruct)                                                            | ✔️          |
+
+## Embedding Models
+
+### Text Embedding Task
+**QEff Auto Class:** `QEFFAutoModel`
+
+| Architecture | Model Family | Representative Models          |
+|--------------|--------------|---------------------------------|
+| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2)          |
+| **LlamaModel** | Llama-based  | [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
+| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
+| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
+| **MPNetForMaskedLM** | MPNet | [sentence-transformers/multi-qa-mpnet-base-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1) |
+| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
+| **MistralModel** | Mistral | [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
+
+## Multimodal Language Models
+
+### Vision-Language Models (Text + Image Generation)
+**QEff Auto Class:** `QEFFAutoModelImageTextToText`
+
+| Architecture                | Model Family | Representative Models                  |
+|-----------------------------|--------------|----------------------------------------|
+| **LlavaForConditionalGeneration** | LLaVA-1.5   | [llava-hf/llava-1.5-7b-hf](https://huggingface.co/llava-hf/llava-1.5-7b-hf)               |
+| **MllamaForConditionalGeneration** | Llama 3.2   | [meta-llama/Llama-3.2-11B-Vision Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)<br>[meta-llama/Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision) |
+
+### Audio Models
+(Automatic Speech Recognition) - Transcription Task
+**QEff Auto Class:** `QEFFAutoModelForSpeechSeq2Seq`
+
+| Architecture | Model Family | Representative Models                                                                 |
+|--------------|--------------|----------------------------------------------------------------------------------------|
+| **Whisper**  | Whisper      | [openai/whisper-tiny](https://huggingface.co/openai/whisper-tiny)<br>[openai/whisper-base](https://huggingface.co/openai/whisper-base)<br>[openai/whisper-small](https://huggingface.co/openai/whisper-small)<br>[openai/whisper-medium](https://huggingface.co/openai/whisper-medium)<br>[openai/whisper-large](https://huggingface.co/openai/whisper-large)<br>[openai/whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) |
 
 (models_coming_soon)=
 # Models Coming Soon
 
-* [Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)
-* [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)
-* [Chatglm2-6b](https://huggingface.co/THUDM/chatglm2-6b)
-* [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)
-* [Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B)
-* [Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)
-* [Llama-3.2-90B-Vision](https://huggingface.co/meta-llama/Llama-3.2-90B-Vision)
\ No newline at end of file
+| Architecture            | Model Family | Representative Models                      |
+|-------------------------|--------------|--------------------------------------------|
+| **BaichuanForCausalLM** | Baichuan2    | [baichuan-inc/Baichuan2-7B-Base](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base)             |
+| **CohereForCausalLM**   | Command-R    | [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01)             |
+| **DbrxForCausalLM**     | DBRX         | [databricks/dbrx-base](https://huggingface.co/databricks/dbrx-base)                       |
\ No newline at end of file

From e7796a43ff145d0a3cc7bbb288d11658368cc39f Mon Sep 17 00:00:00 2001
From: Amit Raj <168538872+quic-amitraj@users.noreply.github.com>
Date: Fri, 28 Feb 2025 21:57:37 +0530
Subject: [PATCH 091/138] Removed warning and override of mxfp6 for internal
 use (#277)

compilation fix and enabled mxfp6 for vision encoder

---------

Signed-off-by: Amit Raj <quic_amitraj@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 25 ++-----------------
 1 file changed, 2 insertions(+), 23 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index b8b5981cd..8bca3b94a 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -52,8 +52,6 @@
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
 
-MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 = ["MllamaForConditionalGeneration"]
-
 
 class QEFFTransformersBase(QEFFBaseModel):
     """
@@ -627,17 +625,12 @@ def compile(
         ):
             self.export()
 
-        if mxfp6_matmul and self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6:
-            logger.warning(
-                "Due to accuracy issues of vision model fixing it's precision to fp16, while language model will be compiled for mxfp6"
-            )
-
         self.vision_model._compile(
             compile_dir,
             compile_only=True,
             specializations=specializations["vision"],
             convert_to_fp16=True,
-            mxfp6_matmul=False,
+            mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
             custom_io=custom_io_vision,
@@ -946,11 +939,6 @@ def compile(
             if output_name.endswith("_RetainedState"):
                 custom_io[output_name] = kv_cache_dtype
 
-        if self.model_name in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and mxfp6_matmul:
-            logger.warning(
-                f"It is advised to use fp16 precision during compilation for {self.model.__class__.__name__} to avoid accuracy issues, got mxfp6_matmul=True"
-            )
-
         self._compile(
             onnx_path,
             compile_dir,
@@ -1147,16 +1135,7 @@ class QEFFAutoModelForImageTextToText:
 
     _hf_auto_class = AutoModelForImageTextToText
 
-    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = None, **kwargs):
-        if model.config.architectures[0] in MODELS_WITH_ACCURACY_ISSUE_FOR_MXFP6 and not kv_offload:
-            # For models with mxfp6 accuracy issue, we will use kv_offload=True by default
-            if kv_offload is None:
-                kv_offload = True
-            else:
-                logger.warning(f"Advised to use kv_offload=True for {model.__class__.__name__}")
-        elif kv_offload is None:
-            kv_offload = False
-
+    def __new__(self, model: nn.Module, kv_offload: Optional[bool] = True, **kwargs):
         if kv_offload:
             return _QEffAutoModelForImageTextToTextDualQPC(model, **kwargs)
         else:

From b9a74cf8b63b527c04007a19ae02e22c3c9bc653 Mon Sep 17 00:00:00 2001
From: mohiso22 <quic_mohisoni@quicinc.com>
Date: Fri, 28 Feb 2025 22:03:50 +0530
Subject: [PATCH 092/138] Added support of 2qpcs for internvl and llava (#279)

Signed-off-by: Mohit Soni <quic_mohisoni@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/internvl/modeling_internvl.py      | 147 +++++++++++++++---
 .../models/llava/modeling_llava.py            | 147 +++++++++++++++---
 .../models/mllama/modeling_mllama.py          |   3 +
 .../transformers/models/modeling_auto.py      |  17 +-
 .../transformers/models/pytorch_transforms.py |   2 +
 5 files changed, 261 insertions(+), 55 deletions(-)

diff --git a/QEfficient/transformers/models/internvl/modeling_internvl.py b/QEfficient/transformers/models/internvl/modeling_internvl.py
index 318993dde..c39e7b65d 100644
--- a/QEfficient/transformers/models/internvl/modeling_internvl.py
+++ b/QEfficient/transformers/models/internvl/modeling_internvl.py
@@ -14,9 +14,57 @@
 from QEfficient.utils.logging_utils import logger
 
 
+class QEffInternEncoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, pixel_values):
+        vit_embeds = self.model.extract_feature(pixel_values)
+        return vit_embeds
+
+
+class QEffInternDecoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.config = self.model.language_model.config
+
+    def forward(self, input_ids, vit_embeds, position_ids, past_key_values):
+        # TODO: Check if Hardcoding this is okay, i.e. check if this value is common for all intern models
+        IMG_CONTEXT_TOKEN = 151667
+
+        input_embeds = self.model.language_model.get_input_embeddings()(input_ids)
+        B, N, C = input_embeds.shape
+        image_input_embeds = input_embeds.reshape(B * N, C)
+        image_input_ids = input_ids.reshape(B * N)
+        selected = image_input_ids == IMG_CONTEXT_TOKEN
+        indices1 = selected.unsqueeze(0).to(torch.int64).cumsum(1) - 1
+        indices0 = torch.arange(selected.unsqueeze(0).shape[0]).view(-1, 1)
+        image_features_expanded = vit_embeds.reshape(-1, C).unsqueeze(0)[indices0, indices1]
+        image_input_embeds = torch.where(selected.unsqueeze(0).unsqueeze(-1), image_features_expanded, input_embeds)
+        inputs_embeds = torch.where(input_ids.shape[1] == torch.tensor(1), input_embeds, image_input_embeds)
+        outputs = self.model.language_model(
+            inputs_embeds=inputs_embeds, position_ids=position_ids, past_key_values=past_key_values, use_cache=True
+        )
+        return outputs.logits, vit_embeds, outputs.past_key_values
+
+
 class QEffInternVLModel(nn.Module):
+    def get_qeff_vision_encoder(self):
+        return QEffInternEncoderWrapper(self)
+
+    def get_qeff_language_decoder(self):
+        return QEffInternDecoderWrapper(self)
+
     def get_specializations(
-        self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
+        self,
+        batch_size: int,
+        prefill_seq_len: int,
+        ctx_len: int,
+        img_size: int,
+        kv_offload: bool = False,
+        **compiler_options,
     ):
         # TODO: check if this should be named num_patches or something else
         num_patches = compiler_options.pop("num_patches", None)
@@ -33,8 +81,18 @@ def get_specializations(
         elif img_size is None:
             img_size = 448
             logger.warning("Setting img_size to be 448, as it was neither passed nor found in vision_config")
-
-        specializations = [
+        if img_size != 448 and kv_offload:
+            raise NotImplementedError("Image Size other than 448 is not supported for Intern models yet.")
+        vision = [
+            {
+                "batch_size": batch_size,
+                "num_patches": num_patches,
+                "img_size": img_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+            }
+        ]
+        lang = [
             {
                 "batch_size": batch_size,
                 "seq_len": prefill_seq_len,
@@ -50,46 +108,75 @@ def get_specializations(
                 "img_size": img_size,
             },
         ]
-        return specializations, compiler_options
 
-    def get_onnx_dynamic_axes(
-        self,
-    ):
+        specializations = {}
+
+        if kv_offload:
+            specializations["vision"] = vision
+            specializations["lang"] = lang
+            return specializations, compiler_options
+        else:
+            return lang, compiler_options
+
+    def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         # Define dynamic axes
-        dynamic_axes = {}
-        dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
-        dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
-        dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
+        vision_dynamic_axes = {}
+        lang_dynamic_axes = {}
+        lang_dynamic_axes["input_ids"] = {0: "batch_size", 1: "seq_len"}
+        lang_dynamic_axes["position_ids"] = {0: "batch_size", 1: "seq_len"}
+        vision_dynamic_axes["pixel_values"] = {0: "num_patches", 2: "img_size", 3: "img_size"}
 
         pkv_dynamic_axes = {0: "batch_size", 2: "ctx_len"}
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes
+                lang_dynamic_axes[f"past_{kv}.{i}"] = pkv_dynamic_axes
 
+        dynamic_axes = {}
+        if kv_offload:
+            dynamic_axes["vision"] = vision_dynamic_axes
+            dynamic_axes["lang"] = lang_dynamic_axes
+        else:
+            dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes}
         return dynamic_axes
 
-    def get_output_names(
-        self,
-    ):
-        output_names = ["logits", "pixel_values_RetainedState"]
+    def get_output_names(self, kv_offload: bool = False):
+        vision_output_names = ["vit_embeds"]
+        lang_output_names = ["logits"]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                output_names.append(f"past_{kv}.{i}_RetainedState")
+                lang_output_names.append(f"past_{kv}.{i}_RetainedState")
+
+        output_names = {}
+        if kv_offload:
+            lang_output_names.insert(1, "vit_embeds_RetainedState")
+            output_names["vision"] = vision_output_names
+            output_names["lang"] = lang_output_names
+        else:
+            lang_output_names.insert(1, "pixel_values_RetainedState")
+            return lang_output_names
         return output_names
 
     def get_dummy_inputs(self, kv_offload: bool = False):
-        if kv_offload:
-            raise ValueError("kv_offload method not supported for InternVL yet!")
         num_patches = 13
         C = 3
         if vis_cfg := getattr(self.config, "vision_config", None):
             img_size = getattr(vis_cfg, "image_size", 448)
         else:
             img_size = 448
+        if img_size != 448 and kv_offload:
+            raise NotImplementedError("Image Size other than 448 is not supported for Intern models yet.")
+
+        # Taken from the modeling files of OpenGVLab/InternVL2_5-1B
+        feature_size = int((((self.config.vision_config.hidden_size**0.5) * self.config.downsample_ratio) ** 2))
 
         # Define shapes
         inputs_shapes = {}
         inputs_shapes["input_ids"] = (constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
+        inputs_shapes["vit_embeds"] = (
+            num_patches,
+            feature_size,
+            self.language_model.config.hidden_size,
+        )
         inputs_shapes["position_ids"] = (
             constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE,
             constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
@@ -97,14 +184,16 @@ def get_dummy_inputs(self, kv_offload: bool = False):
         inputs_shapes["pixel_values"] = (num_patches, C, img_size, img_size)
 
         # Define inputs
-        inputs = {}
-        inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
-        inputs["position_ids"] = (
+        vision_inputs = {}
+        lang_inputs = {}
+        vision_inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
+        lang_inputs["input_ids"] = torch.zeros((inputs_shapes["input_ids"]), dtype=torch.int64)
+        lang_inputs["vit_embeds"] = torch.zeros((inputs_shapes["vit_embeds"]), dtype=torch.float32)
+        lang_inputs["position_ids"] = (
             torch.arange(constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN, dtype=torch.int64)
             .view(1, constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN)
             .repeat(constants.ONNX_EXPORT_EXAMPLE_BATCH_SIZE, 1)
         )
-        inputs["pixel_values"] = torch.zeros((inputs_shapes["pixel_values"]), dtype=torch.float32)
 
         # Add data for KV
         kv_cache_shape = get_padding_shape_from_config(
@@ -113,10 +202,18 @@ def get_dummy_inputs(self, kv_offload: bool = False):
             seq_len=constants.ONNX_EXPORT_EXAMPLE_SEQ_LEN,
         )
 
-        inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
+        lang_inputs["past_key_values"] = [[] for _ in range(self.language_model.config.num_hidden_layers)]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
+                lang_inputs["past_key_values"][i].append(torch.zeros(kv_cache_shape, dtype=torch.float32))
+
+        inputs = {}
+        if kv_offload:
+            inputs["vision"] = vision_inputs
+            inputs["lang"] = lang_inputs
+        else:
+            lang_inputs.pop("vit_embeds")
+            inputs = {**vision_inputs, **lang_inputs}
 
         return inputs
 
diff --git a/QEfficient/transformers/models/llava/modeling_llava.py b/QEfficient/transformers/models/llava/modeling_llava.py
index 82c934670..93d6f4c3b 100644
--- a/QEfficient/transformers/models/llava/modeling_llava.py
+++ b/QEfficient/transformers/models/llava/modeling_llava.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import torch
+import torch.nn as nn
 import torch.utils.checkpoint
 from transformers.models.llava.modeling_llava import (
     LlavaForConditionalGeneration,
@@ -20,7 +21,57 @@
 CTX_LEN = 1024
 
 
+class QEFFLlavaEncoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, pixel_values):
+        # Image features
+        image_outputs = self.model.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_outputs.hidden_states[self.model.config.vision_feature_layer]
+        vision_feature_select_strategy = self.model.config.vision_feature_select_strategy
+        if vision_feature_select_strategy == "default":
+            selected_image_feature = selected_image_feature[:, 1:]
+        elif vision_feature_select_strategy == "full":
+            selected_image_feature = selected_image_feature
+        else:
+            raise ValueError(f"Unexpected select feature strategy: {self.model.config.vision_feature_select_strategy}")
+        image_features = self.model.multi_modal_projector(selected_image_feature)
+
+        return image_features
+
+
+class QEFFLlavaDecoderWrapper(nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.config = self.model.config
+
+    def forward(self, input_ids, image_features, position_ids, past_key_values):
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+        mask = input_ids == self.model.config.image_token_index
+        indices1 = mask.to(torch.int64).cumsum(1) - 1
+        indices0 = torch.arange(mask.shape[0]).view(-1, 1)
+        image_features_expanded = image_features[indices0, indices1]
+        inputs_embeds = torch.where(mask.unsqueeze(-1), image_features_expanded, inputs_embeds)
+        outputs = self.model.language_model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+        )
+
+        return outputs.logits, image_features, outputs.past_key_values
+
+
 class QEffLlavaForConditionalGeneration(LlavaForConditionalGeneration):
+    def get_qeff_vision_encoder(self):
+        return QEFFLlavaEncoderWrapper(self)
+
+    def get_qeff_language_decoder(self):
+        return QEFFLlavaDecoderWrapper(self)
+
     def forward(self, input_ids, position_ids, pixel_values, past_key_values):
         inputs_embeds = self.get_input_embeddings()(input_ids)
         # Image features
@@ -50,7 +101,7 @@ def forward(self, input_ids, position_ids, pixel_values, past_key_values):
         )
         return outputs.logits, pixel_values, outputs.past_key_values
 
-    def get_dummy_inputs(self, **kwargs):
+    def get_dummy_inputs(self, kv_offload: bool = False, **kwargs):
         num_layers = self.config.text_config.num_hidden_layers
         num_key_value_heads = self.config.text_config.num_key_value_heads
         head_dim = self.config.text_config.hidden_size // self.config.text_config.num_attention_heads
@@ -58,25 +109,44 @@ def get_dummy_inputs(self, **kwargs):
             img_size = getattr(vis_cfg, "image_size", 336)
         else:
             img_size = 336
-        inputs = {
+        if img_size != 336 and kv_offload:
+            raise NotImplementedError("Image Size other than 336 is not supported for Llava models yet.")
+        vision_inputs = {
+            "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=torch.float32),
+        }
+        lang_inputs = {
             "input_ids": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
+            "image_features": torch.ones((BS, 576, self.language_model.config.hidden_size), dtype=torch.float32),
             "attention_mask": torch.ones((BS, SEQ_LEN), dtype=torch.int64),
-            "pixel_values": torch.zeros((BS, NUM_CHANNEL, img_size, img_size), dtype=torch.float32),
         }
-        inputs["position_ids"] = inputs.pop("attention_mask").cumsum(1)
-        inputs["past_key_values"] = []
+        lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1)
+        lang_inputs["past_key_values"] = []
         for i in range(num_layers):
-            inputs["past_key_values"].append(
+            lang_inputs["past_key_values"].append(
                 (
                     torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim),
                     torch.zeros(BS, num_key_value_heads, CTX_LEN, head_dim),
                 )
             )
-        inputs["position_ids"] = torch.full(inputs["position_ids"].shape, CTX_LEN - 1)
+        lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, CTX_LEN - 1)
+        inputs = {}
+
+        if kv_offload:
+            inputs["vision"] = vision_inputs
+            inputs["lang"] = lang_inputs
+        else:
+            lang_inputs.pop("image_features")
+            inputs = {**vision_inputs, **lang_inputs}
         return inputs
 
     def get_specializations(
-        self, batch_size: int, prefill_seq_len: int, ctx_len: int, img_size: int, **compiler_options
+        self,
+        batch_size: int,
+        prefill_seq_len: int,
+        ctx_len: int,
+        img_size: int,
+        kv_offload: bool = False,
+        **compiler_options,
     ):
         max_num_images = compiler_options.pop("max_num_images", 1)
         prefill_seq_len = prefill_seq_len if prefill_seq_len else SEQ_LEN
@@ -86,8 +156,18 @@ def get_specializations(
         elif img_size is None:
             img_size = 336
             logger.warning("Setting img_size to be 336, as it was neither passed nor found in vision_config")
-
-        specializations = [
+        if img_size != 336 and kv_offload:
+            raise NotImplementedError("Image Size other than 336 is not supported for Llava models yet.")
+        vision = [
+            {
+                "batch_size": batch_size,
+                "max_num_images": max_num_images,
+                "img_size": img_size,
+                "seq_len": prefill_seq_len,
+                "ctx_len": ctx_len,
+            }
+        ]
+        lang = [
             {
                 "batch_size": batch_size,
                 "seq_len": prefill_seq_len,
@@ -103,32 +183,53 @@ def get_specializations(
                 "img_size": img_size,
             },
         ]
-        return specializations, compiler_options
+        specializations = {}
 
-    def get_onnx_dynamic_axes(
-        self,
-    ):
+        if kv_offload:
+            specializations["vision"] = vision
+            specializations["lang"] = lang
+            return specializations, compiler_options
+        else:
+            return lang, compiler_options
+
+    def get_onnx_dynamic_axes(self, kv_offload: bool = False):
         # Define dynamic axes
         num_layers = self.config.text_config.num_hidden_layers
 
-        dynamic_axes = {
+        vision_dynamic_axes = {
+            "pixel_values": {0: "batch_size", 2: "img_size", 3: "img_size"},
+        }
+        lang_dynamic_axes = {
             "input_ids": {0: "batch_size", 1: "seq_len"},
             "position_ids": {0: "batch_size", 1: "seq_len"},
-            "pixel_values": {0: "batch_size", 2: "img_size", 3: "img_size"},
         }
         for i in range(num_layers):
-            dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
-            dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
+            lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
+            lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
 
+        dynamic_axes = {}
+        if kv_offload:
+            dynamic_axes["vision"] = vision_dynamic_axes
+            dynamic_axes["lang"] = lang_dynamic_axes
+        else:
+            dynamic_axes = {**vision_dynamic_axes, **lang_dynamic_axes}
         return dynamic_axes
 
-    def get_output_names(
-        self,
-    ):
-        output_names = ["logits", "pixel_values_RetainedState"]
+    def get_output_names(self, kv_offload: bool = False):
+        vision_output_names = ["image_features"]
+        lang_output_names = ["logits"]
         for i in range(self.language_model.config.num_hidden_layers):
             for kv in ["key", "value"]:
-                output_names.append(f"past_{kv}.{i}_RetainedState")
+                lang_output_names.append(f"past_{kv}.{i}_RetainedState")
+
+        output_names = {}
+        if kv_offload:
+            lang_output_names.insert(1, "image_features_RetainedState")
+            output_names["vision"] = vision_output_names
+            output_names["lang"] = lang_output_names
+        else:
+            lang_output_names.insert(1, "pixel_values_RetainedState")
+            return lang_output_names
         return output_names
 
     def get_inputs_info(self):
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
index 9dcddbdfd..8d2141240 100644
--- a/QEfficient/transformers/models/mllama/modeling_mllama.py
+++ b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -1022,6 +1022,9 @@ class QEffMllamaForConditionalGeneration(MllamaForConditionalGeneration):
     def get_qeff_vision_encoder(self):
         return QEffMllamaVisionEncoder(self)
 
+    def get_qeff_language_decoder(self):
+        return self
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 8bca3b94a..54b7828c8 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -390,7 +390,13 @@ def pytorch_feature_generate(self, model, inputs: Union[torch.Tensor, np.ndarray
 
 
 class QEffVisionEncoderForTextImageToTextModel(QEFFBaseModel):
-    _pytorch_transforms = [AwqToMatmulNbitsTransform, GPTQToMatmulNbitsTransform, CustomOpsTransform, KVCacheTransform]
+    _pytorch_transforms = [
+        AwqToMatmulNbitsTransform,
+        GPTQToMatmulNbitsTransform,
+        CustomOpsTransform,
+        KVCacheTransform,
+        KVCacheModuleMethodMapperTransform,
+    ]
     _onnx_transforms = [FP16ClipTransform, SplitTensorsTransform]
 
     def __init__(self, model: nn.modules):
@@ -454,6 +460,7 @@ class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
 
     def __init__(self, model):
         super().__init__(model)
+        self.model = model.get_qeff_language_decoder()
 
     def export(self, inputs, output_names, dynamic_axes, export_dir=None):
         return self._export(inputs, output_names, dynamic_axes, export_dir)
@@ -502,7 +509,6 @@ def model_name(self) -> str:
 
 class _QEffAutoModelForImageTextToTextDualQPC:
     _hf_auto_class = AutoModelForImageTextToText
-    UNSUPPORTED_MODELS = ["LlavaForConditionalGeneration", "InternVLChatModel"]
 
     def __init__(
         self,
@@ -513,8 +519,6 @@ def __init__(
             raise NotImplementedError("Continuous batching is not supported for image-text-to-text models yet.")
         self.model = model
         self.config = model.config
-        if self.model_name in self.UNSUPPORTED_MODELS:
-            raise NotImplementedError(f"kv_offload is not yet supported for {self.model.__class__.__name__}")
         self.vision_model = QEffVisionEncoderForTextImageToTextModel(model)
         self.lang_model = QEffCausalLMForTextImageToTextModel(model)
 
@@ -640,12 +644,12 @@ def compile(
         custom_io_lang = {}
         # Inputs
         for output_name in output_names["lang"]:
-            if output_name.startswith("past_"):
+            if output_name.endswith("_RetainedState"):
                 custom_io_lang[output_name[: -len("_RetainedState")]] = kv_cache_dtype
 
         # outputs
         for output_name in output_names["lang"]:
-            if output_name.startswith("past_"):
+            if output_name.endswith("_RetainedState"):
                 custom_io_lang[output_name] = kv_cache_dtype
 
         self.lang_model._compile(
@@ -799,7 +803,6 @@ def kv_offload_generate(
             lang_inputs["input_ids"] = outputs["logits"].argmax(2)
             lang_inputs["position_ids"] += 1
             generated_ids[:, num_token] = lang_inputs["input_ids"].squeeze(1)
-
             if streamer:
                 streamer.put(lang_inputs["input_ids"][0])
 
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
index 10f4c448b..8152f0676 100644
--- a/QEfficient/transformers/models/pytorch_transforms.py
+++ b/QEfficient/transformers/models/pytorch_transforms.py
@@ -433,6 +433,8 @@ class KVCacheModuleMethodMapperTransform(ModuleMethodMapperTransform):
             "get_onnx_dynamic_axes": QEffInternVLModel.get_onnx_dynamic_axes,
             "get_output_names": QEffInternVLModel.get_output_names,
             "get_inputs_info": QEffInternVLModel.get_inputs_info,
+            "get_qeff_vision_encoder": QEffInternVLModel.get_qeff_vision_encoder,
+            "get_qeff_language_decoder": QEffInternVLModel.get_qeff_language_decoder,
         },
         "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward},
     }

From 756e72935f46827dc3d20694542939f397516f96 Mon Sep 17 00:00:00 2001
From: shubhagr-quic <quic_shubhagr@quicinc.com>
Date: Mon, 3 Mar 2025 13:31:49 +0530
Subject: [PATCH 093/138] Removed onnx_defer_loading flag. (#295)

Removing onnx_defer_loading flag which was originally removed in
_[Removed onnx_defer_loading from Immutable Convertor Args. PR: 230]_
but got added back later in _[Mllama(single + dual) + InternVL(single) +
Llava (single) PR: 267]_ maybe becausing of rebasing.

Signed-off-by: Shubham Agrawal <quic_shubhagr@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/utils/constants.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index a5cc6fda1..6c2bba0c6 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -136,7 +136,6 @@ class QnnConstants:
         "--float_bitwidth ",
         "--preserve_io_datatype",
         "--onnx_skip_simplification",
-        "--onnx_defer_loading",
     ]
 
     IMMUTABLE_CONTEXT_BIN_GEN_ARGS = [

From 5f2bd31942028170e8467b27901c27d37236bb9c Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Mon, 3 Mar 2025 20:29:36 +0530
Subject: [PATCH 094/138] Code for SDK configs Inclusion (#203)

This will create a config JSON file, which contains all the details
about compilation and SDK versions.

Currently, this code is added in the code block of
QEFFAutoModelForCausalLM.compile.

The config would look like below:

```
{
    "huggingface_config": {
        "vocab_size": 50257,
        "n_positions": 1024,
        "n_embd": 768,
        "n_layer": 12,
        "n_head": 12,
        "n_inner": null,
        "activation_function": "gelu_new",
        "resid_pdrop": 0.1,
        "embd_pdrop": 0.1,
        "attn_pdrop": 0.1,
        "layer_norm_epsilon": 1e-05,
        "initializer_range": 0.02,
        "summary_type": "cls_index",
        "summary_use_proj": true,
        "summary_activation": null,
        "summary_first_dropout": 0.1,
        "summary_proj_to_labels": true,
        "scale_attn_weights": true,
        "use_cache": true,
        "scale_attn_by_inverse_layer_idx": false,
        "reorder_and_upcast_attn": false,
        "bos_token_id": 50256,
        "eos_token_id": 50256,
        "return_dict": true,
        "output_hidden_states": false,
        "output_attentions": false,
        "torchscript": false,
        "torch_dtype": null,
        "use_bfloat16": false,
        "tf_legacy_loss": false,
        "pruned_heads": {},
        "tie_word_embeddings": true,
        "chunk_size_feed_forward": 0,
        "is_encoder_decoder": false,
        "is_decoder": false,
        "cross_attention_hidden_size": null,
        "add_cross_attention": false,
        "tie_encoder_decoder": false,
        "max_length": 20,
        "min_length": 0,
        "do_sample": false,
        "early_stopping": false,
        "num_beams": 1,
        "num_beam_groups": 1,
        "diversity_penalty": 0.0,
        "temperature": 1.0,
        "top_k": 50,
        "top_p": 1.0,
        "typical_p": 1.0,
        "repetition_penalty": 1.0,
        "length_penalty": 1.0,
        "no_repeat_ngram_size": 0,
        "encoder_no_repeat_ngram_size": 0,
        "bad_words_ids": null,
        "num_return_sequences": 1,
        "output_scores": false,
        "return_dict_in_generate": false,
        "forced_bos_token_id": null,
        "forced_eos_token_id": null,
        "remove_invalid_values": false,
        "exponential_decay_length_penalty": null,
        "suppress_tokens": null,
        "begin_suppress_tokens": null,
        "architectures": [
            "GPT2LMHeadModel"
        ],
        "finetuning_task": null,
        "id2label": {
            "0": "LABEL_0",
            "1": "LABEL_1"
        },
        "label2id": {
            "LABEL_0": 0,
            "LABEL_1": 1
        },
        "tokenizer_class": null,
        "prefix": null,
        "pad_token_id": null,
        "sep_token_id": null,
        "decoder_start_token_id": null,
        "task_specific_params": {
            "text-generation": {
                "do_sample": true,
                "max_length": 50
            }
        },
        "problem_type": null,
        "_name_or_path": "gpt2",
        "_commit_hash": "607a30d783dfa663caf39e06633721c8d4cfcd7e",
        "_attn_implementation_internal": "eager",
        "transformers_version": null,
        "model_type": "gpt2",
        "n_ctx": 1024
    },
    "qpc_config": {
        "QEff_config": {
            "pytorch_transforms": [
                "AwqToMatmulNbitsTransform",
                "GPTQToMatmulNbitsTransform",
                "CustomOpsTransform",
                "KVCacheTransform"
            ],
            "onnx_transforms": [
                "FP16ClipTransform",
                "SplitTensorsTransform"
            ],
            "onnx_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/GPT2LMHeadModel.onnx"
        },
        "aic_compiler_config": {
            "apps_sdk_version": "1.20.0",
            "compile_dir": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47",
            "specializtions_file_path": "/root/.cache/qeff_models/GPT2LMHeadModel-36f0eca92731bb47/specializations.json",
            "prefill_seq_len": 32,
            "ctx_len": 128,
            "batch_size": 1,
            "full_batch_size": null,
            "num_devices": 1,
            "num_cores": 16,
            "mxfp6_matmul": false,
            "mxint8_kv_cache": false,
            "num_speculative_tokens": null
        },
        "qnn_config": {
            "enable_qnn": true,
            "qnn_config_path": "QEfficient/compile/qnn_config.json",
            "product": "QAIRT",
            "os": {
                "Ubuntu": 22.04,
                "Windows": 11
            },
            "sdk_flavor": [
                "aic"
            ],
            "version": "2.31.0",
            "build_id": "250109072054_3882",
            "qnn_backend_api_version": "2.18.0",
            "tensorflow": "2.10.1",
            "tflite": "2.3.0",
            "torch": "1.13.1",
            "onnx": "1.16.1",
            "onnxruntime": "1.17.1",
            "onnxsimplifier": "0.4.36",
            "android-ndk": "r26c",
            "platform": "AIC.1.20.0.14"
        }
    }
}
```

Note: The code structure may change.

---------

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/base/modeling_qeff.py              |   6 +-
 QEfficient/peft/auto.py                       |   4 +
 QEfficient/peft/lora/auto.py                  |   4 +
 .../transformers/models/modeling_auto.py      |  24 ++++
 QEfficient/utils/__init__.py                  |   1 +
 QEfficient/utils/_utils.py                    | 114 +++++++++++++++++-
 QEfficient/utils/constants.py                 |   2 +
 tests/peft/lora/test_lora_model.py            |   4 +
 tests/peft/test_peft_model.py                 |   2 +
 tests/qnn_tests/test_causal_lm_models_qnn.py  |   8 +-
 tests/text_generation/test_text_generation.py |   3 +
 .../models/test_causal_lm_models.py           |   7 +-
 .../models/test_embedding_models.py           |   2 +
 .../models/test_prefix_caching.py             |   2 +
 .../models/test_speech_seq2seq_models.py      |   1 +
 tests/transformers/spd/test_spd_inference.py  |   3 +
 tests/transformers/test_causal_lm.py          |   2 +
 tests/transformers/test_speech_seq2seq.py     |   2 +
 18 files changed, 185 insertions(+), 6 deletions(-)

diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
index ec74c57f3..f2b3714fa 100644
--- a/QEfficient/base/modeling_qeff.py
+++ b/QEfficient/base/modeling_qeff.py
@@ -23,7 +23,7 @@
 from QEfficient.base.pytorch_transforms import PytorchTransform
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
 from QEfficient.generation.cloud_infer import QAICInferenceSession
-from QEfficient.utils import constants
+from QEfficient.utils import constants, dump_qconfig
 from QEfficient.utils._utils import load_json
 from QEfficient.utils.cache import QEFF_HOME, to_hashable
 
@@ -211,6 +211,7 @@ def _export(
         self.onnx_path = onnx_path
         return onnx_path
 
+    @dump_qconfig
     def _compile(
         self,
         onnx_path: Optional[str] = None,
@@ -336,8 +337,10 @@ def _compile(
             )
 
         self.qpc_path = qpc_path
+
         return qpc_path
 
+    @dump_qconfig
     def _qnn_compile(
         self,
         onnx_path: Optional[str] = None,
@@ -435,4 +438,5 @@ def _qnn_compile(
         )
 
         self.qpc_path = qpc_path
+
         return qpc_path
diff --git a/QEfficient/peft/auto.py b/QEfficient/peft/auto.py
index 377caa3e7..deb64fae1 100644
--- a/QEfficient/peft/auto.py
+++ b/QEfficient/peft/auto.py
@@ -107,6 +107,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.get_base_model().config.__dict__
+
     def load_adapter(self, model_id: str, adapter_name: str):
         """Loads a new adapter from huggingface hub or local path
 
diff --git a/QEfficient/peft/lora/auto.py b/QEfficient/peft/lora/auto.py
index c13979968..7f2a5cd84 100644
--- a/QEfficient/peft/lora/auto.py
+++ b/QEfficient/peft/lora/auto.py
@@ -90,6 +90,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.model.config.__dict__
+
     def download_adapter(
         self,
         adapter_model_id: str,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 54b7828c8..5852740b4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -229,6 +229,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -447,6 +451,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.model.vision_model.config.__dict__
+
 
 class QEffCausalLMForTextImageToTextModel(QEFFBaseModel):
     _pytorch_transforms = [
@@ -506,6 +514,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.language_model.config.__dict__
+
 
 class _QEffAutoModelForImageTextToTextDualQPC:
     _hf_auto_class = AutoModelForImageTextToText
@@ -1128,6 +1140,10 @@ def model_name(self) -> str:
             mname = mname[4:]
         return mname
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
 
 class QEFFAutoModelForImageTextToText:
     """
@@ -1320,6 +1336,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
@@ -1630,6 +1650,10 @@ def model_hash(self) -> str:
         mhash = mhash.hexdigest()[:16]
         return mhash
 
+    @property
+    def get_model_config(self) -> dict:
+        return self.model.config.__dict__
+
     def export(self, export_dir: Optional[str] = None) -> str:
         """
         Exports the model to ``ONNX`` format using ``torch.onnx.export``.
diff --git a/QEfficient/utils/__init__.py b/QEfficient/utils/__init__.py
index 2506b9233..a7f17e6bc 100755
--- a/QEfficient/utils/__init__.py
+++ b/QEfficient/utils/__init__.py
@@ -11,6 +11,7 @@
 )
 from QEfficient.utils._utils import (  # noqa: F401
     check_and_assign_cache_dir,
+    dump_qconfig,
     get_num_layers_from_config,
     get_onnx_dir_name,
     get_padding_shape_from_config,
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 8344a053d..ea9044e2c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,16 +8,18 @@
 import json
 import os
 import subprocess
+import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import requests
 import torch
+import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
 from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
 
 
@@ -442,3 +444,113 @@ class IOInfo:
 
     def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
+
+
+def dump_qconfig(func):
+    def wrapper(self, *args, **kwargs):
+        result = func(self, *args, **kwargs)
+        create_and_dump_qconfigs(
+            self.qpc_path,
+            self.onnx_path,
+            self.get_model_config,
+            [cls.__name__ for cls in self._pytorch_transforms],
+            [cls.__name__ for cls in self._onnx_transforms],
+            kwargs.get("specializations"),
+            kwargs.get("mdp_ts_num_devices", 1),
+            kwargs.get("num_speculative_tokens"),
+            **{
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"]
+            },
+        )
+        return result
+
+    return wrapper
+
+
+def create_and_dump_qconfigs(
+    qpc_path,
+    onnx_path,
+    huggingface_config,
+    pytorch_transforms,
+    onnx_transforms,
+    specializations,
+    mdp_ts_num_devices,
+    num_speculative_tokens,
+    **compiler_options,
+):
+    """
+    This Method creates a JSON file which contains all the configs for a model.
+    Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and
+    many other compilation options.
+    """
+    qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
+    enable_qnn = True if "qnn_config" in compiler_options else None
+
+    qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json")
+    onnx_path = str(onnx_path)
+    specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json"))
+    compile_dir = str(os.path.dirname(qpc_path))
+    qnn_config_path = (
+        (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None
+    )
+
+    # Extract QAIC SDK Apps Version from SDK XML file
+    tree = ET.parse(Constants.SDK_APPS_XML)
+    root = tree.getroot()
+    qaic_version = root.find(".//base_version").text
+
+    # Extract QNN SDK details from YAML file if the environment variable is set
+    qnn_sdk_details = None
+    qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
+    if qnn_sdk_path:
+        qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
+        with open(qnn_sdk_yaml_path, "r") as file:
+            qnn_sdk_details = yaml.safe_load(file)
+
+    # Ensure all objects in the configs dictionary are JSON serializable
+    def make_serializable(obj):
+        if isinstance(obj, (int, float, str, bool, type(None))):
+            return obj
+        elif isinstance(obj, (list, tuple)):
+            return [make_serializable(item) for item in obj]
+        elif isinstance(obj, dict):
+            return {key: make_serializable(value) for key, value in obj.items()}
+        elif hasattr(obj, "__dict__"):
+            return make_serializable(vars(obj))
+        return str(obj)
+
+    qconfigs = {
+        "huggingface_config": make_serializable(huggingface_config),
+        "qpc_config": {
+            "QEff_config": {
+                "pytorch_transforms": make_serializable(pytorch_transforms),
+                "onnx_transforms": make_serializable(onnx_transforms),
+                "onnx_path": onnx_path,
+            },
+        },
+    }
+
+    aic_compiler_config = {
+        "apps_sdk_version": qaic_version,
+        "compile_dir": compile_dir,
+        "specializations_file_path": specializations_file_path,
+        "specializations": make_serializable(specializations),
+        "mdp_ts_num_devices": mdp_ts_num_devices,
+        "num_speculative_tokens": num_speculative_tokens,
+        **compiler_options,
+    }
+    qnn_config = {
+        "enable_qnn": enable_qnn,
+        "qnn_config_path": qnn_config_path,
+    }
+    # Put AIC or qnn details.
+    if enable_qnn:
+        qconfigs["qpc_config"]["qnn_config"] = qnn_config
+        if qnn_sdk_details:
+            qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details)
+    else:
+        qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
+
+    create_json(qconfig_file_path, qconfigs)
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
index 6c2bba0c6..3852adcda 100644
--- a/QEfficient/utils/constants.py
+++ b/QEfficient/utils/constants.py
@@ -75,12 +75,14 @@ class Constants:
     MAX_QPC_LIMIT = 30
     MAX_RETRIES = 5  # This constant will be used set the maximum number of retry attempts for downloading a model using huggingface_hub snapshot_download
     NUM_SPECULATIVE_TOKENS = 2
+    SDK_APPS_XML = "/opt/qti-aic/versions/apps.xml"  # This xml file is parsed to find out the SDK version.
 
 
 @dataclass
 class QnnConstants:
     # QNN PATH to be read from environment variable.
     QNN_SDK_PATH_ENV_VAR_NAME = "QNN_SDK_ROOT"
+    QNN_SDK_YAML = "sdk.yaml"
 
     # QNN Compilation tools
     QAIRT_CONVERTER = "{}/bin/{}/qairt-converter"
diff --git a/tests/peft/lora/test_lora_model.py b/tests/peft/lora/test_lora_model.py
index 4726fb8c5..69a6282fb 100644
--- a/tests/peft/lora/test_lora_model.py
+++ b/tests/peft/lora/test_lora_model.py
@@ -4,6 +4,8 @@
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
+
+import os
 from pathlib import Path
 from time import perf_counter
 
@@ -225,6 +227,7 @@ def test_auto_lora_model_for_causal_lm_noncb_export_compile_generate(
     # test compile
     qeff_model.compile(prefill_seq_len=32, ctx_len=64)
     assert Path(qeff_model.qpc_path).is_dir()
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
     # test generate
     prompts = ["hello!", "hi", "hello, my name is", "hey"]
@@ -249,6 +252,7 @@ def test_auto_lora_model_for_causal_lm_cb_compile_generate(base_model_name, adap
     # test compile
     qeff_model.compile(prefill_seq_len=32, ctx_len=64, full_batch_size=2)
     assert Path(qeff_model.qpc_path).is_dir()
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
     # test generate
     prompts = ["hello!", "hi", "hello, my name is", "hey"]
diff --git a/tests/peft/test_peft_model.py b/tests/peft/test_peft_model.py
index 6a9a957b2..c4e331a9d 100644
--- a/tests/peft/test_peft_model.py
+++ b/tests/peft/test_peft_model.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from time import perf_counter
 
 import numpy as np
@@ -187,3 +188,4 @@ def test_auto_peft_model_for_causal_lm_compile_generate(base_config, adapter_con
     end = perf_counter()
     compile_time_1 = end - start
     assert compile_time_1 < 0.01 * compile_time_0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
diff --git a/tests/qnn_tests/test_causal_lm_models_qnn.py b/tests/qnn_tests/test_causal_lm_models_qnn.py
index fe906fe7e..65acab157 100644
--- a/tests/qnn_tests/test_causal_lm_models_qnn.py
+++ b/tests/qnn_tests/test_causal_lm_models_qnn.py
@@ -5,6 +5,8 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import numpy as np
 import pytest
 from transformers import AutoModelForCausalLM
@@ -98,7 +100,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -106,6 +108,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         aic_enable_depth_first=False,
         enable_qnn=True,
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     exec_info = qeff_model.generate(tokenizer, prompts=Constants.INPUT_STR)
     cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
     gen_len = ort_tokens.shape[-1]
@@ -136,7 +139,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -145,6 +148,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
         full_batch_size=full_batch_size,
         enable_qnn=True,
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
     exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts)
 
     assert all(
diff --git a/tests/text_generation/test_text_generation.py b/tests/text_generation/test_text_generation.py
index a1e4265ee..f7d3cd6cb 100644
--- a/tests/text_generation/test_text_generation.py
+++ b/tests/text_generation/test_text_generation.py
@@ -5,6 +5,8 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
+
 import pytest
 from transformers import AutoModelForCausalLM
 
@@ -101,3 +103,4 @@ def test_generate_text_stream(
     assert cloud_ai_100_output == stream_tokens, (
         f"Deviation in output observed while comparing regular execution and streamed output: {cloud_ai_100_output} != {stream_tokens}"
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
index a3a855cee..418386780 100644
--- a/tests/transformers/models/test_causal_lm_models.py
+++ b/tests/transformers/models/test_causal_lm_models.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from typing import Optional
 
 import numpy as np
@@ -127,7 +128,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -141,6 +142,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     assert (ort_tokens == cloud_ai_100_tokens[:, :gen_len]).all(), (
         "Tokens don't match for ONNXRT output and Cloud AI 100 output."
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
     # testing for CB models
     model_hf, _ = load_causal_lm_model(model_config)
@@ -165,7 +167,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
     if not get_available_device_id():
         pytest.skip("No available devices to run model on Cloud AI 100")
 
-    _ = qeff_model.compile(
+    qpc_path = qeff_model.compile(
         prefill_seq_len=prompt_len,
         ctx_len=ctx_len,
         num_cores=14,
@@ -182,6 +184,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(
             for pt_token, cloud_token in zip(pytorch_hf_tokens, exec_info_fbs.generated_ids)
         ]
     ), "Tokens don't match for  HF PyTorch model output and Cloud AI 100 output."
+    assert os.path.isfile(os.path.join(os.path.dirname(qpc_path), "qconfig.json"))
 
 
 # FIXME: there should be a CB test here
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
index 1c2d5196c..e681f5093 100644
--- a/tests/transformers/models/test_embedding_models.py
+++ b/tests/transformers/models/test_embedding_models.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 
 import numpy as np
 import onnxruntime as ort
@@ -77,6 +78,7 @@ def check_embed_pytorch_vs_ort_vs_ai100(
     mad = np.mean(np.abs(ai100_output - onnx_outputs[0]))
     print("Mad for onnx and AI 100 output is ", mad)
     assert mad <= 10**-3, f"MAD is too high for onnx and Pytorch: {mad}"
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py
index 8ef24403c..c787a3c96 100644
--- a/tests/transformers/models/test_prefix_caching.py
+++ b/tests/transformers/models/test_prefix_caching.py
@@ -31,6 +31,7 @@ def test_simple_prefix_caching(model_name):
         num_cores=14,
     )
     prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
@@ -61,6 +62,7 @@ def test_simple_prefix_caching_qnn(model_name):
         qnn_config=qnn_config_json_path,
     )
     prefix_caching_inference(model_name=model_name, qpc_path=qeff_model.qpc_path)
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
     os.remove(qnn_config_json_path)
 
 
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
index af83c9354..99f715863 100644
--- a/tests/transformers/models/test_speech_seq2seq_models.py
+++ b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -360,6 +360,7 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
     assert (pytorch_kv_tokens == cloud_ai_100_tokens).all(), (
         "Tokens don't match for pytorch output and Cloud AI 100 output."
     )
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
 
 
 @pytest.mark.on_qaic
diff --git a/tests/transformers/spd/test_spd_inference.py b/tests/transformers/spd/test_spd_inference.py
index a9f197ec3..205f00a00 100644
--- a/tests/transformers/spd/test_spd_inference.py
+++ b/tests/transformers/spd/test_spd_inference.py
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import os
 from time import perf_counter
 from typing import List, Optional
 
@@ -331,3 +332,5 @@ def test_spec_decode_inference(
     ]  # Because we always run for single input and single batch size
     all_matching = np.array_equal(cloud_ai_100_tokens, generated_ids)
     assert all_matching, "Tokens don't match for SpD output and vanilla DLM output."
+    assert os.path.isfile(os.path.join(os.path.dirname(target_model_qpc_path), "qconfig.json"))
+    assert os.path.isfile(os.path.join(os.path.dirname(draft_model_qpc_path), "qconfig.json"))
diff --git a/tests/transformers/test_causal_lm.py b/tests/transformers/test_causal_lm.py
index 1ceb5a7e0..64376db62 100644
--- a/tests/transformers/test_causal_lm.py
+++ b/tests/transformers/test_causal_lm.py
@@ -6,6 +6,7 @@
 # ----------------------------------------------------------------------------
 
 import copy
+import os
 from time import perf_counter
 
 import onnx
@@ -170,3 +171,4 @@ def test_causal_lm_compile(config, cb, tmp_cache):
     end = perf_counter()
     compile_time = end - start
     assert compile_time < 2.0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))
diff --git a/tests/transformers/test_speech_seq2seq.py b/tests/transformers/test_speech_seq2seq.py
index a41896010..15d6152e3 100644
--- a/tests/transformers/test_speech_seq2seq.py
+++ b/tests/transformers/test_speech_seq2seq.py
@@ -6,6 +6,7 @@
 # ----------------------------------------------------------------------------
 
 import copy
+import os
 from time import perf_counter
 
 import onnx
@@ -142,3 +143,4 @@ def test_causal_lm_compile(config, tmp_cache):
     end = perf_counter()
     compile_time = end - start
     assert compile_time < 2.0
+    assert os.path.isfile(os.path.join(os.path.dirname(qeff_model.qpc_path), "qconfig.json"))

From a45b5c49aa957d43698e8cfe05cb1c3ec42bd1df Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 6 Mar 2025 11:56:27 +0530
Subject: [PATCH 095/138] Docs string added for the Image class and granite
 models are added in validation page (#303)

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/modeling_auto.py      | 62 ++++++++++++++++++-
 docs/source/quick_start.md                    |  6 +-
 docs/source/validate.md                       | 10 +--
 3 files changed, 70 insertions(+), 8 deletions(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 5852740b4..07aff78ff 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1147,9 +1147,69 @@ def get_model_config(self) -> dict:
 
 class QEFFAutoModelForImageTextToText:
     """
-    A factory class for creating QEFFAutoModelForImageTextToText instances with for single and Dual QPC approach
+    The QEFFAutoModelForImageTextToText class is used to work with multimodal language models from the HuggingFace hub.
+    While you can initialize the class directly, it's best to use the ``from_pretrained`` method for this purpose. This class supports both single and dual QPC approaches.
     Attributes:
         _hf_auto_class (class): The Hugging Face AutoModel class for ImageTextToText models.
+
+    ``Mandatory`` Args:
+        :pretrained_model_name_or_path (str): Model card name from HuggingFace or local path to model directory.
+
+    ``Optional`` Args:
+        :kv_offload (bool): Flag to toggle between single and dual QPC approaches. If set to False, the Single QPC approach will be used; otherwise, the dual QPC approach will be applied. Defaults to True.
+
+    .. code-block:: python
+        import requests
+        from PIL import Image
+        from transformers import AutoProcessor, TextStreamer
+
+        from QEfficient import QEFFAutoModelForImageTextToText
+
+        # Add HuggingFace Token to access the model
+        HF_TOKEN = ""
+        model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
+        query = "Describe this image."
+        image_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg"
+
+        ## STEP - 1 Load the Processor and Model, and kv_offload=True/False for dual and single qpc
+        processor = AutoProcessor.from_pretrained(model_name, token=token)
+        model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, attn_implementation="eager", kv_offload=False)
+
+        ## STEP - 2 Export & Compile the Model
+        model.compile(
+            prefill_seq_len=32,
+            ctx_len=512,
+            img_size=560,
+            num_cores=16,
+            num_devices=1,
+            mxfp6_matmul=False,
+        )
+
+        ## STEP - 3 Load and process the inputs for Inference
+        image = Image.open(requests.get(image_url, stream=True).raw)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": query},
+                ],
+            }
+        ]
+        input_text = [processor.apply_chat_template(messages, add_generation_prompt=True)]
+        inputs = processor(
+            text=input_text,
+            images=image,
+            return_tensors="pt",
+            add_special_tokens=False,
+            padding="max_length",
+            max_length=prefill_seq_len,
+        )
+
+        ## STEP - 4 Run Inference on the compiled model
+        streamer = TextStreamer(processor.tokenizer)
+        model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
+
     """
 
     _hf_auto_class = AutoModelForImageTextToText
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
index 88093e134..2ccb013e9 100644
--- a/docs/source/quick_start.md
+++ b/docs/source/quick_start.md
@@ -239,7 +239,7 @@ Use the qualcomm_efficient_converter API to export the KV transformed Model to O
 
 generated_qpc_path = qeff_model.compile(
     num_cores=14,
-    mxfp6=True,
+    mxfp6_matmul=True,
 )
 ```
 
@@ -250,8 +250,8 @@ Benchmark the model on Cloud AI 100, run the infer API to print tokens and tok/s
 ```Python
 # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on AI 100
 # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
-
-qeff_model.generate(prompts=["My name is"])
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+qeff_model.generate(prompts=["My name is"],tokenizer=tokenizer)
 ```
 End to End demo examples for various models are available in **notebooks** directory. Please check them out.
 
diff --git a/docs/source/validate.md b/docs/source/validate.md
index 49acd268d..acd4c11da 100644
--- a/docs/source/validate.md
+++ b/docs/source/validate.md
@@ -41,13 +41,15 @@
 
 | Architecture | Model Family | Representative Models          |
 |--------------|--------------|---------------------------------|
-| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2)          |
+| **BertModel** | BERT-based   | [BAAI/bge-base-en-v1.5](https://huggingface.co/BAAI/bge-base-en-v1.5)<br> [BAAI/bge-large-en-v1.5](https://huggingface.co/BAAI/bge-large-en-v1.5)<br>[BAAI/bge-small-en-v1.5](https://huggingface.co/BAAI/bge-small-en-v1.5) <br>[e5-large-v2](https://huggingface.co/intfloat/e5-large-v2) |
 | **LlamaModel** | Llama-based  | [intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
-| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
-| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
 | **MPNetForMaskedLM** | MPNet | [sentence-transformers/multi-qa-mpnet-base-cos-v1](https://huggingface.co/sentence-transformers/multi-qa-mpnet-base-cos-v1) |
-| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
 | **MistralModel** | Mistral | [e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) |
+| **NomicBertModel** | NomicBERT | [nomic-embed-text-v1.5](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) |
+| **Qwen2ForCausalLM** | Qwen2 | [stella_en_1.5B_v5](https://huggingface.co/NovaSearch/stella_en_1.5B_v5) |
+| **RobertaModel**     | RoBERTa |  [ibm-granite/granite-embedding-30m-english](https://huggingface.co/ibm-granite/granite-embedding-30m-english)<br> [ibm-granite/granite-embedding-125m-english](https://huggingface.co/ibm-granite/granite-embedding-125m-english) |
+| **XLMRobertaForSequenceClassification** | XLM-RoBERTa | [bge-reranker-v2-m3bge-reranker-v2-m3](https://huggingface.co/BAAI/bge-reranker-v2-m3) |
+| **XLMRobertaModel**    | XLM-RoBERTa  |[ibm-granite/granite-embedding-107m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-107m-multilingual)<br> [ibm-granite/granite-embedding-278m-multilingual](https://huggingface.co/ibm-granite/granite-embedding-278m-multilingual)  |
 
 ## Multimodal Language Models
 

From a276806bbacc41dd6b8171714b0283c1eee39e99 Mon Sep 17 00:00:00 2001
From: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Date: Thu, 6 Mar 2025 15:36:34 +0530
Subject: [PATCH 096/138] [Bug-Fix :] QEFFAutoModelForCausalLM __repr__()
 Method Fixed (#307)

This is just small fixes done for printing the
`QEFFAutoModelForCausalLM`'s instance by changing the `__repr__(self)`
method.

Signed-off-by: Abukhoyer Shaik <quic_abukhoye@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/modeling_auto.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 07aff78ff..a87c39fb4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -1325,7 +1325,7 @@ def model_name(self) -> str:
         return mname
 
     def __repr__(self) -> str:
-        return self.__class__.__name__ + "\n" + self.model.__repr__
+        return self.__class__.__name__ + "\n" + self.model.__repr__()
 
     @classmethod
     @with_replaced_quantizers

From d88e12492d1690b367487ca20137609dee887558 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 00:39:03 +0530
Subject: [PATCH 097/138] added initial version of SwiftKV for AI 100

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py        |  29 ++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 411 ++++++++++++++++++
 exps/run_swiftkv.py                           |  28 ++
 3 files changed, 468 insertions(+)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
 create mode 100644 exps/run_swiftkv.py

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index a5c375c6e..fe56b197c 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -36,6 +36,35 @@ class QEffDynamicCache(DynamicCache):
 
     """
 
+    def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        else:
+            position_ids = cache_kwargs.get("position_ids")
+            self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
+            self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
+
+    def read_only(self, layer_idx, cache_kwargs):
+        position_ids = cache_kwargs.get("position_ids")
+        ctx_len = position_ids.shape[-1]
+        ctx_indices = torch.arange(ctx_len)[None, None, ...]
+        gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
+        invalid_mask = ctx_indices > gather_limit
+
+        if torch.onnx.is_in_onnx_export():
+            invalid_idx_value = torch.iinfo(torch.int32).max
+        else:
+            invalid_idx_value = 0
+
+        ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
+        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
+        k_out = CtxGatherFunc.apply(k_out, ctx_indices)
+        v_out = CtxGatherFunc.apply(v_out, ctx_indices)
+        v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)
+        return k_out, v_out
+
     def update(
         self,
         key_states: torch.Tensor,
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
new file mode 100644
index 000000000..a33c83d3a
--- /dev/null
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -0,0 +1,411 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+import logging
+import math
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from transformers.cache_utils import Cache, StaticCache
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaRMSNorm, repeat_kv
+
+from QEfficient.transformers.cache_utils import QEffDynamicCache
+from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
+from QEfficient.transformers.models.llama.modeling_llama import (
+    QEffLlamaDecoderLayer,
+    QEffLlamaRotaryEmbedding,
+    qeff_apply_rotary_pos_emb,
+)
+
+logger = logging.get_logger(__name__)
+
+
+class LlamaSwiftKVAttention(LlamaAttention):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__(config, layer_idx)
+        self.hidden_size = config.hidden_size
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj_swiftkv = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj_swiftkv = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj_swiftkv = nn.Linear(
+            self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.rotary_emb = QEffLlamaRotaryEmbedding(config=self.config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask=None,
+    ) -> torch.Tensor:
+        bsz, q_len, _ = hidden_states.size()
+        query, _ = self.q_proj_swiftkv(hidden_states)
+
+        # Reshape the query, key, and value tensors.
+        query_states = query.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = position_ids.shape[-1]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len = past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        key_states, value_states = past_key_value.read_only(self.layer_idx, position_ids=position_ids)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, _ = qeff_apply_rotary_pos_emb(query_states, torch.empty_like(key_states), cos, sin, position_ids)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=torch.float32), attn_weights)
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        # attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, past_key_value
+
+
+class LlamaSwiftKVDecoderLayer(nn.Module):
+    def __init__(self, config, layer_idx) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self, hidden_states: torch.Tensor, position_ids: torch.Tensor, past_key_values, causal_mask
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states, past_key_values = self.self_attn(
+            hidden_states=hidden_states,
+            position_ids=position_ids,
+            past_key_value=past_key_values,
+            attention_mask=causal_mask,
+        )
+
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, past_key_values
+
+
+class LlamaSwiftKVModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.vocab_size = config.vocab_size
+        self.config = config
+
+        self.embed_tokens = nn.Embedding(
+            self.vocab_size, config.hidden_size, None
+        )  # TODO: Not sure if padding_idx shoudl eb NONE
+        self.layers = torch.nn.ModuleList(
+            [
+                QEffLlamaDecoderLayer(config=config, layer_idx=idx)
+                if idx < config.num_key_value_layers
+                else LlamaSwiftKVDecoderLayer(config=config, layer_idx=idx)
+                for idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.norm_swiftkv = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def _run_swiftkv_layers(
+        self, hidden_states: torch.Tensor, position_ids: torch.Tensor, past_key_values, causal_mask
+    ) -> torch.Tensor:
+        for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
+            layer = self.layers[layer_idx]
+
+            hidden_states, past_key_values = layer(hidden_states, position_ids, past_key_values, causal_mask)
+
+        return hidden_states, past_key_values
+
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        self.config._attn_implementation = "eager"
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else past_seen_tokens
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
+            causal_mask = attention_mask
+        else:
+            causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+            else:
+                causal_mask = _create_causal_mask(position_ids=position_ids, target_length=target_length)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor],
+        position_ids: torch.Tensor,
+        past_key_values: List[torch.Tensor],
+    ):
+        inputs_embeds = self.embed_tokens(input_ids)
+
+        # kept for BC (non `Cache` `past_key_values` inputs)
+        use_cache = True
+
+        if use_cache and not isinstance(past_key_values, Cache):
+            if past_key_values is None:
+                past_key_values = QEffDynamicCache()
+            else:
+                past_key_values = QEffDynamicCache.from_legacy_cache(past_key_values)
+                logger.warning_once(
+                    "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
+                    "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
+                    "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
+                )
+
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        cache_position = torch.arange(
+            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+        )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(None, inputs_embeds, cache_position, past_key_values, False)
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        next_decoder_cache = None
+
+        for layer_idx in range(self.config.num_key_value_layers):
+            layer = self.layers[layer_idx]
+            hidden_states, next_decoder_cache = layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=True,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+        swiftkv_hidden_states = self.norm_swiftkv(hidden_states)
+
+        ####################################
+        ## THE MAGIC OF SWIFT KV BEGINS HERE
+        ####################################
+        for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
+            self_attn = self.layers[layer_idx].self_attn
+            key_states = self_attn.k_proj_swiftkv(swiftkv_hidden_states)
+            value_states = self_attn.v_proj_swiftkv(swiftkv_hidden_states)
+            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+            kv_seq_len = key_states.shape[-2]
+            if past_key_values is not None:
+                if self.layer_idx is None:
+                    raise ValueError(
+                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                        "with a layer index."
+                    )
+                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
+
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            _, key_states = qeff_apply_rotary_pos_emb(
+                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
+            )
+            cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
+            past_key_values.write_only(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        hidden_states, next_decoder_cache = self._run_swiftkv_layers(
+            hidden_states, position_ids, past_key_values, causal_mask
+        )
+        ####################################
+        ## THE MAGIC OF SWIFT KV ENDS HERE
+        ####################################
+
+        next_cache = next_decoder_cache.to_legacy_cache()
+        return hidden_states, next_cache
+
+
+class LlamaSwiftKVForCausalLM(nn.Module):
+    """
+    # packed_modules_mapping = {
+    #     "kv_proj_swiftkv": ["k_proj_swiftkv", "v_proj_swiftkv"],
+    #     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
+    #     "gate_up_proj": ["gate_proj", "up_proj"],
+    # }
+
+    # # BitandBytes specific attributes
+    # default_bitsandbytes_target_modules = [
+    #     ".gate_proj.",
+    #     ".down_proj.",
+    #     ".up_proj.",
+    #     ".q_proj.",
+    #     ".k_proj.",
+    #     ".v_proj.",
+    #     ".o_proj.",
+    #     ".k_proj_swiftkv.",
+    #     ".v_proj_swiftkv.",
+    # ]
+
+    # # in TP, these weights are partitioned along the column dimension (dim=-1)
+    # column_parallel_weights_modules = [
+    #     ".q_proj_swiftkv.",
+    #     ".down_proj.",
+    #     ".o_proj.",
+    # ]
+    # bitsandbytes_stacked_params_mapping = {
+    #     # shard_name, weight_name, index
+    #     "k_proj_swiftkv": ("kv_proj_swiftkv", 1),
+    #     "v_proj_swiftkv": ("kv_proj_swiftkv", 2),
+    #     "q_proj": ("qkv_proj", 0),
+    #     "k_proj": ("qkv_proj", 1),
+    #     "v_proj": ("qkv_proj", 2),
+    #     "gate_proj": ("gate_up_proj", 0),
+    #     "up_proj": ("gate_up_proj", 1),
+    # }
+    """
+
+    def __init__(self, *, config):
+        super().__init__()
+
+        self.model = LlamaSwiftKVModel(
+            config=config,
+        )
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        past_key_values: Optional[Union[List[torch.FloatTensor]]] = None,
+    ):
+        hidden_states, output_past_key_values = self.model(input_ids, position_ids, past_key_values)
+        logit_index = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        hidden_states = hidden_states[torch.arange(position_ids.shape[0]).view(-1, 1), logit_index]
+        logits = self.lm_head(hidden_states)
+        return logits, output_past_key_values
diff --git a/exps/run_swiftkv.py b/exps/run_swiftkv.py
new file mode 100644
index 000000000..cf180f609
--- /dev/null
+++ b/exps/run_swiftkv.py
@@ -0,0 +1,28 @@
+import json
+import os
+
+from safetensors import safe_open
+
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+
+WEIGHTS = "/local/mnt/workspace/open-source/myown/efficient-transformers/cache_dir/swiftkv_model_weights"
+
+
+def load_safetensors(path):
+    state_dict = {}
+    f = safe_open(path, framework="pt", device="cpu")
+    for key in f.keys():
+        tensor = f.get_tensor(key)
+        state_dict[key] = tensor
+    return state_dict
+
+
+config = json.load(open(os.path.join(WEIGHTS, "config.json"), "r"))
+
+config.num_hidden_layers = 1
+
+model = LlamaSwiftKVForCausalLM(config=config)
+state_dict_0 = load_safetensors(os.path.join(WEIGHTS, "model-00001-of-00009.safetensors"))
+
+for k in model.state_dict().keys() - state_dict_0.keys():
+    del state_dict_0[k]

From 860ac4fb0c7abadaf181c2fd7bd308e8dcd5a477 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:36:22 +0530
Subject: [PATCH 098/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py    | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index a33c83d3a..5b5fcd77f 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -22,7 +22,6 @@
 # limitations under the License.
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
-import logging
 import math
 from typing import List, Optional, Tuple, Union
 
@@ -30,7 +29,7 @@
 from torch import nn
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.models.llama.modeling_llama import LlamaAttention, LlamaMLP, LlamaRMSNorm, repeat_kv
+from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -40,12 +39,10 @@
     qeff_apply_rotary_pos_emb,
 )
 
-logger = logging.get_logger(__name__)
 
-
-class LlamaSwiftKVAttention(LlamaAttention):
+class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config, layer_idx) -> None:
-        super().__init__(config, layer_idx)
+        super().__init__()
         self.hidden_size = config.hidden_size
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
@@ -56,7 +53,7 @@ def __init__(self, config, layer_idx) -> None:
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
-
+        self.layer_idx = layer_idx
         self.q_proj_swiftkv = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
         self.k_proj_swiftkv = nn.Linear(
             self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias

From d0f7479b8c3877934a3e9a73e0fb48d6e0caa9ac Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:39:46 +0530
Subject: [PATCH 099/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 5b5fcd77f..2022d2c9b 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -63,7 +63,7 @@ def __init__(self, config, layer_idx) -> None:
         )
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
 
-        self.rotary_emb = QEffLlamaRotaryEmbedding(config=self.config)
+        self.rotary_emb = QEffLlamaRotaryEmbedding(config=config)
 
     def forward(
         self,

From c6448567ee03d43a7fa8f3974bcbde1b48385f3b Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 01:46:12 +0530
Subject: [PATCH 100/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 2022d2c9b..4f22e82e0 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -394,6 +394,7 @@ def __init__(self, *, config):
         )
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.config = config
 
     def forward(
         self,

From 551110753d8305fe240c9b0971a14efb170f86bb Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:07:57 +0530
Subject: [PATCH 101/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4f22e82e0..24b88746a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -286,7 +286,9 @@ def forward(
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
-        causal_mask = self._update_causal_mask(None, inputs_embeds, cache_position, past_key_values, False)
+        causal_mask = self._update_causal_mask(
+            None, inputs_embeds, cache_position, position_ids, past_key_values, False
+        )
         hidden_states = inputs_embeds
 
         # create position embeddings to be shared across the decoder layers

From 0089540af889b3f79a8e6d4096796ba5b46a3e3f Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:16:52 +0530
Subject: [PATCH 102/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 24b88746a..8eaef4521 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -292,7 +292,7 @@ def forward(
         hidden_states = inputs_embeds
 
         # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
         next_decoder_cache = None
 
         for layer_idx in range(self.config.num_key_value_layers):
@@ -305,7 +305,7 @@ def forward(
                 output_attentions=False,
                 use_cache=True,
                 cache_position=cache_position,
-                position_embeddings=position_embeddings,
+                position_embeddings=None,
             )
 
         bsz, q_len, _ = hidden_states.size()

From 02b48ff7316eea1151eaeb013007ce1144a1d949 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:23:24 +0530
Subject: [PATCH 103/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 8eaef4521..19887c77e 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -123,6 +123,8 @@ class LlamaSwiftKVDecoderLayer(nn.Module):
     def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.num_key_value_heads = config.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)
@@ -318,8 +320,10 @@ def forward(
             self_attn = self.layers[layer_idx].self_attn
             key_states = self_attn.k_proj_swiftkv(swiftkv_hidden_states)
             value_states = self_attn.v_proj_swiftkv(swiftkv_hidden_states)
-            key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-            value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+            key_states = key_states.view(bsz, q_len, self_attn.num_key_value_heads, self_attn.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, q_len, self_attn.num_key_value_heads, self_attn.head_dim).transpose(
+                1, 2
+            )
 
             kv_seq_len = key_states.shape[-2]
             if past_key_values is not None:
@@ -331,12 +335,12 @@ def forward(
                     )
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
 
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+            cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
             _, key_states = qeff_apply_rotary_pos_emb(
                 torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
             )
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
-            past_key_values.write_only(key_states, value_states, self.layer_idx, cache_kwargs)
+            past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask

From 757f10aad4d8c931c1e466713e6d0b56501bc99e Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 12:14:45 +0530
Subject: [PATCH 104/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 19887c77e..20a91ef45 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -124,7 +124,7 @@ def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)

From 16cd02914c60fc89f4b1687c6c7588b9088ab4a0 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 12:33:39 +0530
Subject: [PATCH 105/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 20a91ef45..b4160a312 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -124,7 +124,6 @@ def __init__(self, config, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_heads)
 
         self.self_attn = LlamaSwiftKVAttention(config=config, layer_idx=layer_idx)
         self.mlp = LlamaMLP(config)

From ee36aa17d454f721fc964d8ca5771a701d912b4e Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:05:36 +0530
Subject: [PATCH 106/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py          | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index b4160a312..4d8bfb754 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -326,13 +326,13 @@ def forward(
 
             kv_seq_len = key_states.shape[-2]
             if past_key_values is not None:
-                if self.layer_idx is None:
+                if self_attn.layer_idx is None:
                     raise ValueError(
-                        f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                        f"The cache structure has changed since version v4.36. If you are using {self_attn.__class__.__name__} "
                         "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
                         "with a layer index."
                     )
-                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self.layer_idx)
+                kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
             _, key_states = qeff_apply_rotary_pos_emb(

From 4203a072ae61318ffc8c76806b006d9ab724a1a1 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:08:51 +0530
Subject: [PATCH 107/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4d8bfb754..4015a6c95 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -335,9 +335,7 @@ def forward(
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
-            _, key_states = qeff_apply_rotary_pos_emb(
-                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
-            )
+            _, key_states = qeff_apply_rotary_pos_emb(torch.empty_like(key_states), key_states, cos, sin, position_ids)
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 

From ee2f7e1dfd08938b159c00e0fb1689d1bb333e79 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:18:16 +0530
Subject: [PATCH 108/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../transformers/models/llama_swiftkv/modeling_llama_swiftkv.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4015a6c95..8ba2ad78e 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -73,7 +73,7 @@ def forward(
         attention_mask=None,
     ) -> torch.Tensor:
         bsz, q_len, _ = hidden_states.size()
-        query, _ = self.q_proj_swiftkv(hidden_states)
+        query = self.q_proj_swiftkv(hidden_states)
 
         # Reshape the query, key, and value tensors.
         query_states = query.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)

From 80730ddea5fe45e4646c038a4d690e8becef89d2 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 19 Dec 2024 16:21:20 +0530
Subject: [PATCH 109/138] all bugfixes in

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 8ba2ad78e..d93d7cb44 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -90,7 +90,11 @@ def forward(
 
         key_states, value_states = past_key_value.read_only(self.layer_idx, position_ids=position_ids)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        query_states, _ = qeff_apply_rotary_pos_emb(query_states, torch.empty_like(key_states), cos, sin, position_ids)
+        position_idx = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        position_ids = position_ids[:, position_idx[0]]
+        query_states, _ = qeff_apply_rotary_pos_emb(
+            query_states, torch.empty_like(query_states), cos, sin, position_ids
+        )
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -160,9 +164,7 @@ def __init__(self, config):
         self.vocab_size = config.vocab_size
         self.config = config
 
-        self.embed_tokens = nn.Embedding(
-            self.vocab_size, config.hidden_size, None
-        )  # TODO: Not sure if padding_idx shoudl eb NONE
+        self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, None)
         self.layers = torch.nn.ModuleList(
             [
                 QEffLlamaDecoderLayer(config=config, layer_idx=idx)
@@ -179,9 +181,9 @@ def _run_swiftkv_layers(
     ) -> torch.Tensor:
         for layer_idx in range(self.config.num_key_value_layers, self.config.num_hidden_layers):
             layer = self.layers[layer_idx]
-
             hidden_states, past_key_values = layer(hidden_states, position_ids, past_key_values, causal_mask)
 
+        hidden_states = self.norm(hidden_states)
         return hidden_states, past_key_values
 
     def _update_causal_mask(
@@ -339,15 +341,21 @@ def forward(
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
 
+        last_pos_id = position_ids.to(torch.int32).argmax(1, keepdim=True)
+        orig_hidden_states = hidden_states
+        hidden_states = orig_hidden_states[:, last_pos_id[0], :]
+        causal_mask = causal_mask[:, :, last_pos_id[0], :]
+
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask
         )
+        orig_hidden_states[:, last_pos_id[0], :] = hidden_states
         ####################################
         ## THE MAGIC OF SWIFT KV ENDS HERE
         ####################################
 
         next_cache = next_decoder_cache.to_legacy_cache()
-        return hidden_states, next_cache
+        return orig_hidden_states, next_cache
 
 
 class LlamaSwiftKVForCausalLM(nn.Module):

From 4b073732d07b34b6db6f41a4ec3b108ced4f9f2f Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 19 Dec 2024 16:21:56 +0530
Subject: [PATCH 110/138] added init file

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/models/llama_swiftkv/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/__init__.py

diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
new file mode 100644
index 000000000..e69de29bb

From e34d79aa2592ff6a2c138e5752c8c719050ed35e Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 9 Jan 2025 16:38:13 +0530
Subject: [PATCH 111/138] all changes except BQA are in with this

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index fe56b197c..2a07d9f10 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -47,8 +47,9 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
             self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
     def read_only(self, layer_idx, cache_kwargs):
+        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         position_ids = cache_kwargs.get("position_ids")
-        ctx_len = position_ids.shape[-1]
+        ctx_len = k_out.shape[2]
         ctx_indices = torch.arange(ctx_len)[None, None, ...]
         gather_limit = position_ids.max(1, keepdim=True).values.unsqueeze(1)
         invalid_mask = ctx_indices > gather_limit
@@ -59,7 +60,7 @@ def read_only(self, layer_idx, cache_kwargs):
             invalid_idx_value = 0
 
         ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
-        k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
+
         k_out = CtxGatherFunc.apply(k_out, ctx_indices)
         v_out = CtxGatherFunc.apply(v_out, ctx_indices)
         v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)

From ed909a9c5077dff041054412d2b49bea5e474fdf Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Wed, 5 Feb 2025 09:20:06 +0530
Subject: [PATCH 112/138] more updates

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/__init__.py          |  6 ++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 68 +++----------------
 2 files changed, 14 insertions(+), 60 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
index e69de29bb..d259e435a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/__init__.py
+++ b/QEfficient/transformers/models/llama_swiftkv/__init__.py
@@ -0,0 +1,6 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index d93d7cb44..365f0b6d2 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,25 +1,13 @@
-# coding=utf-8
-# Adapted from
-# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
-# Copyright 2023 The vLLM team.
-# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+# -----------------------------------------------------------------------------
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+# -----------------------------------------------------------------------------
+# This file is adapted from vllm implementation by snowflake here: https://github.com/Snowflake-Labs/vllm/blob/swiftkv/vllm/model_executor/models/llama_swiftkv.py
+# The Modules are updated as required by Cloud AI 100 HW requirements.
+
+
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
 import math
@@ -294,8 +282,6 @@ def forward(
         )
         hidden_states = inputs_embeds
 
-        # create position embeddings to be shared across the decoder layers
-        # position_embeddings = self.rotary_emb(hidden_states, position_ids)
         next_decoder_cache = None
 
         for layer_idx in range(self.config.num_key_value_layers):
@@ -359,44 +345,6 @@ def forward(
 
 
 class LlamaSwiftKVForCausalLM(nn.Module):
-    """
-    # packed_modules_mapping = {
-    #     "kv_proj_swiftkv": ["k_proj_swiftkv", "v_proj_swiftkv"],
-    #     "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-    #     "gate_up_proj": ["gate_proj", "up_proj"],
-    # }
-
-    # # BitandBytes specific attributes
-    # default_bitsandbytes_target_modules = [
-    #     ".gate_proj.",
-    #     ".down_proj.",
-    #     ".up_proj.",
-    #     ".q_proj.",
-    #     ".k_proj.",
-    #     ".v_proj.",
-    #     ".o_proj.",
-    #     ".k_proj_swiftkv.",
-    #     ".v_proj_swiftkv.",
-    # ]
-
-    # # in TP, these weights are partitioned along the column dimension (dim=-1)
-    # column_parallel_weights_modules = [
-    #     ".q_proj_swiftkv.",
-    #     ".down_proj.",
-    #     ".o_proj.",
-    # ]
-    # bitsandbytes_stacked_params_mapping = {
-    #     # shard_name, weight_name, index
-    #     "k_proj_swiftkv": ("kv_proj_swiftkv", 1),
-    #     "v_proj_swiftkv": ("kv_proj_swiftkv", 2),
-    #     "q_proj": ("qkv_proj", 0),
-    #     "k_proj": ("qkv_proj", 1),
-    #     "v_proj": ("qkv_proj", 2),
-    #     "gate_proj": ("gate_up_proj", 0),
-    #     "up_proj": ("gate_up_proj", 1),
-    # }
-    """
-
     def __init__(self, *, config):
         super().__init__()
 

From 4e4300d85d828dac9554ecca4bf3159c417763e8 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 06:17:43 +0000
Subject: [PATCH 113/138] Enabling the SwiftKV model in the QEFF Infra

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py     | 19 ++++++++
 .../llama_swiftkv/config_llama_swiftkv.py     | 45 +++++++++++++++++++
 .../llama_swiftkv/modeling_llama_swiftkv.py   | 17 ++++---
 .../transformers/models/modeling_auto.py      |  6 +++
 QEfficient/utils/_utils.py                    |  2 +-
 5 files changed, 82 insertions(+), 7 deletions(-)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index ccad5e020..aec82e8cd 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -153,6 +153,9 @@
     QEffWhisperPositionalEmbedding,
 )
 
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
@@ -362,3 +365,19 @@ def _create_causal_mask(
         attention_mask = attention_mask.unsqueeze(1)
 
     return attention_mask
+
+
+# Define a SwiftKV Model card name to Model type dictionary
+# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
+SwiftKVModelCardNameToSwiftKVModelTypeDict: Dict[Type[str], Type[str]] = {
+    # LlamaSwiftKV Model
+    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct": "llama_swiftkv"
+}
+
+# Define a SwiftKV Model type to ConfigClass and ModelArchitecture class dictionary
+# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
+SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
+    # LlamaSwiftKV Model
+    "llama_swiftkv" : [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+}
+
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
new file mode 100644
index 000000000..fa97388de
--- /dev/null
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -0,0 +1,45 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# The Modules are updated as required by Cloud AI 100 HW requirements.
+
+
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+
+
+from typing import Optional
+from transformers import LlamaConfig
+
+
+class LlamaSwiftKVConfig(LlamaConfig):
+    """
+    Args:
+        num_key_value_layers (int, optional):
+            The number of layers, from the first layer, that have keys and
+            values. If None, all layers have keys and values.
+        last_key_value_heads (int, optional):
+            The number of heads in the last layer that have keys and values.
+            If None, the number of heads in the last key-value layer is equal
+            to the number of heads in all the other key-value layers.
+    """
+
+    model_type = "llama_swiftkv"
+
+    def __init__(
+        self,
+        swiftkv: bool = False,
+        num_key_value_layers: Optional[int] = None,
+        key_value_group_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.swiftkv = swiftkv
+        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
+        self.key_value_group_size = key_value_group_size or 1
+        assert (
+            self.num_hidden_layers - self.num_key_value_layers
+        ) % self.key_value_group_size == 0
\ No newline at end of file
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 365f0b6d2..e2bd5a08a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -18,6 +18,7 @@
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
+from transformers.modeling_utils import PreTrainedModel
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -26,10 +27,10 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
 class LlamaSwiftKVAttention(nn.Module):
-    def __init__(self, config, layer_idx) -> None:
+    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.attention_dropout = config.attention_dropout
@@ -112,7 +113,7 @@ def forward(
 
 
 class LlamaSwiftKVDecoderLayer(nn.Module):
-    def __init__(self, config, layer_idx) -> None:
+    def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_key_value_heads = config.num_key_value_heads
@@ -147,7 +148,9 @@ def forward(
 
 
 class LlamaSwiftKVModel(nn.Module):
-    def __init__(self, config):
+    config_class = LlamaSwiftKVConfig
+
+    def __init__(self, config: LlamaSwiftKVConfig):
         super().__init__()
         self.vocab_size = config.vocab_size
         self.config = config
@@ -344,8 +347,10 @@ def forward(
         return orig_hidden_states, next_cache
 
 
-class LlamaSwiftKVForCausalLM(nn.Module):
-    def __init__(self, *, config):
+class LlamaSwiftKVForCausalLM(PreTrainedModel):
+    config_class = LlamaSwiftKVConfig
+
+    def __init__(self, *, config: LlamaSwiftKVConfig):
         super().__init__()
 
         self.model = LlamaSwiftKVModel(
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index a87c39fb4..9d8074a97 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,6 +7,7 @@
 
 import hashlib
 import warnings
+
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -51,6 +52,7 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
+from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 
 class QEFFTransformersBase(QEFFBaseModel):
@@ -76,6 +78,10 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
+
+        # Load the SwiftKV model if supported
+        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
+
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index ea9044e2c..e3724b90f 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -21,7 +21,7 @@
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
-
+from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
 
 class DownloadRetryLimitExceeded(Exception):
     """

From 0684de33bcd126d9a0cac49d54a988363f8be1dc Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 27 Feb 2025 15:16:14 +0530
Subject: [PATCH 114/138] rebased

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py                   | 3 +--
 .../models/llama_swiftkv/config_llama_swiftkv.py            | 6 +-----
 .../models/llama_swiftkv/modeling_llama_swiftkv.py          | 1 +
 QEfficient/transformers/models/modeling_auto.py             | 1 -
 QEfficient/utils/_utils.py                                  | 2 +-
 5 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index aec82e8cd..42244e288 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -378,6 +378,5 @@ def _create_causal_mask(
 # While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
 SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
     # LlamaSwiftKV Model
-    "llama_swiftkv" : [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
 }
-
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
index fa97388de..77eeb61a3 100644
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -9,8 +9,6 @@
 
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
-
-
 from typing import Optional
 from transformers import LlamaConfig
 
@@ -40,6 +38,4 @@ def __init__(
         self.swiftkv = swiftkv
         self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
         self.key_value_group_size = key_value_group_size or 1
-        assert (
-            self.num_hidden_layers - self.num_key_value_layers
-        ) % self.key_value_group_size == 0
\ No newline at end of file
+        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index e2bd5a08a..4d6888bc7 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -29,6 +29,7 @@
 )
 from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
+
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9d8074a97..18006c6dc 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -78,7 +78,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         # Load the SwiftKV model if supported
         QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index e3724b90f..ea9044e2c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -21,7 +21,7 @@
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
-from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
+
 
 class DownloadRetryLimitExceeded(Exception):
     """

From 9fa21da9e3026f78dbe00bba3870788451d1b2a4 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 18:45:54 +0000
Subject: [PATCH 115/138] moving registration of non transformer models during
 initialization of QEfficient

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |  18 +-
 QEfficient/transformers/modeling_utils.py     |  76 ++++++--
 .../models/llama_swiftkv/__init__.py          |   2 +-
 .../llama_swiftkv/config_llama_swiftkv.py     |  41 -----
 .../llama_swiftkv/modeling_llama_swiftkv.py   |  30 ++-
 .../transformers/models/modeling_auto.py      |   3 -
 QEfficient/utils/_utils.py                    | 172 +++++++-----------
 7 files changed, 174 insertions(+), 168 deletions(-)
 delete mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 47c462979..95f690b9c 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
@@ -13,7 +13,23 @@
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
 from QEfficient.utils.logging_utils import logger
+from transformers import AutoConfig
+from QEfficient.transformers.modeling_utils import (
+    get_model_class_type_from_model_type,
+    get_auto_model_class,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+)
 
+# loop over all the models which are not present in transformers and register them
+for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
+    # Register the config class based on model type
+    AutoConfig.register(key, value[0])
+
+    model_class_type = get_model_class_type_from_model_type(key)
+    AutoModelClassName = get_auto_model_class(model_class_type, value[1])
+
+    # Register the non transformer library Class and config class using AutoModelClass
+    AutoModelClassName.register(value[0], value[1])
 
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 42244e288..9619cb816 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -7,6 +7,7 @@
 
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
+import sys
 
 import torch
 import torch.nn as nn
@@ -86,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 from .models.codegen.modeling_codegen import (
@@ -153,8 +155,11 @@
     QEffWhisperPositionalEmbedding,
 )
 
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+# Placeholder for all non-transformer models
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
+    LlamaSwiftKVForCausalLM,
+    LlamaSwiftKVConfig
+)
 
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
@@ -274,6 +279,19 @@
     WhisperForConditionalGeneration: QEffWhisperForConditionalGeneration,
 }
 
+# Map of model type to config class and Model architecture class
+# While onboarding new models make sure to add the new model card names to this dictionary.
+MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
+    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
+}
+
+# list of sub-strings representing the model type, this is typically taken from llama-swiftkv
+LIST_OF_MODEL_TYPES = {"swiftkv"}
+
+# list of sub-strings used for representing the model Architecture class name, for example LlamaSwiftKVForCausalLM
+MODEL_TYPE_TO_MODEL_CLASS_TYPE = {
+    "swiftkv": "SwiftKVFor"
+}
 
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
@@ -366,17 +384,47 @@ def _create_causal_mask(
 
     return attention_mask
 
+def convert_str_to_class(className):
+    """
+    Convert the string to class name
+    ---------
+    :className: `str`- Class name string.
+    Return:
+        Class Name
+    """
+    return getattr(sys.modules[__name__], className)
 
-# Define a SwiftKV Model card name to Model type dictionary
-# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
-SwiftKVModelCardNameToSwiftKVModelTypeDict: Dict[Type[str], Type[str]] = {
-    # LlamaSwiftKV Model
-    "Snowflake/Llama-3.1-SwiftKV-8B-Instruct": "llama_swiftkv"
-}
 
-# Define a SwiftKV Model type to ConfigClass and ModelArchitecture class dictionary
-# While onboarding new models make sure to add the new SwiftKV model card names to this dictionary.
-SwiftKVModelTypeToConfigClassAndModelArchClassDict = {
-    # LlamaSwiftKV Model
-    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
-}
+def get_auto_model_class(model_type, NonTransformerModelCls):
+    """
+    Register the Non Transformer Models like swiftkv
+    ---------------------------------------
+    : model_type: str: name of the Non Transformer model for example llama_swiftkv
+    : NonTransformerModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
+    """
+
+    # Construct the AutoModel class name using NonTransformerModel class e.g. SwiftKVModel Class name, this code is written to make things generic
+    nonTransformerModelClsName = NonTransformerModelCls.__name__
+    start_index = nonTransformerModelClsName.find(model_type)
+
+    # Calculate the index after model_type example "SwiftKVFor"
+    substring_start = start_index + len(model_type)
+
+    # Get the substring after model_type example "SwiftKVFor"
+    nonTransformerModel = nonTransformerModelClsName[substring_start:]
+
+    autoModelName = "AutoModelFor" + nonTransformerModel
+
+    # Convert the string to class name
+    autoModelClassName = convert_str_to_class(autoModelName)
+
+    return autoModelClassName
+
+def get_model_class_type_from_model_type(model_type):
+    for substring in LIST_OF_MODEL_TYPES:
+        if (substring in model_type):
+            model_class_type = substring
+            break
+
+    model_class_name = MODEL_TYPE_TO_MODEL_CLASS_TYPE[model_class_type]
+    return model_class_name
\ No newline at end of file
diff --git a/QEfficient/transformers/models/llama_swiftkv/__init__.py b/QEfficient/transformers/models/llama_swiftkv/__init__.py
index d259e435a..72ba36c8a 100644
--- a/QEfficient/transformers/models/llama_swiftkv/__init__.py
+++ b/QEfficient/transformers/models/llama_swiftkv/__init__.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
deleted file mode 100644
index 77eeb61a3..000000000
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-# The Modules are updated as required by Cloud AI 100 HW requirements.
-
-
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
-
-from typing import Optional
-from transformers import LlamaConfig
-
-
-class LlamaSwiftKVConfig(LlamaConfig):
-    """
-    Args:
-        num_key_value_layers (int, optional):
-            The number of layers, from the first layer, that have keys and
-            values. If None, all layers have keys and values.
-        last_key_value_heads (int, optional):
-            The number of heads in the last layer that have keys and values.
-            If None, the number of heads in the last key-value layer is equal
-            to the number of heads in all the other key-value layers.
-    """
-
-    model_type = "llama_swiftkv"
-
-    def __init__(
-        self,
-        swiftkv: bool = False,
-        num_key_value_layers: Optional[int] = None,
-        key_value_group_size: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.swiftkv = swiftkv
-        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
-        self.key_value_group_size = key_value_group_size or 1
-        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 4d6888bc7..7d5c45a7d 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,6 +1,6 @@
 # -----------------------------------------------------------------------------
 #
-# Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
 # SPDX-License-Identifier: BSD-3-Clause
 #
 # -----------------------------------------------------------------------------
@@ -19,6 +19,7 @@
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
+from transformers import LlamaConfig
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -27,8 +28,33 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
+class LlamaSwiftKVConfig(LlamaConfig):
+    """
+    Args:
+        num_key_value_layers (int, optional):
+            The number of layers, from the first layer, that have keys and
+            values. If None, all layers have keys and values.
+        last_key_value_heads (int, optional):
+            The number of heads in the last layer that have keys and values.
+            If None, the number of heads in the last key-value layer is equal
+            to the number of heads in all the other key-value layers.
+    """
+
+    model_type = "llama_swiftkv"
+
+    def __init__(
+        self,
+        swiftkv: bool = False,
+        num_key_value_layers: Optional[int] = None,
+        key_value_group_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.swiftkv = swiftkv
+        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
+        self.key_value_group_size = key_value_group_size or 1
+        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
 
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 18006c6dc..9d7d48293 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -52,7 +52,6 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
-from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 
 class QEFFTransformersBase(QEFFBaseModel):
@@ -78,8 +77,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-        # Load the SwiftKV model if supported
-        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index ea9044e2c..281c9f89b 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,7 +8,8 @@
 import json
 import os
 import subprocess
-import xml.etree.ElementTree as ET
+import sys
+import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -17,12 +18,21 @@
 import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
+
+from QEfficient.transformers.modeling_utils import (
+    SwiftKVModelCardNameToSwiftKVModelTypeDict,
+    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
+)
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
 from QEfficient.utils.logging_utils import logger
 
-
 class DownloadRetryLimitExceeded(Exception):
     """
     Used for raising error when hf_download fails to download the model after given max_retries.
@@ -446,111 +456,61 @@ def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
 
 
-def dump_qconfig(func):
-    def wrapper(self, *args, **kwargs):
-        result = func(self, *args, **kwargs)
-        create_and_dump_qconfigs(
-            self.qpc_path,
-            self.onnx_path,
-            self.get_model_config,
-            [cls.__name__ for cls in self._pytorch_transforms],
-            [cls.__name__ for cls in self._onnx_transforms],
-            kwargs.get("specializations"),
-            kwargs.get("mdp_ts_num_devices", 1),
-            kwargs.get("num_speculative_tokens"),
-            **{
-                k: v
-                for k, v in kwargs.items()
-                if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"]
-            },
-        )
-        return result
+def convert_str_to_class(className):
+    """
+    Convert the string to class name
+    ---------
+    :className: `str`- Class name string.
+    Return:
+        Class Name
+    """
+    return getattr(sys.modules[__name__], className)
 
-    return wrapper
 
+def register_swiftKV_model(model_type, SwiftkvConfigCls, SwiftKVModelCls):
+    """
+    Register the SwiftKV Models
+    ---------------------------------------
+    : model_type: str: name of the swiftKVModel for example llama_swiftkv
+    : SwiftkVConfigCls: SwiftKV Config class for example LlamaSwiftKVConfig
+    : SwiftKVModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
+    """
+
+    # Register the SwiftKV Config class using AutoConfig
+    AutoConfig.register(model_type, SwiftkvConfigCls)
 
-def create_and_dump_qconfigs(
-    qpc_path,
-    onnx_path,
-    huggingface_config,
-    pytorch_transforms,
-    onnx_transforms,
-    specializations,
-    mdp_ts_num_devices,
-    num_speculative_tokens,
-    **compiler_options,
-):
+    # Construct the AutoModel class name using SwiftKVModel Class name, this code is written to make things generic
+    swiftKvModelName = SwiftKVModelCls.__name__
+    start_index = swiftKvModelName.find("SwiftKVFor")
+
+    # Calculate the index after "SwiftKVFor"
+    substring_start = start_index + len("SwiftKVFor")
+
+    # Get the substring after "SwiftKVFor"
+    swiftKVModel = swiftKvModelName[substring_start:]
+
+    AutoModelName = "AutoModelFor" + swiftKVModel
+
+    # Convert the string to class name
+    AutoModelClassName = convert_str_to_class(AutoModelName)
+
+    # Register the SwiftKVModel Class and config class using AutoModelClass
+    AutoModelClassName.register(SwiftkvConfigCls, SwiftKVModelCls)
+
+
+def QEFFLoadSwiftKVModels(pretrained_model_name_or_path):
     """
-    This Method creates a JSON file which contains all the configs for a model.
-    Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and
-    many other compilation options.
+    Load the SwiftKV Models
+    ---------------------------------------
+    : pretrained_model_name_or_path: str: name of the swiftKVModel for example Snowflake/Llama-3.1-SwiftKV-8B-Instruct
     """
-    qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
-    enable_qnn = True if "qnn_config" in compiler_options else None
-
-    qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json")
-    onnx_path = str(onnx_path)
-    specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json"))
-    compile_dir = str(os.path.dirname(qpc_path))
-    qnn_config_path = (
-        (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None
-    )
+    try:
+        modelType = SwiftKVModelCardNameToSwiftKVModelTypeDict[pretrained_model_name_or_path]
 
-    # Extract QAIC SDK Apps Version from SDK XML file
-    tree = ET.parse(Constants.SDK_APPS_XML)
-    root = tree.getroot()
-    qaic_version = root.find(".//base_version").text
-
-    # Extract QNN SDK details from YAML file if the environment variable is set
-    qnn_sdk_details = None
-    qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
-    if qnn_sdk_path:
-        qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
-        with open(qnn_sdk_yaml_path, "r") as file:
-            qnn_sdk_details = yaml.safe_load(file)
-
-    # Ensure all objects in the configs dictionary are JSON serializable
-    def make_serializable(obj):
-        if isinstance(obj, (int, float, str, bool, type(None))):
-            return obj
-        elif isinstance(obj, (list, tuple)):
-            return [make_serializable(item) for item in obj]
-        elif isinstance(obj, dict):
-            return {key: make_serializable(value) for key, value in obj.items()}
-        elif hasattr(obj, "__dict__"):
-            return make_serializable(vars(obj))
-        return str(obj)
-
-    qconfigs = {
-        "huggingface_config": make_serializable(huggingface_config),
-        "qpc_config": {
-            "QEff_config": {
-                "pytorch_transforms": make_serializable(pytorch_transforms),
-                "onnx_transforms": make_serializable(onnx_transforms),
-                "onnx_path": onnx_path,
-            },
-        },
-    }
-
-    aic_compiler_config = {
-        "apps_sdk_version": qaic_version,
-        "compile_dir": compile_dir,
-        "specializations_file_path": specializations_file_path,
-        "specializations": make_serializable(specializations),
-        "mdp_ts_num_devices": mdp_ts_num_devices,
-        "num_speculative_tokens": num_speculative_tokens,
-        **compiler_options,
-    }
-    qnn_config = {
-        "enable_qnn": enable_qnn,
-        "qnn_config_path": qnn_config_path,
-    }
-    # Put AIC or qnn details.
-    if enable_qnn:
-        qconfigs["qpc_config"]["qnn_config"] = qnn_config
-        if qnn_sdk_details:
-            qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details)
-    else:
-        qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
+        SwiftKVConfigCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][0]
+        SwiftKVModelArchCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][1]
+
+        register_swiftKV_model(modelType, SwiftKVConfigCls, SwiftKVModelArchCls)
 
-    create_json(qconfig_file_path, qconfigs)
+    except KeyError:
+        warnings.warn("Requested SwiftKVModel is currently not supported... stay tuned for future releases", Warning)

From cb3b0ba819789cd423c328f5c356e8168fcd9f51 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Tue, 4 Mar 2025 05:18:06 +0000
Subject: [PATCH 116/138] fixed lint warnings

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |   7 +-
 QEfficient/transformers/modeling_utils.py     |  30 ++-
 .../llama_swiftkv/modeling_llama_swiftkv.py   |   6 +-
 .../transformers/models/modeling_auto.py      |   2 -
 QEfficient/utils/_utils.py                    | 172 +++++++++++-------
 5 files changed, 129 insertions(+), 88 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 95f690b9c..cad29d450 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -14,11 +14,13 @@
 
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
+
 from QEfficient.transformers.modeling_utils import (
-    get_model_class_type_from_model_type,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
-    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+    get_model_class_type_from_model_type,
 )
+from QEfficient.utils.logging_utils import logger
 
 # loop over all the models which are not present in transformers and register them
 for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
@@ -31,6 +33,7 @@
     # Register the non transformer library Class and config class using AutoModelClass
     AutoModelClassName.register(value[0], value[1])
 
+
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
     try:
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 9619cb816..a3c69b1ed 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,9 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
+import sys
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
-import sys
 
 import torch
 import torch.nn as nn
@@ -87,9 +87,14 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
+# Placeholder for all non-transformer models
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
+    LlamaSwiftKVConfig,
+    LlamaSwiftKVForCausalLM,
+)
+
 from .models.codegen.modeling_codegen import (
     QEffCodeGenAttention,
     QeffCodeGenBlock,
@@ -155,12 +160,6 @@
     QEffWhisperPositionalEmbedding,
 )
 
-# Placeholder for all non-transformer models
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
-    LlamaSwiftKVForCausalLM,
-    LlamaSwiftKVConfig
-)
-
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
@@ -281,17 +280,14 @@
 
 # Map of model type to config class and Model architecture class
 # While onboarding new models make sure to add the new model card names to this dictionary.
-MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {
-    "llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]
-}
+MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {"llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]}
 
 # list of sub-strings representing the model type, this is typically taken from llama-swiftkv
 LIST_OF_MODEL_TYPES = {"swiftkv"}
 
 # list of sub-strings used for representing the model Architecture class name, for example LlamaSwiftKVForCausalLM
-MODEL_TYPE_TO_MODEL_CLASS_TYPE = {
-    "swiftkv": "SwiftKVFor"
-}
+MODEL_TYPE_TO_MODEL_CLASS_TYPE = {"swiftkv": "SwiftKVFor"}
+
 
 def _prepare_cross_attention_mask(
     cross_attention_mask: torch.Tensor,
@@ -384,6 +380,7 @@ def _create_causal_mask(
 
     return attention_mask
 
+
 def convert_str_to_class(className):
     """
     Convert the string to class name
@@ -420,11 +417,12 @@ def get_auto_model_class(model_type, NonTransformerModelCls):
 
     return autoModelClassName
 
+
 def get_model_class_type_from_model_type(model_type):
     for substring in LIST_OF_MODEL_TYPES:
-        if (substring in model_type):
+        if substring in model_type:
             model_class_type = substring
             break
 
     model_class_name = MODEL_TYPE_TO_MODEL_CLASS_TYPE[model_class_type]
-    return model_class_name
\ No newline at end of file
+    return model_class_name
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 7d5c45a7d..f1ec2634d 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -15,11 +15,11 @@
 
 import torch
 from torch import nn
+from transformers import LlamaConfig
 from transformers.cache_utils import Cache, StaticCache
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
-from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
-from transformers import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -29,6 +29,7 @@
     qeff_apply_rotary_pos_emb,
 )
 
+
 class LlamaSwiftKVConfig(LlamaConfig):
     """
     Args:
@@ -56,6 +57,7 @@ def __init__(
         self.key_value_group_size = key_value_group_size or 1
         assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
 
+
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
         super().__init__()
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9d7d48293..a87c39fb4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,7 +7,6 @@
 
 import hashlib
 import warnings
-
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -77,7 +76,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 281c9f89b..ea9044e2c 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,8 +8,7 @@
 import json
 import os
 import subprocess
-import sys
-import warnings
+import xml.etree.ElementTree as ET
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -18,21 +17,12 @@
 import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import (
-    AutoConfig,
-    AutoProcessor,
-    AutoTokenizer,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-)
-
-from QEfficient.transformers.modeling_utils import (
-    SwiftKVModelCardNameToSwiftKVModelTypeDict,
-    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
-)
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
 
+
 class DownloadRetryLimitExceeded(Exception):
     """
     Used for raising error when hf_download fails to download the model after given max_retries.
@@ -456,61 +446,111 @@ def __repr__(self):
         return f"input_name:{self.name}\tdatatype:{self.datatype}\tshape:{self.shape}"
 
 
-def convert_str_to_class(className):
-    """
-    Convert the string to class name
-    ---------
-    :className: `str`- Class name string.
-    Return:
-        Class Name
-    """
-    return getattr(sys.modules[__name__], className)
-
-
-def register_swiftKV_model(model_type, SwiftkvConfigCls, SwiftKVModelCls):
-    """
-    Register the SwiftKV Models
-    ---------------------------------------
-    : model_type: str: name of the swiftKVModel for example llama_swiftkv
-    : SwiftkVConfigCls: SwiftKV Config class for example LlamaSwiftKVConfig
-    : SwiftKVModelCls: SwiftKV model class name for example LlamaSwiftKVForCausalLM
-    """
-
-    # Register the SwiftKV Config class using AutoConfig
-    AutoConfig.register(model_type, SwiftkvConfigCls)
-
-    # Construct the AutoModel class name using SwiftKVModel Class name, this code is written to make things generic
-    swiftKvModelName = SwiftKVModelCls.__name__
-    start_index = swiftKvModelName.find("SwiftKVFor")
-
-    # Calculate the index after "SwiftKVFor"
-    substring_start = start_index + len("SwiftKVFor")
-
-    # Get the substring after "SwiftKVFor"
-    swiftKVModel = swiftKvModelName[substring_start:]
-
-    AutoModelName = "AutoModelFor" + swiftKVModel
-
-    # Convert the string to class name
-    AutoModelClassName = convert_str_to_class(AutoModelName)
+def dump_qconfig(func):
+    def wrapper(self, *args, **kwargs):
+        result = func(self, *args, **kwargs)
+        create_and_dump_qconfigs(
+            self.qpc_path,
+            self.onnx_path,
+            self.get_model_config,
+            [cls.__name__ for cls in self._pytorch_transforms],
+            [cls.__name__ for cls in self._onnx_transforms],
+            kwargs.get("specializations"),
+            kwargs.get("mdp_ts_num_devices", 1),
+            kwargs.get("num_speculative_tokens"),
+            **{
+                k: v
+                for k, v in kwargs.items()
+                if k not in ["specializations", "mdp_ts_num_devices", "num_speculative_tokens", "custom_io"]
+            },
+        )
+        return result
 
-    # Register the SwiftKVModel Class and config class using AutoModelClass
-    AutoModelClassName.register(SwiftkvConfigCls, SwiftKVModelCls)
+    return wrapper
 
 
-def QEFFLoadSwiftKVModels(pretrained_model_name_or_path):
+def create_and_dump_qconfigs(
+    qpc_path,
+    onnx_path,
+    huggingface_config,
+    pytorch_transforms,
+    onnx_transforms,
+    specializations,
+    mdp_ts_num_devices,
+    num_speculative_tokens,
+    **compiler_options,
+):
     """
-    Load the SwiftKV Models
-    ---------------------------------------
-    : pretrained_model_name_or_path: str: name of the swiftKVModel for example Snowflake/Llama-3.1-SwiftKV-8B-Instruct
+    This Method creates a JSON file which contains all the configs for a model.
+    Such as huggingface configs, QEff transforms, QAIC sdk version, QNN sdk, compilation dir, qpc dir and
+    many other compilation options.
     """
-    try:
-        modelType = SwiftKVModelCardNameToSwiftKVModelTypeDict[pretrained_model_name_or_path]
-
-        SwiftKVConfigCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][0]
-        SwiftKVModelArchCls = SwiftKVModelTypeToConfigClassAndModelArchClassDict[modelType][1]
+    qnn_config = compiler_options["qnn_config"] if "qnn_config" in compiler_options else None
+    enable_qnn = True if "qnn_config" in compiler_options else None
+
+    qconfig_file_path = os.path.join(os.path.dirname(qpc_path), "qconfig.json")
+    onnx_path = str(onnx_path)
+    specializations_file_path = str(os.path.join(os.path.dirname(qpc_path), "specializations.json"))
+    compile_dir = str(os.path.dirname(qpc_path))
+    qnn_config_path = (
+        (qnn_config if qnn_config is not None else "QEfficient/compile/qnn_config.json") if enable_qnn else None
+    )
 
-        register_swiftKV_model(modelType, SwiftKVConfigCls, SwiftKVModelArchCls)
+    # Extract QAIC SDK Apps Version from SDK XML file
+    tree = ET.parse(Constants.SDK_APPS_XML)
+    root = tree.getroot()
+    qaic_version = root.find(".//base_version").text
+
+    # Extract QNN SDK details from YAML file if the environment variable is set
+    qnn_sdk_details = None
+    qnn_sdk_path = os.getenv(QnnConstants.QNN_SDK_PATH_ENV_VAR_NAME)
+    if qnn_sdk_path:
+        qnn_sdk_yaml_path = os.path.join(qnn_sdk_path, QnnConstants.QNN_SDK_YAML)
+        with open(qnn_sdk_yaml_path, "r") as file:
+            qnn_sdk_details = yaml.safe_load(file)
+
+    # Ensure all objects in the configs dictionary are JSON serializable
+    def make_serializable(obj):
+        if isinstance(obj, (int, float, str, bool, type(None))):
+            return obj
+        elif isinstance(obj, (list, tuple)):
+            return [make_serializable(item) for item in obj]
+        elif isinstance(obj, dict):
+            return {key: make_serializable(value) for key, value in obj.items()}
+        elif hasattr(obj, "__dict__"):
+            return make_serializable(vars(obj))
+        return str(obj)
+
+    qconfigs = {
+        "huggingface_config": make_serializable(huggingface_config),
+        "qpc_config": {
+            "QEff_config": {
+                "pytorch_transforms": make_serializable(pytorch_transforms),
+                "onnx_transforms": make_serializable(onnx_transforms),
+                "onnx_path": onnx_path,
+            },
+        },
+    }
+
+    aic_compiler_config = {
+        "apps_sdk_version": qaic_version,
+        "compile_dir": compile_dir,
+        "specializations_file_path": specializations_file_path,
+        "specializations": make_serializable(specializations),
+        "mdp_ts_num_devices": mdp_ts_num_devices,
+        "num_speculative_tokens": num_speculative_tokens,
+        **compiler_options,
+    }
+    qnn_config = {
+        "enable_qnn": enable_qnn,
+        "qnn_config_path": qnn_config_path,
+    }
+    # Put AIC or qnn details.
+    if enable_qnn:
+        qconfigs["qpc_config"]["qnn_config"] = qnn_config
+        if qnn_sdk_details:
+            qconfigs["qpc_config"]["qnn_config"].update(qnn_sdk_details)
+    else:
+        qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
 
-    except KeyError:
-        warnings.warn("Requested SwiftKVModel is currently not supported... stay tuned for future releases", Warning)
+    create_json(qconfig_file_path, qconfigs)

From 5d9d1e5755b01d1a63597e957b5ead65ab2bb0f1 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:59:48 +0530
Subject: [PATCH 117/138] enabling faster downloads via hf_transfer (#282)

hf hub doc:
https://huggingface.co/docs/huggingface_hub/en/guides/download
details on hf_transfer
https://github.com/[huggingface/hf_transfer](https://github.com/huggingface/hf_transfer)

---------

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index cad29d450..cf622f2cd 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -14,13 +14,11 @@
 
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
-
 from QEfficient.transformers.modeling_utils import (
     MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
-from QEfficient.utils.logging_utils import logger
 
 # loop over all the models which are not present in transformers and register them
 for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():

From 2ca93606da57e66533b52f02af08e6b50ccf3557 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:36:53 +0000
Subject: [PATCH 118/138] Fixed the compilation errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py                        | 2 +-
 QEfficient/transformers/modeling_utils.py                     | 1 +
 .../models/llama_swiftkv/modeling_llama_swiftkv.py            | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index 2a07d9f10..765a12f9d 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -46,7 +46,7 @@ def write_only(self, key_states, value_states, layer_idx, cache_kwargs):
             self.key_cache[layer_idx] = CtxScatterFunc.apply(self.key_cache[layer_idx], position_ids, key_states)
             self.value_cache[layer_idx] = CtxScatterFunc.apply(self.value_cache[layer_idx], position_ids, value_states)
 
-    def read_only(self, layer_idx, cache_kwargs):
+    def read_only(self, layer_idx, **cache_kwargs):
         k_out, v_out = self.key_cache[layer_idx], self.value_cache[layer_idx]
         position_ids = cache_kwargs.get("position_ids")
         ctx_len = k_out.shape[2]
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index a3c69b1ed..db0b86c2a 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index f1ec2634d..26931fced 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -379,8 +379,8 @@ def forward(
 class LlamaSwiftKVForCausalLM(PreTrainedModel):
     config_class = LlamaSwiftKVConfig
 
-    def __init__(self, *, config: LlamaSwiftKVConfig):
-        super().__init__()
+    def __init__(self, config: LlamaSwiftKVConfig):
+        super().__init__(config=config)
 
         self.model = LlamaSwiftKVModel(
             config=config,

From 0f5cfcf86804d1fdb27ee47166bd1a794bf1b81b Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:39:38 +0000
Subject: [PATCH 119/138] Fixed the lint error

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 -
 QEfficient/transformers/modeling_utils.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index cf622f2cd..e60362c34 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,7 +6,6 @@
 # -----------------------------------------------------------------------------
 
 import os
-
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index db0b86c2a..a3c69b1ed 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,7 +87,6 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models

From c1f8a6babdb3da5125dfdf6c925c81b0ad57be07 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:44:12 +0000
Subject: [PATCH 120/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 +
 QEfficient/transformers/modeling_utils.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index e60362c34..cf622f2cd 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import os
+
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index a3c69b1ed..54348c860 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -11,6 +11,8 @@
 
 import torch
 import torch.nn as nn
+import importlib
+
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -389,7 +391,8 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    return getattr(sys.modules[__name__], className)
+    module = importlib.import_module("transformers")
+    return getattr(module, className)
 
 
 def get_auto_model_class(model_type, NonTransformerModelCls):

From dc059e44f2eb49c8c59b8b16d4f101b4952d2e22 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:48:14 +0000
Subject: [PATCH 121/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 54348c860..fcb4549d7 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,14 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
-import sys
+import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
 import torch
 import torch.nn as nn
-import importlib
-
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,

From 4fdebc77dae70ef9721f5d081226ca911e257b9c Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 11:24:56 +0000
Subject: [PATCH 122/138] Address review comments

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 16 +++++++++-------
 QEfficient/transformers/modeling_utils.py |  4 ++--
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index cf622f2cd..53a3a4fef 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -20,16 +20,18 @@
     get_model_class_type_from_model_type,
 )
 
-# loop over all the models which are not present in transformers and register them
-for key, value in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
-    # Register the config class based on model type
-    AutoConfig.register(key, value[0])
+from QEfficient.utils.logging_utils import logger
+
+# loop over all the model types which are not present in transformers and register them
+for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
+    # Register the model config class based on the model type. This will be first element in the tuple
+    AutoConfig.register(model_type, model_cls[0])
 
-    model_class_type = get_model_class_type_from_model_type(key)
-    AutoModelClassName = get_auto_model_class(model_class_type, value[1])
+    model_class_type = get_model_class_type_from_model_type(model_type)
+    AutoModelClassName = get_auto_model_class(model_class_type, model_cls[1])
 
     # Register the non transformer library Class and config class using AutoModelClass
-    AutoModelClassName.register(value[0], value[1])
+    AutoModelClassName.register(model_cls[0], model_cls[1])
 
 
 def check_qaic_sdk():
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index fcb4549d7..e70542ff7 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
@@ -280,6 +279,7 @@
 
 # Map of model type to config class and Model architecture class
 # While onboarding new models make sure to add the new model card names to this dictionary.
+# Developers are expected to follow the naming conventions like ForCausalLM while defining the class names
 MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS = {"llama_swiftkv": [LlamaSwiftKVConfig, LlamaSwiftKVForCausalLM]}
 
 # list of sub-strings representing the model type, this is typically taken from llama-swiftkv
@@ -389,7 +389,7 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    module = importlib.import_module("transformers")
+    module = __import__("transformers")
     return getattr(module, className)
 
 

From 5b04f83dc3c70968486b72ebe8410f3fbd2eff55 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 14:18:20 +0000
Subject: [PATCH 123/138] Rebased and fixed the lint errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 53a3a4fef..60aba0d74 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -12,14 +12,13 @@
 # hf_transfer is imported (will happen on line 15 via leading imports)
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 
-from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
+
 from QEfficient.transformers.modeling_utils import (
     MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
-
 from QEfficient.utils.logging_utils import logger
 
 # loop over all the model types which are not present in transformers and register them

From 393c428ea803da10a5352dd1f2879610e73fcb51 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 00:39:03 +0530
Subject: [PATCH 124/138] added initial version of SwiftKV for AI 100

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/cache_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/transformers/cache_utils.py b/QEfficient/transformers/cache_utils.py
index 765a12f9d..e7d6e8275 100644
--- a/QEfficient/transformers/cache_utils.py
+++ b/QEfficient/transformers/cache_utils.py
@@ -60,7 +60,6 @@ def read_only(self, layer_idx, **cache_kwargs):
             invalid_idx_value = 0
 
         ctx_indices = torch.where(invalid_mask, invalid_idx_value, ctx_indices)
-
         k_out = CtxGatherFunc.apply(k_out, ctx_indices)
         v_out = CtxGatherFunc.apply(v_out, ctx_indices)
         v_out = torch.where(invalid_mask.unsqueeze(-1), torch.tensor(0.0, dtype=torch.float32), v_out)

From 273258e941d4cd4f43904efefce56502250bf556 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 02:23:24 +0530
Subject: [PATCH 125/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py        | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 26931fced..f45d30b6d 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -355,6 +355,7 @@ def forward(
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
+<<<<<<< HEAD
             _, key_states = qeff_apply_rotary_pos_emb(torch.empty_like(key_states), key_states, cos, sin, position_ids)
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
@@ -363,6 +364,13 @@ def forward(
         orig_hidden_states = hidden_states
         hidden_states = orig_hidden_states[:, last_pos_id[0], :]
         causal_mask = causal_mask[:, :, last_pos_id[0], :]
+=======
+            _, key_states = qeff_apply_rotary_pos_emb(
+                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
+            )
+            cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
+            past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
+>>>>>>> 5259873 (BUGFIX)
 
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask

From 8d43032fd82d58440d16adf5208e50e65b4f07b7 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Tue, 17 Dec 2024 13:08:51 +0530
Subject: [PATCH 126/138] BUGFIX

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../models/llama_swiftkv/modeling_llama_swiftkv.py       | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index f45d30b6d..479a7dbd6 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,3 +1,4 @@
+
 # -----------------------------------------------------------------------------
 #
 # Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
@@ -355,7 +356,6 @@ def forward(
                 kv_seq_len = past_key_values.get_usable_length(kv_seq_len, self_attn.layer_idx)
 
             cos, sin = self_attn.rotary_emb(value_states, seq_len=kv_seq_len)
-<<<<<<< HEAD
             _, key_states = qeff_apply_rotary_pos_emb(torch.empty_like(key_states), key_states, cos, sin, position_ids)
             cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
             past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
@@ -364,13 +364,6 @@ def forward(
         orig_hidden_states = hidden_states
         hidden_states = orig_hidden_states[:, last_pos_id[0], :]
         causal_mask = causal_mask[:, :, last_pos_id[0], :]
-=======
-            _, key_states = qeff_apply_rotary_pos_emb(
-                torch.empty_like(swiftkv_hidden_states), key_states, cos, sin, position_ids
-            )
-            cache_kwargs = {"sin": sin, "cos": cos, "position_ids": position_ids}
-            past_key_values.write_only(key_states, value_states, self_attn.layer_idx, cache_kwargs)
->>>>>>> 5259873 (BUGFIX)
 
         hidden_states, next_decoder_cache = self._run_swiftkv_layers(
             hidden_states, position_ids, past_key_values, causal_mask

From 339ce8918c5a84caa245168f4194fa14fce02a30 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 06:17:43 +0000
Subject: [PATCH 127/138] Enabling the SwiftKV model in the QEFF Infra

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py     |  3 ++
 .../llama_swiftkv/config_llama_swiftkv.py     | 45 +++++++++++++++++++
 .../llama_swiftkv/modeling_llama_swiftkv.py   |  3 +-
 .../transformers/models/modeling_auto.py      |  6 +++
 QEfficient/utils/_utils.py                    |  2 +-
 5 files changed, 57 insertions(+), 2 deletions(-)
 create mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index e70542ff7..51dce5e90 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -159,6 +159,9 @@
     QEffWhisperPositionalEmbedding,
 )
 
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
new file mode 100644
index 000000000..fa97388de
--- /dev/null
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -0,0 +1,45 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+# The Modules are updated as required by Cloud AI 100 HW requirements.
+
+
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
+
+
+
+from typing import Optional
+from transformers import LlamaConfig
+
+
+class LlamaSwiftKVConfig(LlamaConfig):
+    """
+    Args:
+        num_key_value_layers (int, optional):
+            The number of layers, from the first layer, that have keys and
+            values. If None, all layers have keys and values.
+        last_key_value_heads (int, optional):
+            The number of heads in the last layer that have keys and values.
+            If None, the number of heads in the last key-value layer is equal
+            to the number of heads in all the other key-value layers.
+    """
+
+    model_type = "llama_swiftkv"
+
+    def __init__(
+        self,
+        swiftkv: bool = False,
+        num_key_value_layers: Optional[int] = None,
+        key_value_group_size: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.swiftkv = swiftkv
+        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
+        self.key_value_group_size = key_value_group_size or 1
+        assert (
+            self.num_hidden_layers - self.num_key_value_layers
+        ) % self.key_value_group_size == 0
\ No newline at end of file
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 479a7dbd6..79215e3bf 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -21,6 +21,7 @@
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
+from transformers.modeling_utils import PreTrainedModel
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -29,7 +30,7 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-
+from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
 class LlamaSwiftKVConfig(LlamaConfig):
     """
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index a87c39fb4..9d8074a97 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,6 +7,7 @@
 
 import hashlib
 import warnings
+
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -51,6 +52,7 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
+from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 
 class QEFFTransformersBase(QEFFBaseModel):
@@ -76,6 +78,10 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
+
+        # Load the SwiftKV model if supported
+        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
+
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index ea9044e2c..e3724b90f 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -21,7 +21,7 @@
 
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
 from QEfficient.utils.logging_utils import logger
-
+from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
 
 class DownloadRetryLimitExceeded(Exception):
     """

From b24399b3f7dc760b120d520096bd0e86007c76f0 Mon Sep 17 00:00:00 2001
From: Onkar Chougule <quic_ochougul@quicinc.com>
Date: Thu, 27 Feb 2025 15:16:14 +0530
Subject: [PATCH 128/138] rebased

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 .../llama_swiftkv/config_llama_swiftkv.py     |  6 +----
 .../transformers/models/modeling_auto.py      |  1 -
 QEfficient/utils/_utils.py                    | 25 ++++++++++++++++---
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
index fa97388de..77eeb61a3 100644
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
@@ -9,8 +9,6 @@
 
 """Inference-only LLaMA model compatible with HuggingFace weights."""
 
-
-
 from typing import Optional
 from transformers import LlamaConfig
 
@@ -40,6 +38,4 @@ def __init__(
         self.swiftkv = swiftkv
         self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
         self.key_value_group_size = key_value_group_size or 1
-        assert (
-            self.num_hidden_layers - self.num_key_value_layers
-        ) % self.key_value_group_size == 0
\ No newline at end of file
+        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9d8074a97..18006c6dc 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -78,7 +78,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         # Load the SwiftKV model if supported
         QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index e3724b90f..931f66225 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,7 +8,12 @@
 import json
 import os
 import subprocess
+<<<<<<< HEAD
 import xml.etree.ElementTree as ET
+=======
+import sys
+import warnings
+>>>>>>> b280225 (rebased)
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -17,11 +22,25 @@
 import yaml
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
-from transformers import AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
-
+from transformers import (
+    AutoConfig,
+    AutoProcessor,
+    AutoTokenizer,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
+)
+
+<<<<<<< HEAD
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
+=======
+from QEfficient.transformers.modeling_utils import (
+    SwiftKVModelCardNameToSwiftKVModelTypeDict,
+    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
+)
+from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
+>>>>>>> b280225 (rebased)
 from QEfficient.utils.logging_utils import logger
-from QEfficient.transformers.modeling_utils import SwiftKVModelCardNameToSwiftKVModelTypeDict, SwiftKVModelTypeToConfigClassAndModelArchClassDict
+
 
 class DownloadRetryLimitExceeded(Exception):
     """

From 98b5b619e160ae91b7c69d7937c8a99482506a7e Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Thu, 27 Feb 2025 18:45:54 +0000
Subject: [PATCH 129/138] moving registration of non transformer models during
 initialization of QEfficient

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                        |  6 +++
 QEfficient/transformers/modeling_utils.py     | 18 +++++++-
 .../llama_swiftkv/config_llama_swiftkv.py     | 41 -------------------
 .../llama_swiftkv/modeling_llama_swiftkv.py   |  5 ++-
 .../transformers/models/modeling_auto.py      |  3 --
 QEfficient/utils/_utils.py                    | 16 +-------
 6 files changed, 27 insertions(+), 62 deletions(-)
 delete mode 100644 QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 60aba0d74..58eff477c 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -20,6 +20,12 @@
     get_model_class_type_from_model_type,
 )
 from QEfficient.utils.logging_utils import logger
+from transformers import AutoConfig
+from QEfficient.transformers.modeling_utils import (
+    get_model_class_type_from_model_type,
+    get_auto_model_class,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+)
 
 # loop over all the model types which are not present in transformers and register them
 for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 51dce5e90..bd15563d0 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -7,6 +7,7 @@
 
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
+import sys
 
 import torch
 import torch.nn as nn
@@ -86,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models
@@ -159,8 +161,11 @@
     QEffWhisperPositionalEmbedding,
 )
 
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import LlamaSwiftKVForCausalLM
+# Placeholder for all non-transformer models
+from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
+    LlamaSwiftKVForCausalLM,
+    LlamaSwiftKVConfig
+)
 
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
@@ -383,6 +388,15 @@ def _create_causal_mask(
 
     return attention_mask
 
+def convert_str_to_class(className):
+    """
+    Convert the string to class name
+    ---------
+    :className: `str`- Class name string.
+    Return:
+        Class Name
+    """
+    return getattr(sys.modules[__name__], className)
 
 def convert_str_to_class(className):
     """
diff --git a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
deleted file mode 100644
index 77eeb61a3..000000000
--- a/QEfficient/transformers/models/llama_swiftkv/config_llama_swiftkv.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# -----------------------------------------------------------------------------
-#
-# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
-# SPDX-License-Identifier: BSD-3-Clause
-#
-# -----------------------------------------------------------------------------
-# The Modules are updated as required by Cloud AI 100 HW requirements.
-
-
-"""Inference-only LLaMA model compatible with HuggingFace weights."""
-
-from typing import Optional
-from transformers import LlamaConfig
-
-
-class LlamaSwiftKVConfig(LlamaConfig):
-    """
-    Args:
-        num_key_value_layers (int, optional):
-            The number of layers, from the first layer, that have keys and
-            values. If None, all layers have keys and values.
-        last_key_value_heads (int, optional):
-            The number of heads in the last layer that have keys and values.
-            If None, the number of heads in the last key-value layer is equal
-            to the number of heads in all the other key-value layers.
-    """
-
-    model_type = "llama_swiftkv"
-
-    def __init__(
-        self,
-        swiftkv: bool = False,
-        num_key_value_layers: Optional[int] = None,
-        key_value_group_size: Optional[int] = None,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.swiftkv = swiftkv
-        self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
-        self.key_value_group_size = key_value_group_size or 1
-        assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 79215e3bf..25a5d592c 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -22,6 +22,7 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
+from transformers import LlamaConfig
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -30,7 +31,6 @@
     QEffLlamaRotaryEmbedding,
     qeff_apply_rotary_pos_emb,
 )
-from QEfficient.transformers.models.llama_swiftkv.config_llama_swiftkv import LlamaSwiftKVConfig
 
 class LlamaSwiftKVConfig(LlamaConfig):
     """
@@ -58,7 +58,10 @@ def __init__(
         self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
         self.key_value_group_size = key_value_group_size or 1
         assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
+<<<<<<< HEAD
 
+=======
+>>>>>>> 9f5bca6 (moving registration of non transformer models during initialization of QEfficient)
 
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 18006c6dc..9d7d48293 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -52,7 +52,6 @@
 from QEfficient.utils import constants, get_padding_shape_from_config
 from QEfficient.utils.cache import to_hashable
 from QEfficient.utils.logging_utils import logger
-from QEfficient.utils._utils import QEFFLoadSwiftKVModels
 
 
 class QEFFTransformersBase(QEFFBaseModel):
@@ -78,8 +77,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-        # Load the SwiftKV model if supported
-        QEFFLoadSwiftKVModels(pretrained_model_name_or_path)
 
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 931f66225..7f74986df 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -8,12 +8,7 @@
 import json
 import os
 import subprocess
-<<<<<<< HEAD
 import xml.etree.ElementTree as ET
-=======
-import sys
-import warnings
->>>>>>> b280225 (rebased)
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -23,22 +18,13 @@
 from huggingface_hub import login, snapshot_download
 from requests.exceptions import HTTPError
 from transformers import (
-    AutoConfig,
     AutoProcessor,
     AutoTokenizer,
     PreTrainedTokenizer,
     PreTrainedTokenizerFast,
 )
 
-<<<<<<< HEAD
 from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants, QnnConstants
-=======
-from QEfficient.transformers.modeling_utils import (
-    SwiftKVModelCardNameToSwiftKVModelTypeDict,
-    SwiftKVModelTypeToConfigClassAndModelArchClassDict,
-)
-from QEfficient.utils.constants import QEFF_MODELS_DIR, Constants
->>>>>>> b280225 (rebased)
 from QEfficient.utils.logging_utils import logger
 
 
@@ -572,4 +558,4 @@ def make_serializable(obj):
     else:
         qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
 
-    create_json(qconfig_file_path, qconfigs)
+    create_json(qconfig_file_path, qconfigs)
\ No newline at end of file

From 00abd9810b77e39aa8379a5e0adc5f5984c009c5 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Tue, 4 Mar 2025 05:18:06 +0000
Subject: [PATCH 130/138] fixed lint warnings

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                                 |  7 +++++--
 QEfficient/transformers/modeling_utils.py              | 10 ++--------
 .../models/llama_swiftkv/modeling_llama_swiftkv.py     |  6 ++----
 QEfficient/transformers/models/modeling_auto.py        |  2 --
 QEfficient/utils/_utils.py                             |  2 +-
 5 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 58eff477c..2f9f05986 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -21,11 +21,13 @@
 )
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
+
 from QEfficient.transformers.modeling_utils import (
-    get_model_class_type_from_model_type,
+    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
     get_auto_model_class,
-    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS
+    get_model_class_type_from_model_type,
 )
+from QEfficient.utils.logging_utils import logger
 
 # loop over all the model types which are not present in transformers and register them
 for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
@@ -39,6 +41,7 @@
     AutoModelClassName.register(model_cls[0], model_cls[1])
 
 
+
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
     try:
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index bd15563d0..23d804e09 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,9 +5,9 @@
 #
 # -----------------------------------------------------------------------------
 
+import sys
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
-import sys
 
 import torch
 import torch.nn as nn
@@ -87,7 +87,6 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models
@@ -161,12 +160,6 @@
     QEffWhisperPositionalEmbedding,
 )
 
-# Placeholder for all non-transformer models
-from QEfficient.transformers.models.llama_swiftkv.modeling_llama_swiftkv import (
-    LlamaSwiftKVForCausalLM,
-    LlamaSwiftKVConfig
-)
-
 # Define a named tuple for ModelArchitectures
 # Required for the Automation tool
 ModelArchitectures = namedtuple("ModelArchitectures", ["architectures"])
@@ -388,6 +381,7 @@ def _create_causal_mask(
 
     return attention_mask
 
+
 def convert_str_to_class(className):
     """
     Convert the string to class name
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 25a5d592c..7a0d84abd 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -22,7 +22,7 @@
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 from transformers.modeling_utils import PreTrainedModel
-from transformers import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
@@ -32,6 +32,7 @@
     qeff_apply_rotary_pos_emb,
 )
 
+
 class LlamaSwiftKVConfig(LlamaConfig):
     """
     Args:
@@ -58,10 +59,7 @@ def __init__(
         self.num_key_value_layers = num_key_value_layers or self.num_hidden_layers
         self.key_value_group_size = key_value_group_size or 1
         assert (self.num_hidden_layers - self.num_key_value_layers) % self.key_value_group_size == 0
-<<<<<<< HEAD
 
-=======
->>>>>>> 9f5bca6 (moving registration of non transformer models during initialization of QEfficient)
 
 class LlamaSwiftKVAttention(nn.Module):
     def __init__(self, config: LlamaSwiftKVConfig, layer_idx) -> None:
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
index 9d7d48293..a87c39fb4 100644
--- a/QEfficient/transformers/models/modeling_auto.py
+++ b/QEfficient/transformers/models/modeling_auto.py
@@ -7,7 +7,6 @@
 
 import hashlib
 import warnings
-
 from pathlib import Path
 from time import perf_counter
 from typing import List, Optional, Union
@@ -77,7 +76,6 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(cls, pretrained_model_name_or_path: str, is_tlm: bool = False, *args, **kwargs):
-
         if kwargs.get("attn_implementation", None) not in {None, "eager"}:
             logger.warning('Updating attn_implementation="eager"')
 
diff --git a/QEfficient/utils/_utils.py b/QEfficient/utils/_utils.py
index 7f74986df..8ba5e2c18 100644
--- a/QEfficient/utils/_utils.py
+++ b/QEfficient/utils/_utils.py
@@ -558,4 +558,4 @@ def make_serializable(obj):
     else:
         qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
 
-    create_json(qconfig_file_path, qconfigs)
\ No newline at end of file
+    create_json(qconfig_file_path, qconfigs)

From 8396903b48784c7b4fa6b42ade4321384b005c7f Mon Sep 17 00:00:00 2001
From: Onkar Chougule <168134249+ochougul@users.noreply.github.com>
Date: Fri, 28 Feb 2025 14:59:48 +0530
Subject: [PATCH 131/138] enabling faster downloads via hf_transfer (#282)

hf hub doc:
https://huggingface.co/docs/huggingface_hub/en/guides/download
details on hf_transfer
https://github.com/[huggingface/hf_transfer](https://github.com/huggingface/hf_transfer)

---------

Signed-off-by: Onkar Chougule <quic_ochougul@quicinc.com>
Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 2f9f05986..00cd941b2 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -19,6 +19,7 @@
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
+
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
 

From 4a5cd48cde0a25790aa71e159331b3140a0e3d85 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:36:53 +0000
Subject: [PATCH 132/138] Fixed the compilation errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 23d804e09..e3b2a158c 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,6 +87,7 @@
     WhisperPositionalEmbedding,
 )
 
+from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models

From 6abceb601014ab6b1754b8e37343edfa47d1b41e Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 10:39:38 +0000
Subject: [PATCH 133/138] Fixed the lint error

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 -
 QEfficient/transformers/modeling_utils.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 00cd941b2..1d9b2db21 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,7 +6,6 @@
 # -----------------------------------------------------------------------------
 
 import os
-
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index e3b2a158c..23d804e09 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -87,7 +87,6 @@
     WhisperPositionalEmbedding,
 )
 
-from transformers import AutoModelForCausalLM
 from QEfficient.customop import CustomRMSNormAIC
 
 # Placeholder for all non-transformer models

From d026845822b337069717d26d9482d830b41fddbf Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:44:12 +0000
Subject: [PATCH 134/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 1 +
 QEfficient/transformers/modeling_utils.py | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 1d9b2db21..00cd941b2 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -6,6 +6,7 @@
 # -----------------------------------------------------------------------------
 
 import os
+
 # For faster downloads via hf_transfer
 # This code is put above import statements as this needs to be executed before
 # hf_transfer is imported (will happen on line 15 via leading imports)
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 23d804e09..dc67aa6c9 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -11,6 +11,8 @@
 
 import torch
 import torch.nn as nn
+import importlib
+
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,
@@ -390,7 +392,8 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    return getattr(sys.modules[__name__], className)
+    module = importlib.import_module("transformers")
+    return getattr(module, className)
 
 def convert_str_to_class(className):
     """

From 7010a75c63de698dd5f244c469866636304113c7 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 5 Mar 2025 11:48:14 +0000
Subject: [PATCH 135/138] fixed ruff errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/transformers/modeling_utils.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index dc67aa6c9..10022a1dc 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,14 +5,12 @@
 #
 # -----------------------------------------------------------------------------
 
-import sys
+import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
 import torch
 import torch.nn as nn
-import importlib
-
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
     CodeGenBlock,

From 582c1d463ef6c993f54fd9012a2d90b2fe841c2a Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 11:24:56 +0000
Subject: [PATCH 136/138] Address review comments

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                    | 8 --------
 QEfficient/transformers/modeling_utils.py | 3 +--
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 00cd941b2..df7c319f6 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -23,13 +23,6 @@
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
 
-from QEfficient.transformers.modeling_utils import (
-    MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS,
-    get_auto_model_class,
-    get_model_class_type_from_model_type,
-)
-from QEfficient.utils.logging_utils import logger
-
 # loop over all the model types which are not present in transformers and register them
 for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
     # Register the model config class based on the model type. This will be first element in the tuple
@@ -42,7 +35,6 @@
     AutoModelClassName.register(model_cls[0], model_cls[1])
 
 
-
 def check_qaic_sdk():
     """Check if QAIC SDK is installed"""
     try:
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 10022a1dc..1d229f64f 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import importlib
 from collections import namedtuple
 from typing import Dict, Optional, Tuple, Type
 
@@ -390,7 +389,7 @@ def convert_str_to_class(className):
     Return:
         Class Name
     """
-    module = importlib.import_module("transformers")
+    module = __import__("transformers")
     return getattr(module, className)
 
 def convert_str_to_class(className):

From cc895b7532df08e94b67c21f73366deecc7fc10c Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 14:27:24 +0000
Subject: [PATCH 137/138] Fix the lint errors

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index df7c319f6..0034c9cec 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -19,7 +19,6 @@
     get_auto_model_class,
     get_model_class_type_from_model_type,
 )
-
 from QEfficient.utils.logging_utils import logger
 from transformers import AutoConfig
 

From 0726e752f456e67fb55505b122dc932a8f03b6f7 Mon Sep 17 00:00:00 2001
From: Hem Agnihotri <quic_hemagnih@quicinc.com>
Date: Wed, 12 Mar 2025 15:04:32 +0000
Subject: [PATCH 138/138] rebased and fixed lint erros

Signed-off-by: Hem Agnihotri <quic_hemagnih@quicinc.com>
---
 QEfficient/__init__.py                                |  1 -
 QEfficient/transformers/modeling_utils.py             | 11 -----------
 .../models/llama_swiftkv/modeling_llama_swiftkv.py    |  3 ---
 3 files changed, 15 deletions(-)

diff --git a/QEfficient/__init__.py b/QEfficient/__init__.py
index 0034c9cec..60aba0d74 100644
--- a/QEfficient/__init__.py
+++ b/QEfficient/__init__.py
@@ -20,7 +20,6 @@
     get_model_class_type_from_model_type,
 )
 from QEfficient.utils.logging_utils import logger
-from transformers import AutoConfig
 
 # loop over all the model types which are not present in transformers and register them
 for model_type, model_cls in MODEL_TYPE_TO_CONFIG_CLS_AND_ARCH_CLS.items():
diff --git a/QEfficient/transformers/modeling_utils.py b/QEfficient/transformers/modeling_utils.py
index 1d229f64f..e70542ff7 100644
--- a/QEfficient/transformers/modeling_utils.py
+++ b/QEfficient/transformers/modeling_utils.py
@@ -381,17 +381,6 @@ def _create_causal_mask(
     return attention_mask
 
 
-def convert_str_to_class(className):
-    """
-    Convert the string to class name
-    ---------
-    :className: `str`- Class name string.
-    Return:
-        Class Name
-    """
-    module = __import__("transformers")
-    return getattr(module, className)
-
 def convert_str_to_class(className):
     """
     Convert the string to class name
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
index 7a0d84abd..26931fced 100644
--- a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
+++ b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -1,4 +1,3 @@
-
 # -----------------------------------------------------------------------------
 #
 # Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
@@ -21,8 +20,6 @@
 from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 from transformers.modeling_utils import PreTrainedModel
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.llama.modeling_llama import LlamaMLP, LlamaRMSNorm, logger, repeat_kv
 
 from QEfficient.transformers.cache_utils import QEffDynamicCache
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask