quic
diff --git a/‎QEfficient/transformers/models/modeling_auto.py
Lines changed: 41 additions & 17 deletions b/‎QEfficient/transformers/models/modeling_auto.py
Lines changed: 41 additions & 17 deletions
diff --git a/‎QEfficient/transformers/models/pytorch_transforms.py
Lines changed: 22 additions & 5 deletions b/‎QEfficient/transformers/models/pytorch_transforms.py
Lines changed: 22 additions & 5 deletions
diff --git a/‎QEfficient/transformers/post_processing.py
Lines changed: 28 additions & 0 deletions b/‎QEfficient/transformers/post_processing.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎QEfficient/transformers/spd/causal_lm_forward.py renamed to ‎QEfficient/transformers/spd/spd_transform_forward.py
Lines changed: 25 additions & 2 deletions b/‎QEfficient/transformers/spd/causal_lm_forward.py renamed to ‎QEfficient/transformers/spd/spd_transform_forward.py
Lines changed: 25 additions & 2 deletions
diff --git a/‎QEfficient/transformers/spd/turbo.py
Lines changed: 77 additions & 0 deletions b/‎QEfficient/transformers/spd/turbo.py
Lines changed: 77 additions & 0 deletions
diff --git a/‎QEfficient/utils/_utils.py
Lines changed: 14 additions & 0 deletions b/‎QEfficient/utils/_utils.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎QEfficient/utils/checkpoint_utils.py
Lines changed: 25 additions & 0 deletions b/‎QEfficient/utils/checkpoint_utils.py
Lines changed: 25 additions & 0 deletions
@@ -1298,7 +1298,7 @@ def __init__(
         self,
         model: nn.Module,
         continuous_batching: bool = False,
-        is_tlm: bool = False,
+        qaic_config: Optional[dict] = None,
         **kwargs,
     ):
         model_class_name = model.__class__.__name__
@@ -1324,11 +1324,8 @@ def __init__(
         self.model.config.use_cache = True
         self.num_layers = model.config.num_hidden_layers
         self.continuous_batching = continuous_batching
-
-        if is_tlm:
-            # TODO: It is possible to always apply this transform and make value of indices as last indices by default in PyTorch
-            self.model, transformed = SpDTransform.apply(self.model)
-        self.is_tlm = is_tlm
+        self.model, transformed = SpDTransform.apply(self.model, qaic_config, **kwargs)
+        self.is_tlm = transformed
 
     @property
     def model_name(self) -> str:
@@ -1343,7 +1340,12 @@ def __repr__(self) -> str:
     @classmethod
     @with_replaced_quantizers
     def from_pretrained(
-        cls, pretrained_model_name_or_path, continuous_batching: bool = False, is_tlm: bool = False, *args, **kwargs
+        cls,
+        pretrained_model_name_or_path,
+        continuous_batching: bool = False,
+        qaic_config: Optional[dict] = None,
+        *args,
+        **kwargs,
     ):
         """
         This method serves as the easiest entry point into using QEfficient. The interface is designed to be similar to transformers.AutoModelForCausalLM.
@@ -1388,6 +1390,8 @@ def from_pretrained(
 
         kwargs.update({"attn_implementation": "eager", "low_cpu_mem_usage": False})
         model = cls._hf_auto_class.from_pretrained(pretrained_model_name_or_path, *args, **kwargs)
+        if qaic_config is not None:
+            qaic_config["pretrained_model_name_or_path"] = pretrained_model_name_or_path
 
         # This is support models that should be classified to in a different auto class but transformers load them via this class
 
@@ -1396,7 +1400,12 @@ def from_pretrained(
                 model, kv_offload=kv_offload
             )
 
-        return cls(model, is_tlm=is_tlm, continuous_batching=continuous_batching)
+        return cls(
+            model,
+            continuous_batching=continuous_batching,
+            qaic_config=qaic_config,
+            **kwargs,
+        )
 
     @property
     def model_hash(self) -> str:
@@ -1571,15 +1580,7 @@ def compile(
             raise TypeError("`prefill_only` must be a boolean.")
 
         if self.is_tlm:
-            if num_speculative_tokens is None:
-                raise TypeError("`num_speculative_tokens` is required when `is_tlm=True`.")
-            if not isinstance(num_speculative_tokens, int) or num_speculative_tokens < 2:
-                raise ValueError("`num_speculative_tokens` must be an integer >= 2.")
-            if prefill_seq_len < (num_speculative_tokens + 1):
-                raise ValueError(
-                    f"`prefill_seq_len` must be at least `num_speculative_tokens + 1` "
-                    f"({num_speculative_tokens + 1}), got {prefill_seq_len}."
-                )
+            num_speculative_tokens = self.check_and_get_num_speculative_tokens(num_speculative_tokens, prefill_seq_len)
 
         if self.continuous_batching and full_batch_size is None:
             raise TypeError("`full_batch_size` is required when `continuous_batching=True`.")
@@ -1674,6 +1675,29 @@ def generate(
         else:
             raise NotImplementedError("Only AI_100 runtime is supported right now via generate API")
 
+    def check_and_get_num_speculative_tokens(self, num_speculative_tokens: Optional[int], prefill_seq_len: int):
+        if hasattr(self.model.config, "speculative_config"):
+            num_speculative_tokens_ = self.model.config.speculative_config["num_speculative_tokens"]
+            if num_speculative_tokens is not None:
+                logger.warning(
+                    f"arg `num_speculative_tokens` is a fixed value of {num_speculative_tokens_} for this model."
+                    f" Passed value of {num_speculative_tokens} will be ignored."
+                )
+            num_speculative_tokens = num_speculative_tokens_
+        elif num_speculative_tokens is None:
+            raise TypeError("missing required argument `num_speculative_tokens` as `is_tlm` is True.")
+
+        if not isinstance(num_speculative_tokens, int) and num_speculative_tokens < 2:
+            ValueError(
+                f"`num_speculative_tokens` arg should be an integer greater than 1, got {num_speculative_tokens}"
+            )
+        num_logits_to_keep = num_speculative_tokens + 1
+        if prefill_seq_len < num_logits_to_keep:
+            raise ValueError(
+                f"sequence length ({prefill_seq_len}) must be at least `num_speculative_tokens+1` ({num_logits_to_keep})"
+            )
+        return num_speculative_tokens
+
 
 class QEFFAutoModelForSpeechSeq2Seq(QEFFTransformersBase, MultimodalUtilityMixin):
     """
 
@@ -6,7 +6,7 @@
 # -----------------------------------------------------------------------------
 
 from types import MethodType
-from typing import Tuple
+from typing import Optional, Tuple
 
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
@@ -266,7 +266,10 @@
     QEffWhisperModel,
     QEffWhisperPositionalEmbedding,
 )
-from QEfficient.transformers.spd.causal_lm_forward import tlm_forward
+from QEfficient.transformers.post_processing import build_and_attach_mlp, model_type_registry
+from QEfficient.transformers.spd.spd_transform_forward import tlm_forward
+
+SPD_TARGET = "target"
 
 
 class CustomOpsTransform(ModuleMappingTransform):
@@ -423,19 +426,33 @@ class SpDTransform:
     _module_mapping = {
         # Llama
         QEffLlamaForCausalLM,
+        QEffQwen2ForCausalLM,
     }
 
     @classmethod
-    def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
+    def apply(cls, model: nn.Module, qaic_config: Optional[dict] = None, **kwargs) -> Tuple[nn.Module, bool]:
         transformed = False
-        if (model_class := model.__class__) in cls._module_mapping:
+        if qaic_config is None or (speculative_model_type := qaic_config.get("speculative_model_type")) is None:
+            return model, transformed
+        elif speculative_model_type not in (
+            supported_spd_model_types := [SPD_TARGET] + list(model_type_registry.keys())
+        ):
+            raise ValueError(
+                f"Specualtive model type {speculative_model_type} is not supported. we currently only support {supported_spd_model_types}"
+            )
+        elif (model_class := model.__class__) in cls._module_mapping:
             model.forward = MethodType(tlm_forward, model)
+            if speculative_model_type != SPD_TARGET:
+                # build and attach draft mlp
+                pretrained_model_name_or_path = qaic_config["pretrained_model_name_or_path"]
+                model = build_and_attach_mlp(
+                    model, pretrained_model_name_or_path, speculative_model_type=speculative_model_type, **kwargs
+                )
             transformed = True
         else:
             raise NotImplementedError(
                 f"model class {model_class} does not yet support returning multiple logits to keep."
             )
-
         return model, transformed
 
 
 
@@ -0,0 +1,28 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from QEfficient.transformers.spd.turbo import build_and_attach_turbo
+from QEfficient.utils.spd_utils import get_speculative_config, get_speculative_weights
+
+model_type_registry = dict(turbo=build_and_attach_turbo)
+
+
+def build_and_attach_mlp(model, pretrained_model_name_or_path, speculative_model_type: str, **kwargs):
+    speculative_config: dict = get_speculative_config(pretrained_model_name_or_path, **kwargs)
+    speculative_weights: str = get_speculative_weights(pretrained_model_name_or_path, **kwargs)
+
+    if (model_type := speculative_config.get("model_type")) is None:
+        speculative_config["model_type"] = speculative_model_type
+    else:
+        if model_type != speculative_model_type:
+            raise ValueError(
+                f"`model_type` key from speculator config ({model_type} does not match input model type ({speculative_model_type})."
+            )
+    func = model_type_registry[speculative_model_type]
+    model = func(model, speculative_config, speculative_weights)
+    model.config.speculative_config = speculative_config
+    return model
@@ -21,7 +21,7 @@ def filter_hidden_states(
     Filter hidden states based on whether this is a TLM SpD model
 
     ``Mandatory`` Args:
-        :hidden_states (torch.Tensor): Hidden states tensor.
+        :hidden_states (torch.Tensor): Last hidden state tensor.
         :position_ids (torch.Tensor): Position ids tensor.
     ``Optional`` Args:
         :num_logits_to_keep (int, optional): Number of speculative tokens, specified only for TLM SpD model
@@ -50,6 +50,26 @@ def filter_hidden_states(
     return hidden_states
 
 
+def project_hidden_states(hidden_states: torch.Tensor, hidden_size_projections: torch.nn.ModuleList) -> torch.Tensor:
+    """
+    Filter hidden states based on whether this is a TLM SpD model
+    ``Mandatory`` Args:
+        :hidden_states (torch.Tensor): Last hidden state tensor.
+        :hidden_size_projections (torch.nn.ModuleList): Position ids tensor.
+    ``Optional`` Args:
+        :num_logits_to_keep (int, optional): Number of speculative tokens, specified only for TLM SpD model
+    Returns:
+        :torch.Tensor: Filtered hidden states.
+    """
+    proj_hidden_states = [hidden_states]
+    num_projs = len(hidden_size_projections)
+    for i in range(num_projs):
+        hidden_states_i = hidden_size_projections[i](hidden_states)
+        proj_hidden_states.append(hidden_states_i)
+    hidden_states = torch.stack(proj_hidden_states, dim=2)  # shape: [bsz, seq_len, num_projs, d_model]
+    return hidden_states
+
+
 def tlm_forward(
     self,
     input_ids: torch.LongTensor = None,
@@ -113,7 +133,10 @@ def tlm_forward(
     )
 
     hidden_states = filter_hidden_states(outputs[0], position_ids, num_logits_to_keep)
-    if self.config.pretraining_tp > 1:
+    hidden_size_projections = getattr(self, "projections", None)
+    if hidden_size_projections:
+        hidden_states = project_hidden_states(hidden_states, hidden_size_projections)
+    if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
         lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
         logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
         logits = torch.cat(logits, dim=-1)
 
@@ -0,0 +1,77 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+import torch
+
+from QEfficient.utils.checkpoint_utils import load_checkpoint
+
+
+class ResBlock(torch.nn.Module):
+    """
+    A Residual Block module.
+    This module performs a linear transformation followed by a SiLU activation,
+    and then adds the result to the original input, creating a residual connection.
+    Args:
+        hidden_size (int): The size of the hidden layers in the block.
+    """
+
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.linear = torch.nn.Linear(hidden_size, hidden_size)
+        # Initialize as an identity mapping
+        torch.nn.init.zeros_(self.linear.weight)
+        # Use SiLU activation to keep consistent with the Llama model
+        self.act = torch.nn.SiLU()
+
+    def forward(self, x):
+        """
+        Forward pass of the ResBlock.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output after the residual connection and activation.
+        """
+        return x + self.act(self.linear(x))
+
+
+def post_process_turbo_state_dict(state_dict: dict) -> dict:
+    """normaize turbo state dict keys
+    Args:
+        state_dict (dict): turbo state dict
+    Returns:
+        dict: normalized state dict
+    """
+    new_state_dict = dict()
+    for name, weights in state_dict.items():
+        new_name = name.replace("projections.", "")
+        new_state_dict[new_name] = weights
+    return new_state_dict
+
+
+def build_and_attach_turbo(model, speculative_config: dict, speculative_weights: str):
+    """build and attach turbo projections
+    Args:
+        model: model to attach projections to
+        speculative_config (dict): speculative config file used to build projections
+    Returns:
+        model: model with turbo projections
+    """
+    hidden_size = model.config.hidden_size
+    num_layers = speculative_config["turbo_num_layers"]
+    num_heads = speculative_config["turbo_num_heads"]
+    projections = torch.nn.ModuleList(
+        [
+            torch.nn.Sequential(
+                *([ResBlock(hidden_size)] * num_layers),
+            )
+            for _ in range(num_heads)
+        ],
+    )
+    load_checkpoint(projections, speculative_weights, strict=True, post_process_func=post_process_turbo_state_dict)
+    model.projections = projections
+    speculative_config["num_speculative_tokens"] = num_heads
+    return model
@@ -5,6 +5,7 @@
 #
 # -----------------------------------------------------------------------------
 
+import inspect
 import json
 import os
 import subprocess
@@ -626,3 +627,16 @@ def make_serializable(obj):
         qconfigs["qpc_config"]["aic_compiler_config"] = aic_compiler_config
 
     create_json(qconfig_file_path, qconfigs)
+
+
+def filter_kwargs(func, kwargs):
+    """
+    Filter a dictionary of keyword arguments to only include the valid arguments of a function.
+    Args:
+        func: The function to check the arguments for.
+        kwargs: The dictionary of keyword arguments to filter.
+    Returns:
+        A new dictionary containing only the valid keyword arguments.
+    """
+    valid_args = inspect.signature(func).parameters
+    return {key: value for key, value in kwargs.items() if key in valid_args}
@@ -0,0 +1,25 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) 2025 Qualcomm Innovation Center, Inc. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# -----------------------------------------------------------------------------
+
+from safetensors.torch import load_file
+
+
+def load_checkpoint(model, checkpoint: str, strict=False, post_process_func=None):
+    """load weights ending with `.safetensors` extension
+    Args:
+        model: model to load wights into
+        checkpoint (str): checkpoint path
+        strict (bool, optional): strictness of loading weights. Defaults to False.
+        post_process_func (optional): Optional post-processing of loaded state dict. Defaults to None.
+    Returns:
+        model: model with applied weights
+    """
+    state_dict: dict = load_file(checkpoint)
+    if post_process_func is not None:
+        state_dict = post_process_func(state_dict)
+    model.load_state_dict(state_dict, strict=strict)
+    return model