Minor Changes to AutoModelForSpeechSeq2Seq to better align with other models (#286)

kdulla · web-flow · commit 5520757f30ad · 2025-03-12T19:20:12.000+05:30
Minor fixes to generate and compile to be more consistent with how other
models are called.

---------

Signed-off-by: Kushal Dulla &lt;quic_kdulla@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -579,6 +579,7 @@ def export(
         )
 
         self.lang_model.export(inputs["lang"], output_names["lang"], dynamic_axes["lang"], export_dir)
+        return self.onnx_path
 
     def compile(
         self,
@@ -676,6 +677,7 @@ def compile(
             custom_io=custom_io_lang,
             **compiler_options,
         )
+        return self.qpc_path
 
     def generate(
         self,
@@ -895,7 +897,7 @@ def export(
         inputs = self.model.get_dummy_inputs()
         dynamic_axes = self.model.get_onnx_dynamic_axes()
         output_names = self.model.get_output_names()
-        self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
 
     def compile(
         self,
@@ -1727,20 +1729,26 @@ def export(self, export_dir: Optional[str] = None) -> str:
         inputs = self.model.get_dummy_inputs()
         dynamic_axes = self.model.get_onnx_dynamic_axes()
         output_names = self.model.get_output_names()
-        self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
+        return self._export(inputs, output_names, dynamic_axes, export_dir=export_dir)
 
     def compile(
         self,
         onnx_path: Optional[str] = None,
         compile_dir: Optional[str] = None,
         *,
-        encoder_ctx_len: int = 1500,
-        decoder_ctx_len: int = 150,
-        feature_len: int = 3000,
+        prefill_seq_len: Optional[int] = 1,
+        encoder_ctx_len: Optional[int] = None,
+        ctx_len: int = 150,
+        full_batch_size: Optional[int] = None,
+        kv_cache_batch_size: Optional[int] = None,
         batch_size: int = 1,
         num_devices: int = 1,
         num_cores: int = 16,  # FIXME: Make this mandatory arg
         mxfp6_matmul: bool = False,
+        mxint8_kv_cache: bool = False,
+        num_speculative_tokens: Optional[int] = None,
+        enable_qnn: bool = False,
+        qnn_config: Optional[str] = None,
         **compiler_options,
     ) -> str:
         """
@@ -1751,19 +1759,41 @@ def compile(
         ``Optional`` Args:
             :onnx_path (str, optional): Path to pre-exported onnx model.
             :compile_dir (str, optional): Path for saving the qpc generated.
-            :seq_len (int, optional): The length of the prompt should be less that ``seq_len``. ``Defaults to 32``.
+            :encoder_ctx_len (int, optional): The maximum length of context for encoder, based on the AutoProcessor output. ``Defaults to checking config, if None in config then 1500``
+            :ctx_len (int, optional): The maximum length of context to keep for decoding. ``Defaults to 150``.
             :batch_size (int, optional): Batch size. ``Defaults to 1``.
             :num_devices (int): Number of devices the model needs to be compiled for. Defaults to 1.
             :num_cores (int): Number of cores used to compile the model.
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
-            :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+
+            Other args are not yet implemented for AutoModelForSpeechSeq2Seq
         Returns:
             :str: Path of the compiled ``qpc`` package.
         """
-        specializations = self.model.get_specializations(batch_size, encoder_ctx_len, decoder_ctx_len, feature_len)
+        specializations, compiler_options = self.model.get_specializations(
+            batch_size,
+            encoder_ctx_len,
+            ctx_len,
+            **compiler_options,
+        )
 
-        self._compile(
+        if full_batch_size:
+            logger.warning("Continuous batching is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if kv_cache_batch_size:
+            logger.warning("Prefix caching is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if mxint8_kv_cache:
+            logger.warning("mxint8 cache is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if num_speculative_tokens:
+            logger.warning("Speculative decoding is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        if enable_qnn or qnn_config:
+            logger.warning("QNN compile is not yet enabled for AutoModelForSpeechSeq2Seq")
+
+        return self._compile(
             onnx_path,
             compile_dir,
             compile_only=True,
@@ -1781,7 +1811,6 @@ def generate(
         inputs: torch.Tensor,
         generation_len: int,
         streamer: Optional[TextStreamer] = None,
-        enable_debug_logs: bool = False,
         device_ids: List[int] = None,
     ) -> Union[torch.Tensor, np.ndarray]:
         """
@@ -1790,9 +1819,8 @@ def generate(
 
         ``Mandatory`` Args:
             :processor: autoprocessor to process inputs and decode logits
-            :inputs (np.ndarray): inputs to run the execution.
+            :inputs (torch.Tensor): inputs to run the execution.
             :generation_len (int): length upto which to generate
-            :sample_rate (int): sampling rate at which input audio is stored in inputs (needed for processor)
             :device_id (List[int]): Ids of devices for running the qpc pass as [0] in case of normal model / [0, 1, 2, 3] in case of tensor slicing model
         Returns:
             :dict: Output from the ``AI_100`` or ``PyTorch`` runtime.
@@ -1803,9 +1831,20 @@ def generate(
         inputs = self.auto_correct_inputs(inputs)
 
         if self.qpc_session is None:
-            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids, enable_debug_logs=enable_debug_logs)
+            self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
             self.batch_size = self.qpc_session.bindings[0].dims[0]
 
+        inputs["input_features"] = inputs["input_features"].numpy().astype(np.float32)
+
+        # add start token id and initial position ids to inputs
+        seq_len = 1
+        inputs["decoder_input_ids"] = (
+            torch.ones((self.batch_size, seq_len), dtype=torch.int64) * self.model.config.decoder_start_token_id
+        ).numpy()
+        inputs["decoder_position_ids"] = (
+            torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(self.batch_size, 1).numpy()
+        )
+
         self.qpc_session.skip_buffers(
             [x for x in self.qpc_session.input_names + self.qpc_session.output_names if x.startswith("past_")]
         )
diff --git a/QEfficient/transformers/models/whisper/modeling_whisper.py b/QEfficient/transformers/models/whisper/modeling_whisper.py
@@ -8,7 +8,6 @@
 import random
 from typing import Optional, Tuple
 
-import numpy as np
 import torch
 from torch import nn
 from transformers.cache_utils import Cache, EncoderDecoderCache, StaticCache
@@ -812,28 +811,33 @@ def get_dummy_inputs(
 
         return inputs
 
-    def get_specializations(
-        self, batch_size: int, encoder_ctx_len: int, decoder_ctx_len: int, feature_len: int, **compiler_options
-    ):
+    def get_specializations(self, batch_size: int, encoder_ctx_len, ctx_len, **compiler_options):
+        if encoder_ctx_len is None and hasattr(self.config, "max_source_positions"):
+            encoder_ctx_len = self.config.max_source_positions
+        elif encoder_ctx_len is None:
+            encoder_ctx_len = 1500
+            logger.warning("Setting `encoder_ctx_len=1500` as it was neither passed nor found in config")
+        feature_len = encoder_ctx_len * 2
+
         encoder_specializations = {
             "batch_size": batch_size,
             "seq_len": 1,
             "encoder_ctx_len": encoder_ctx_len,
-            "decoder_ctx_len": decoder_ctx_len,
+            "decoder_ctx_len": ctx_len,
             "feature_len": feature_len,
         }
 
         decoder_specializations = {
             "batch_size": batch_size,
             "seq_len": 1,
             "encoder_ctx_len": encoder_ctx_len,
-            "decoder_ctx_len": decoder_ctx_len,
+            "decoder_ctx_len": ctx_len,
             "feature_len": 1,  # important dummy feature so that torch.where knows whether to run cross attention or not
         }
 
         specializations = [encoder_specializations, decoder_specializations]
 
-        return specializations
+        return specializations, compiler_options
 
     def get_onnx_dynamic_axes(
         self,
@@ -874,7 +878,5 @@ def get_output_names(
 
     def get_inputs_info(self):
         return [
-            IOInfo(name="input_features", datatype=np.float32, shape=("batch_size", "num_mel_bins", "feature_len")),
-            IOInfo(name="decoder_input_ids", datatype=np.int64, shape=("batch_size", "seq_len")),
-            IOInfo(name="decoder_position_ids", datatype=np.int64, shape=("batch_size", "seq_len")),
+            IOInfo(name="input_features", datatype=torch.float32, shape=("batch_size", "num_mel_bins", "feature_len")),
         ]
diff --git a/examples/speech_to_text/run_whisper_speech_to_text.py b/examples/speech_to_text/run_whisper_speech_to_text.py
@@ -5,8 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import numpy as np
-import torch
 from datasets import load_dataset
 from transformers import AutoProcessor
 
@@ -29,24 +27,10 @@
 ## STEP 3 -- export and compile model
 qeff_model.compile()
 
-## STEP 4 -- prepare generate inputs
-bs = 1
-seq_len = 1
-input_features = (
-    processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32)
+## STEP 4 -- generate output for loaded input and processor
+exec_info = qeff_model.generate(
+    inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len
 )
-decoder_input_ids = (
-    torch.ones((bs, seq_len), dtype=torch.int64) * qeff_model.model.config.decoder_start_token_id
-).numpy()
-decoder_position_ids = torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1).numpy()
-inputs = dict(
-    input_features=input_features,
-    decoder_input_ids=decoder_input_ids,
-    decoder_position_ids=decoder_position_ids,
-)
-
-## STEP 5 -- generate output for loaded input and processor
-exec_info = qeff_model.generate(inputs=inputs, generation_len=ctx_len)
 
-## STEP 6 (optional) -- use processor to decode output
+## STEP 5 (optional) -- use processor to decode output
 print(processor.batch_decode(exec_info.generated_ids)[0])
diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py
@@ -334,28 +334,14 @@ def check_seq2seq_pytorch_vs_kv_vs_ort_vs_ai100(
         pytest.skip("No available devices to run model on Cloud AI 100")
 
     qeff_model.compile(
-        encoder_ctx_len=qeff_model.model.config.max_source_positions,
-        decoder_ctx_len=ctx_len,
+        ctx_len=ctx_len,
         num_cores=16,
         batch_size=batch_size,
     )
 
-    bs = 1
-    seq_len = 1
-    input_features = (
-        processor(data, sampling_rate=sample_rate, return_tensors="pt").input_features.numpy().astype(np.float32)
-    )
-    decoder_input_ids = (
-        torch.ones((bs, seq_len), dtype=torch.int64) * qeff_model.model.config.decoder_start_token_id
-    ).numpy()
-    decoder_position_ids = torch.arange(seq_len, dtype=torch.int64).view(1, seq_len).repeat(bs, 1).numpy()
-    inputs = dict(
-        input_features=input_features,
-        decoder_input_ids=decoder_input_ids,
-        decoder_position_ids=decoder_position_ids,
+    exec_info = qeff_model.generate(
+        inputs=processor(data, sampling_rate=sample_rate, return_tensors="pt"), generation_len=ctx_len
     )
-
-    exec_info = qeff_model.generate(inputs=inputs, generation_len=ctx_len)
     cloud_ai_100_tokens = exec_info.generated_ids[0]  # Because we always run for single input and single batch size
     assert (pytorch_kv_tokens == cloud_ai_100_tokens).all(), (
         "Tokens don't match for pytorch output and Cloud AI 100 output."