openMF · itsPronay · Mar 17, 2026 · Mar 17, 2026 · Mar 18, 2026
diff --git a/faster-whisper-benchmark/Faster_whisper_run_colab.ipynb b/faster-whisper-benchmark/Faster_whisper_run_colab.ipynb
diff --git a/faster-whisper-benchmark/README.md b/faster-whisper-benchmark/README.md
@@ -0,0 +1,49 @@
+# Faster Whisper Benchmark
+
+This repository provides benchmarking utilities for the [faster-whisper](https://github.com/SYSTRAN/faster-whisper) model, profiling both encoder and decoder performance on various devices. It supports logging to Weights & Biases (wandb) and outputs results in a pandas table for easy analysis.
+
+## Features
+- Benchmarks Whisper encoder and decoder separately
+- Logs detailed metrics (latency, memory, utilization, etc.)
+- Supports multiple devices and model variants
+- Outputs results as a pandas DataFrame
+- Optional logging to wandb
+
+## Setup
+
+You must set up Qualcomm AI Hub and set up wandb if you want to log online.
+
+**Important:** You must use PyTorch version 2.8.0. See [PyTorch previous versions](https://pytorch.org/get-started/previous-versions/).
+Otherwise, decoder profiling will fail.
+```
+
+## Usage
+
+Run the main benchmark script:
+
+```bash
+python main.py [--model_id MODEL_ID] [--batch_size N] [--decoder_len N] [--tokens N] [--feature_length N] [--wandb_project NAME] [--wandb_mode online|offline|disabled]
+```
+
+### Example
+
+```bash
+python main.py --model_id openai/whisper-small --batch_size 1 --feature_length 3000 --wandb_mode online
+```
+
+## Output
+- Prints a summary table of encoder and decoder metrics using pandas
+- Logs metrics to wandb if enabled
+- Shows total estimated latency and combined memory usage
+
+## File Structure
+- `main.py` — Main benchmarking script
+- `scripts/setup_env.py` — Environment and dependency setup
+- `models/whisper_wrappers.py` — Model wrapper classes
+- `utils/benchmark.py` — Torch utility functions
+- `metrics/extractor.py` — Metric extraction and logging
+
+## Customization
+- Edit `devices_list` in `main.py` to benchmark on different devices
+- Adjust arguments to benchmark different model sizes or input shapes
+
diff --git a/faster-whisper-benchmark/main.py b/faster-whisper-benchmark/main.py
@@ -0,0 +1,106 @@
+import torch
+import qai_hub as hub
+from transformers import WhisperForConditionalGeneration
+import wandb
+import argparse
+import pandas as pd
+
+from models import WhisperEncoderWrapper, WhisperDecoderStepWrapper
+from utils import get_traced_model, get_traced_model_multi, profile_model
+from metrics.extractor import extract_metrics_from_profile
+
+parser = argparse.ArgumentParser('Faster Whisper Benchmark')
+
+parser.add_argument('--model_id', type=str, default="openai/whisper-small", help='Model ID to benchmark')
+parser.add_argument('--batch_size', type=int, default=1, help='Batch size for benchmarking')
+parser.add_argument('--decoder_len', type=int, default=16, help='Decoder input length for benchmarking')
+parser.add_argument('--tokens', type=int, default=120, help='Estimated number of output tokens to calculate total latency')
+parser.add_argument('--feature_length', type=int, default=3000, help='Feature length to benchmark')
+
+parser.add_argument('--wandb_project', type=str, default="faster-whisper-benchmark", help='Weights & Biases project name')
+parser.add_argument('--wandb_mode', type=str, choices=['online', 'offline'], default="online", help='Weights & Biases mode (online/offline)')
+
+args, _ = parser.parse_known_args()
+
+# Keep the devices you want to run the benchmark on
+devices_list = [
+    # "Google Pixel 3a",
+    # "Samsung Galaxy S24 (Family)",
+    "Samsung Galaxy S25 Ultra"
+]
+
+def main():
+
+    for device in devices_list:
+
+        if args.wandb_mode != 'disabled':
+            wandb.init(
+                project = args.wandb_project,
+                name = f"Model: {args.model_id}, Device: {device},",
+                mode = args.wandb_mode,
+                config = vars(args)
+            )
+
+        device = hub.Device(device)
+        base_model = WhisperForConditionalGeneration.from_pretrained(args.model_id).eval().cpu()
+        encoder_model = WhisperEncoderWrapper(base_model).eval().cpu()
+        decoder_model = WhisperDecoderStepWrapper(base_model).eval().cpu()
+
+        encoder_shape = (args.batch_size, 80, args.feature_length)
+        print(f"Benchmarking feature shape: {encoder_shape}")
+
+        # Perform benchmarking for encoder
+        encoder_profile = profile_model(
+            get_traced_model(encoder_shape, encoder_model),
+            device,
+            {"input_features": encoder_shape},
+        )
+
+        encoder_metrics = extract_metrics_from_profile(encoder_profile)
+
+        with torch.no_grad():
+            dummy_features = torch.rand(encoder_shape, dtype=torch.float32)
+            encoder_hidden = encoder_model(dummy_features)
+            encoder_hidden_shape = tuple(encoder_hidden.shape)
+
+
+        decoder_input = torch.ones((args.batch_size, args.decoder_len), dtype=torch.int32)
+
+        # Perform benchmarking for decoder
+        decoder_profile = profile_model(
+            get_traced_model_multi((decoder_input, encoder_hidden), decoder_model),
+            device,
+            {
+                "decoder_input_ids": (tuple(decoder_input.shape), "int32"),
+                "encoder_hidden_states": (encoder_hidden_shape, "float32"),
+            },
+        )
+
+        decoder_metrics = extract_metrics_from_profile(decoder_profile)
+
+        enc_ms = encoder_metrics.get("estimated_inference_time_ms")
+        dec_ms = decoder_metrics.get("estimated_inference_time_ms")
+        est_total_ms = None
+        if enc_ms is not None and dec_ms is not None:
+            est_total_ms = enc_ms + args.tokens * dec_ms
+
+
+        # Combine total memory usage (sum encoder and decoder peak memory)
+        encoder_mem = encoder_metrics.get("estimated_inference_peak_memory", 0)
+        decoder_mem = decoder_metrics.get("estimated_inference_peak_memory", 0)
+        total_memory_mb = encoder_mem + decoder_mem
+
+        all_metrics = {
+            "estimated_total_latency_ms": est_total_ms,
+            "total_memory_mb": total_memory_mb,
+            "encoder": encoder_metrics,
+            "decoder": decoder_metrics,
+        }
+
+        if args.wandb_mode != 'disabled':
+            wandb.log(all_metrics)
+            wandb.finish()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/faster-whisper-benchmark/metrics/___init__.py b/faster-whisper-benchmark/metrics/___init__.py
@@ -0,0 +1 @@
+from .extractor import extract_metrics_from_profile
diff --git a/faster-whisper-benchmark/metrics/extractor.py b/faster-whisper-benchmark/metrics/extractor.py
@@ -0,0 +1,81 @@
+import numpy as np
+
+
+def us_to_ms(x):
+    return x / 1e3
+
+def bytes_to_mb(x):
+    return x / (1024 ** 2)
+
+def extract_metrics_from_profile(profile: dict):
+    exec_sum    = profile.get("execution_summary", {})
+    exec_detail = profile.get("execution_detail", [])
+
+    # ── End-to-End Performance ────────────────────────────────────
+    times      = np.array(exec_sum.get("all_inference_times", []))
+    first_load = exec_sum.get("first_load_time", 0)
+    warm_load  = exec_sum.get("warm_load_time", 0)
+
+    metrics = {
+        "estimated_inference_time_ms": round(us_to_ms(exec_sum.get("estimated_inference_time", 0)), 4),
+        "mean_latency_ms":    round(us_to_ms(times.mean()), 4)             if len(times) else None,
+        "min_latency_ms":     round(us_to_ms(times.min()), 4)              if len(times) else None,
+        "max_latency_ms":     round(us_to_ms(times.max()), 4)              if len(times) else None,
+        "std_dev_ms":         round(us_to_ms(times.std()), 4)              if len(times) else None,
+        "coeff_of_variation": round((times.std() / times.mean()) * 100, 4) if len(times) else None,
+        "throughput_fps":     round(1000 / us_to_ms(times.mean()), 4)      if len(times) else None,
+        "cold_start_ms":      round(us_to_ms(first_load), 4),
+        "warm_start_ms":      round(us_to_ms(warm_load), 4),
+        "speedup_cold_warm":  round(first_load / warm_load, 4) if warm_load else None,
+    }
+
+    # ── Memory Footprint ──────────────────────────────────────────
+    inf_mem  = exec_sum.get("estimated_inference_peak_memory", 0)
+    cold_mem = exec_sum.get("first_load_peak_memory", 0)
+    warm_mem = exec_sum.get("warm_load_peak_memory", 0)
+
+    metrics.update({
+        "estimated_inference_peak_memory": round(bytes_to_mb(inf_mem), 4),
+        "cold_start_peak_mb":             round(bytes_to_mb(cold_mem), 4),
+        "warm_start_peak_mb":             round(bytes_to_mb(warm_mem), 4),
+        "memory_reduction_cold_warm_pct": round((1 - warm_mem / cold_mem) * 100, 4) if cold_mem else None,
+        "memory_reduction_warm_inf_pct":  round((1 - inf_mem / warm_mem) * 100, 4)  if warm_mem else None,
+        "memory_efficiency_ratio":        round(inf_mem / cold_mem, 4)              if cold_mem else None,
+    })
+
+    # ── Accelerator Utilization ───────────────────────────────────
+    if exec_detail:
+        total_time      = sum(op.get("execution_time", 0) for op in exec_detail)
+        total_op_count  = len(exec_detail)
+        zero_op_count   = sum(1 for op in exec_detail if op.get("execution_time", 0) == 0)
+        nonzero_op_count = total_op_count - zero_op_count
+
+        unit_times = {}
+        for op in exec_detail:
+            unit = op.get("compute_unit", "UNKNOWN")
+            unit_times[unit] = unit_times.get(unit, 0) + op.get("execution_time", 0)
+
+        metrics.update({
+            "total_op_count":          total_op_count,
+            "nonzero_op_count":        nonzero_op_count,
+            "zero_op_count":           zero_op_count,
+            "zero_op_percentage":      round(zero_op_count / total_op_count * 100, 4) if total_op_count else 0.0,
+            "avg_op_time_ms":          round(us_to_ms(total_time / nonzero_op_count), 4) if nonzero_op_count else 0.0,
+            "total_op_time_ms":        round(us_to_ms(total_time), 4),
+            "dominant_compute_unit":   max(unit_times, key=unit_times.get) if unit_times else "N/A",
+            "cpu_utilization_percentage": round(unit_times.get("CPU", 0.0) / total_time * 100, 4) if total_time else 0.0,
+            "gpu_utilization_percentage": round(unit_times.get("GPU", 0.0) / total_time * 100, 4) if total_time else 0.0,
+            "npu_utilization_percentage": round(unit_times.get("NPU", 0.0) / total_time * 100, 4) if total_time else 0.0,
+        })
+
+        # ── Performance Bottlenecks ───────────────────────────────
+        top_ops       = sorted(exec_detail, key=lambda op: op.get("execution_time", 0), reverse=True)[:15]
+        top_ops_total = sum(op.get("execution_time", 0) for op in top_ops)
+
+        metrics.update({
+            "top15_ops_time_ms":        round(us_to_ms(top_ops_total), 4),
+            "top15_ops_pct_of_total":   round(top_ops_total / total_time * 100, 4) if total_time else 0.0,
+            "effective_op_time_ratio":  round(top_ops_total / total_time, 4)        if total_time else 0.0,
+        })
+
+    return {k: v for k, v in metrics.items() if v is not None}
diff --git a/faster-whisper-benchmark/models/__init__.py b/faster-whisper-benchmark/models/__init__.py
@@ -0,0 +1 @@
+from .whisper_wrappers import WhisperEncoderWrapper, WhisperDecoderStepWrapper
diff --git a/faster-whisper-benchmark/models/whisper_wrappers.py b/faster-whisper-benchmark/models/whisper_wrappers.py
@@ -0,0 +1,32 @@
+import torch
+import torch.nn as nn
+from transformers import WhisperForConditionalGeneration
+
+class WhisperEncoderWrapper(nn.Module):
+    def __init__(self, model: WhisperForConditionalGeneration) -> None:
+        super().__init__()
+        self.encoder = model.model.encoder
+
+    def forward(self, input_features: torch.Tensor) -> torch.Tensor:
+        return self.encoder(input_features=input_features).last_hidden_state
+
+class WhisperDecoderStepWrapper(nn.Module):
+    def __init__(self, model: WhisperForConditionalGeneration) -> None:
+        super().__init__()
+        self.decoder = model.model.decoder
+        self.proj_out = model.proj_out
+
+    def forward(
+        self,
+        decoder_input_ids: torch.Tensor, 
+        encoder_hidden_states: torch.Tensor,
+    ) -> torch.Tensor:
+        token_ids = decoder_input_ids.to(torch.int32)  
+        out = self.decoder(
+            input_ids=token_ids,
+            encoder_hidden_states=encoder_hidden_states,
+            use_cache=False,
+            return_dict=True,
+        )
+        logits = self.proj_out(out.last_hidden_state)
+        return logits[:, -1, :]
diff --git a/faster-whisper-benchmark/utils/__init__.py b/faster-whisper-benchmark/utils/__init__.py
@@ -0,0 +1,3 @@
+from .benchmark import (
+    profile_model, get_exported_model_multi, get_traced_model, get_traced_model_multi,
+)
diff --git a/faster-whisper-benchmark/utils/benchmark.py b/faster-whisper-benchmark/utils/benchmark.py
@@ -0,0 +1,56 @@
+import torch
+import qai_hub as hub
+
+def run_compile(traced_model, device, input_specs_or_shape):
+    if isinstance(input_specs_or_shape, dict):
+        input_specs = input_specs_or_shape
+    else:
+        input_specs = dict(image=input_specs_or_shape)
+
+    compile_job = hub.submit_compile_job(
+        model=traced_model,
+        device=device,
+        input_specs=input_specs,
+    )
+
+    assert isinstance(compile_job, hub.CompileJob)
+    return compile_job
+
+
+def run_profile(compiled_job, device):
+    profile_job = hub.submit_profile_job(
+        model=compiled_job.get_target_model(),
+        device=device,
+        name=compiled_job.name + "_profiling",
+    )
+
+    assert isinstance(profile_job, hub.ProfileJob)
+    return profile_job
+
+
+def get_traced_model(input_shape, model, dtype=torch.float32):
+    example_input = torch.rand(input_shape, dtype=dtype)
+    with torch.no_grad():
+        traced_model = torch.jit.trace(model, example_input)
+    return traced_model
+
+
+def get_traced_model_multi(example_inputs, model):
+    with torch.no_grad():
+        exported = torch.export.export(model, example_inputs)
+    tmp_path = "/tmp/decoder_exported.pt2"
+    torch.export.save(exported, tmp_path)
+    return tmp_path
+
+def profile_model(traced_model: torch.jit.ScriptModule, device: hub.Device, input_specs: dict) -> dict:
+    compiled_model = run_compile(traced_model, device, input_specs)
+    profiled_model = run_profile(compiled_model, device)
+    return profiled_model.download_profile()
+
+def get_exported_model_multi(example_inputs, model):
+    """Use torch.export for decoder (avoids aten::diff ONNX issue)."""
+    with torch.no_grad():
+        exported = torch.export.export(model, example_inputs)
+    tmp_path = "/tmp/decoder_exported.pt2"
+    torch.export.save(exported, tmp_path)
+    return tmp_path
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .extractor import extract_metrics_from_profile
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from .whisper_wrappers import WhisperEncoderWrapper, WhisperDecoderStepWrapper