Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
399 changes: 399 additions & 0 deletions faster-whisper-benchmark/Faster_whisper_run_colab.ipynb

Large diffs are not rendered by default.

49 changes: 49 additions & 0 deletions faster-whisper-benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Faster Whisper Benchmark

This repository provides benchmarking utilities for the [faster-whisper](https://github.com/SYSTRAN/faster-whisper) model, profiling both encoder and decoder performance on various devices. It supports logging to Weights & Biases (wandb) and outputs results in a pandas table for easy analysis.

## Features
- Benchmarks Whisper encoder and decoder separately
- Logs detailed metrics (latency, memory, utilization, etc.)
- Supports multiple devices and model variants
- Outputs results as a pandas DataFrame
- Optional logging to wandb

## Setup

You must set up Qualcomm AI Hub and set up wandb if you want to log online.

**Important:** You must use PyTorch version 2.8.0. See [PyTorch previous versions](https://pytorch.org/get-started/previous-versions/).
Otherwise, decoder profiling will fail.
```

## Usage

Run the main benchmark script:

```bash
python main.py [--model_id MODEL_ID] [--batch_size N] [--decoder_len N] [--tokens N] [--feature_length N] [--wandb_project NAME] [--wandb_mode online|offline|disabled]
```

### Example

```bash
python main.py --model_id openai/whisper-small --batch_size 1 --feature_length 3000 --wandb_mode online
```

## Output
- Prints a summary table of encoder and decoder metrics using pandas
- Logs metrics to wandb if enabled
- Shows total estimated latency and combined memory usage

## File Structure
- `main.py` — Main benchmarking script
- `scripts/setup_env.py` — Environment and dependency setup
- `models/whisper_wrappers.py` — Model wrapper classes
- `utils/benchmark.py` — Torch utility functions
- `metrics/extractor.py` — Metric extraction and logging

## Customization
- Edit `devices_list` in `main.py` to benchmark on different devices
- Adjust arguments to benchmark different model sizes or input shapes

106 changes: 106 additions & 0 deletions faster-whisper-benchmark/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import torch
import qai_hub as hub
from transformers import WhisperForConditionalGeneration
import wandb
import argparse
import pandas as pd

from models import WhisperEncoderWrapper, WhisperDecoderStepWrapper
from utils import get_traced_model, get_traced_model_multi, profile_model
from metrics.extractor import extract_metrics_from_profile

parser = argparse.ArgumentParser('Faster Whisper Benchmark')

parser.add_argument('--model_id', type=str, default="openai/whisper-small", help='Model ID to benchmark')
parser.add_argument('--batch_size', type=int, default=1, help='Batch size for benchmarking')
parser.add_argument('--decoder_len', type=int, default=16, help='Decoder input length for benchmarking')
parser.add_argument('--tokens', type=int, default=120, help='Estimated number of output tokens to calculate total latency')
parser.add_argument('--feature_length', type=int, default=3000, help='Feature length to benchmark')

parser.add_argument('--wandb_project', type=str, default="faster-whisper-benchmark", help='Weights & Biases project name')
parser.add_argument('--wandb_mode', type=str, choices=['online', 'offline'], default="online", help='Weights & Biases mode (online/offline)')

args, _ = parser.parse_known_args()

# Keep the devices you want to run the benchmark on
devices_list = [
# "Google Pixel 3a",
# "Samsung Galaxy S24 (Family)",
"Samsung Galaxy S25 Ultra"
]

def main():

for device in devices_list:

if args.wandb_mode != 'disabled':
wandb.init(
project = args.wandb_project,
name = f"Model: {args.model_id}, Device: {device},",
mode = args.wandb_mode,
config = vars(args)
)

device = hub.Device(device)
base_model = WhisperForConditionalGeneration.from_pretrained(args.model_id).eval().cpu()
encoder_model = WhisperEncoderWrapper(base_model).eval().cpu()
decoder_model = WhisperDecoderStepWrapper(base_model).eval().cpu()

encoder_shape = (args.batch_size, 80, args.feature_length)
print(f"Benchmarking feature shape: {encoder_shape}")

# Perform benchmarking for encoder
encoder_profile = profile_model(
get_traced_model(encoder_shape, encoder_model),
device,
{"input_features": encoder_shape},
)

encoder_metrics = extract_metrics_from_profile(encoder_profile)

with torch.no_grad():
dummy_features = torch.rand(encoder_shape, dtype=torch.float32)
encoder_hidden = encoder_model(dummy_features)
encoder_hidden_shape = tuple(encoder_hidden.shape)


decoder_input = torch.ones((args.batch_size, args.decoder_len), dtype=torch.int32)

# Perform benchmarking for decoder
decoder_profile = profile_model(
get_traced_model_multi((decoder_input, encoder_hidden), decoder_model),
device,
{
"decoder_input_ids": (tuple(decoder_input.shape), "int32"),
"encoder_hidden_states": (encoder_hidden_shape, "float32"),
},
)

decoder_metrics = extract_metrics_from_profile(decoder_profile)

enc_ms = encoder_metrics.get("estimated_inference_time_ms")
dec_ms = decoder_metrics.get("estimated_inference_time_ms")
est_total_ms = None
if enc_ms is not None and dec_ms is not None:
est_total_ms = enc_ms + args.tokens * dec_ms


# Combine total memory usage (sum encoder and decoder peak memory)
encoder_mem = encoder_metrics.get("estimated_inference_peak_memory", 0)
decoder_mem = decoder_metrics.get("estimated_inference_peak_memory", 0)
total_memory_mb = encoder_mem + decoder_mem

all_metrics = {
"estimated_total_latency_ms": est_total_ms,
"total_memory_mb": total_memory_mb,
"encoder": encoder_metrics,
"decoder": decoder_metrics,
}

if args.wandb_mode != 'disabled':
wandb.log(all_metrics)
wandb.finish()


if __name__ == "__main__":
main()
1 change: 1 addition & 0 deletions faster-whisper-benchmark/metrics/___init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .extractor import extract_metrics_from_profile
81 changes: 81 additions & 0 deletions faster-whisper-benchmark/metrics/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import numpy as np


def us_to_ms(x):
return x / 1e3

def bytes_to_mb(x):
return x / (1024 ** 2)

def extract_metrics_from_profile(profile: dict):
exec_sum = profile.get("execution_summary", {})
exec_detail = profile.get("execution_detail", [])

# ── End-to-End Performance ────────────────────────────────────
times = np.array(exec_sum.get("all_inference_times", []))
first_load = exec_sum.get("first_load_time", 0)
warm_load = exec_sum.get("warm_load_time", 0)

metrics = {
"estimated_inference_time_ms": round(us_to_ms(exec_sum.get("estimated_inference_time", 0)), 4),
"mean_latency_ms": round(us_to_ms(times.mean()), 4) if len(times) else None,
"min_latency_ms": round(us_to_ms(times.min()), 4) if len(times) else None,
"max_latency_ms": round(us_to_ms(times.max()), 4) if len(times) else None,
"std_dev_ms": round(us_to_ms(times.std()), 4) if len(times) else None,
"coeff_of_variation": round((times.std() / times.mean()) * 100, 4) if len(times) else None,
"throughput_fps": round(1000 / us_to_ms(times.mean()), 4) if len(times) else None,
"cold_start_ms": round(us_to_ms(first_load), 4),
"warm_start_ms": round(us_to_ms(warm_load), 4),
"speedup_cold_warm": round(first_load / warm_load, 4) if warm_load else None,
}

# ── Memory Footprint ──────────────────────────────────────────
inf_mem = exec_sum.get("estimated_inference_peak_memory", 0)
cold_mem = exec_sum.get("first_load_peak_memory", 0)
warm_mem = exec_sum.get("warm_load_peak_memory", 0)

metrics.update({
"estimated_inference_peak_memory": round(bytes_to_mb(inf_mem), 4),
"cold_start_peak_mb": round(bytes_to_mb(cold_mem), 4),
"warm_start_peak_mb": round(bytes_to_mb(warm_mem), 4),
"memory_reduction_cold_warm_pct": round((1 - warm_mem / cold_mem) * 100, 4) if cold_mem else None,
"memory_reduction_warm_inf_pct": round((1 - inf_mem / warm_mem) * 100, 4) if warm_mem else None,
"memory_efficiency_ratio": round(inf_mem / cold_mem, 4) if cold_mem else None,
})

# ── Accelerator Utilization ───────────────────────────────────
if exec_detail:
total_time = sum(op.get("execution_time", 0) for op in exec_detail)
total_op_count = len(exec_detail)
zero_op_count = sum(1 for op in exec_detail if op.get("execution_time", 0) == 0)
nonzero_op_count = total_op_count - zero_op_count

unit_times = {}
for op in exec_detail:
unit = op.get("compute_unit", "UNKNOWN")
unit_times[unit] = unit_times.get(unit, 0) + op.get("execution_time", 0)

metrics.update({
"total_op_count": total_op_count,
"nonzero_op_count": nonzero_op_count,
"zero_op_count": zero_op_count,
"zero_op_percentage": round(zero_op_count / total_op_count * 100, 4) if total_op_count else 0.0,
"avg_op_time_ms": round(us_to_ms(total_time / nonzero_op_count), 4) if nonzero_op_count else 0.0,
"total_op_time_ms": round(us_to_ms(total_time), 4),
"dominant_compute_unit": max(unit_times, key=unit_times.get) if unit_times else "N/A",
"cpu_utilization_percentage": round(unit_times.get("CPU", 0.0) / total_time * 100, 4) if total_time else 0.0,
"gpu_utilization_percentage": round(unit_times.get("GPU", 0.0) / total_time * 100, 4) if total_time else 0.0,
"npu_utilization_percentage": round(unit_times.get("NPU", 0.0) / total_time * 100, 4) if total_time else 0.0,
})

# ── Performance Bottlenecks ───────────────────────────────
top_ops = sorted(exec_detail, key=lambda op: op.get("execution_time", 0), reverse=True)[:15]
top_ops_total = sum(op.get("execution_time", 0) for op in top_ops)

metrics.update({
"top15_ops_time_ms": round(us_to_ms(top_ops_total), 4),
"top15_ops_pct_of_total": round(top_ops_total / total_time * 100, 4) if total_time else 0.0,
"effective_op_time_ratio": round(top_ops_total / total_time, 4) if total_time else 0.0,
})

return {k: v for k, v in metrics.items() if v is not None}
1 change: 1 addition & 0 deletions faster-whisper-benchmark/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .whisper_wrappers import WhisperEncoderWrapper, WhisperDecoderStepWrapper
32 changes: 32 additions & 0 deletions faster-whisper-benchmark/models/whisper_wrappers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import torch
import torch.nn as nn
from transformers import WhisperForConditionalGeneration

class WhisperEncoderWrapper(nn.Module):
def __init__(self, model: WhisperForConditionalGeneration) -> None:
super().__init__()
self.encoder = model.model.encoder

def forward(self, input_features: torch.Tensor) -> torch.Tensor:
return self.encoder(input_features=input_features).last_hidden_state

class WhisperDecoderStepWrapper(nn.Module):
def __init__(self, model: WhisperForConditionalGeneration) -> None:
super().__init__()
self.decoder = model.model.decoder
self.proj_out = model.proj_out

def forward(
self,
decoder_input_ids: torch.Tensor,
encoder_hidden_states: torch.Tensor,
) -> torch.Tensor:
token_ids = decoder_input_ids.to(torch.int32)
out = self.decoder(
input_ids=token_ids,
encoder_hidden_states=encoder_hidden_states,
use_cache=False,
return_dict=True,
)
logits = self.proj_out(out.last_hidden_state)
return logits[:, -1, :]
3 changes: 3 additions & 0 deletions faster-whisper-benchmark/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .benchmark import (
profile_model, get_exported_model_multi, get_traced_model, get_traced_model_multi,
)
56 changes: 56 additions & 0 deletions faster-whisper-benchmark/utils/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import torch
import qai_hub as hub

def run_compile(traced_model, device, input_specs_or_shape):
if isinstance(input_specs_or_shape, dict):
input_specs = input_specs_or_shape
else:
input_specs = dict(image=input_specs_or_shape)

compile_job = hub.submit_compile_job(
model=traced_model,
device=device,
input_specs=input_specs,
)

assert isinstance(compile_job, hub.CompileJob)
return compile_job


def run_profile(compiled_job, device):
profile_job = hub.submit_profile_job(
model=compiled_job.get_target_model(),
device=device,
name=compiled_job.name + "_profiling",
)

assert isinstance(profile_job, hub.ProfileJob)
return profile_job


def get_traced_model(input_shape, model, dtype=torch.float32):
example_input = torch.rand(input_shape, dtype=dtype)
with torch.no_grad():
traced_model = torch.jit.trace(model, example_input)
return traced_model


def get_traced_model_multi(example_inputs, model):
with torch.no_grad():
exported = torch.export.export(model, example_inputs)
tmp_path = "/tmp/decoder_exported.pt2"
torch.export.save(exported, tmp_path)
return tmp_path

def profile_model(traced_model: torch.jit.ScriptModule, device: hub.Device, input_specs: dict) -> dict:
compiled_model = run_compile(traced_model, device, input_specs)
profiled_model = run_profile(compiled_model, device)
return profiled_model.download_profile()

def get_exported_model_multi(example_inputs, model):
"""Use torch.export for decoder (avoids aten::diff ONNX issue)."""
with torch.no_grad():
exported = torch.export.export(model, example_inputs)
tmp_path = "/tmp/decoder_exported.pt2"
torch.export.save(exported, tmp_path)
return tmp_path