Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions benchmarking_whisper/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
__pycache__/
*.pyc
*.pyo
.env
venv/
111 changes: 111 additions & 0 deletions benchmarking_whisper/evaluate_multilingual.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import os
import time
import io
import pandas as pd
import torch
import librosa
import numpy as np
import psutil
from huggingface_hub import hf_hub_download, list_repo_files
from providers.hf_whisper import transcribe_hf
from results.metrics import calculate_wer, calculate_cer, calculate_bleu

# Configuration for the Whisper Benchmarking Suite
REPO_ID = "bhoomi16/mifos-banking-stt"

# Mapping language codes to Hub directories
DIR_MAPPING = {
"en": "english_audio/",
"hi": "hindi_audio/",
"fr": "french_audio/",
"es": "spanish_audio/",
"pt": "portuguese_audio/"
}

MODELS = ["openai/whisper-small", "openai/whisper-small.en"]
LANGUAGES = ["hi", "en", "fr", "es", "pt"]

def run_benchmark_pass(model_name, lang_code):
print(f"Benchmarking {model_name} - {lang_code}...")

folder_prefix = DIR_MAPPING.get(lang_code, "english_audio/")

try:
remote_files = list_repo_files(repo_id=REPO_ID, repo_type="dataset")
# Filter for audio files in the matching language folder
target_files = [f for f in remote_files if f.startswith(folder_prefix) and f.endswith(".wav")]

if not target_files:
return []

except Exception as e:
print(f"Error listing files: {e}")
return []

results = []
pid = psutil.Process(os.getpid())

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since our final goal is to deploy the model on the mifos's mobile app, we should consider the memory constraints of a mobile device rather than the machine we are currently using, as they differ significantly.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as well as the latency

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@itsPronay as of now we haven't decided if we want to host a model on client side or no

@itsPronay itsPronay Apr 2, 2026

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@staru09 , In the ticket, it’s mentioned that for local models we should measure metrics like memory usage and latency. Could you please clarify which device we should base these measurements on?

If the intention is to run these models locally on a mobile device, the measurements would differ significantly compared to running them on a server (self-hosted). The approach to evaluating memory usage and latency would vary depending on the deployment environment.

So, when we are talking about 'Benchmark local-models (memory and latency)', what are we evaluating against?

  1. Server computer?
  2. Physical Mobile device?
  3. or It is not decided yet.


# 📊 Now evaluating the FULL dataset for each language
for file_path in target_files:
try:
temp_local_file = hf_hub_download(repo_id=REPO_ID, filename=file_path, repo_type="dataset")
audio, _ = librosa.load(temp_local_file, sr=16000)

# Ground truth text from filename
reference = str(file_path.split("/")[-1].replace(".wav", "").replace("_", " ")).lower()

# Transcription cycle
start = time.time()
hypothesis, _ = transcribe_hf(audio, model_name=model_name, language=lang_code)
latency = time.time() - start

results.append({
"Model": model_name,
"Language": lang_code,
"WER": calculate_wer(reference, hypothesis),
"CER": calculate_cer(reference, hypothesis),
"BLEU": calculate_bleu(reference, hypothesis),
"Latency": latency,
"Memory_MB": pid.memory_info().rss / (1024 * 1024)
})

if os.path.exists(temp_local_file): os.remove(temp_local_file)

except Exception as e:
print(f"Processing error ({file_path}): {e}")
continue

return results

def main():
print("Mifos AI Whisper Benchmarking Suite (Portable Edition) - FULL RUN")
print("-" * 65)

consolidated_metrics = []

for model in MODELS:
# Only run Whisper-Small.en for English to save time and redundant compute
for lang in LANGUAGES:
if ".en" in model and lang != "en": continue
data = run_benchmark_pass(model, lang)
consolidated_metrics.extend(data)

if consolidated_metrics:
# Aggregation Logic
df = pd.DataFrame(consolidated_metrics)
final_summary = df.groupby(["Model", "Language"]).mean(numeric_only=True).reset_index()

# Save to final results.md at root
output_template = "# Whisper Benchmark Results (Full Dataset)\n\n"
output_template += f"Cloud Data Source: {REPO_ID}\n"
output_template += f"Total Samples Evaluated: {len(df)}\n\n"
output_template += "### Performance Summary\n"
output_template += final_summary.to_markdown(index=False)

with open("results.md", "w", encoding="utf-8") as f:
f.write(output_template)

print(f"\nProcess finished. {len(df)} samples processed. Results available in results.md")

if __name__ == "__main__":
main()
52 changes: 52 additions & 0 deletions benchmarking_whisper/providers/hf_whisper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import time
import os

_processor = None
_model = None

def get_hf_model(model_name="openai/whisper-small"):
global _processor, _model
if _model is None or _model.config._name_or_path != model_name:
print(f"Loading Hugging Face model: {model_name}...")
device = "cuda" if torch.cuda.is_available() else "cpu"
_processor = WhisperProcessor.from_pretrained(model_name)
_model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
return _processor, _model

def transcribe_hf(audio_array, sampling_rate=16000, model_name="openai/whisper-small", language="hi", task="transcribe"):
"""
Transcribe audio using Hugging Face Whisper model.
"""
processor, model = get_hf_model(model_name)
device = model.device

# Preprocess audio
input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(device)

# Logic to handle forced_decoder_ids safely
# If it's English-only or certain distilled variants, they don't support forced_decoder_ids
generate_kwargs = {}
if ".en" not in model_name and "distil" not in model_name.lower():
try:
forced_ids = processor.get_decoder_prompt_ids(language=language, task=task)
generate_kwargs["forced_decoder_ids"] = forced_ids
except Exception:
pass

# Standard inference cycle for the Whisper model
start_time = time.time()
try:
with torch.no_grad():
predicted_ids = model.generate(input_features, **generate_kwargs)
except Exception as e:
print(f"Warning: Specialist model {model_name} require native defaults. Retrying without kwargs. Error: {e}")
with torch.no_grad():
predicted_ids = model.generate(input_features)
latency = time.time() - start_time

# Decode
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

return transcription, latency
9 changes: 9 additions & 0 deletions benchmarking_whisper/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
torch
transformers
librosa
numpy
pandas
psutil
jiwer
huggingface_hub
tabulate
14 changes: 14 additions & 0 deletions benchmarking_whisper/results.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Whisper Benchmark Results (Full Dataset)

Cloud Data Source: bhoomi16/mifos-banking-stt
Total Samples Evaluated: 59

### Performance Summary
| Model | Language | WER | CER | BLEU | Latency | Memory_MB |
|:------------------------|:-----------|---------:|---------:|---------:|----------:|------------:|
| openai/whisper-small | en | 0.615238 | 0.319008 | 0.18991 | 2.82601 | 1404.94 |
| openai/whisper-small | es | 0.434683 | 0.1307 | 0.460203 | 3.03096 | 1408.85 |
| openai/whisper-small | fr | 0.559127 | 0.268749 | 0.338119 | 2.99419 | 1404.6 |
| openai/whisper-small | hi | 0.488333 | 0.289248 | 0.221635 | 5.1376 | 1422.5 |
| openai/whisper-small | pt | 0.471252 | 0.16247 | 0.405052 | 4.89976 | 1400.37 |
| openai/whisper-small.en | en | 0.615238 | 0.34123 | 0.18991 | 3.92016 | 1195.83 |
38 changes: 38 additions & 0 deletions benchmarking_whisper/results/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from jiwer import wer, cer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def calculate_wer(reference, hypothesis):
"""
Calculate Word Error Rate (WER).
"""
if not hypothesis:
return 1.0 # 100% error if no prediction
return wer(reference, hypothesis)

def calculate_cer(reference, hypothesis):
"""
Calculate Character Error Rate (CER).
"""
if not hypothesis:
return 1.0
return cer(reference, hypothesis)

def calculate_bleu(reference, hypothesis):
"""
Calculate BLEU Score (Bilingual Evaluation Understudy).
Returns a score from 0.0 to 1.0.
"""
if not hypothesis:
return 0.0

# BLEU requires tokenized lists
ref_tokens = [reference.split()]
hyp_tokens = hypothesis.split()

# Using smoothing to handle short sentences / no n-gram overlap
chencherry = SmoothingFunction()
try:
score = sentence_bleu(ref_tokens, hyp_tokens, smoothing_function=chencherry.method1)
return score
except Exception:
return 0.0