openMF · ibhoomi16 · Mar 20, 2026 · itsPronay · Apr 2, 2026 · itsPronay
diff --git a/benchmarking_whisper/.gitignore b/benchmarking_whisper/.gitignore
@@ -0,0 +1,5 @@
+__pycache__/
+*.pyc
+*.pyo
+.env
+venv/
diff --git a/benchmarking_whisper/evaluate_multilingual.py b/benchmarking_whisper/evaluate_multilingual.py
@@ -0,0 +1,111 @@
+import os
+import time
+import io
+import pandas as pd
+import torch
+import librosa
+import numpy as np
+import psutil
+from huggingface_hub import hf_hub_download, list_repo_files
+from providers.hf_whisper import transcribe_hf
+from results.metrics import calculate_wer, calculate_cer, calculate_bleu
+
+# Configuration for the Whisper Benchmarking Suite
+REPO_ID = "bhoomi16/mifos-banking-stt"
+
+# Mapping language codes to Hub directories
+DIR_MAPPING = {
+    "en": "english_audio/",
+    "hi": "hindi_audio/",
+    "fr": "french_audio/",
+    "es": "spanish_audio/",
+    "pt": "portuguese_audio/"
+}
+
+MODELS = ["openai/whisper-small", "openai/whisper-small.en"]
+LANGUAGES = ["hi", "en", "fr", "es", "pt"]
+
+def run_benchmark_pass(model_name, lang_code):
+    print(f"Benchmarking {model_name} - {lang_code}...")
+
+    folder_prefix = DIR_MAPPING.get(lang_code, "english_audio/")
+
+    try:
+        remote_files = list_repo_files(repo_id=REPO_ID, repo_type="dataset")
+        # Filter for audio files in the matching language folder
+        target_files = [f for f in remote_files if f.startswith(folder_prefix) and f.endswith(".wav")]
+
+        if not target_files:
+            return []
+
+    except Exception as e:
+        print(f"Error listing files: {e}")
+        return []
+
+    results = []
+    pid = psutil.Process(os.getpid())
+
+    # 📊 Now evaluating the FULL dataset for each language
+    for file_path in target_files: 
+        try:
+            temp_local_file = hf_hub_download(repo_id=REPO_ID, filename=file_path, repo_type="dataset")
+            audio, _ = librosa.load(temp_local_file, sr=16000)
+
+            # Ground truth text from filename
+            reference = str(file_path.split("/")[-1].replace(".wav", "").replace("_", " ")).lower()
+
+            # Transcription cycle
+            start = time.time()
+            hypothesis, _ = transcribe_hf(audio, model_name=model_name, language=lang_code)
+            latency = time.time() - start
+
+            results.append({
+                "Model": model_name,
+                "Language": lang_code,
+                "WER": calculate_wer(reference, hypothesis),
+                "CER": calculate_cer(reference, hypothesis),
+                "BLEU": calculate_bleu(reference, hypothesis),
+                "Latency": latency,
+                "Memory_MB": pid.memory_info().rss / (1024 * 1024)
+            })
+
+            if os.path.exists(temp_local_file): os.remove(temp_local_file)
+
+        except Exception as e:
+            print(f"Processing error ({file_path}): {e}")
+            continue
+
+    return results
+
+def main():
+    print("Mifos AI Whisper Benchmarking Suite (Portable Edition) - FULL RUN")
+    print("-" * 65)
+
+    consolidated_metrics = []
+
+    for model in MODELS:
+        # Only run Whisper-Small.en for English to save time and redundant compute
+        for lang in LANGUAGES:
+            if ".en" in model and lang != "en": continue
+            data = run_benchmark_pass(model, lang)
+            consolidated_metrics.extend(data)
+
+    if consolidated_metrics:
+        # Aggregation Logic
+        df = pd.DataFrame(consolidated_metrics)
+        final_summary = df.groupby(["Model", "Language"]).mean(numeric_only=True).reset_index()
+
+        # Save to final results.md at root
+        output_template = "# Whisper Benchmark Results (Full Dataset)\n\n"
+        output_template += f"Cloud Data Source: {REPO_ID}\n"
+        output_template += f"Total Samples Evaluated: {len(df)}\n\n"
+        output_template += "### Performance Summary\n"
+        output_template += final_summary.to_markdown(index=False)
+
+        with open("results.md", "w", encoding="utf-8") as f:
+            f.write(output_template)
+
+        print(f"\nProcess finished. {len(df)} samples processed. Results available in results.md")
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarking_whisper/providers/hf_whisper.py b/benchmarking_whisper/providers/hf_whisper.py
@@ -0,0 +1,52 @@
+import torch
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import time
+import os
+
+_processor = None
+_model = None
+
+def get_hf_model(model_name="openai/whisper-small"):
+    global _processor, _model
+    if _model is None or _model.config._name_or_path != model_name:
+        print(f"Loading Hugging Face model: {model_name}...")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        _processor = WhisperProcessor.from_pretrained(model_name)
+        _model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)
+    return _processor, _model
+
+def transcribe_hf(audio_array, sampling_rate=16000, model_name="openai/whisper-small", language="hi", task="transcribe"):
+    """
+    Transcribe audio using Hugging Face Whisper model.
+    """
+    processor, model = get_hf_model(model_name)
+    device = model.device
+
+    # Preprocess audio
+    input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(device)
+
+    # Logic to handle forced_decoder_ids safely
+    # If it's English-only or certain distilled variants, they don't support forced_decoder_ids
+    generate_kwargs = {}
+    if ".en" not in model_name and "distil" not in model_name.lower():
+        try:
+            forced_ids = processor.get_decoder_prompt_ids(language=language, task=task)
+            generate_kwargs["forced_decoder_ids"] = forced_ids
+        except Exception:
+            pass
+
+    # Standard inference cycle for the Whisper model
+    start_time = time.time()
+    try:
+        with torch.no_grad():
+            predicted_ids = model.generate(input_features, **generate_kwargs)
+    except Exception as e:
+        print(f"Warning: Specialist model {model_name} require native defaults. Retrying without kwargs. Error: {e}")
+        with torch.no_grad():
+            predicted_ids = model.generate(input_features)
+    latency = time.time() - start_time
+
+    # Decode
+    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+
+    return transcription, latency
diff --git a/benchmarking_whisper/requirements.txt b/benchmarking_whisper/requirements.txt
@@ -0,0 +1,9 @@
+torch
+transformers
+librosa
+numpy
+pandas
+psutil
+jiwer
+huggingface_hub
+tabulate
diff --git a/benchmarking_whisper/results.md b/benchmarking_whisper/results.md
@@ -0,0 +1,14 @@
+# Whisper Benchmark Results (Full Dataset)
+
+Cloud Data Source: bhoomi16/mifos-banking-stt
+Total Samples Evaluated: 59
+
+### Performance Summary
+| Model                   | Language   |      WER |      CER |     BLEU |   Latency |   Memory_MB |
+|:------------------------|:-----------|---------:|---------:|---------:|----------:|------------:|
+| openai/whisper-small    | en         | 0.615238 | 0.319008 | 0.18991  |   2.82601 |     1404.94 |
+| openai/whisper-small    | es         | 0.434683 | 0.1307   | 0.460203 |   3.03096 |     1408.85 |
+| openai/whisper-small    | fr         | 0.559127 | 0.268749 | 0.338119 |   2.99419 |     1404.6  |
+| openai/whisper-small    | hi         | 0.488333 | 0.289248 | 0.221635 |   5.1376  |     1422.5  |
+| openai/whisper-small    | pt         | 0.471252 | 0.16247  | 0.405052 |   4.89976 |     1400.37 |
+| openai/whisper-small.en | en         | 0.615238 | 0.34123  | 0.18991  |   3.92016 |     1195.83 |
diff --git a/benchmarking_whisper/results/metrics.py b/benchmarking_whisper/results/metrics.py
@@ -0,0 +1,38 @@
+from jiwer import wer, cer
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+
+def calculate_wer(reference, hypothesis):
+    """
+    Calculate Word Error Rate (WER).
+    """
+    if not hypothesis:
+        return 1.0 # 100% error if no prediction
+    return wer(reference, hypothesis)
+
+def calculate_cer(reference, hypothesis):
+    """
+    Calculate Character Error Rate (CER).
+    """
+    if not hypothesis:
+        return 1.0
+    return cer(reference, hypothesis)
+
+def calculate_bleu(reference, hypothesis):
+    """
+    Calculate BLEU Score (Bilingual Evaluation Understudy).
+    Returns a score from 0.0 to 1.0.
+    """
+    if not hypothesis:
+        return 0.0
+
+    # BLEU requires tokenized lists
+    ref_tokens = [reference.split()]
+    hyp_tokens = hypothesis.split()
+
+    # Using smoothing to handle short sentences / no n-gram overlap
+    chencherry = SmoothingFunction()
+    try:
+        score = sentence_bleu(ref_tokens, hyp_tokens, smoothing_function=chencherry.method1)
+        return score
+    except Exception:
+        return 0.0