diff --git a/benchmarking_whisper/.gitignore b/benchmarking_whisper/.gitignore new file mode 100644 index 00000000..353b6594 --- /dev/null +++ b/benchmarking_whisper/.gitignore @@ -0,0 +1,5 @@ +__pycache__/ +*.pyc +*.pyo +.env +venv/ diff --git a/benchmarking_whisper/evaluate_multilingual.py b/benchmarking_whisper/evaluate_multilingual.py new file mode 100644 index 00000000..2de137aa --- /dev/null +++ b/benchmarking_whisper/evaluate_multilingual.py @@ -0,0 +1,111 @@ +import os +import time +import io +import pandas as pd +import torch +import librosa +import numpy as np +import psutil +from huggingface_hub import hf_hub_download, list_repo_files +from providers.hf_whisper import transcribe_hf +from results.metrics import calculate_wer, calculate_cer, calculate_bleu + +# Configuration for the Whisper Benchmarking Suite +REPO_ID = "bhoomi16/mifos-banking-stt" + +# Mapping language codes to Hub directories +DIR_MAPPING = { + "en": "english_audio/", + "hi": "hindi_audio/", + "fr": "french_audio/", + "es": "spanish_audio/", + "pt": "portuguese_audio/" +} + +MODELS = ["openai/whisper-small", "openai/whisper-small.en"] +LANGUAGES = ["hi", "en", "fr", "es", "pt"] + +def run_benchmark_pass(model_name, lang_code): + print(f"Benchmarking {model_name} - {lang_code}...") + + folder_prefix = DIR_MAPPING.get(lang_code, "english_audio/") + + try: + remote_files = list_repo_files(repo_id=REPO_ID, repo_type="dataset") + # Filter for audio files in the matching language folder + target_files = [f for f in remote_files if f.startswith(folder_prefix) and f.endswith(".wav")] + + if not target_files: + return [] + + except Exception as e: + print(f"Error listing files: {e}") + return [] + + results = [] + pid = psutil.Process(os.getpid()) + + # 📊 Now evaluating the FULL dataset for each language + for file_path in target_files: + try: + temp_local_file = hf_hub_download(repo_id=REPO_ID, filename=file_path, repo_type="dataset") + audio, _ = librosa.load(temp_local_file, sr=16000) + + # Ground truth text from filename + reference = str(file_path.split("/")[-1].replace(".wav", "").replace("_", " ")).lower() + + # Transcription cycle + start = time.time() + hypothesis, _ = transcribe_hf(audio, model_name=model_name, language=lang_code) + latency = time.time() - start + + results.append({ + "Model": model_name, + "Language": lang_code, + "WER": calculate_wer(reference, hypothesis), + "CER": calculate_cer(reference, hypothesis), + "BLEU": calculate_bleu(reference, hypothesis), + "Latency": latency, + "Memory_MB": pid.memory_info().rss / (1024 * 1024) + }) + + if os.path.exists(temp_local_file): os.remove(temp_local_file) + + except Exception as e: + print(f"Processing error ({file_path}): {e}") + continue + + return results + +def main(): + print("Mifos AI Whisper Benchmarking Suite (Portable Edition) - FULL RUN") + print("-" * 65) + + consolidated_metrics = [] + + for model in MODELS: + # Only run Whisper-Small.en for English to save time and redundant compute + for lang in LANGUAGES: + if ".en" in model and lang != "en": continue + data = run_benchmark_pass(model, lang) + consolidated_metrics.extend(data) + + if consolidated_metrics: + # Aggregation Logic + df = pd.DataFrame(consolidated_metrics) + final_summary = df.groupby(["Model", "Language"]).mean(numeric_only=True).reset_index() + + # Save to final results.md at root + output_template = "# Whisper Benchmark Results (Full Dataset)\n\n" + output_template += f"Cloud Data Source: {REPO_ID}\n" + output_template += f"Total Samples Evaluated: {len(df)}\n\n" + output_template += "### Performance Summary\n" + output_template += final_summary.to_markdown(index=False) + + with open("results.md", "w", encoding="utf-8") as f: + f.write(output_template) + + print(f"\nProcess finished. {len(df)} samples processed. Results available in results.md") + +if __name__ == "__main__": + main() diff --git a/benchmarking_whisper/providers/hf_whisper.py b/benchmarking_whisper/providers/hf_whisper.py new file mode 100644 index 00000000..32d29d3a --- /dev/null +++ b/benchmarking_whisper/providers/hf_whisper.py @@ -0,0 +1,52 @@ +import torch +from transformers import WhisperProcessor, WhisperForConditionalGeneration +import time +import os + +_processor = None +_model = None + +def get_hf_model(model_name="openai/whisper-small"): + global _processor, _model + if _model is None or _model.config._name_or_path != model_name: + print(f"Loading Hugging Face model: {model_name}...") + device = "cuda" if torch.cuda.is_available() else "cpu" + _processor = WhisperProcessor.from_pretrained(model_name) + _model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device) + return _processor, _model + +def transcribe_hf(audio_array, sampling_rate=16000, model_name="openai/whisper-small", language="hi", task="transcribe"): + """ + Transcribe audio using Hugging Face Whisper model. + """ + processor, model = get_hf_model(model_name) + device = model.device + + # Preprocess audio + input_features = processor(audio_array, sampling_rate=sampling_rate, return_tensors="pt").input_features.to(device) + + # Logic to handle forced_decoder_ids safely + # If it's English-only or certain distilled variants, they don't support forced_decoder_ids + generate_kwargs = {} + if ".en" not in model_name and "distil" not in model_name.lower(): + try: + forced_ids = processor.get_decoder_prompt_ids(language=language, task=task) + generate_kwargs["forced_decoder_ids"] = forced_ids + except Exception: + pass + + # Standard inference cycle for the Whisper model + start_time = time.time() + try: + with torch.no_grad(): + predicted_ids = model.generate(input_features, **generate_kwargs) + except Exception as e: + print(f"Warning: Specialist model {model_name} require native defaults. Retrying without kwargs. Error: {e}") + with torch.no_grad(): + predicted_ids = model.generate(input_features) + latency = time.time() - start_time + + # Decode + transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] + + return transcription, latency diff --git a/benchmarking_whisper/requirements.txt b/benchmarking_whisper/requirements.txt new file mode 100644 index 00000000..06bcd71f --- /dev/null +++ b/benchmarking_whisper/requirements.txt @@ -0,0 +1,9 @@ +torch +transformers +librosa +numpy +pandas +psutil +jiwer +huggingface_hub +tabulate diff --git a/benchmarking_whisper/results.md b/benchmarking_whisper/results.md new file mode 100644 index 00000000..9e0bdce8 --- /dev/null +++ b/benchmarking_whisper/results.md @@ -0,0 +1,14 @@ +# Whisper Benchmark Results (Full Dataset) + +Cloud Data Source: bhoomi16/mifos-banking-stt +Total Samples Evaluated: 59 + +### Performance Summary +| Model | Language | WER | CER | BLEU | Latency | Memory_MB | +|:------------------------|:-----------|---------:|---------:|---------:|----------:|------------:| +| openai/whisper-small | en | 0.615238 | 0.319008 | 0.18991 | 2.82601 | 1404.94 | +| openai/whisper-small | es | 0.434683 | 0.1307 | 0.460203 | 3.03096 | 1408.85 | +| openai/whisper-small | fr | 0.559127 | 0.268749 | 0.338119 | 2.99419 | 1404.6 | +| openai/whisper-small | hi | 0.488333 | 0.289248 | 0.221635 | 5.1376 | 1422.5 | +| openai/whisper-small | pt | 0.471252 | 0.16247 | 0.405052 | 4.89976 | 1400.37 | +| openai/whisper-small.en | en | 0.615238 | 0.34123 | 0.18991 | 3.92016 | 1195.83 | \ No newline at end of file diff --git a/benchmarking_whisper/results/metrics.py b/benchmarking_whisper/results/metrics.py new file mode 100644 index 00000000..2110efbf --- /dev/null +++ b/benchmarking_whisper/results/metrics.py @@ -0,0 +1,38 @@ +from jiwer import wer, cer +from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction + +def calculate_wer(reference, hypothesis): + """ + Calculate Word Error Rate (WER). + """ + if not hypothesis: + return 1.0 # 100% error if no prediction + return wer(reference, hypothesis) + +def calculate_cer(reference, hypothesis): + """ + Calculate Character Error Rate (CER). + """ + if not hypothesis: + return 1.0 + return cer(reference, hypothesis) + +def calculate_bleu(reference, hypothesis): + """ + Calculate BLEU Score (Bilingual Evaluation Understudy). + Returns a score from 0.0 to 1.0. + """ + if not hypothesis: + return 0.0 + + # BLEU requires tokenized lists + ref_tokens = [reference.split()] + hyp_tokens = hypothesis.split() + + # Using smoothing to handle short sentences / no n-gram overlap + chencherry = SmoothingFunction() + try: + score = sentence_bleu(ref_tokens, hyp_tokens, smoothing_function=chencherry.method1) + return score + except Exception: + return 0.0