openMF · hopessugar · Apr 3, 2026
diff --git a/benchmarking_experiments/providers/elevenlabs.py b/benchmarking_experiments/providers/elevenlabs.py
@@ -0,0 +1,100 @@
+# benchmarking_experiments/providers/elevenlabs.py
+
+import os
+import io
+import time
+from typing import AsyncGenerator, Dict, Any
+from dotenv import load_dotenv
+from elevenlabs.client import AsyncElevenLabs
+from elevenlabs import VoiceSettings
+import httpx
+from .base import TTSProvider, STTProvider
+
+load_dotenv()
+
+VOICE_IDS = {
+    "en": "JBFqnCBsd6RMkjVDRZzb",  # George - English
+    "hi": "JBFqnCBsd6RMkjVDRZzb",  # George - supports Hindi
+    "es": "JBFqnCBsd6RMkjVDRZzb",  # George - supports Spanish
+}
+
+LANGUAGE_CODES = {
+    "en": "en",
+    "hi": "hi",
+    "es": "es",
+}
+
+
+class ElevenLabsTTSProvider(TTSProvider):
+    """
+    ElevenLabs TTS Provider implementing TTSProvider base class.
+    Supports English (en), Hindi (hi), and Spanish (es).
+    Uses ElevenLabs Flash v2.5 model for low latency.
+    """
+
+    def __init__(self, language: str = "en"):
+        self.language = language
+        self.api_key = os.getenv("ELEVENLABS_API_KEY")
+        if not self.api_key:
+            raise ValueError("ELEVENLABS_API_KEY not set in environment")
+        self.voice_id = VOICE_IDS.get(language, VOICE_IDS["en"])
+        self.client = AsyncElevenLabs(api_key=self.api_key)
+
+    async def synthesize_stream(self, text: str) -> AsyncGenerator[bytes, None]:
+        """
+        Yields synthesized audio chunks from ElevenLabs TTS.
+        Uses Flash v2.5 for low latency multilingual output.
+        """
+        async for chunk in self.client.text_to_speech.convert(
+            voice_id=self.voice_id,
+            text=text,
+            model_id="eleven_flash_v2_5",
+            voice_settings=VoiceSettings(
+                stability=0.5,
+                similarity_boost=0.75,
+            ),
+            output_format="mp3_44100_128",
+        ):
+            if chunk:
+                yield chunk
+
+
+class ElevenLabsSTTProvider(STTProvider):
+    """
+    ElevenLabs STT Provider implementing STTProvider base class.
+    Uses ElevenLabs Scribe model — supports 90+ languages including Hindi.
+    """
+
+    def __init__(self, language: str = "en"):
+        self.language = language
+        self.api_key = os.getenv("ELEVENLABS_API_KEY")
+        if not self.api_key:
+            raise ValueError("ELEVENLABS_API_KEY not set in environment")
+        self.client = AsyncElevenLabs(api_key=self.api_key)
+
+    async def transcribe_stream(
+        self, audio_generator: AsyncGenerator[bytes, None]
+    ) -> Dict[str, Any]:
+        """
+        Collects audio chunks and transcribes via ElevenLabs Scribe STT.
+        """
+        audio_data = b""
+        async for chunk in audio_generator:
+            audio_data += chunk
+
+        start = time.time()
+
+        result = await self.client.speech_to_text.convert(
+            file=("audio.mp3", io.BytesIO(audio_data), "audio/mpeg"),
+            model_id="scribe_v1",
+            language_code=LANGUAGE_CODES.get(self.language, "en"),
+        )
+
+        latency_ms = round((time.time() - start) * 1000, 2)
+
+        return {
+            "text": result.text,
+            "language": self.language,
+            "latency_ms": latency_ms,
+            "model": "scribe_v1",
+        }
diff --git a/benchmarking_experiments/requirements.txt b/benchmarking_experiments/requirements.txt
@@ -13,3 +13,4 @@ pydub
 datasets
 librosa
 scipy
+elevenlabs
diff --git a/benchmarking_experiments/results/elevenlabs/README.md b/benchmarking_experiments/results/elevenlabs/README.md
@@ -0,0 +1,64 @@
+# ElevenLabs Multilingual Evaluation
+
+## Overview
+Benchmarks ElevenLabs **Flash v2.5** (TTS) and **Scribe v1** (STT) models
+across English (en), Hindi (hi), and Spanish (es).
+
+## TTS Results
+
+| Language | Avg Latency |
+|----------|-------------|
+| English  | 1762.63 ms  |
+| Hindi    | 647.62 ms   |
+| Spanish  | 682.80 ms   |
+
+**Findings:**
+- Hindi and Spanish TTS is significantly faster than English
+- All 3 languages produce natural sounding audio
+- Flash v2.5 model optimized for low latency multilingual output
+
+## STT Results (Round-trip: TTS → STT)
+
+| Language | Avg WER | Quality |
+|----------|---------|---------|
+| English  | 0.0     | ✅ Perfect |
+| Hindi    | 0.089   | ✅ Very Good |
+| Spanish  | 0.0     | ✅ Perfect |
+
+**Findings:**
+- English and Spanish STT is flawless (WER 0.0)
+- Hindi STT performs very well (WER 0.089) — only minor punctuation
+  differences observed (e.g. "हूँ" vs "हूं")
+- Massive improvement over Cartesia's Hindi STT (WER 1.04 → 0.089)
+
+## Comparison with Cartesia
+
+| Metric | Cartesia | ElevenLabs | Improvement |
+|--------|----------|------------|-------------|
+| Hindi STT WER | 1.04 ⚠️ | 0.089 ✅ | 91% better |
+| Hindi TTS Latency | 1773ms | 647ms | 63% faster |
+| EN STT WER | 0.0 | 0.0 | Same |
+| ES STT WER | 0.0 | 0.0 | Same |
+
+## Key Finding
+ElevenLabs Scribe v1 handles Hindi significantly better than
+Cartesia's ink-whisper model. The minor WER (0.089) is due to
+punctuation variants in Devanagari script, not actual word errors.
+
+## How to Run
+```bash
+pip install elevenlabs python-dotenv
+# Add ELEVENLABS_API_KEY to .env
+cd benchmarking_experiments
+python tests/test_elevenlabs_multilingual.py
+```
+
+## Files
+| File | Description |
+|------|-------------|
+| `tts_en_results.json` | English TTS results |
+| `tts_hi_results.json` | Hindi TTS results |
+| `tts_es_results.json` | Spanish TTS results |
+| `stt_en_results.json` | English STT results |
+| `stt_hi_results.json` | Hindi STT results |
+| `stt_es_results.json` | Spanish STT results |
diff --git a/benchmarking_experiments/results/elevenlabs/stt_en_results.json b/benchmarking_experiments/results/elevenlabs/stt_en_results.json
@@ -0,0 +1,32 @@
+{
+  "language": "en",
+  "avg_wer": 0.0,
+  "avg_latency_ms": 748.07,
+  "total_samples": 3,
+  "samples": [
+    {
+      "text": "Hello, how can I help you today?",
+      "language": "en",
+      "latency_ms": 818.65,
+      "model": "scribe_v1",
+      "reference": "Hello, how can I help you today?",
+      "wer": 0.0
+    },
+    {
+      "text": "Please transfer funds to the specified account.",
+      "language": "en",
+      "latency_ms": 685.3,
+      "model": "scribe_v1",
+      "reference": "Please transfer funds to the specified account.",
+      "wer": 0.0
+    },
+    {
+      "text": "Your transaction has been processed successfully.",
+      "language": "en",
+      "latency_ms": 740.25,
+      "model": "scribe_v1",
+      "reference": "Your transaction has been processed successfully.",
+      "wer": 0.0
+    }
+  ]
+}
diff --git a/benchmarking_experiments/results/elevenlabs/stt_es_results.json b/benchmarking_experiments/results/elevenlabs/stt_es_results.json
@@ -0,0 +1,32 @@
+{
+  "language": "es",
+  "avg_wer": 0.0,
+  "avg_latency_ms": 740.59,
+  "total_samples": 3,
+  "samples": [
+    {
+      "text": "Hola, ¿cómo puedo ayudarte hoy?",
+      "language": "es",
+      "latency_ms": 812.59,
+      "model": "scribe_v1",
+      "reference": "Hola, ¿cómo puedo ayudarte hoy?",
+      "wer": 0.0
+    },
+    {
+      "text": "Por favor, transfiera fondos a la cuenta especificada.",
+      "language": "es",
+      "latency_ms": 712.57,
+      "model": "scribe_v1",
+      "reference": "Por favor, transfiera fondos a la cuenta especificada.",
+      "wer": 0.0
+    },
+    {
+      "text": "Su transacción ha sido procesada exitosamente.",
+      "language": "es",
+      "latency_ms": 696.62,
+      "model": "scribe_v1",
+      "reference": "Su transacción ha sido procesada exitosamente.",
+      "wer": 0.0
+    }
+  ]
+}
diff --git a/benchmarking_experiments/results/elevenlabs/stt_hi_results.json b/benchmarking_experiments/results/elevenlabs/stt_hi_results.json
@@ -0,0 +1,32 @@
+{
+  "language": "hi",
+  "avg_wer": 0.0893,
+  "avg_latency_ms": 951.68,
+  "total_samples": 3,
+  "samples": [
+    {
+      "text": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूं?",
+      "language": "hi",
+      "latency_ms": 1011.96,
+      "model": "scribe_v1",
+      "reference": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?",
+      "wer": 0.125
+    },
+    {
+      "text": "कृपया निर्दिष्ट खाते में धनराशि स्थानांतरित करें।",
+      "language": "hi",
+      "latency_ms": 868.24,
+      "model": "scribe_v1",
+      "reference": "कृपया निर्दिष्ट खाते में धनराशि स्थानांतरित करें।",
+      "wer": 0.0
+    },
+    {
+      "text": "आपका लेनदेन सफलतापूर्वक संसाधित किया गया है।",
+      "language": "hi",
+      "latency_ms": 974.85,
+      "model": "scribe_v1",
+      "reference": "आपका लेन-देन सफलतापूर्वक संसाधित किया गया है।",
+      "wer": 0.1429
+    }
+  ]
+}
diff --git a/benchmarking_experiments/results/elevenlabs/tts_en_results.json b/benchmarking_experiments/results/elevenlabs/tts_en_results.json
@@ -0,0 +1,28 @@
+{
+  "language": "en",
+  "avg_latency_ms": 1762.63,
+  "total_samples": 3,
+  "samples": [
+    {
+      "text": "Hello, how can I help you today?",
+      "latency_ms": 1179.06,
+      "audio_size_bytes": 30973,
+      "language": "en",
+      "model": "eleven_flash_v2_5"
+    },
+    {
+      "text": "Please transfer funds to the specified account.",
+      "latency_ms": 3534.34,
+      "audio_size_bytes": 47691,
+      "language": "en",
+      "model": "eleven_flash_v2_5"
+    },
+    {
+      "text": "Your transaction has been processed successfully.",
+      "latency_ms": 574.48,
+      "audio_size_bytes": 42675,
+      "language": "en",
+      "model": "eleven_flash_v2_5"
+    }
+  ]
+}
diff --git a/benchmarking_experiments/results/elevenlabs/tts_es_results.json b/benchmarking_experiments/results/elevenlabs/tts_es_results.json
@@ -0,0 +1,28 @@
+{
+  "language": "es",
+  "avg_latency_ms": 682.8,
+  "total_samples": 3,
+  "samples": [
+    {
+      "text": "Hola, ¿cómo puedo ayudarte hoy?",
+      "latency_ms": 791.74,
+      "audio_size_bytes": 33898,
+      "language": "es",
+      "model": "eleven_flash_v2_5"
+    },
+    {
+      "text": "Por favor, transfiera fondos a la cuenta especificada.",
+      "latency_ms": 621.14,
+      "audio_size_bytes": 53960,
+      "language": "es",
+      "model": "eleven_flash_v2_5"
+    },
+    {
+      "text": "Su transacción ha sido procesada exitosamente.",
+      "latency_ms": 635.53,
+      "audio_size_bytes": 51035,
+      "language": "es",
+      "model": "eleven_flash_v2_5"
+    }
+  ]
+}
diff --git a/benchmarking_experiments/results/elevenlabs/tts_hi_results.json b/benchmarking_experiments/results/elevenlabs/tts_hi_results.json
@@ -0,0 +1,28 @@
+{
+  "language": "hi",
+  "avg_latency_ms": 647.62,
+  "total_samples": 3,
+  "samples": [
+    {
+      "text": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?",
+      "latency_ms": 704.87,
+      "audio_size_bytes": 41839,
+      "language": "hi",
+      "model": "eleven_flash_v2_5"
+    },
+    {
+      "text": "कृपया निर्दिष्ट खाते में धनराशि स्थानांतरित करें।",
+      "latency_ms": 622.14,
+      "audio_size_bytes": 53124,
+      "language": "hi",
+      "model": "eleven_flash_v2_5"
+    },
+    {
+      "text": "आपका लेन-देन सफलतापूर्वक संसाधित किया गया है।",
+      "latency_ms": 615.86,
+      "audio_size_bytes": 51453,
+      "language": "hi",
+      "model": "eleven_flash_v2_5"
+    }
+  ]
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ pydub @@
     datasets
     librosa
     scipy
+    elevenlabs