Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions benchmarking_experiments/providers/elevenlabs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# benchmarking_experiments/providers/elevenlabs.py

import os
import io
import time
from typing import AsyncGenerator, Dict, Any
from dotenv import load_dotenv
from elevenlabs.client import AsyncElevenLabs
from elevenlabs import VoiceSettings
import httpx
from .base import TTSProvider, STTProvider

load_dotenv()

VOICE_IDS = {
"en": "JBFqnCBsd6RMkjVDRZzb", # George - English
"hi": "JBFqnCBsd6RMkjVDRZzb", # George - supports Hindi
"es": "JBFqnCBsd6RMkjVDRZzb", # George - supports Spanish
}

LANGUAGE_CODES = {
"en": "en",
"hi": "hi",
"es": "es",
}


class ElevenLabsTTSProvider(TTSProvider):
"""
ElevenLabs TTS Provider implementing TTSProvider base class.
Supports English (en), Hindi (hi), and Spanish (es).
Uses ElevenLabs Flash v2.5 model for low latency.
"""

def __init__(self, language: str = "en"):
self.language = language
self.api_key = os.getenv("ELEVENLABS_API_KEY")
if not self.api_key:
raise ValueError("ELEVENLABS_API_KEY not set in environment")
self.voice_id = VOICE_IDS.get(language, VOICE_IDS["en"])
self.client = AsyncElevenLabs(api_key=self.api_key)

async def synthesize_stream(self, text: str) -> AsyncGenerator[bytes, None]:
"""
Yields synthesized audio chunks from ElevenLabs TTS.
Uses Flash v2.5 for low latency multilingual output.
"""
async for chunk in self.client.text_to_speech.convert(
voice_id=self.voice_id,
text=text,
model_id="eleven_flash_v2_5",
voice_settings=VoiceSettings(
stability=0.5,
similarity_boost=0.75,
),
output_format="mp3_44100_128",
):
if chunk:
yield chunk


class ElevenLabsSTTProvider(STTProvider):
"""
ElevenLabs STT Provider implementing STTProvider base class.
Uses ElevenLabs Scribe model — supports 90+ languages including Hindi.
"""

def __init__(self, language: str = "en"):
self.language = language
self.api_key = os.getenv("ELEVENLABS_API_KEY")
if not self.api_key:
raise ValueError("ELEVENLABS_API_KEY not set in environment")
self.client = AsyncElevenLabs(api_key=self.api_key)

async def transcribe_stream(
self, audio_generator: AsyncGenerator[bytes, None]
) -> Dict[str, Any]:
"""
Collects audio chunks and transcribes via ElevenLabs Scribe STT.
"""
audio_data = b""
async for chunk in audio_generator:
audio_data += chunk

start = time.time()

result = await self.client.speech_to_text.convert(
file=("audio.mp3", io.BytesIO(audio_data), "audio/mpeg"),
model_id="scribe_v1",
language_code=LANGUAGE_CODES.get(self.language, "en"),
)

latency_ms = round((time.time() - start) * 1000, 2)

return {
"text": result.text,
"language": self.language,
"latency_ms": latency_ms,
"model": "scribe_v1",
}
1 change: 1 addition & 0 deletions benchmarking_experiments/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ pydub
datasets
librosa
scipy
elevenlabs
64 changes: 64 additions & 0 deletions benchmarking_experiments/results/elevenlabs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# ElevenLabs Multilingual Evaluation

## Overview
Benchmarks ElevenLabs **Flash v2.5** (TTS) and **Scribe v1** (STT) models
across English (en), Hindi (hi), and Spanish (es).

## TTS Results

| Language | Avg Latency |
|----------|-------------|
| English | 1762.63 ms |
| Hindi | 647.62 ms |
| Spanish | 682.80 ms |

**Findings:**
- Hindi and Spanish TTS is significantly faster than English
- All 3 languages produce natural sounding audio
- Flash v2.5 model optimized for low latency multilingual output

## STT Results (Round-trip: TTS → STT)

| Language | Avg WER | Quality |
|----------|---------|---------|
| English | 0.0 | ✅ Perfect |
| Hindi | 0.089 | ✅ Very Good |
| Spanish | 0.0 | ✅ Perfect |

**Findings:**
- English and Spanish STT is flawless (WER 0.0)
- Hindi STT performs very well (WER 0.089) — only minor punctuation
differences observed (e.g. "हूँ" vs "हूं")
- Massive improvement over Cartesia's Hindi STT (WER 1.04 → 0.089)

## Comparison with Cartesia

| Metric | Cartesia | ElevenLabs | Improvement |
|--------|----------|------------|-------------|
| Hindi STT WER | 1.04 ⚠️ | 0.089 ✅ | 91% better |
| Hindi TTS Latency | 1773ms | 647ms | 63% faster |
| EN STT WER | 0.0 | 0.0 | Same |
| ES STT WER | 0.0 | 0.0 | Same |

## Key Finding
ElevenLabs Scribe v1 handles Hindi significantly better than
Cartesia's ink-whisper model. The minor WER (0.089) is due to
punctuation variants in Devanagari script, not actual word errors.

## How to Run
```bash
pip install elevenlabs python-dotenv
# Add ELEVENLABS_API_KEY to .env
cd benchmarking_experiments
python tests/test_elevenlabs_multilingual.py
```

## Files
| File | Description |
|------|-------------|
| `tts_en_results.json` | English TTS results |
| `tts_hi_results.json` | Hindi TTS results |
| `tts_es_results.json` | Spanish TTS results |
| `stt_en_results.json` | English STT results |
| `stt_hi_results.json` | Hindi STT results |
| `stt_es_results.json` | Spanish STT results |
32 changes: 32 additions & 0 deletions benchmarking_experiments/results/elevenlabs/stt_en_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"language": "en",
"avg_wer": 0.0,
"avg_latency_ms": 748.07,
"total_samples": 3,
"samples": [
{
"text": "Hello, how can I help you today?",
"language": "en",
"latency_ms": 818.65,
"model": "scribe_v1",
"reference": "Hello, how can I help you today?",
"wer": 0.0
},
{
"text": "Please transfer funds to the specified account.",
"language": "en",
"latency_ms": 685.3,
"model": "scribe_v1",
"reference": "Please transfer funds to the specified account.",
"wer": 0.0
},
{
"text": "Your transaction has been processed successfully.",
"language": "en",
"latency_ms": 740.25,
"model": "scribe_v1",
"reference": "Your transaction has been processed successfully.",
"wer": 0.0
}
]
}
32 changes: 32 additions & 0 deletions benchmarking_experiments/results/elevenlabs/stt_es_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"language": "es",
"avg_wer": 0.0,
"avg_latency_ms": 740.59,
"total_samples": 3,
"samples": [
{
"text": "Hola, ¿cómo puedo ayudarte hoy?",
"language": "es",
"latency_ms": 812.59,
"model": "scribe_v1",
"reference": "Hola, ¿cómo puedo ayudarte hoy?",
"wer": 0.0
},
{
"text": "Por favor, transfiera fondos a la cuenta especificada.",
"language": "es",
"latency_ms": 712.57,
"model": "scribe_v1",
"reference": "Por favor, transfiera fondos a la cuenta especificada.",
"wer": 0.0
},
{
"text": "Su transacción ha sido procesada exitosamente.",
"language": "es",
"latency_ms": 696.62,
"model": "scribe_v1",
"reference": "Su transacción ha sido procesada exitosamente.",
"wer": 0.0
}
]
}
32 changes: 32 additions & 0 deletions benchmarking_experiments/results/elevenlabs/stt_hi_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
{
"language": "hi",
"avg_wer": 0.0893,
"avg_latency_ms": 951.68,
"total_samples": 3,
"samples": [
{
"text": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूं?",
"language": "hi",
"latency_ms": 1011.96,
"model": "scribe_v1",
"reference": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?",
"wer": 0.125
},
{
"text": "कृपया निर्दिष्ट खाते में धनराशि स्थानांतरित करें।",
"language": "hi",
"latency_ms": 868.24,
"model": "scribe_v1",
"reference": "कृपया निर्दिष्ट खाते में धनराशि स्थानांतरित करें।",
"wer": 0.0
},
{
"text": "आपका लेनदेन सफलतापूर्वक संसाधित किया गया है।",
"language": "hi",
"latency_ms": 974.85,
"model": "scribe_v1",
"reference": "आपका लेन-देन सफलतापूर्वक संसाधित किया गया है।",
"wer": 0.1429
}
]
}
28 changes: 28 additions & 0 deletions benchmarking_experiments/results/elevenlabs/tts_en_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"language": "en",
"avg_latency_ms": 1762.63,
"total_samples": 3,
"samples": [
{
"text": "Hello, how can I help you today?",
"latency_ms": 1179.06,
"audio_size_bytes": 30973,
"language": "en",
"model": "eleven_flash_v2_5"
},
{
"text": "Please transfer funds to the specified account.",
"latency_ms": 3534.34,
"audio_size_bytes": 47691,
"language": "en",
"model": "eleven_flash_v2_5"
},
{
"text": "Your transaction has been processed successfully.",
"latency_ms": 574.48,
"audio_size_bytes": 42675,
"language": "en",
"model": "eleven_flash_v2_5"
}
]
}
28 changes: 28 additions & 0 deletions benchmarking_experiments/results/elevenlabs/tts_es_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"language": "es",
"avg_latency_ms": 682.8,
"total_samples": 3,
"samples": [
{
"text": "Hola, ¿cómo puedo ayudarte hoy?",
"latency_ms": 791.74,
"audio_size_bytes": 33898,
"language": "es",
"model": "eleven_flash_v2_5"
},
{
"text": "Por favor, transfiera fondos a la cuenta especificada.",
"latency_ms": 621.14,
"audio_size_bytes": 53960,
"language": "es",
"model": "eleven_flash_v2_5"
},
{
"text": "Su transacción ha sido procesada exitosamente.",
"latency_ms": 635.53,
"audio_size_bytes": 51035,
"language": "es",
"model": "eleven_flash_v2_5"
}
]
}
28 changes: 28 additions & 0 deletions benchmarking_experiments/results/elevenlabs/tts_hi_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"language": "hi",
"avg_latency_ms": 647.62,
"total_samples": 3,
"samples": [
{
"text": "नमस्ते, मैं आपकी कैसे मदद कर सकता हूँ?",
"latency_ms": 704.87,
"audio_size_bytes": 41839,
"language": "hi",
"model": "eleven_flash_v2_5"
},
{
"text": "कृपया निर्दिष्ट खाते में धनराशि स्थानांतरित करें।",
"latency_ms": 622.14,
"audio_size_bytes": 53124,
"language": "hi",
"model": "eleven_flash_v2_5"
},
{
"text": "आपका लेन-देन सफलतापूर्वक संसाधित किया गया है।",
"latency_ms": 615.86,
"audio_size_bytes": 51453,
"language": "hi",
"model": "eleven_flash_v2_5"
}
]
}
Loading