diff --git a/README.md b/README.md index 269868c24..3a07fdc1c 100644 --- a/README.md +++ b/README.md @@ -62,11 +62,11 @@ pnpm install @livekit/agents Currently, only the following plugins are supported: | Plugin | Features | -| ---------------------------------------------------------------------------------------------------- | ------------- | +| ---------------------------------------------------------------------------------------------------- |---------------| | [@livekit/agents-plugin-openai](https://www.npmjs.com/package/@livekit/agents-plugin-openai) | LLM, TTS, STT | | [@livekit/agents-plugin-google](https://www.npmjs.com/package/@livekit/agents-plugin-google) | LLM, TTS | | [@livekit/agents-plugin-deepgram](https://www.npmjs.com/package/@livekit/agents-plugin-deepgram) | STT, TTS | -| [@livekit/agents-plugin-elevenlabs](https://www.npmjs.com/package/@livekit/agents-plugin-elevenlabs) | TTS | +| [@livekit/agents-plugin-elevenlabs](https://www.npmjs.com/package/@livekit/agents-plugin-elevenlabs) | TTS, STT | | [@livekit/agents-plugin-cartesia](https://www.npmjs.com/package/@livekit/agents-plugin-cartesia) | TTS | | [@livekit/agents-plugin-neuphonic](https://www.npmjs.com/package/@livekit/agents-plugin-neuphonic) | TTS | | [@livekit/agents-plugin-resemble](https://www.npmjs.com/package/@livekit/agents-plugin-resemble) | TTS | diff --git a/plugins/elevenlabs/README.md b/plugins/elevenlabs/README.md index 77764172b..7fd5e59f0 100644 --- a/plugins/elevenlabs/README.md +++ b/plugins/elevenlabs/README.md @@ -3,15 +3,218 @@ SPDX-FileCopyrightText: 2024 LiveKit, Inc. SPDX-License-Identifier: Apache-2.0 --> -# ElevenLabs plugin for LiveKit Agents +# ElevenLabs Plugin for LiveKit Agents The Agents Framework is designed for building realtime, programmable participants that run on servers. Use it to create conversational, multi-modal voice agents that can see, hear, and understand. -This package contains the ElevenLabs plugin, which allows for voice synthesis. +This package contains the ElevenLabs plugin, which provides: +- **Text-to-Speech (TTS)**: High-quality voice synthesis with multiple voices and models +- **Speech-to-Text (STT)**: Real-time and batch transcription with Scribe API + Refer to the [documentation](https://docs.livekit.io/agents/overview/) for information on how to use it, or browse the [API reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_elevenlabs.html). See the [repository](https://github.com/livekit/agents-js) for more information about the framework as a whole. + +## Installation + +```bash +pnpm add @livekit/agents-plugin-elevenlabs +``` + +Set your ElevenLabs API key: +```bash +export ELEVEN_API_KEY=your_api_key_here +``` + +--- + +## Text-to-Speech (TTS) + +For TTS documentation, refer to the [API reference](https://docs.livekit.io/agents-js/modules/plugins_agents_plugin_elevenlabs.html). + +### Quick Example + +```typescript +import { TTS } from '@livekit/agents-plugin-elevenlabs'; + +const tts = new TTS(); +// Use tts for voice synthesis +``` + +--- + +## Speech-to-Text (STT) + +### Features + +- **Multiple Model Support**: Choose between Scribe v1, v2, and v2 realtime +- **Streaming & Non-Streaming**: Support for both batch and real-time transcription +- **Multi-Language**: Supports 35+ languages with automatic language detection +- **Audio Event Tagging**: Optional tagging of non-speech audio events (laughter, footsteps, etc.) +- **VAD Configuration**: Customizable voice activity detection for streaming mode + +### Supported Models + +#### Scribe v1 (`scribe_v1`) +- **Type**: Non-streaming +- **Method**: HTTP POST +- **Use Case**: Batch transcription of pre-recorded audio +- **Features**: Audio event tagging, language detection + +#### Scribe v2 (`scribe_v2`) +- **Type**: Non-streaming +- **Method**: HTTP POST +- **Use Case**: Improved accuracy for batch transcription +- **Features**: Enhanced model, language detection + +#### Scribe v2 Realtime (`scribe_v2_realtime`) +- **Type**: Streaming (default) +- **Method**: WebSocket +- **Use Case**: Real-time conversation transcription +- **Features**: Interim results, VAD-based segmentation, manual commit support + +### Quick Start + +#### Non-Streaming (Scribe v1) + +```typescript +import { STT } from '@livekit/agents-plugin-elevenlabs'; + +const stt = new STT({ + apiKey: process.env.ELEVEN_API_KEY, // or set ELEVEN_API_KEY env var + model: 'scribe_v1', + languageCode: 'en', + tagAudioEvents: true, +}); +``` + +#### Streaming (Scribe v2 Realtime) + +```typescript +import { STT } from '@livekit/agents-plugin-elevenlabs'; +import { SpeechEventType } from '@livekit/agents'; + +const stt = new STT({ + model: 'scribe_v2_realtime', // default + sampleRate: 16000, + languageCode: 'en', + commitStrategy: 'vad', // auto-commit on speech end + vadSilenceThresholdSecs: 1.0, +}); +``` + +### Configuration Options + +#### Common Options + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `apiKey` | `string` | `process.env.ELEVEN_API_KEY` | ElevenLabs API key | +| `baseURL` | `string` | `https://api.elevenlabs.io/v1` | API base URL | +| `model` | `STTModels` | `'scribe_v2_realtime'` | Model to use | +| `languageCode` | `string` | `undefined` | Language code (auto-detected if not set) | + +#### Non-Streaming Options (v1, v2) + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `tagAudioEvents` | `boolean` | `true` | Tag non-speech events like (laughter) | + +#### Streaming Options (v2_realtime) + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| `sampleRate` | `number` | `16000` | Audio sample rate in Hz (16000, 22050, or 44100) | +| `numChannels` | `number` | `1` | Number of audio channels | +| `commitStrategy` | `'vad' \| 'manual'` | `'vad'` | How to commit transcripts | +| `vadSilenceThresholdSecs` | `number` | `undefined` | VAD silence threshold (0.3-3.0 seconds) | +| `vadThreshold` | `number` | `undefined` | VAD threshold (0.1-0.9) | +| `minSpeechDurationMs` | `number` | `undefined` | Minimum speech duration (50-2000 ms) | +| `minSilenceDurationMs` | `number` | `undefined` | Minimum silence duration (50-2000 ms) | + +### Supported Languages + +The STT plugin supports 35+ languages including: + +English (`en`), Spanish (`es`), French (`fr`), German (`de`), Italian (`it`), Portuguese (`pt`), Polish (`pl`), Dutch (`nl`), Swedish (`sv`), Finnish (`fi`), Danish (`da`), Norwegian (`no`), Czech (`cs`), Romanian (`ro`), Slovak (`sk`), Ukrainian (`uk`), Greek (`el`), Turkish (`tr`), Russian (`ru`), Bulgarian (`bg`), Croatian (`hr`), Serbian (`sr`), Hungarian (`hu`), Lithuanian (`lt`), Latvian (`lv`), Estonian (`et`), Japanese (`ja`), Chinese (`zh`), Korean (`ko`), Hindi (`hi`), Arabic (`ar`), Persian (`fa`), Hebrew (`he`), Indonesian (`id`), Malay (`ms`), Thai (`th`), Vietnamese (`vi`), Tamil (`ta`), Urdu (`ur`) + +### Advanced Usage + +#### Custom VAD Parameters + +Fine-tune voice activity detection for your use case: + +```typescript +const stt = new STT({ + model: 'scribe_v2_realtime', + commitStrategy: 'vad', + + // Longer silence before committing (good for thoughtful speakers) + vadSilenceThresholdSecs: 2.0, + + // Higher threshold = more strict about what's considered speech + vadThreshold: 0.7, + + // Ignore very short speech bursts (reduce false positives) + minSpeechDurationMs: 200, + + // Require longer silence to end speech (reduce fragmentation) + minSilenceDurationMs: 500, +}); +``` + +#### Multi-Language Support + +Let ElevenLabs auto-detect the language: + +```typescript +const stt = new STT({ + model: 'scribe_v1', + // Don't set languageCode - will auto-detect +}); + +const event = await stt.recognize(audioBuffer); +console.log('Detected language:', event.alternatives[0].language); +console.log('Text:', event.alternatives[0].text); +``` + +Or specify a language: + +```typescript +const stt = new STT({ + model: 'scribe_v2_realtime', + languageCode: 'es', // Spanish +}); +``` + +### Model Comparison + +| Feature | Scribe v1 | Scribe v2 | Scribe v2 Realtime | +|---------|-----------|-----------|-------------------| +| **Type** | Non-streaming | Non-streaming | Streaming | +| **Latency** | High (batch) | High (batch) | Low (real-time) | +| **Interim Results** | ❌ | ❌ | ✅ | +| **Audio Event Tagging** | ✅ | ❌ | ❌ | +| **VAD Configuration** | ❌ | ❌ | ✅ | +| **Manual Commit** | ❌ | ❌ | ✅ | +| **Best For** | Batch jobs with event detection | High-accuracy batch | Real-time conversations | + +--- + +## Resources + +- [ElevenLabs TTS Documentation](https://elevenlabs.io/docs/api-reference/text-to-speech) +- [ElevenLabs STT Documentation](https://elevenlabs.io/docs/api-reference/speech-to-text) +- [Scribe v2 Streaming Guide](https://elevenlabs.io/docs/cookbooks/speech-to-text/streaming) +- [LiveKit Agents Documentation](https://docs.livekit.io/agents/) +- [LiveKit Agents JS Repository](https://github.com/livekit/agents-js) + +## License + +Copyright 2025 LiveKit, Inc. + +Licensed under the Apache License, Version 2.0. diff --git a/plugins/elevenlabs/package.json b/plugins/elevenlabs/package.json index 12a0e3f15..a7431974f 100644 --- a/plugins/elevenlabs/package.json +++ b/plugins/elevenlabs/package.json @@ -44,7 +44,7 @@ "typescript": "^5.0.0" }, "dependencies": { - "ws": "^8.16.0" + "ws": "^8.18.3" }, "peerDependencies": { "@livekit/agents": "workspace:*", diff --git a/plugins/elevenlabs/src/index.ts b/plugins/elevenlabs/src/index.ts index 66c4eeff6..29a7ec457 100644 --- a/plugins/elevenlabs/src/index.ts +++ b/plugins/elevenlabs/src/index.ts @@ -3,6 +3,8 @@ // SPDX-License-Identifier: Apache-2.0 import { Plugin } from '@livekit/agents'; +export * from './models.js'; +export * from './stt.js'; export * from './tts.js'; class ElevenLabsPlugin extends Plugin { diff --git a/plugins/elevenlabs/src/models.ts b/plugins/elevenlabs/src/models.ts index 59e419145..573844d2f 100644 --- a/plugins/elevenlabs/src/models.ts +++ b/plugins/elevenlabs/src/models.ts @@ -21,3 +21,50 @@ export type TTSEncoding = // | 'mp3_44100_128' // | 'mp3_44100_192' 'pcm_16000' | 'pcm_22050' | 'pcm_44100'; + +export type STTModels = 'scribe_v1' | 'scribe_v2' | 'scribe_v2_realtime'; + +export type STTAudioFormat = 'pcm_16000' | 'pcm_22050' | 'pcm_44100'; + +export type STTCommitStrategy = 'vad' | 'manual'; + +export type STTLanguages = + | 'en' + | 'es' + | 'fr' + | 'de' + | 'it' + | 'pt' + | 'pl' + | 'nl' + | 'sv' + | 'fi' + | 'da' + | 'no' + | 'cs' + | 'ro' + | 'sk' + | 'uk' + | 'el' + | 'tr' + | 'ru' + | 'bg' + | 'hr' + | 'sr' + | 'hu' + | 'lt' + | 'lv' + | 'et' + | 'ja' + | 'zh' + | 'ko' + | 'hi' + | 'ar' + | 'fa' + | 'he' + | 'id' + | 'ms' + | 'th' + | 'vi' + | 'ta' + | 'ur'; diff --git a/plugins/elevenlabs/src/stt.ts b/plugins/elevenlabs/src/stt.ts new file mode 100644 index 000000000..1282b915c --- /dev/null +++ b/plugins/elevenlabs/src/stt.ts @@ -0,0 +1,517 @@ +// SPDX-FileCopyrightText: 2025 LiveKit, Inc. +// +// SPDX-License-Identifier: Apache-2.0 +import { + APIConnectionError, + APIStatusError, + APITimeoutError, + type AudioBuffer, + AudioByteStream, + Future, + Task, + log, + mergeFrames, + stt, + waitForAbort, +} from '@livekit/agents'; +import type { AudioFrame } from '@livekit/rtc-node'; +import { type RawData, WebSocket } from 'ws'; +import type { STTAudioFormat, STTCommitStrategy, STTLanguages, STTModels } from './models.js'; + +const API_BASE_URL_V1 = 'https://api.elevenlabs.io/v1'; +const AUTHORIZATION_HEADER = 'xi-api-key'; + +export interface STTOptions { + apiKey?: string; + baseURL: string; + model: STTModels; + languageCode?: STTLanguages | string; + tagAudioEvents: boolean; + sampleRate: number; + numChannels: number; + // Streaming-specific options (only used for scribe_v2_realtime) + commitStrategy: STTCommitStrategy; + vadSilenceThresholdSecs?: number; + vadThreshold?: number; + minSpeechDurationMs?: number; + minSilenceDurationMs?: number; +} + +const defaultSTTOptions: STTOptions = { + apiKey: process.env.ELEVEN_API_KEY, + baseURL: API_BASE_URL_V1, + model: 'scribe_v2_realtime', + tagAudioEvents: false, + sampleRate: 16000, + numChannels: 1, + commitStrategy: 'vad', +}; + +export class STT extends stt.STT { + #opts: STTOptions; + #logger = log(); + label = 'elevenlabs.STT'; + + /** + * Create a new instance of ElevenLabs STT. + * + * @remarks + * `apiKey` must be set to your ElevenLabs API key, either using the argument or by setting the + * `ELEVEN_API_KEY` environment variable. + * + * @param opts - Configuration options for the STT service + * @param opts.apiKey - ElevenLabs API key (defaults to ELEVEN_API_KEY env var) + * @param opts.baseURL - Base URL for the API (defaults to https://api.elevenlabs.io/v1) + * @param opts.model - Model to use: 'scribe_v1' (non-streaming), 'scribe_v2' (non-streaming), or 'scribe_v2_realtime' (streaming) + * @param opts.languageCode - Language code for transcription (optional, auto-detected if not set) + * @param opts.tagAudioEvents - Whether to tag audio events like (laughter), (footsteps), etc. (defaults to false) + * @param opts.sampleRate - Sample rate for audio (defaults to 16000) + * @param opts.numChannels - Number of audio channels (defaults to 1) + * @param opts.commitStrategy - Commit strategy: 'vad' (auto) or 'manual' (defaults to 'vad', scribe_v2_realtime only) + * @param opts.vadSilenceThresholdSecs - VAD silence threshold in seconds, 0.3-3.0 (scribe_v2_realtime only) + * @param opts.vadThreshold - VAD threshold, 0.1-0.9 (scribe_v2_realtime only) + * @param opts.minSpeechDurationMs - Minimum speech duration in ms, 50-2000 (scribe_v2_realtime only) + * @param opts.minSilenceDurationMs - Minimum silence duration in ms, 50-2000 (scribe_v2_realtime only) + */ + constructor(opts: Partial = defaultSTTOptions) { + const mergedOpts = { ...defaultSTTOptions, ...opts }; + const isStreaming = mergedOpts.model === 'scribe_v2_realtime'; + + super({ + streaming: isStreaming, + interimResults: isStreaming, + }); + + this.#opts = mergedOpts; + + if (this.#opts.apiKey === undefined) { + throw new Error( + 'ElevenLabs API key is required, whether as an argument or as $ELEVEN_API_KEY', + ); + } + } + + #createWav(frame: AudioFrame): Buffer { + const bitsPerSample = 16; + const byteRate = (frame.sampleRate * frame.channels * bitsPerSample) / 8; + const blockAlign = (frame.channels * bitsPerSample) / 8; + + const header = Buffer.alloc(44); + header.write('RIFF', 0); + header.writeUInt32LE(36 + frame.data.byteLength, 4); + header.write('WAVE', 8); + header.write('fmt ', 12); + header.writeUInt32LE(16, 16); + header.writeUInt16LE(1, 20); + header.writeUInt16LE(frame.channels, 22); + header.writeUInt32LE(frame.sampleRate, 24); + header.writeUInt32LE(byteRate, 28); + header.writeUInt16LE(blockAlign, 32); + header.writeUInt16LE(16, 34); + header.write('data', 36); + header.writeUInt32LE(frame.data.byteLength, 40); + return Buffer.concat([header, Buffer.from(frame.data.buffer)]); + } + + async _recognize(buffer: AudioBuffer, language?: string): Promise { + if (this.#opts.model === 'scribe_v2_realtime') { + throw new Error( + 'scribe_v2_realtime requires streaming. Use stream() method instead, or use scribe_v1/scribe_v2 for non-streaming recognize()', + ); + } + + const mergedBuffer = mergeFrames(buffer); + const wavBytes = this.#createWav(mergedBuffer); + + // Create form data for the request + const form = new FormData(); + form.append('file', new Blob([wavBytes], { type: 'audio/wav' }), 'audio.wav'); + form.append('model_id', this.#opts.model); + form.append('tag_audio_events', this.#opts.tagAudioEvents.toString()); + + // Add language code if provided (either from options or recognize call) + const languageCode = language || this.#opts.languageCode; + if (languageCode) { + form.append('language_code', languageCode); + } + + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), 30000); // 30 second timeout + + const response = await fetch(`${this.#opts.baseURL}/speech-to-text`, { + method: 'POST', + headers: { + [AUTHORIZATION_HEADER]: this.#opts.apiKey!, + }, + body: form, + signal: controller.signal, + }); + + clearTimeout(timeout); + + if (!response.ok) { + const errorText = await response.text(); + throw new APIStatusError({ + message: `ElevenLabs API error: ${response.statusText} - ${errorText}`, + options: { + statusCode: response.status, + requestId: null, + body: null, + retryable: response.status >= 500, + }, + }); + } + + const responseJson = await response.json(); + const extractedText = responseJson.text || ''; + const detectedLanguage = responseJson.language_code || languageCode || 'en'; + const words = responseJson.words || []; + + let startTime = 0; + let endTime = 0; + + if (words.length > 0) { + startTime = Math.min(...words.map((w: any) => w.start || 0)); + endTime = Math.max(...words.map((w: any) => w.end || 0)); + } + + return { + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text: extractedText, + language: detectedLanguage, + startTime, + endTime, + confidence: 1.0, // ElevenLabs doesn't provide confidence scores + }, + ], + }; + } catch (error) { + if (error instanceof APIStatusError) { + throw error; + } + if ((error as any).name === 'AbortError') { + throw new APITimeoutError({ + message: 'ElevenLabs API request timed out', + options: { retryable: true }, + }); + } + throw new APIConnectionError({ + message: `Failed to connect to ElevenLabs: ${error}`, + options: { retryable: true }, + }); + } + } + + updateOptions(opts: Partial) { + this.#opts = { ...this.#opts, ...opts }; + } + + stream(): SpeechStream { + if (this.#opts.model !== 'scribe_v2_realtime') { + throw new Error( + 'Streaming is only supported with scribe_v2_realtime model. For non-streaming, use recognize() method.', + ); + } + return new SpeechStream(this, this.#opts); + } +} + +export class SpeechStream extends stt.SpeechStream { + #opts: STTOptions; + #logger = log(); + #speaking = false; + #lastCommittedText = ''; + label = 'elevenlabs.SpeechStream'; + + constructor(stt: STT, opts: STTOptions) { + super(stt, opts.sampleRate); + this.#opts = opts; + this.closed = false; + } + + protected async run() { + const maxRetry = 32; + let retries = 0; + let ws: WebSocket; + + while (!this.input.closed) { + // Build WebSocket URL + const audioFormat: STTAudioFormat = `pcm_${this.#opts.sampleRate}` as STTAudioFormat; + const baseUrl = this.#opts.baseURL.replace('https://', 'wss://').replace('http://', 'ws://'); + const streamURL = new URL(`${baseUrl}/speech-to-text/realtime`); + + const params = { + model_id: this.#opts.model, + encoding: audioFormat, + sample_rate: this.#opts.sampleRate, + commit_strategy: this.#opts.commitStrategy, + vad_silence_threshold_secs: this.#opts.vadSilenceThresholdSecs, + vad_threshold: this.#opts.vadThreshold, + min_speech_duration_ms: this.#opts.minSpeechDurationMs, + min_silence_duration_ms: this.#opts.minSilenceDurationMs, + language_code: this.#opts.languageCode, + }; + + Object.entries(params).forEach(([k, v]) => { + if (v !== undefined) { + streamURL.searchParams.append(k, String(v)); + } + }); + + ws = new WebSocket(streamURL.toString(), { + headers: { [AUTHORIZATION_HEADER]: `${this.#opts.apiKey}` }, + }); + + try { + const connFut = new Future(); + ws.once('open', () => connFut.resolve()); + ws.once('error', (error) => connFut.reject(error)); + ws.once('close', (code) => connFut.reject(new Error(`WebSocket returned ${code}`))); + + await connFut.await; + + // on success reset retries + retries = 0; + + await this.#runWS(ws); + } catch (e) { + if (retries >= maxRetry) { + throw new Error(`failed to connect to ElevenLabs after ${retries} attempts: ${e}`); + } + + ws.removeAllListeners(); + + const delay = Math.min(retries * 5, 10); + retries++; + + this.#logger.warn( + `STT: failed to connect to ElevenLabs, retrying in ${delay} seconds: ${e} (${retries}/${maxRetry})`, + ); + await new Promise((resolve) => setTimeout(resolve, delay * 1000)); + } + } + + this.closed = true; + } + + async #runWS(ws: WebSocket) { + let closing = false; + const abortController = new AbortController(); + + const keepalive = setInterval(() => { + try { + if (ws.readyState === WebSocket.OPEN) { + ws.ping(); + } + } catch { + clearInterval(keepalive); + return; + } + }, 5000); + + const sendTask = Task.from(async (controller) => { + const samples100Ms = Math.floor(this.#opts.sampleRate / 10); + const stream = new AudioByteStream( + this.#opts.sampleRate, + this.#opts.numChannels, + samples100Ms, + ); + + let frame_count = 0; + while (!controller.signal.aborted) { + const result = await Promise.race([this.input.next(), waitForAbort(controller.signal)]); + + if (result === undefined) break; // aborted + if (result.done) { + controller.abort(); + break; + } + + const data = result.value; + let frames: AudioFrame[]; + if (data === SpeechStream.FLUSH_SENTINEL) { + frames = stream.flush(); + + // Send any remaining frames + for (const frame of frames) { + const audioB64 = Buffer.from(frame.data.buffer).toString('base64'); + ws.send( + JSON.stringify({ + message_type: 'input_audio_chunk', + audio_base_64: audioB64, + commit: false, + sample_rate: this.#opts.sampleRate, + }), + ); + } + + // Send commit message if using manual commit strategy + if (this.#opts.commitStrategy === 'manual') { + ws.send( + JSON.stringify({ + message_type: 'input_audio_chunk', + audio_base_64: '', + commit: true, + sample_rate: this.#opts.sampleRate, + }), + ); + } + } else { + if ( + data.sampleRate !== this.#opts.sampleRate || + data.channels !== this.#opts.numChannels + ) { + throw new Error( + `sample rate or channel count of frame does not match (expected ${this.#opts.sampleRate}/${this.#opts.numChannels}, got ${data.sampleRate}/${data.channels})`, + ); + } + frames = stream.write(data.data.buffer); + frame_count += frames.length; + + if (frame_count % 100 == 0) { + this.#logger.debug(`STT: Sent ${frame_count} audio frames`); + } + + for (const frame of frames) { + const audioB64 = Buffer.from(frame.data.buffer).toString('base64'); + ws.send( + JSON.stringify({ + message_type: 'input_audio_chunk', + audio_base_64: audioB64, + commit: false, + sample_rate: this.#opts.sampleRate, + }), + ); + } + } + } + + this.#logger.info(`STT: Send task complete, sent ${frame_count} total frames`); + closing = true; + }, abortController); + + const wsMonitor = Task.from(async (controller) => { + const connectionClosed = new Promise((resolve, reject) => + ws.once('close', (code, reason) => { + if (!closing) { + this.#logger.error(`STT: WebSocket closed unexpectedly with code ${code}: ${reason}`); + reject(new Error('WebSocket closed')); + } else { + this.#logger.error(`STT: WebSocket closed normally ${code}: ${reason}`); + resolve(); + } + }), + ); + + await Promise.race([connectionClosed, waitForAbort(controller.signal)]); + }, abortController); + + const listenTask = Task.from(async (controller) => { + const listenMessage = new Promise((resolve, reject) => { + ws.on('message', (msg) => { + try { + const json = JSON.parse(msg.toString()); + this.#processStreamEvent(json); + + if (this.closed || closing) { + resolve(); + } + } catch (err) { + this.#logger.error(`STT: Error processing message: ${msg}`); + reject(err); + } + }); + }); + + await Promise.race([listenMessage, waitForAbort(controller.signal)]); + }, abortController); + + try { + await Promise.all([sendTask.result, listenTask.result, wsMonitor]); + } finally { + closing = true; + abortController.abort(); + ws.close(); + clearInterval(keepalive); + } + } + + #processStreamEvent(data: any) { + const messageType = data.message_type; + + if (messageType === 'partial_transcript') { + const text = data.text || ''; + + // Ignore stale partial transcripts that match the last committed text + if (text && text === this.#lastCommittedText) { + return; + } + + if (text) { + // Send START_OF_SPEECH if this is the first transcript in a new segment + if (!this.#speaking) { + this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH }); + this.#speaking = true; + this.#lastCommittedText = ''; + } + + this.queue.put({ + type: stt.SpeechEventType.INTERIM_TRANSCRIPT, + alternatives: [ + { + text, + language: this.#opts.languageCode || 'en', + startTime: 0, + endTime: 0, + confidence: 1.0, + }, + ], + }); + } + } else if ( + messageType === 'committed_transcript' || + messageType === 'committed_transcript_with_timestamps' + ) { + const text = data.text || ''; + + if (text) { + // Send START_OF_SPEECH if we get a FINAL without any INTERIM first + if (!this.#speaking) { + this.queue.put({ type: stt.SpeechEventType.START_OF_SPEECH }); + } + + this.queue.put({ + type: stt.SpeechEventType.FINAL_TRANSCRIPT, + alternatives: [ + { + text, + language: this.#opts.languageCode || 'en', + startTime: 0, + endTime: 0, + confidence: 1.0, + }, + ], + }); + + // Send end of speech event + this.queue.put({ type: stt.SpeechEventType.END_OF_SPEECH }); + this.#speaking = false; + this.#lastCommittedText = text; + } else { + // Empty commit - just reset state + this.#speaking = false; + this.#lastCommittedText = ''; + } + } else if (messageType === 'session_started') { + const sessionId = data.session_id || 'unknown'; + this.#logger.info(`STT: ElevenLabs session started with ID: ${sessionId}`); + } else if (messageType === 'input_error') { + this.#logger.error(`STT: Input Error received: ${data.error}. We ignore this for now.`); + } else { + this.#logger.warn(`STT: Unknown message type: ${messageType}`); + } + } +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0ff5271e8..bacc5ef17 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -365,8 +365,8 @@ importers: plugins/elevenlabs: dependencies: ws: - specifier: ^8.16.0 - version: 8.17.0 + specifier: ^8.18.3 + version: 8.18.3 devDependencies: '@livekit/agents': specifier: workspace:* @@ -1368,92 +1368,78 @@ packages: resolution: {integrity: sha512-RXwd0CgG+uPRX5YYrkzKyalt2OJYRiJQ8ED/fi1tq9WQW2jsQIn0tqrlR5l5dr/rjqq6AHAxURhj2DVjyQWSOA==} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-arm@1.2.0': resolution: {integrity: sha512-mWd2uWvDtL/nvIzThLq3fr2nnGfyr/XMXlq8ZJ9WMR6PXijHlC3ksp0IpuhK6bougvQrchUAfzRLnbsen0Cqvw==} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-ppc64@1.2.0': resolution: {integrity: sha512-Xod/7KaDDHkYu2phxxfeEPXfVXFKx70EAFZ0qyUdOjCcxbjqyJOEUpDe6RIyaunGxT34Anf9ue/wuWOqBW2WcQ==} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-s390x@1.2.0': resolution: {integrity: sha512-eMKfzDxLGT8mnmPJTNMcjfO33fLiTDsrMlUVcp6b96ETbnJmd4uvZxVJSKPQfS+odwfVaGifhsB07J1LynFehw==} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-libvips-linux-x64@1.2.0': resolution: {integrity: sha512-ZW3FPWIc7K1sH9E3nxIGB3y3dZkpJlMnkk7z5tu1nSkBoCgw2nSRTFHI5pB/3CQaJM0pdzMF3paf9ckKMSE9Tg==} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-libvips-linuxmusl-arm64@1.2.0': resolution: {integrity: sha512-UG+LqQJbf5VJ8NWJ5Z3tdIe/HXjuIdo4JeVNADXBFuG7z9zjoegpzzGIyV5zQKi4zaJjnAd2+g2nna8TZvuW9Q==} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-libvips-linuxmusl-x64@1.2.0': resolution: {integrity: sha512-SRYOLR7CXPgNze8akZwjoGBoN1ThNZoqpOgfnOxmWsklTGVfJiGJoC/Lod7aNMGA1jSsKWM1+HRX43OP6p9+6Q==} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-linux-arm64@0.34.3': resolution: {integrity: sha512-QdrKe3EvQrqwkDrtuTIjI0bu6YEJHTgEeqdzI3uWJOH6G1O8Nl1iEeVYRGdj1h5I21CqxSvQp1Yv7xeU3ZewbA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [glibc] '@img/sharp-linux-arm@0.34.3': resolution: {integrity: sha512-oBK9l+h6KBN0i3dC8rYntLiVfW8D8wH+NPNT3O/WBHeW0OQWCjfWksLUaPidsrDKpJgXp3G3/hkmhptAW0I3+A==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm] os: [linux] - libc: [glibc] '@img/sharp-linux-ppc64@0.34.3': resolution: {integrity: sha512-GLtbLQMCNC5nxuImPR2+RgrviwKwVql28FWZIW1zWruy6zLgA5/x2ZXk3mxj58X/tszVF69KK0Is83V8YgWhLA==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [ppc64] os: [linux] - libc: [glibc] '@img/sharp-linux-s390x@0.34.3': resolution: {integrity: sha512-3gahT+A6c4cdc2edhsLHmIOXMb17ltffJlxR0aC2VPZfwKoTGZec6u5GrFgdR7ciJSsHT27BD3TIuGcuRT0KmQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [s390x] os: [linux] - libc: [glibc] '@img/sharp-linux-x64@0.34.3': resolution: {integrity: sha512-8kYso8d806ypnSq3/Ly0QEw90V5ZoHh10yH0HnrzOCr6DKAPI6QVHvwleqMkVQ0m+fc7EH8ah0BB0QPuWY6zJQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [glibc] '@img/sharp-linuxmusl-arm64@0.34.3': resolution: {integrity: sha512-vAjbHDlr4izEiXM1OTggpCcPg9tn4YriK5vAjowJsHwdBIdx0fYRsURkxLG2RLm9gyBq66gwtWI8Gx0/ov+JKQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [arm64] os: [linux] - libc: [musl] '@img/sharp-linuxmusl-x64@0.34.3': resolution: {integrity: sha512-gCWUn9547K5bwvOn9l5XGAEjVTTRji4aPTqLzGXHvIr6bIDZKNTA34seMPgM0WmSf+RYBH411VavCejp3PkOeQ==} engines: {node: ^18.17.0 || ^20.3.0 || >=21.0.0} cpu: [x64] os: [linux] - libc: [musl] '@img/sharp-wasm32@0.34.3': resolution: {integrity: sha512-+CyRcpagHMGteySaWos8IbnXcHgfDn7pO2fiC2slJxvNq9gDipYBN42/RagzctVRKgxATmfqOSulgZv5e1RdMg==} @@ -1578,14 +1564,12 @@ packages: engines: {node: '>= 10'} cpu: [arm64] os: [linux] - libc: [glibc] '@livekit/rtc-node-linux-x64-gnu@0.13.13': resolution: {integrity: sha512-B/SgbeBRobpA5LqmDEoBJHpRXePpoF4RO4F0zJf9BdkDhOR0j77p6hD0ZiOuPTRoBzUqukpsTszp+lZnHoNmiA==} engines: {node: '>= 10'} cpu: [x64] os: [linux] - libc: [glibc] '@livekit/rtc-node-win32-x64-msvc@0.13.13': resolution: {integrity: sha512-ygVYV4eHczs3QdaW/p0ADhhm7InUDhFaCYk8OzzIn056ZibZPXzvPizCThZqs8VsDniA01MraZF3qhZZb8IyRg==} @@ -1726,121 +1710,101 @@ packages: resolution: {integrity: sha512-3reX2fUHqN7sffBNqmEyMQVj/CKhIHZd4y631duy0hZqI8Qoqf6lTtmAKvJFYa6bhU95B1D0WgzHkmTg33In0A==} cpu: [arm] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm-gnueabihf@4.40.0': resolution: {integrity: sha512-y/qUMOpJxBMy8xCXD++jeu8t7kzjlOCkoxxajL58G62PJGBZVl/Gwpm7JK9+YvlB701rcQTzjUZ1JgUoPTnoQA==} cpu: [arm] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.17.2': resolution: {integrity: sha512-uSqpsp91mheRgw96xtyAGP9FW5ChctTFEoXP0r5FAzj/3ZRv3Uxjtc7taRQSaQM/q85KEKjKsZuiZM3GyUivRg==} cpu: [arm] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm-musleabihf@4.40.0': resolution: {integrity: sha512-GoCsPibtVdJFPv/BOIvBKO/XmwZLwaNWdyD8TKlXuqp0veo2sHE+A/vpMQ5iSArRUz/uaoj4h5S6Pn0+PdhRjg==} cpu: [arm] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.17.2': resolution: {integrity: sha512-EMMPHkiCRtE8Wdk3Qhtciq6BndLtstqZIroHiiGzB3C5LDJmIZcSzVtLRbwuXuUft1Cnv+9fxuDtDxz3k3EW2A==} cpu: [arm64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm64-gnu@4.40.0': resolution: {integrity: sha512-L5ZLphTjjAD9leJzSLI7rr8fNqJMlGDKlazW2tX4IUF9P7R5TMQPElpH82Q7eNIDQnQlAyiNVfRPfP2vM5Avvg==} cpu: [arm64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.17.2': resolution: {integrity: sha512-NMPylUUZ1i0z/xJUIx6VUhISZDRT+uTWpBcjdv0/zkp7b/bQDF+NfnfdzuTiB1G6HTodgoFa93hp0O1xl+/UbA==} cpu: [arm64] os: [linux] - libc: [musl] '@rollup/rollup-linux-arm64-musl@4.40.0': resolution: {integrity: sha512-ATZvCRGCDtv1Y4gpDIXsS+wfFeFuLwVxyUBSLawjgXK2tRE6fnsQEkE4csQQYWlBlsFztRzCnBvWVfcae/1qxQ==} cpu: [arm64] os: [linux] - libc: [musl] '@rollup/rollup-linux-loongarch64-gnu@4.40.0': resolution: {integrity: sha512-wG9e2XtIhd++QugU5MD9i7OnpaVb08ji3P1y/hNbxrQ3sYEelKJOq1UJ5dXczeo6Hj2rfDEL5GdtkMSVLa/AOg==} cpu: [loong64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-powerpc64le-gnu@4.17.2': resolution: {integrity: sha512-T19My13y8uYXPw/L/k0JYaX1fJKFT/PWdXiHr8mTbXWxjVF1t+8Xl31DgBBvEKclw+1b00Chg0hxE2O7bTG7GQ==} cpu: [ppc64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-powerpc64le-gnu@4.40.0': resolution: {integrity: sha512-vgXfWmj0f3jAUvC7TZSU/m/cOE558ILWDzS7jBhiCAFpY2WEBn5jqgbqvmzlMjtp8KlLcBlXVD2mkTSEQE6Ixw==} cpu: [ppc64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.17.2': resolution: {integrity: sha512-BOaNfthf3X3fOWAB+IJ9kxTgPmMqPPH5f5k2DcCsRrBIbWnaJCgX2ll77dV1TdSy9SaXTR5iDXRL8n7AnoP5cg==} cpu: [riscv64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-gnu@4.40.0': resolution: {integrity: sha512-uJkYTugqtPZBS3Z136arevt/FsKTF/J9dEMTX/cwR7lsAW4bShzI2R0pJVw+hcBTWF4dxVckYh72Hk3/hWNKvA==} cpu: [riscv64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.40.0': resolution: {integrity: sha512-rKmSj6EXQRnhSkE22+WvrqOqRtk733x3p5sWpZilhmjnkHkpeCgWsFFo0dGnUGeA+OZjRl3+VYq+HyCOEuwcxQ==} cpu: [riscv64] os: [linux] - libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.17.2': resolution: {integrity: sha512-W0UP/x7bnn3xN2eYMql2T/+wpASLE5SjObXILTMPUBDB/Fg/FxC+gX4nvCfPBCbNhz51C+HcqQp2qQ4u25ok6g==} cpu: [s390x] os: [linux] - libc: [glibc] '@rollup/rollup-linux-s390x-gnu@4.40.0': resolution: {integrity: sha512-SpnYlAfKPOoVsQqmTFJ0usx0z84bzGOS9anAC0AZ3rdSo3snecihbhFTlJZ8XMwzqAcodjFU4+/SM311dqE5Sw==} cpu: [s390x] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.17.2': resolution: {integrity: sha512-Hy7pLwByUOuyaFC6mAr7m+oMC+V7qyifzs/nW2OJfC8H4hbCzOX07Ov0VFk/zP3kBsELWNFi7rJtgbKYsav9QQ==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.40.0': resolution: {integrity: sha512-RcDGMtqF9EFN8i2RYN2W+64CdHruJ5rPqrlYw+cgM3uOVPSsnAQps7cpjXe9be/yDp8UC7VLoCoKC8J3Kn2FkQ==} cpu: [x64] os: [linux] - libc: [glibc] '@rollup/rollup-linux-x64-musl@4.17.2': resolution: {integrity: sha512-h1+yTWeYbRdAyJ/jMiVw0l6fOOm/0D1vNLui9iPuqgRGnXA0u21gAqOyB5iHjlM9MMfNOm9RHCQ7zLIzT0x11Q==} cpu: [x64] os: [linux] - libc: [musl] '@rollup/rollup-linux-x64-musl@4.40.0': resolution: {integrity: sha512-HZvjpiUmSNx5zFgwtQAV1GaGazT2RWvqeDi0hV+AtC8unqqDSsaFjPxfsO6qPtKRRg25SisACWnJ37Yio8ttaw==} cpu: [x64] os: [linux] - libc: [musl] '@rollup/rollup-win32-arm64-msvc@4.17.2': resolution: {integrity: sha512-tmdtXMfKAjy5+IQsVtDiCfqbynAQE/TQRpWdVataHmhMb9DCoJxp9vLcCBjEQWMiUYxO1QprH/HbY9ragCEFLA==}