Skip to content

Commit 10ee1ca

Browse files
committed
implements background audio with mixer
depends on livekit/node-sdks#567
1 parent 9a58cd3 commit 10ee1ca

File tree

2 files changed

+96
-59
lines changed

2 files changed

+96
-59
lines changed

agents/src/voice/background_audio.ts

Lines changed: 86 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// SPDX-License-Identifier: Apache-2.0
44
import {
55
AudioFrame,
6+
AudioMixer,
67
AudioSource,
78
LocalAudioTrack,
89
type LocalTrackPublication,
@@ -57,7 +58,7 @@ export interface BackgroundAudioPlayerOptions {
5758

5859
/**
5960
* Sound to play when the agent is thinking.
60-
* TODO (Brian): Implement thinking sound when AudioMixer becomes available
61+
* Plays when agent state changes to 'thinking' and stops when it changes to other states.
6162
*/
6263
thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[];
6364

@@ -113,15 +114,16 @@ export class PlayHandle {
113114
* This class handles playing ambient sounds and manages audio track publishing.
114115
* It supports:
115116
* - Continuous ambient sound playback with looping
117+
* - Thinking sound playback during agent processing
118+
* - Multiple simultaneous audio streams via AudioMixer
116119
* - Volume control and probability-based sound selection
117120
* - Integration with LiveKit rooms and agent sessions
118121
*
119-
* Note: Thinking sound not yet supported
120-
*
121122
* @example
122123
* ```typescript
123124
* const player = new BackgroundAudioPlayer({
124125
* ambientSound: { source: BuiltinAudioClip.OFFICE_AMBIENCE, volume: 0.8 },
126+
* thinkingSound: { source: BuiltinAudioClip.KEYBOARD_TYPING, volume: 0.6 },
125127
* });
126128
*
127129
* await player.start({ room, agentSession });
@@ -130,9 +132,12 @@ export class PlayHandle {
130132
export class BackgroundAudioPlayer {
131133
private ambientSound?: AudioSourceType | AudioConfig | AudioConfig[];
132134
private thinkingSound?: AudioSourceType | AudioConfig | AudioConfig[];
135+
private streamTimeoutMs: number;
133136

134137
private playTasks: Task<void>[] = [];
135138
private audioSource = new AudioSource(48000, 1, AUDIO_SOURCE_BUFFER_MS);
139+
private audioMixer: AudioMixer;
140+
private mixerTask?: Task<void>;
136141

137142
private room?: Room;
138143
private agentSession?: AgentSession;
@@ -148,15 +153,17 @@ export class BackgroundAudioPlayer {
148153
#logger = log();
149154

150155
constructor(options?: BackgroundAudioPlayerOptions) {
151-
const { ambientSound, thinkingSound } = options || {};
156+
const { ambientSound, thinkingSound, streamTimeoutMs = 200 } = options || {};
152157

153158
this.ambientSound = ambientSound;
154159
this.thinkingSound = thinkingSound;
160+
this.streamTimeoutMs = streamTimeoutMs;
155161

156-
if (this.thinkingSound) {
157-
this.#logger.warn('thinkingSound is not yet supported');
158-
// TODO: Implement thinking sound when AudioMixer becomes available
159-
}
162+
this.audioMixer = new AudioMixer(48000, 1, {
163+
blocksize: 4800,
164+
capacity: 1,
165+
streamTimeoutMs: this.streamTimeoutMs,
166+
});
160167
}
161168

162169
/**
@@ -282,7 +289,11 @@ export class BackgroundAudioPlayer {
282289

283290
// TODO (Brian): check job context is not fake
284291

285-
// TODO (Brian): start audio mixer task
292+
// Start audio mixer task
293+
this.mixerTask = Task.from(async () => {
294+
await this.runMixerTask();
295+
});
296+
286297
this.room.on('reconnected', this.onReconnected);
287298

288299
this.agentSession?.on(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged);
@@ -307,8 +318,12 @@ export class BackgroundAudioPlayer {
307318
await this.republishTask.cancelAndWait(TASK_TIMEOUT_MS);
308319
}
309320

310-
// TODO (Brian): cancel audio mixer task and close audio mixer
321+
// Cancel audio mixer task and close audio mixer
322+
if (this.mixerTask) {
323+
await this.mixerTask.cancelAndWait(TASK_TIMEOUT_MS);
324+
}
311325

326+
await this.audioMixer.aclose();
312327
await this.audioSource.close();
313328

314329
this.agentSession?.off(AgentSessionEventTypes.AgentStateChanged, this.onAgentStateChanged);
@@ -372,12 +387,43 @@ export class BackgroundAudioPlayer {
372387
return;
373388
}
374389

375-
// TODO (Brian): play thinking sound and assign to thinkingHandle
390+
const normalized = this.normalizeSoundSource(this.thinkingSound);
391+
if (normalized) {
392+
const { source, volume } = normalized;
393+
const selectedSound: AudioConfig = { source, volume, probability: 1.0 };
394+
this.thinkingHandle = this.play(selectedSound);
395+
}
376396
} else {
377397
this.thinkingHandle?.stop();
378398
}
379399
};
380400

401+
private applyVolumeToFrame(frame: AudioFrame, volume: number): AudioFrame {
402+
const int16Data = new Int16Array(
403+
frame.data.buffer,
404+
frame.data.byteOffset,
405+
frame.data.byteLength / 2,
406+
);
407+
const float32Data = new Float32Array(int16Data.length);
408+
409+
for (let i = 0; i < int16Data.length; i++) {
410+
float32Data[i] = int16Data[i]!;
411+
}
412+
413+
const volumeFactor = 10 ** Math.log10(volume);
414+
for (let i = 0; i < float32Data.length; i++) {
415+
float32Data[i]! *= volumeFactor;
416+
}
417+
418+
const outputData = new Int16Array(float32Data.length);
419+
for (let i = 0; i < float32Data.length; i++) {
420+
const clipped = Math.max(-32768, Math.min(32767, float32Data[i]!));
421+
outputData[i] = Math.round(clipped);
422+
}
423+
424+
return new AudioFrame(outputData, frame.sampleRate, frame.channels, frame.samplesPerChannel);
425+
}
426+
381427
private async playTask({
382428
playHandle,
383429
sound,
@@ -395,57 +441,44 @@ export class BackgroundAudioPlayer {
395441
sound = getBuiltinAudioPath(sound);
396442
}
397443

444+
let audioStream: AsyncIterable<AudioFrame>;
398445
if (typeof sound === 'string') {
399-
sound = loop
446+
audioStream = loop
400447
? loopAudioFramesFromFile(sound, { abortSignal: signal })
401448
: audioFramesFromFile(sound, { abortSignal: signal });
449+
} else {
450+
audioStream = sound;
402451
}
403452

404-
try {
405-
for await (const frame of sound) {
406-
if (signal.aborted || playHandle.done()) break;
453+
const applyVolume = this.applyVolumeToFrame.bind(this);
407454

408-
let processedFrame: AudioFrame;
409-
410-
if (volume !== 1.0) {
411-
const int16Data = new Int16Array(
412-
frame.data.buffer,
413-
frame.data.byteOffset,
414-
frame.data.byteLength / 2,
415-
);
416-
const float32Data = new Float32Array(int16Data.length);
417-
418-
for (let i = 0; i < int16Data.length; i++) {
419-
float32Data[i] = int16Data[i]!;
420-
}
421-
422-
const volumeFactor = 10 ** Math.log10(volume);
423-
for (let i = 0; i < float32Data.length; i++) {
424-
float32Data[i]! *= volumeFactor;
425-
}
426-
427-
const outputData = new Int16Array(float32Data.length);
428-
for (let i = 0; i < float32Data.length; i++) {
429-
const clipped = Math.max(-32768, Math.min(32767, float32Data[i]!));
430-
outputData[i] = Math.round(clipped);
431-
}
432-
433-
processedFrame = new AudioFrame(
434-
outputData,
435-
frame.sampleRate,
436-
frame.channels,
437-
frame.samplesPerChannel,
438-
);
439-
} else {
440-
processedFrame = frame;
441-
}
442-
443-
// TODO (Brian): use AudioMixer to add/remove frame streams
444-
await this.audioSource.captureFrame(processedFrame);
455+
async function* genWrapper(): AsyncGenerator<AudioFrame> {
456+
for await (const frame of audioStream) {
457+
if (signal.aborted || playHandle.done()) break;
458+
yield volume !== 1.0 ? applyVolume(frame, volume) : frame;
445459
}
460+
// TODO: the waitForPlayout() may be inaccurate by 400ms
461+
playHandle._markPlayoutDone();
462+
}
463+
464+
const gen = genWrapper();
465+
try {
466+
this.audioMixer.addStream(gen);
467+
await playHandle.waitForPlayout(); // wait for playout or interruption
446468
} finally {
447-
// TODO: the waitForPlayout() may be innaccurate by 400ms
469+
this.audioMixer.removeStream(gen);
448470
playHandle._markPlayoutDone();
471+
472+
// Close the generator if it was stopped early
473+
if (playHandle.done()) {
474+
await gen.return(undefined);
475+
}
476+
}
477+
}
478+
479+
private async runMixerTask(): Promise<void> {
480+
for await (const frame of this.audioMixer) {
481+
await this.audioSource.captureFrame(frame);
449482
}
450483
}
451484
}

examples/src/background_audio.ts

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@ import { z } from 'zod';
1212
* Background audio could make the agent feel more realistic, versus perfect silence
1313
* in the background.
1414
*
15-
* NOTE: Thinking sound is not yet supported (requires AudioMixer implementation)
15+
* This example demonstrates:
16+
* - Ambient background sound (office ambience) playing continuously
17+
* - Thinking sound (keyboard typing) that plays when the agent is processing/thinking
18+
* - Multiple sounds can play simultaneously via AudioMixer
1619
*/
1720

1821
export default defineAgent({
@@ -47,11 +50,12 @@ export default defineAgent({
4750

4851
const backgroundAudio = new voice.BackgroundAudioPlayer({
4952
ambientSound: voice.BuiltinAudioClip.OFFICE_AMBIENCE,
50-
// TODO: Thinking sound not yet supported
51-
// thinkingSound: [
52-
// { source: voice.BuiltinAudioClip.KEYBOARD_TYPING, volume: 0.8 },
53-
// { source: voice.BuiltinAudioClip.KEYBOARD_TYPING2, volume: 0.7 },
54-
// ],
53+
// Thinking sound will play when the agent enters 'thinking' state (e.g., during tool calls)
54+
// Multiple sounds with different probabilities/volumes can be provided
55+
thinkingSound: [
56+
{ source: voice.BuiltinAudioClip.KEYBOARD_TYPING, volume: 0.8, probability: 0.6 },
57+
{ source: voice.BuiltinAudioClip.KEYBOARD_TYPING2, volume: 0.7, probability: 0.4 },
58+
],
5559
});
5660

5761
await backgroundAudio.start({ room: ctx.room, agentSession: session });

0 commit comments

Comments
 (0)