Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion livekit-agents/livekit/agents/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from rich.table import Column, Table
from rich.text import Text
from rich.theme import Theme
from rich.traceback import Traceback as RichTraceback

from livekit import rtc

Expand Down
21 changes: 17 additions & 4 deletions livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:

self._current_speech: SpeechHandle | None = None
self._speech_q: list[tuple[int, float, SpeechHandle]] = []
self._user_silence_event: asyncio.Event = asyncio.Event()
self._user_silence_event.set()

# for false interruption handling
self._paused_speech: SpeechHandle | None = None
Expand Down Expand Up @@ -1213,6 +1215,7 @@ def _interrupt_by_audio_activity(self) -> None:

def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
self._session._update_user_state("speaking")
self._user_silence_event.clear()

if self._false_interruption_timer:
# cancel the timer when user starts speaking but leave the paused state unchanged
Expand All @@ -1227,6 +1230,7 @@ def on_end_of_speech(self, ev: vad.VADEvent | None) -> None:
"listening",
last_speaking_time=speech_end_time,
)
self._user_silence_event.set()

if (
self._paused_speech
Expand All @@ -1243,6 +1247,11 @@ def on_vad_inference_done(self, ev: vad.VADEvent) -> None:
if ev.speech_duration >= self._session.options.min_interruption_duration:
self._interrupt_by_audio_activity()

if ev.speaking and ev.raw_accumulated_silence <= 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we may want a threshold of raw_accumulated_speech here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, we definitely don't want it to be speaking between user words.

self._user_silence_event.clear()
else:
self._user_silence_event.set()

def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) -> None:
if isinstance(self.llm, llm.RealtimeModel) and self.llm.capabilities.user_transcription:
# skip stt transcription if user_transcription is enabled on the realtime model
Expand Down Expand Up @@ -1584,7 +1593,8 @@ async def _tts_task(
audio_output = self._session.output.audio if self._session.output.audio_enabled else None

wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
await speech_handle.wait_if_not_interrupted([wait_for_authorization])
wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
speech_handle._clear_authorization()

if speech_handle.interrupted:
Expand Down Expand Up @@ -1797,7 +1807,8 @@ async def _pipeline_reply_task(
self._session._update_agent_state("thinking")

wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
await speech_handle.wait_if_not_interrupted([wait_for_authorization])
wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
speech_handle._clear_authorization()

if speech_handle.interrupted:
Expand Down Expand Up @@ -2072,7 +2083,8 @@ async def _realtime_reply_task(

# realtime_reply_task is called only when there's text input, native audio input is handled by _realtime_generation_task
wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
await speech_handle.wait_if_not_interrupted([wait_for_authorization])
wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
if speech_handle.interrupted:
await utils.aio.cancel_and_wait(wait_for_authorization)

Expand Down Expand Up @@ -2142,7 +2154,8 @@ async def _realtime_generation_task(
tool_ctx = llm.ToolContext(self.tools)

wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
await speech_handle.wait_if_not_interrupted([wait_for_authorization])
wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
speech_handle._clear_authorization()

if speech_handle.interrupted:
Expand Down