livekit · chenghao-mou · Nov 26, 2025 · Nov 26, 2025 · longcw · Nov 27, 2025
diff --git a/livekit-agents/livekit/agents/cli/cli.py b/livekit-agents/livekit/agents/cli/cli.py
@@ -35,7 +35,6 @@
 from rich.table import Column, Table
 from rich.text import Text
 from rich.theme import Theme
-from rich.traceback import Traceback as RichTraceback
 
 from livekit import rtc
 

diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -120,6 +120,8 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
 
         self._current_speech: SpeechHandle | None = None
         self._speech_q: list[tuple[int, float, SpeechHandle]] = []
+        self._user_silence_event: asyncio.Event = asyncio.Event()
+        self._user_silence_event.set()
 
         # for false interruption handling
         self._paused_speech: SpeechHandle | None = None
@@ -1213,6 +1215,7 @@ def _interrupt_by_audio_activity(self) -> None:
 
     def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
         self._session._update_user_state("speaking")
+        self._user_silence_event.clear()
 
         if self._false_interruption_timer:
             # cancel the timer when user starts speaking but leave the paused state unchanged
@@ -1227,6 +1230,7 @@ def on_end_of_speech(self, ev: vad.VADEvent | None) -> None:
             "listening",
             last_speaking_time=speech_end_time,
         )
+        self._user_silence_event.set()
 
         if (
             self._paused_speech
@@ -1243,6 +1247,11 @@ def on_vad_inference_done(self, ev: vad.VADEvent) -> None:
         if ev.speech_duration >= self._session.options.min_interruption_duration:
             self._interrupt_by_audio_activity()
 
+        if ev.speaking and ev.raw_accumulated_silence <= 0:
+            self._user_silence_event.clear()
+        else:
+            self._user_silence_event.set()
+
     def on_interim_transcript(self, ev: stt.SpeechEvent, *, speaking: bool | None) -> None:
         if isinstance(self.llm, llm.RealtimeModel) and self.llm.capabilities.user_transcription:
             # skip stt transcription if user_transcription is enabled on the realtime model
@@ -1584,7 +1593,8 @@ async def _tts_task(
         audio_output = self._session.output.audio if self._session.output.audio_enabled else None
 
         wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
-        await speech_handle.wait_if_not_interrupted([wait_for_authorization])
+        wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
+        await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
         speech_handle._clear_authorization()
 
         if speech_handle.interrupted:
@@ -1797,7 +1807,8 @@ async def _pipeline_reply_task(
         self._session._update_agent_state("thinking")
 
         wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
-        await speech_handle.wait_if_not_interrupted([wait_for_authorization])
+        wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
+        await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
         speech_handle._clear_authorization()
 
         if speech_handle.interrupted:
@@ -2072,7 +2083,8 @@ async def _realtime_reply_task(
 
         # realtime_reply_task is called only when there's text input, native audio input is handled by _realtime_generation_task
         wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
-        await speech_handle.wait_if_not_interrupted([wait_for_authorization])
+        wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
+        await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
         if speech_handle.interrupted:
             await utils.aio.cancel_and_wait(wait_for_authorization)
 
@@ -2142,7 +2154,8 @@ async def _realtime_generation_task(
         tool_ctx = llm.ToolContext(self.tools)
 
         wait_for_authorization = asyncio.ensure_future(speech_handle._wait_for_authorization())
-        await speech_handle.wait_if_not_interrupted([wait_for_authorization])
+        wait_for_user_silence = asyncio.ensure_future(self._user_silence_event.wait())
+        await speech_handle.wait_if_not_interrupted([wait_for_authorization, wait_for_user_silence])
         speech_handle._clear_authorization()
 
         if speech_handle.interrupted: