re-emit from last overlap speech

chenghao-mou · chenghao-mou · commit 2bc9cf54e043 · 2025-11-27T12:35:12.000Z
diff --git a/livekit-agents/livekit/agents/inference/bargein.py b/livekit-agents/livekit/agents/inference/bargein.py
@@ -59,6 +59,9 @@ class BargeinEvent:
     inference_duration: float = 0.0
     """Time taken to perform the inference, in seconds."""
 
+    overlap_speech_started_at: float | None = None
+    """Timestamp (in seconds) when the overlap speech started. Useful for emitting held transcripts."""
+
 
 class BargeinError(BaseModel):
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -247,6 +250,7 @@ def __init__(self, bargein_detector: BargeinDetector, conn_options: APIConnectOp
         self._conn_options = conn_options
         self._sample_rate = bargein_detector._sample_rate
         self._resampler: rtc.AudioResampler | None = None
+        self._overlap_speech_started_at: float | None = None
 
     @abstractmethod
     async def _run(self) -> None: ...
@@ -315,6 +319,7 @@ def start_overlap_speech(self) -> None:
         self._check_input_not_ended()
         self._check_not_closed()
         self._input_ch.send_nowait(self._OverlapSpeechStartedSentinel())
+        self._overlap_speech_started_at = time.time()
 
     def end_overlap_speech(self) -> None:
         """Mark the end of the overlap speech"""
@@ -494,13 +499,15 @@ async def _send_task() -> None:
                         timestamp=time.time(),
                         is_bargein=is_bargein,
                         inference_duration=inference_duration,
+                        overlap_speech_started_at=self._overlap_speech_started_at,
                     )
                 )
 
                 if is_bargein:
                     ev = BargeinEvent(
                         type=BargeinEventType.BARGEIN,
                         timestamp=time.time(),
+                        overlap_speech_started_at=self._overlap_speech_started_at,
                     )
                     self._event_ch.send_nowait(ev)
                     self._bargein_detector.emit("bargein_detected", ev)
@@ -696,13 +703,15 @@ async def recv_task(ws: aiohttp.ClientWebSocketResponse) -> None:
                             timestamp=time.time(),
                             is_bargein=is_bargein_result,
                             inference_duration=inference_duration,
+                            overlap_speech_started_at=self._overlap_speech_started_at,
                         )
                     )
                 elif msg_type == "bargein_detected":
                     logger.debug("bargein detected")
                     ev = BargeinEvent(
                         type=BargeinEventType.BARGEIN,
                         timestamp=time.time(),
+                        overlap_speech_started_at=self._overlap_speech_started_at,
                     )
                     self._event_ch.send_nowait(ev)
                     self._bargein_detector.emit("bargein_detected", ev)
diff --git a/livekit-agents/livekit/agents/voice/agent_activity.py b/livekit-agents/livekit/agents/voice/agent_activity.py
@@ -1281,15 +1281,23 @@ def on_vad_inference_done(self, ev: vad.VADEvent) -> None:
             self._interrupt_by_audio_activity()
 
     def on_bargein_detected(self, ev: inference.BargeinEvent) -> None:
-        logger.debug("bargein detected", extra={"timestamp": ev.timestamp})
+        logger.debug(
+            "bargein detected",
+            extra={
+                "timestamp": ev.timestamp,
+                "overlap_speech_started_at": ev.overlap_speech_started_at,
+            },
+        )
         # restore interruption by audio activity
         self._interruption_by_audio_activity_enabled = self._turn_detection not in (
             "manual",
             "realtime_llm",
         )
         self._interrupt_by_audio_activity()
         if self._audio_recognition:
-            self._audio_recognition.end_barge_in_monitoring(ev.timestamp)
+            self._audio_recognition.end_barge_in_monitoring(
+                ev.overlap_speech_started_at or ev.timestamp
+            )
 
     def on_bargein_inference_done(self, ev: inference.BargeinEvent) -> None:
         self._interruption_by_audio_activity_enabled = False
diff --git a/livekit-agents/livekit/agents/voice/audio_recognition.py b/livekit-agents/livekit/agents/voice/audio_recognition.py
@@ -247,10 +247,65 @@ def end_barge_in_monitoring(self, ignore_until: float) -> None:
             else min(ignore_until, self._ignore_until)
         )
 
+        # flush held transcripts if possible
+        task = asyncio.create_task(self._flush_held_transcripts())
+        task.add_done_callback(lambda _: self._tasks.discard(task))
+        self._tasks.add(task)
+
+    async def _flush_held_transcripts(self) -> None:
+        """Flush held transcripts whose *end time* is after the ignore_until timestamp.
+
+        If the event has no timestamps, we assume it is the same as the next valid event.
+        """
+        if not self._barge_in_enabled:
+            return
+        if not is_given(self._ignore_until):
+            return
+        if not self._transcript_buffer:
+            return
+
+        emit_from_index = float("inf")
+        for i, ev in enumerate(self._transcript_buffer):
+            if not ev.alternatives:
+                emit_from_index = min(emit_from_index, i)
+                continue
+            # vendor doesn't set timestamps properly, in which case we just return
+            if ev.alternatives[0].start_time == ev.alternatives[0].end_time == 0:
+                self._transcript_buffer.clear()
+                return
+
+            if (
+                ev.alternatives[0].start_time + ev.alternatives[0].end_time + self._input_started_at
+                < self._ignore_until
+            ):
+                emit_from_index = float("inf")
+            else:
+                emit_from_index = min(emit_from_index, i)
+                break
+
+        # extract events to emit and reset BEFORE iterating
+        # to prevent recursive calls
+        events_to_emit = (
+            self._transcript_buffer[emit_from_index:] if emit_from_index != float("inf") else []
+        )
+        self._transcript_buffer.clear()
+        self._ignore_until = NOT_GIVEN
+
+        for ev in events_to_emit:
+            logger.debug(
+                "re-emitting held transcript",
+                extra={
+                    "event": ev.type,
+                    "ignore_until": self._ignore_until if is_given(self._ignore_until) else None,
+                },
+            )
+            await self._on_stt_event(ev)
+
     def _should_hold_stt_event(self, ev: stt.SpeechEvent) -> bool:
         """Test if the event should be held until the ignore_until timestamp."""
         if not self._barge_in_enabled:
             return False
+
         if self._agent_speaking:
             return True
 
@@ -270,13 +325,11 @@ def _should_hold_stt_event(self, ev: stt.SpeechEvent) -> bool:
             # 3. the event is for audio sent before the ignore_until timestamp
             and self._input_started_at is not None
             and not (ev.alternatives[0].start_time == ev.alternatives[0].end_time == 0)
-            and ev.alternatives[0].start_time + ev.alternatives[0].end_time
-            < self._ignore_until - self._input_started_at
+            and ev.alternatives[0].start_time + ev.alternatives[0].end_time + self._input_started_at
+            < self._ignore_until
         ):
             return True
 
-        # ignore_until expired or we don't have the right timestamp
-        self._ignore_until = NOT_GIVEN
         return False
 
     def push_audio(self, frame: rtc.AudioFrame) -> None:
@@ -484,7 +537,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
         # - hold the event until the ignore_until expires
         # - release only relevant events
         # - allow RECOGNITION_USAGE to pass through immediately
-        if ev.type != stt.SpeechEventType.RECOGNITION_USAGE:
+        if ev.type != stt.SpeechEventType.RECOGNITION_USAGE and self._barge_in_enabled:
             if self._should_hold_stt_event(ev):
                 logger.debug(
                     "holding event until ignore_until expires",
@@ -498,23 +551,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
                 self._transcript_buffer.append(ev)
                 return
             elif self._transcript_buffer:
-                # emit the preceding sentinel event immediately before this event
-                # assuming *only one* sentinel event could precede the current event
-                # ignore if the previous event is not a sentinel event
-                logger.debug(
-                    "emitting held events",
-                    extra={
-                        "event": ev.type,
-                        "previous_event": self._transcript_buffer[-1].type,
-                    },
-                )
-                prev_event = self._transcript_buffer.pop()
-                self._transcript_buffer.clear()
-                if prev_event.type in {
-                    stt.SpeechEventType.START_OF_SPEECH,
-                }:
-                    await self._on_stt_event(prev_event)
-
+                await self._flush_held_transcripts()
                 # no return here to allow the new event to be processed normally
 
         if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT: