Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion livekit-agents/livekit/agents/voice/agent_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ def __init__(self, agent: Agent, sess: AgentSession) -> None:
self._paused_speech: SpeechHandle | None = None
self._false_interruption_timer: asyncio.TimerHandle | None = None
self._interrupt_paused_speech_task: asyncio.Task[None] | None = None
self._stt_eos_received: bool = False

# fired when a speech_task finishes or when a new speech_handle is scheduled
# this is used to wake up the main task when the scheduling state changes
Expand Down Expand Up @@ -1217,6 +1218,7 @@ def on_start_of_speech(self, ev: vad.VADEvent | None) -> None:
speech_start_time = speech_start_time - ev.speech_duration
self._session._update_user_state("speaking", last_speaking_time=speech_start_time)
self._user_silence_event.clear()
self._stt_eos_received = False

if self._false_interruption_timer:
# cancel the timer when user starts speaking but leave the paused state unchanged
Expand All @@ -1227,6 +1229,9 @@ def on_end_of_speech(self, ev: vad.VADEvent | None) -> None:
speech_end_time = time.time()
if ev:
speech_end_time = speech_end_time - ev.silence_duration
else:
self._stt_eos_received = True

self._session._update_user_state(
"listening",
last_speaking_time=speech_end_time,
Expand All @@ -1245,7 +1250,16 @@ def on_vad_inference_done(self, ev: vad.VADEvent) -> None:
# ignore vad inference done event if turn_detection is manual or realtime_llm
return

if ev.speech_duration >= self._session.options.min_interruption_duration:
active_speech = ev.speech_duration >= self._session.options.min_interruption_duration
if active_speech and (
self._turn_detection != "stt"
or not self._stt_eos_received
or ev.raw_accumulated_silence == 0
):
# STT may send EOS before VAD EOS, we only interrupt if:
# 1. turn detection is not STT; or
# 2. STT EOS hasn't been received yet; or
# 3. VAD speech is still ongoing
self._interrupt_by_audio_activity()

if (
Expand Down
18 changes: 15 additions & 3 deletions livekit-agents/livekit/agents/voice/audio_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:

self._hooks.on_final_transcript(
ev,
speaking=self._speaking if self._vad else None,
speaking=self._speaking
if self._vad or self._turn_detection_mode == "stt"
else None,
)
extra: dict[str, Any] = {"user_transcript": transcript, "language": self._last_language}
if self._last_speaking_time:
Expand Down Expand Up @@ -401,7 +403,12 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
self._run_eou_detection(chat_ctx)

elif ev.type == stt.SpeechEventType.PREFLIGHT_TRANSCRIPT:
self._hooks.on_interim_transcript(ev, speaking=self._speaking if self._vad else None)
self._hooks.on_interim_transcript(
ev,
speaking=self._speaking
if self._vad or self._turn_detection_mode == "stt"
else None,
)
transcript = ev.alternatives[0].text
language = ev.alternatives[0].language
confidence = ev.alternatives[0].confidence
Expand Down Expand Up @@ -440,7 +447,12 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
)

elif ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
self._hooks.on_interim_transcript(ev, speaking=self._speaking if self._vad else None)
self._hooks.on_interim_transcript(
ev,
speaking=self._speaking
if self._vad or self._turn_detection_mode == "stt"
else None,
)
self._audio_interim_transcript = ev.alternatives[0].text

elif ev.type == stt.SpeechEventType.END_OF_SPEECH and self._turn_detection_mode == "stt":
Expand Down