From 1f6b4570c552ffdf5051e410da328d8e6df5a757 Mon Sep 17 00:00:00 2001 From: Long Chen Date: Sat, 18 Jan 2025 15:56:01 +0800 Subject: [PATCH] create_response and cancel_response APIs for MultimodalAgent (#1359) Co-authored-by: David Zhao --- .changeset/moody-tools-push.md | 6 ++++ .../agents/multimodal/multimodal_agent.py | 28 +++++++++++++++++-- .../google/beta/realtime/realtime_api.py | 17 +++++++++++ .../plugins/openai/realtime/realtime_model.py | 19 +++++++++++++ 4 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 .changeset/moody-tools-push.md diff --git a/.changeset/moody-tools-push.md b/.changeset/moody-tools-push.md new file mode 100644 index 0000000000..e2987ac40d --- /dev/null +++ b/.changeset/moody-tools-push.md @@ -0,0 +1,6 @@ +--- +"livekit-plugins-openai": patch +"livekit-agents": patch +--- + +add generate_reply api for multimodal agent diff --git a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py index 5599b2e931..f308e2f689 100644 --- a/livekit-agents/livekit/agents/multimodal/multimodal_agent.py +++ b/livekit-agents/livekit/agents/multimodal/multimodal_agent.py @@ -29,7 +29,6 @@ "user_stopped_speaking", "agent_started_speaking", "agent_stopped_speaking", - "input_speech_committed", "user_speech_committed", "agent_speech_committed", "agent_speech_interrupted", @@ -97,7 +96,20 @@ def _push_audio(self, frame: rtc.AudioFrame) -> None: ... def fnc_ctx(self) -> llm.FunctionContext | None: ... @fnc_ctx.setter def fnc_ctx(self, value: llm.FunctionContext | None) -> None: ... + def chat_ctx_copy(self) -> llm.ChatContext: ... + + def cancel_response(self) -> None: ... + def create_response( + self, + on_duplicate: Literal[ + "cancel_existing", "cancel_new", "keep_both" + ] = "keep_both", + ) -> None: ... + def commit_audio_buffer(self) -> None: ... + @property + def server_vad_enabled(self) -> bool: ... + def _recover_from_text_response(self, item_id: str) -> None: ... def _update_conversation_item_content( self, @@ -303,7 +315,6 @@ def _input_speech_committed(): alternatives=[stt.SpeechData(language="", text="")], ) ) - self.emit("input_speech_committed") @self._session.on("input_speech_transcription_completed") def _input_speech_transcription_completed(ev: _InputTranscriptionProto): @@ -349,6 +360,8 @@ def _metrics_collected(metrics: MultimodalLLMMetrics): self.emit("metrics_collected", metrics) def interrupt(self) -> None: + self._session.cancel_response() + if self._playing_handle is not None and not self._playing_handle.done(): self._playing_handle.interrupt() @@ -360,6 +373,17 @@ def interrupt(self) -> None: ) self._update_state("listening") + def generate_reply( + self, + on_duplicate: Literal[ + "cancel_existing", "cancel_new", "keep_both" + ] = "cancel_existing", + ) -> None: + """Generate a reply from the agent""" + if not self._session.server_vad_enabled: + self._session.commit_audio_buffer() + self._session.create_response(on_duplicate=on_duplicate) + def _update_state(self, state: AgentState, delay: float = 0.0): """Set the current state of the agent""" diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py index 722ecdbb7f..07b600166c 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py @@ -281,6 +281,23 @@ def chat_ctx_copy(self) -> llm.ChatContext: async def set_chat_ctx(self, ctx: llm.ChatContext) -> None: self._chat_ctx = ctx.copy() + def cancel_response(self) -> None: + raise NotImplementedError("cancel_response is not supported yet") + + def create_response( + self, + on_duplicate: Literal[ + "cancel_existing", "cancel_new", "keep_both" + ] = "keep_both", + ) -> None: + raise NotImplementedError("create_response is not supported yet") + + def commit_audio_buffer(self) -> None: + raise NotImplementedError("commit_audio_buffer is not supported yet") + + def server_vad_enabled(self) -> bool: + return True + @utils.log_exceptions(logger=logger) async def _main_task(self): @utils.log_exceptions(logger=logger) diff --git a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py index 8b6b717f78..1a607775e5 100644 --- a/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py +++ b/livekit-plugins/livekit-plugins-openai/livekit/plugins/openai/realtime/realtime_model.py @@ -1017,6 +1017,25 @@ def _validate_message(msg: llm.ChatMessage) -> bool: # wait for all the futures to complete await asyncio.gather(*_futs) + def cancel_response(self) -> None: + if self._active_response_id: + self.response.cancel() + + def create_response( + self, + on_duplicate: Literal[ + "cancel_existing", "cancel_new", "keep_both" + ] = "keep_both", + ) -> None: + self.response.create(on_duplicate=on_duplicate) + + def commit_audio_buffer(self) -> None: + self.input_audio_buffer.commit() + + @property + def server_vad_enabled(self) -> bool: + return self._opts.turn_detection is not None + def _create_empty_user_audio_message(self, duration: float) -> llm.ChatMessage: """Create an empty audio message with the given duration.""" samples = int(duration * api_proto.SAMPLE_RATE)