Skip to content

Commit

Permalink
create_response and cancel_response APIs for MultimodalAgent (#1359)
Browse files Browse the repository at this point in the history
Co-authored-by: David Zhao <[email protected]>
  • Loading branch information
longcw and davidzhao authored Jan 18, 2025
1 parent 5e3a32c commit 1f6b457
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 2 deletions.
6 changes: 6 additions & 0 deletions .changeset/moody-tools-push.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"livekit-plugins-openai": patch
"livekit-agents": patch
---

add generate_reply api for multimodal agent
28 changes: 26 additions & 2 deletions livekit-agents/livekit/agents/multimodal/multimodal_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
"user_stopped_speaking",
"agent_started_speaking",
"agent_stopped_speaking",
"input_speech_committed",
"user_speech_committed",
"agent_speech_committed",
"agent_speech_interrupted",
Expand Down Expand Up @@ -97,7 +96,20 @@ def _push_audio(self, frame: rtc.AudioFrame) -> None: ...
def fnc_ctx(self) -> llm.FunctionContext | None: ...
@fnc_ctx.setter
def fnc_ctx(self, value: llm.FunctionContext | None) -> None: ...

def chat_ctx_copy(self) -> llm.ChatContext: ...

def cancel_response(self) -> None: ...
def create_response(
self,
on_duplicate: Literal[
"cancel_existing", "cancel_new", "keep_both"
] = "keep_both",
) -> None: ...
def commit_audio_buffer(self) -> None: ...
@property
def server_vad_enabled(self) -> bool: ...

def _recover_from_text_response(self, item_id: str) -> None: ...
def _update_conversation_item_content(
self,
Expand Down Expand Up @@ -303,7 +315,6 @@ def _input_speech_committed():
alternatives=[stt.SpeechData(language="", text="")],
)
)
self.emit("input_speech_committed")

@self._session.on("input_speech_transcription_completed")
def _input_speech_transcription_completed(ev: _InputTranscriptionProto):
Expand Down Expand Up @@ -349,6 +360,8 @@ def _metrics_collected(metrics: MultimodalLLMMetrics):
self.emit("metrics_collected", metrics)

def interrupt(self) -> None:
self._session.cancel_response()

if self._playing_handle is not None and not self._playing_handle.done():
self._playing_handle.interrupt()

Expand All @@ -360,6 +373,17 @@ def interrupt(self) -> None:
)
self._update_state("listening")

def generate_reply(
self,
on_duplicate: Literal[
"cancel_existing", "cancel_new", "keep_both"
] = "cancel_existing",
) -> None:
"""Generate a reply from the agent"""
if not self._session.server_vad_enabled:
self._session.commit_audio_buffer()
self._session.create_response(on_duplicate=on_duplicate)

def _update_state(self, state: AgentState, delay: float = 0.0):
"""Set the current state of the agent"""

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,23 @@ def chat_ctx_copy(self) -> llm.ChatContext:
async def set_chat_ctx(self, ctx: llm.ChatContext) -> None:
self._chat_ctx = ctx.copy()

def cancel_response(self) -> None:
raise NotImplementedError("cancel_response is not supported yet")

def create_response(
self,
on_duplicate: Literal[
"cancel_existing", "cancel_new", "keep_both"
] = "keep_both",
) -> None:
raise NotImplementedError("create_response is not supported yet")

def commit_audio_buffer(self) -> None:
raise NotImplementedError("commit_audio_buffer is not supported yet")

def server_vad_enabled(self) -> bool:
return True

@utils.log_exceptions(logger=logger)
async def _main_task(self):
@utils.log_exceptions(logger=logger)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1017,6 +1017,25 @@ def _validate_message(msg: llm.ChatMessage) -> bool:
# wait for all the futures to complete
await asyncio.gather(*_futs)

def cancel_response(self) -> None:
if self._active_response_id:
self.response.cancel()

def create_response(
self,
on_duplicate: Literal[
"cancel_existing", "cancel_new", "keep_both"
] = "keep_both",
) -> None:
self.response.create(on_duplicate=on_duplicate)

def commit_audio_buffer(self) -> None:
self.input_audio_buffer.commit()

@property
def server_vad_enabled(self) -> bool:
return self._opts.turn_detection is not None

def _create_empty_user_audio_message(self, duration: float) -> llm.ChatMessage:
"""Create an empty audio message with the given duration."""
samples = int(duration * api_proto.SAMPLE_RATE)
Expand Down

0 comments on commit 1f6b457

Please sign in to comment.