From 695f7b567d0b7fffa6a18f14c74e50421434a0aa Mon Sep 17 00:00:00 2001 From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com> Date: Thu, 23 Jan 2025 12:58:49 +0530 Subject: [PATCH] fix: update default model to chirp2 in google stt & update generate_reply method in gemini realtime (#1401) --- .changeset/late-forks-sell.md | 5 +++ examples/multimodal-agent/gemini_agent.py | 22 ++++++------- .../google/beta/realtime/realtime_api.py | 32 ++++++------------- .../livekit/plugins/google/stt.py | 6 ++-- 4 files changed, 27 insertions(+), 38 deletions(-) create mode 100644 .changeset/late-forks-sell.md diff --git a/.changeset/late-forks-sell.md b/.changeset/late-forks-sell.md new file mode 100644 index 000000000..192e4e9c2 --- /dev/null +++ b/.changeset/late-forks-sell.md @@ -0,0 +1,5 @@ +--- +"livekit-plugins-google": patch +--- + +fix: update default model to chirp2 in google stt & update generate_reply method in gemini realtime diff --git a/examples/multimodal-agent/gemini_agent.py b/examples/multimodal-agent/gemini_agent.py index 0b7a191d6..12517e8c2 100644 --- a/examples/multimodal-agent/gemini_agent.py +++ b/examples/multimodal-agent/gemini_agent.py @@ -50,30 +50,26 @@ async def get_weather( await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY) participant = await ctx.wait_for_participant() - # chat_ctx is used to serve as initial context, Agent will start the conversation first if chat_ctx is provided + # create a chat context with chat history, these will be synchronized with the server + # upon calling `agent.generate_reply()` chat_ctx = llm.ChatContext() - chat_ctx.append(text="What is LiveKit?", role="user") - chat_ctx.append( - text="LiveKit is the platform for building realtime AI. The main use cases are to build AI voice agents. LiveKit also powers livestreaming apps, robotics, and video conferencing.", - role="assistant", - ) - chat_ctx.append(text="What is the LiveKit Agents framework?", role="user") + # chat_ctx.append(text="I'm planning a trip to Paris next month.", role="user") + # chat_ctx.append( + # text="How exciting! Paris is a beautiful city. I'd be happy to suggest some must-visit places and help you plan your trip.", + # role="assistant", + # ) agent = multimodal.MultimodalAgent( model=google.beta.realtime.RealtimeModel( voice="Puck", temperature=0.8, - instructions=""" - You are a helpful assistant - Here are some helpful information about LiveKit and its products and services: - - LiveKit is the platform for building realtime AI. The main use cases are to build AI voice agents. LiveKit also powers livestreaming apps, robotics, and video conferencing. - - LiveKit provides an Agents framework for building server-side AI agents, client SDKs for building frontends, and LiveKit Cloud is a global network that transports voice, video, and data traffic in realtime. - """, + instructions="You are a helpful assistant, greet the user and help them with their trip planning", ), fnc_ctx=fnc_ctx, chat_ctx=chat_ctx, ) agent.start(ctx.room, participant) + agent.generate_reply() if __name__ == "__main__": diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py index 5506d842d..d7cee5e27 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py @@ -307,8 +307,6 @@ def __init__( self._init_sync_task = asyncio.create_task(asyncio.sleep(0)) self._send_ch = utils.aio.Chan[ClientEvents]() self._active_response_id = None - if chat_ctx: - self.generate_reply(chat_ctx) async def aclose(self) -> None: if self._send_ch.closed: @@ -336,25 +334,6 @@ def _push_audio(self, frame: rtc.AudioFrame) -> None: def _queue_msg(self, msg: ClientEvents) -> None: self._send_ch.send_nowait(msg) - def generate_reply( - self, - ctx: llm.ChatContext | llm.ChatMessage, - turn_complete: bool = True, - ) -> None: - if isinstance(ctx, llm.ChatMessage) and isinstance(ctx.content, str): - new_chat_ctx = llm.ChatContext() - new_chat_ctx.append(text=ctx.content, role=ctx.role) - elif isinstance(ctx, llm.ChatContext): - new_chat_ctx = ctx - else: - raise ValueError("Invalid chat context") - turns, _ = _build_gemini_ctx(new_chat_ctx, id(self)) - client_content = LiveClientContent( - turn_complete=turn_complete, - turns=turns, - ) - self._queue_msg(client_content) - def chat_ctx_copy(self) -> llm.ChatContext: return self._chat_ctx.copy() @@ -370,7 +349,16 @@ def create_response( "cancel_existing", "cancel_new", "keep_both" ] = "keep_both", ) -> None: - raise NotImplementedError("create_response is not supported yet") + turns, _ = _build_gemini_ctx(self._chat_ctx, id(self)) + ctx = [self._opts.instructions] + turns if self._opts.instructions else turns + + if not ctx: + logger.warning( + "gemini-realtime-session: No chat context to send, sending dummy content." + ) + ctx = [Content(parts=[Part(text=".")])] + + self._queue_msg(LiveClientContent(turns=ctx, turn_complete=True)) def commit_audio_buffer(self) -> None: raise NotImplementedError("commit_audio_buffer is not supported yet") diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py index a39bd240d..752062efa 100644 --- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py +++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py @@ -89,9 +89,9 @@ def __init__( detect_language: bool = True, interim_results: bool = True, punctuate: bool = True, - spoken_punctuation: bool = True, - model: SpeechModels = "long", - location: str = "global", + spoken_punctuation: bool = False, + model: SpeechModels = "chirp_2", + location: str = "us-central1", sample_rate: int = 16000, credentials_info: dict | None = None, credentials_file: str | None = None,