From 695f7b567d0b7fffa6a18f14c74e50421434a0aa Mon Sep 17 00:00:00 2001
From: Jayesh Parmar <60539217+jayeshp19@users.noreply.github.com>
Date: Thu, 23 Jan 2025 12:58:49 +0530
Subject: [PATCH] fix: update default model to chirp2 in google stt & update
 generate_reply method in gemini realtime (#1401)

---
 .changeset/late-forks-sell.md                 |  5 +++
 examples/multimodal-agent/gemini_agent.py     | 22 ++++++-------
 .../google/beta/realtime/realtime_api.py      | 32 ++++++-------------
 .../livekit/plugins/google/stt.py             |  6 ++--
 4 files changed, 27 insertions(+), 38 deletions(-)
 create mode 100644 .changeset/late-forks-sell.md

diff --git a/.changeset/late-forks-sell.md b/.changeset/late-forks-sell.md
new file mode 100644
index 000000000..192e4e9c2
--- /dev/null
+++ b/.changeset/late-forks-sell.md
@@ -0,0 +1,5 @@
+---
+"livekit-plugins-google": patch
+---
+
+fix: update default model to chirp2 in google stt & update generate_reply method in gemini realtime
diff --git a/examples/multimodal-agent/gemini_agent.py b/examples/multimodal-agent/gemini_agent.py
index 0b7a191d6..12517e8c2 100644
--- a/examples/multimodal-agent/gemini_agent.py
+++ b/examples/multimodal-agent/gemini_agent.py
@@ -50,30 +50,26 @@ async def get_weather(
     await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
     participant = await ctx.wait_for_participant()
 
-    # chat_ctx is used to serve as initial context, Agent will start the conversation first if chat_ctx is provided
+    # create a chat context with chat history, these will be synchronized with the server
+    # upon calling `agent.generate_reply()`
     chat_ctx = llm.ChatContext()
-    chat_ctx.append(text="What is LiveKit?", role="user")
-    chat_ctx.append(
-        text="LiveKit is the platform for building realtime AI. The main use cases are to build AI voice agents. LiveKit also powers livestreaming apps, robotics, and video conferencing.",
-        role="assistant",
-    )
-    chat_ctx.append(text="What is the LiveKit Agents framework?", role="user")
+    # chat_ctx.append(text="I'm planning a trip to Paris next month.", role="user")
+    # chat_ctx.append(
+    #     text="How exciting! Paris is a beautiful city. I'd be happy to suggest some must-visit places and help you plan your trip.",
+    #     role="assistant",
+    # )
 
     agent = multimodal.MultimodalAgent(
         model=google.beta.realtime.RealtimeModel(
             voice="Puck",
             temperature=0.8,
-            instructions="""
-            You are a helpful assistant
-            Here are some helpful information about LiveKit and its products and services:
-            - LiveKit is the platform for building realtime AI. The main use cases are to build AI voice agents. LiveKit also powers livestreaming apps, robotics, and video conferencing.
-            - LiveKit provides an Agents framework for building server-side AI agents, client SDKs for building frontends, and LiveKit Cloud is a global network that transports voice, video, and data traffic in realtime.
-            """,
+            instructions="You are a helpful assistant, greet the user and help them with their trip planning",
         ),
         fnc_ctx=fnc_ctx,
         chat_ctx=chat_ctx,
     )
     agent.start(ctx.room, participant)
+    agent.generate_reply()
 
 
 if __name__ == "__main__":
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py
index 5506d842d..d7cee5e27 100644
--- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/beta/realtime/realtime_api.py
@@ -307,8 +307,6 @@ def __init__(
         self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
         self._send_ch = utils.aio.Chan[ClientEvents]()
         self._active_response_id = None
-        if chat_ctx:
-            self.generate_reply(chat_ctx)
 
     async def aclose(self) -> None:
         if self._send_ch.closed:
@@ -336,25 +334,6 @@ def _push_audio(self, frame: rtc.AudioFrame) -> None:
     def _queue_msg(self, msg: ClientEvents) -> None:
         self._send_ch.send_nowait(msg)
 
-    def generate_reply(
-        self,
-        ctx: llm.ChatContext | llm.ChatMessage,
-        turn_complete: bool = True,
-    ) -> None:
-        if isinstance(ctx, llm.ChatMessage) and isinstance(ctx.content, str):
-            new_chat_ctx = llm.ChatContext()
-            new_chat_ctx.append(text=ctx.content, role=ctx.role)
-        elif isinstance(ctx, llm.ChatContext):
-            new_chat_ctx = ctx
-        else:
-            raise ValueError("Invalid chat context")
-        turns, _ = _build_gemini_ctx(new_chat_ctx, id(self))
-        client_content = LiveClientContent(
-            turn_complete=turn_complete,
-            turns=turns,
-        )
-        self._queue_msg(client_content)
-
     def chat_ctx_copy(self) -> llm.ChatContext:
         return self._chat_ctx.copy()
 
@@ -370,7 +349,16 @@ def create_response(
             "cancel_existing", "cancel_new", "keep_both"
         ] = "keep_both",
     ) -> None:
-        raise NotImplementedError("create_response is not supported yet")
+        turns, _ = _build_gemini_ctx(self._chat_ctx, id(self))
+        ctx = [self._opts.instructions] + turns if self._opts.instructions else turns
+
+        if not ctx:
+            logger.warning(
+                "gemini-realtime-session: No chat context to send, sending dummy content."
+            )
+            ctx = [Content(parts=[Part(text=".")])]
+
+        self._queue_msg(LiveClientContent(turns=ctx, turn_complete=True))
 
     def commit_audio_buffer(self) -> None:
         raise NotImplementedError("commit_audio_buffer is not supported yet")
diff --git a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py
index a39bd240d..752062efa 100644
--- a/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py
+++ b/livekit-plugins/livekit-plugins-google/livekit/plugins/google/stt.py
@@ -89,9 +89,9 @@ def __init__(
         detect_language: bool = True,
         interim_results: bool = True,
         punctuate: bool = True,
-        spoken_punctuation: bool = True,
-        model: SpeechModels = "long",
-        location: str = "global",
+        spoken_punctuation: bool = False,
+        model: SpeechModels = "chirp_2",
+        location: str = "us-central1",
         sample_rate: int = 16000,
         credentials_info: dict | None = None,
         credentials_file: str | None = None,