Skip to content

Commit

Permalink
fix: update default model to chirp2 in google stt & update generate_r…
Browse files Browse the repository at this point in the history
…eply method in gemini realtime (#1401)
  • Loading branch information
jayeshp19 authored Jan 23, 2025
1 parent 7b147bc commit 695f7b5
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 38 deletions.
5 changes: 5 additions & 0 deletions .changeset/late-forks-sell.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"livekit-plugins-google": patch
---

fix: update default model to chirp2 in google stt & update generate_reply method in gemini realtime
22 changes: 9 additions & 13 deletions examples/multimodal-agent/gemini_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,30 +50,26 @@ async def get_weather(
await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
participant = await ctx.wait_for_participant()

# chat_ctx is used to serve as initial context, Agent will start the conversation first if chat_ctx is provided
# create a chat context with chat history, these will be synchronized with the server
# upon calling `agent.generate_reply()`
chat_ctx = llm.ChatContext()
chat_ctx.append(text="What is LiveKit?", role="user")
chat_ctx.append(
text="LiveKit is the platform for building realtime AI. The main use cases are to build AI voice agents. LiveKit also powers livestreaming apps, robotics, and video conferencing.",
role="assistant",
)
chat_ctx.append(text="What is the LiveKit Agents framework?", role="user")
# chat_ctx.append(text="I'm planning a trip to Paris next month.", role="user")
# chat_ctx.append(
# text="How exciting! Paris is a beautiful city. I'd be happy to suggest some must-visit places and help you plan your trip.",
# role="assistant",
# )

agent = multimodal.MultimodalAgent(
model=google.beta.realtime.RealtimeModel(
voice="Puck",
temperature=0.8,
instructions="""
You are a helpful assistant
Here are some helpful information about LiveKit and its products and services:
- LiveKit is the platform for building realtime AI. The main use cases are to build AI voice agents. LiveKit also powers livestreaming apps, robotics, and video conferencing.
- LiveKit provides an Agents framework for building server-side AI agents, client SDKs for building frontends, and LiveKit Cloud is a global network that transports voice, video, and data traffic in realtime.
""",
instructions="You are a helpful assistant, greet the user and help them with their trip planning",
),
fnc_ctx=fnc_ctx,
chat_ctx=chat_ctx,
)
agent.start(ctx.room, participant)
agent.generate_reply()


if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,6 @@ def __init__(
self._init_sync_task = asyncio.create_task(asyncio.sleep(0))
self._send_ch = utils.aio.Chan[ClientEvents]()
self._active_response_id = None
if chat_ctx:
self.generate_reply(chat_ctx)

async def aclose(self) -> None:
if self._send_ch.closed:
Expand Down Expand Up @@ -336,25 +334,6 @@ def _push_audio(self, frame: rtc.AudioFrame) -> None:
def _queue_msg(self, msg: ClientEvents) -> None:
self._send_ch.send_nowait(msg)

def generate_reply(
self,
ctx: llm.ChatContext | llm.ChatMessage,
turn_complete: bool = True,
) -> None:
if isinstance(ctx, llm.ChatMessage) and isinstance(ctx.content, str):
new_chat_ctx = llm.ChatContext()
new_chat_ctx.append(text=ctx.content, role=ctx.role)
elif isinstance(ctx, llm.ChatContext):
new_chat_ctx = ctx
else:
raise ValueError("Invalid chat context")
turns, _ = _build_gemini_ctx(new_chat_ctx, id(self))
client_content = LiveClientContent(
turn_complete=turn_complete,
turns=turns,
)
self._queue_msg(client_content)

def chat_ctx_copy(self) -> llm.ChatContext:
return self._chat_ctx.copy()

Expand All @@ -370,7 +349,16 @@ def create_response(
"cancel_existing", "cancel_new", "keep_both"
] = "keep_both",
) -> None:
raise NotImplementedError("create_response is not supported yet")
turns, _ = _build_gemini_ctx(self._chat_ctx, id(self))
ctx = [self._opts.instructions] + turns if self._opts.instructions else turns

if not ctx:
logger.warning(
"gemini-realtime-session: No chat context to send, sending dummy content."
)
ctx = [Content(parts=[Part(text=".")])]

self._queue_msg(LiveClientContent(turns=ctx, turn_complete=True))

def commit_audio_buffer(self) -> None:
raise NotImplementedError("commit_audio_buffer is not supported yet")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def __init__(
detect_language: bool = True,
interim_results: bool = True,
punctuate: bool = True,
spoken_punctuation: bool = True,
model: SpeechModels = "long",
location: str = "global",
spoken_punctuation: bool = False,
model: SpeechModels = "chirp_2",
location: str = "us-central1",
sample_rate: int = 16000,
credentials_info: dict | None = None,
credentials_file: str | None = None,
Expand Down

0 comments on commit 695f7b5

Please sign in to comment.