11
11
from agora_realtime_ai_api .rtc import Channel , ChatMessage , RtcEngine , RtcOptions
12
12
13
13
from .logger import setup_logger
14
- from .realtime .struct import InputAudioBufferCommitted , InputAudioBufferSpeechStarted , InputAudioBufferSpeechStopped , ItemCreated , RateLimitsUpdated , ResponseAudioDelta , ResponseAudioDone , ResponseAudioTranscriptDelta , ResponseAudioTranscriptDone , ResponseContentPartAdded , ResponseContentPartDone , ResponseCreated , ResponseDone , ResponseOutputItemAdded , ResponseOutputItemDone , ServerVADUpdateParams , SessionUpdate , SessionUpdateParams , SessionUpdated , Voices , to_json
14
+ from .realtime .struct import InputAudioBufferCommitted , InputAudioBufferSpeechStarted , InputAudioBufferSpeechStopped , InputAudioTranscription , ItemCreated , ItemInputAudioTranscriptionCompleted , RateLimitsUpdated , ResponseAudioDelta , ResponseAudioDone , ResponseAudioTranscriptDelta , ResponseAudioTranscriptDone , ResponseContentPartAdded , ResponseContentPartDone , ResponseCreated , ResponseDone , ResponseOutputItemAdded , ResponseOutputItemDone , ServerVADUpdateParams , SessionUpdate , SessionUpdateParams , SessionUpdated , Voices , to_json
15
15
from .realtime .connection import RealtimeApiConnection
16
16
from .tools import ClientToolCallResponse , ToolContext
17
17
from .utils import PCMWriter
@@ -102,6 +102,7 @@ async def setup_and_run_agent(
102
102
modalities = ["text" , "audio" ],
103
103
temperature = 0.8 ,
104
104
max_response_output_tokens = "inf" ,
105
+ input_audio_transcription = InputAudioTranscription (model = "whisper-1" )
105
106
)
106
107
)
107
108
)
@@ -190,7 +191,7 @@ def callback(agora_rtc_conn: RTCConnection, conn_info: RTCConnInfo, reason):
190
191
raise
191
192
192
193
async def rtc_to_model (self ) -> None :
193
- if self .subscribe_user is None :
194
+ while self .subscribe_user is None or self . channel . get_audio_frames ( self . subscribe_user ) is None :
194
195
await asyncio .sleep (0.1 )
195
196
196
197
audio_frames = self .channel .get_audio_frames (self .subscribe_user )
@@ -242,7 +243,7 @@ async def _process_model_messages(self) -> None:
242
243
# logger.info("Received audio message")
243
244
self .audio_queue .put_nowait (base64 .b64decode (message .delta ))
244
245
# loop.call_soon_threadsafe(self.audio_queue.put_nowait, base64.b64decode(message.delta))
245
- logger .info (f"TMS:ResponseAudioDelta: response_id:{ message .response_id } ,item_id: { message .item_id } " )
246
+ logger .debug (f"TMS:ResponseAudioDelta: response_id:{ message .response_id } ,item_id: { message .item_id } " )
246
247
case ResponseAudioTranscriptDelta ():
247
248
# logger.info(f"Received text message {message=}")
248
249
asyncio .create_task (self .channel .chat .send_message (
@@ -267,6 +268,13 @@ async def _process_model_messages(self) -> None:
267
268
case InputAudioBufferSpeechStopped ():
268
269
logger .info (f"TMS:InputAudioBufferSpeechStopped: item_id: { message .item_id } " )
269
270
pass
271
+ case ItemInputAudioTranscriptionCompleted ():
272
+ logger .info (f"ItemInputAudioTranscriptionCompleted: { message = } " )
273
+ asyncio .create_task (self .channel .chat .send_message (
274
+ ChatMessage (
275
+ message = to_json (message ), msg_id = message .item_id
276
+ )
277
+ ))
270
278
# InputAudioBufferCommitted
271
279
case InputAudioBufferCommitted ():
272
280
pass
0 commit comments