diff --git a/examples/foundational/34-audio-recording.py b/examples/foundational/34-audio-recording.py index cd5250dfc4..001ff4671a 100644 --- a/examples/foundational/34-audio-recording.py +++ b/examples/foundational/34-audio-recording.py @@ -50,25 +50,14 @@ from dotenv import load_dotenv from loguru import logger -from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams -from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3 -from pipecat.audio.vad.silero import SileroVADAnalyzer -from pipecat.audio.vad.vad_analyzer import VADParams -from pipecat.frames.frames import LLMRunFrame from pipecat.pipeline.pipeline import Pipeline from pipecat.pipeline.runner import PipelineRunner from pipecat.pipeline.task import PipelineParams, PipelineTask -from pipecat.processors.aggregators.llm_context import LLMContext -from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor from pipecat.runner.types import RunnerArguments from pipecat.runner.utils import create_transport -from pipecat.services.cartesia.tts import CartesiaTTSService -from pipecat.services.deepgram.stt import DeepgramSTTService -from pipecat.services.openai.llm import OpenAILLMService from pipecat.transports.base_transport import BaseTransport, TransportParams from pipecat.transports.daily.transport import DailyParams -from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams load_dotenv(override=True) @@ -94,20 +83,10 @@ async def save_audio_file(audio: bytes, filename: str, sample_rate: int, num_cha "daily": lambda: DailyParams( audio_in_enabled=True, audio_out_enabled=True, - vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), - turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), - ), - "twilio": lambda: FastAPIWebsocketParams( - audio_in_enabled=True, - audio_out_enabled=True, - vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), - turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), ), "webrtc": lambda: TransportParams( audio_in_enabled=True, audio_out_enabled=True, - vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)), - turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()), ), } @@ -115,38 +94,13 @@ async def save_audio_file(audio: bytes, filename: str, sample_rate: int, num_cha async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): logger.info(f"Starting bot") - stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"), audio_passthrough=True) - - tts = CartesiaTTSService( - api_key=os.getenv("CARTESIA_API_KEY"), - voice_id="71a7ad14-091c-4e8e-a314-022ece01c121", - ) - - llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4") - # Create audio buffer processor - audiobuffer = AudioBufferProcessor() - - messages = [ - { - "role": "system", - "content": "You are a helpful assistant demonstrating audio recording capabilities. Keep your responses brief and clear.", - }, - ] - - context = LLMContext(messages) - context_aggregator = LLMContextAggregatorPair(context) + audiobuffer = AudioBufferProcessor(sample_rate=48000) pipeline = Pipeline( [ transport.input(), - stt, - context_aggregator.user(), - llm, - tts, - transport.output(), audiobuffer, # Add audio buffer to pipeline - context_aggregator.assistant(), ] ) @@ -155,6 +109,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments): params=PipelineParams( enable_metrics=True, enable_usage_metrics=True, + audio_in_sample_rate=48000, + audio_out_sample_rate= 48000 ), idle_timeout_secs=runner_args.pipeline_idle_timeout_secs, ) @@ -165,7 +121,7 @@ async def on_client_connected(transport, client): # Start recording audio await audiobuffer.start_recording() # Start conversation - empty prompt to let LLM follow system instructions - await task.queue_frames([LLMRunFrame()]) + # await task.queue_frames([LLMRunFrame()]) @transport.event_handler("on_client_disconnected") async def on_client_disconnected(transport, client): diff --git a/src/pipecat/transports/smallwebrtc/transport.py b/src/pipecat/transports/smallwebrtc/transport.py index 0e2ea544e4..7e3ba250b7 100644 --- a/src/pipecat/transports/smallwebrtc/transport.py +++ b/src/pipecat/transports/smallwebrtc/transport.py @@ -235,7 +235,7 @@ def __init__(self, webrtc_connection: SmallWebRTCConnection, callbacks: SmallWeb # We are always resampling it for 16000 if the sample_rate that we receive is bigger than that. # otherwise we face issues with Silero VAD - self._pipecat_resampler = AudioResampler("s16", "mono", 16000) + self._pipecat_resampler = AudioResampler("s16", "mono", 48000) @self._webrtc_connection.event_handler("connected") async def on_connected(connection: SmallWebRTCConnection): @@ -366,31 +366,16 @@ async def read_audio_frame(self): await asyncio.sleep(0.01) continue - if frame.sample_rate > self._in_sample_rate: - resampled_frames = self._pipecat_resampler.resample(frame) - for resampled_frame in resampled_frames: - # 16-bit PCM bytes - pcm_array = resampled_frame.to_ndarray().astype(np.int16) - pcm_bytes = pcm_array.tobytes() - del pcm_array # free NumPy array immediately - - audio_frame = InputAudioRawFrame( - audio=pcm_bytes, - sample_rate=resampled_frame.sample_rate, - num_channels=self._audio_in_channels, - ) - del pcm_bytes # reference kept in audio_frame - - yield audio_frame - else: + resampled_frames = self._pipecat_resampler.resample(frame) + for resampled_frame in resampled_frames: # 16-bit PCM bytes - pcm_array = frame.to_ndarray().astype(np.int16) + pcm_array = resampled_frame.to_ndarray().astype(np.int16) pcm_bytes = pcm_array.tobytes() del pcm_array # free NumPy array immediately audio_frame = InputAudioRawFrame( audio=pcm_bytes, - sample_rate=frame.sample_rate, + sample_rate=resampled_frame.sample_rate, num_channels=self._audio_in_channels, ) del pcm_bytes # reference kept in audio_frame