Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 4 additions & 48 deletions examples/foundational/34-audio-recording.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,25 +50,14 @@
from dotenv import load_dotenv
from loguru import logger

from pipecat.audio.turn.smart_turn.base_smart_turn import SmartTurnParams
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
from pipecat.audio.vad.silero import SileroVADAnalyzer
from pipecat.audio.vad.vad_analyzer import VADParams
from pipecat.frames.frames import LLMRunFrame
from pipecat.pipeline.pipeline import Pipeline
from pipecat.pipeline.runner import PipelineRunner
from pipecat.pipeline.task import PipelineParams, PipelineTask
from pipecat.processors.aggregators.llm_context import LLMContext
from pipecat.processors.aggregators.llm_response_universal import LLMContextAggregatorPair
from pipecat.processors.audio.audio_buffer_processor import AudioBufferProcessor
from pipecat.runner.types import RunnerArguments
from pipecat.runner.utils import create_transport
from pipecat.services.cartesia.tts import CartesiaTTSService
from pipecat.services.deepgram.stt import DeepgramSTTService
from pipecat.services.openai.llm import OpenAILLMService
from pipecat.transports.base_transport import BaseTransport, TransportParams
from pipecat.transports.daily.transport import DailyParams
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams

load_dotenv(override=True)

Expand All @@ -94,59 +83,24 @@ async def save_audio_file(audio: bytes, filename: str, sample_rate: int, num_cha
"daily": lambda: DailyParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"twilio": lambda: FastAPIWebsocketParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
"webrtc": lambda: TransportParams(
audio_in_enabled=True,
audio_out_enabled=True,
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
turn_analyzer=LocalSmartTurnAnalyzerV3(params=SmartTurnParams()),
),
}


async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
logger.info(f"Starting bot")

stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"), audio_passthrough=True)

tts = CartesiaTTSService(
api_key=os.getenv("CARTESIA_API_KEY"),
voice_id="71a7ad14-091c-4e8e-a314-022ece01c121",
)

llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"), model="gpt-4")

# Create audio buffer processor
audiobuffer = AudioBufferProcessor()

messages = [
{
"role": "system",
"content": "You are a helpful assistant demonstrating audio recording capabilities. Keep your responses brief and clear.",
},
]

context = LLMContext(messages)
context_aggregator = LLMContextAggregatorPair(context)
audiobuffer = AudioBufferProcessor(sample_rate=48000)

pipeline = Pipeline(
[
transport.input(),
stt,
context_aggregator.user(),
llm,
tts,
transport.output(),
audiobuffer, # Add audio buffer to pipeline
context_aggregator.assistant(),
]
)

Expand All @@ -155,6 +109,8 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
params=PipelineParams(
enable_metrics=True,
enable_usage_metrics=True,
audio_in_sample_rate=48000,
audio_out_sample_rate= 48000
),
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
)
Expand All @@ -165,7 +121,7 @@ async def on_client_connected(transport, client):
# Start recording audio
await audiobuffer.start_recording()
# Start conversation - empty prompt to let LLM follow system instructions
await task.queue_frames([LLMRunFrame()])
# await task.queue_frames([LLMRunFrame()])

@transport.event_handler("on_client_disconnected")
async def on_client_disconnected(transport, client):
Expand Down
25 changes: 5 additions & 20 deletions src/pipecat/transports/smallwebrtc/transport.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def __init__(self, webrtc_connection: SmallWebRTCConnection, callbacks: SmallWeb

# We are always resampling it for 16000 if the sample_rate that we receive is bigger than that.
# otherwise we face issues with Silero VAD
self._pipecat_resampler = AudioResampler("s16", "mono", 16000)
self._pipecat_resampler = AudioResampler("s16", "mono", 48000)

@self._webrtc_connection.event_handler("connected")
async def on_connected(connection: SmallWebRTCConnection):
Expand Down Expand Up @@ -366,31 +366,16 @@ async def read_audio_frame(self):
await asyncio.sleep(0.01)
continue

if frame.sample_rate > self._in_sample_rate:
resampled_frames = self._pipecat_resampler.resample(frame)
for resampled_frame in resampled_frames:
# 16-bit PCM bytes
pcm_array = resampled_frame.to_ndarray().astype(np.int16)
pcm_bytes = pcm_array.tobytes()
del pcm_array # free NumPy array immediately

audio_frame = InputAudioRawFrame(
audio=pcm_bytes,
sample_rate=resampled_frame.sample_rate,
num_channels=self._audio_in_channels,
)
del pcm_bytes # reference kept in audio_frame

yield audio_frame
else:
resampled_frames = self._pipecat_resampler.resample(frame)
for resampled_frame in resampled_frames:
# 16-bit PCM bytes
pcm_array = frame.to_ndarray().astype(np.int16)
pcm_array = resampled_frame.to_ndarray().astype(np.int16)
pcm_bytes = pcm_array.tobytes()
del pcm_array # free NumPy array immediately

audio_frame = InputAudioRawFrame(
audio=pcm_bytes,
sample_rate=frame.sample_rate,
sample_rate=resampled_frame.sample_rate,
num_channels=self._audio_in_channels,
)
del pcm_bytes # reference kept in audio_frame
Expand Down
Loading