Skip to content

Commit

Permalink
Revamped speech-to-text transcription example (#1083)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidzhao authored Nov 14, 2024
1 parent 49c0663 commit 0c9eac9
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 15 deletions.
16 changes: 11 additions & 5 deletions examples/speech-to-text/README.md
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
# Speech-to-text

This example show realtime transcription from audio to text.
This example shows realtime transcription from voice to text.

It uses Deepgram's STT API, but supports other STT plugins by changing this line:
It uses OpenAI's Whisper STT API, but supports other STT plugins by changing this line:

```python
stt = deepgram.STT()
stt = openai.STT()
```

To render the transcriptions into your client application, refer to the [full documentation](https://docs.livekit.io/agents/voice-agent/transcriptions/).

## Running the example

```bash
export DEEPGRAM_API_KEY=your-api-key
python3 deepgram_stt.py start
export LIVEKIT_URL=wss://yourhost.livekit.cloud
export LIVEKIT_API_KEY=livekit-api-key
export LIVEKIT_API_SECRET=your-api-secret
export OPENAI_API_KEY=your-api-key

python3 transcriber.py start
```

Then connect to any room. For an example frontend, you can use LiveKit's [Agents Playground](https://agents-playground.livekit.io/).
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
stt,
transcription,
)
from livekit.plugins import deepgram
from livekit.plugins import openai, silero

load_dotenv()

logger = logging.getLogger("deepgram-stt-demo")
logger = logging.getLogger("transcriber")
logger.setLevel(logging.INFO)


Expand All @@ -24,40 +24,52 @@ async def _forward_transcription(
):
"""Forward the transcription to the client and log the transcript in the console"""
async for ev in stt_stream:
stt_forwarder.update(ev)
if ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
print(ev.alternatives[0].text, end="")
# you may not want to log interim transcripts, they are not final and may be incorrect
pass
elif ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
print("\n")
print(" -> ", ev.alternatives[0].text)
stt_forwarder.update(ev)


async def entrypoint(ctx: JobContext):
logger.info("starting speech-to-text example")
stt = deepgram.STT()
logger.info(f"starting transcriber (speech to text) example, room: {ctx.room.name}")
# this example uses OpenAI Whisper, but you can use assemblyai,deepgram, google, azure, etc.
stt_impl = openai.STT()

if not stt_impl.capabilities.streaming:
# wrap with a stream adapter to use streaming semantics
stt_impl = stt.StreamAdapter(
stt=stt_impl,
vad=silero.VAD.load(
min_silence_duration=0.2,
),
)

async def transcribe_track(participant: rtc.RemoteParticipant, track: rtc.Track):
audio_stream = rtc.AudioStream(track)
stt_forwarder = transcription.STTSegmentsForwarder(
room=ctx.room, participant=participant, track=track
)
stt_stream = stt.stream()

stt_stream = stt_impl.stream()
asyncio.create_task(_forward_transcription(stt_stream, stt_forwarder))

async for ev in audio_stream:
stt_stream.push_frame(ev.frame)

await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)

@ctx.room.on("track_subscribed")
def on_track_subscribed(
track: rtc.Track,
publication: rtc.TrackPublication,
participant: rtc.RemoteParticipant,
):
# spin up a task to transcribe each track
if track.kind == rtc.TrackKind.KIND_AUDIO:
asyncio.create_task(transcribe_track(participant, track))

await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)


if __name__ == "__main__":
cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))

0 comments on commit 0c9eac9

Please sign in to comment.