Revamped speech-to-text transcription example (#1083)

livekit · Nov 14, 2024 · 0c9eac9 · 0c9eac9
1 parent 49c0663
commit 0c9eac9
Show file tree

Hide file tree

Showing 2 changed files with 33 additions and 15 deletions.
diff --git a/examples/speech-to-text/README.md b/examples/speech-to-text/README.md
@@ -1,18 +1,24 @@
 # Speech-to-text
 
-This example show realtime transcription from audio to text.
+This example shows realtime transcription from voice to text.
 
-It uses Deepgram's STT API, but supports other STT plugins by changing this line:
+It uses OpenAI's Whisper STT API, but supports other STT plugins by changing this line:
 
 ```python
-stt = deepgram.STT()
+stt = openai.STT()
 ```
 
 To render the transcriptions into your client application, refer to the [full documentation](https://docs.livekit.io/agents/voice-agent/transcriptions/).
 
 ## Running the example
 
 ```bash
-export DEEPGRAM_API_KEY=your-api-key
-python3 deepgram_stt.py start
+export LIVEKIT_URL=wss://yourhost.livekit.cloud
+export LIVEKIT_API_KEY=livekit-api-key
+export LIVEKIT_API_SECRET=your-api-secret
+export OPENAI_API_KEY=your-api-key
+
+python3 transcriber.py start
 ```
+
+Then connect to any room. For an example frontend, you can use LiveKit's [Agents Playground](https://agents-playground.livekit.io/).
diff --git a/examples/speech-to-text/deepgram_stt.py → examples/speech-to-text/transcriber.py b/examples/speech-to-text/deepgram_stt.py → examples/speech-to-text/transcriber.py
@@ -11,11 +11,11 @@
     stt,
     transcription,
 )
-from livekit.plugins import deepgram
+from livekit.plugins import openai, silero
 
 load_dotenv()
 
-logger = logging.getLogger("deepgram-stt-demo")
+logger = logging.getLogger("transcriber")
 logger.setLevel(logging.INFO)
 
 
@@ -24,40 +24,52 @@ async def _forward_transcription(
 ):
     """Forward the transcription to the client and log the transcript in the console"""
     async for ev in stt_stream:
-        stt_forwarder.update(ev)
         if ev.type == stt.SpeechEventType.INTERIM_TRANSCRIPT:
-            print(ev.alternatives[0].text, end="")
+            # you may not want to log interim transcripts, they are not final and may be incorrect
+            pass
         elif ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
-            print("\n")
             print(" -> ", ev.alternatives[0].text)
+        stt_forwarder.update(ev)
 
 
 async def entrypoint(ctx: JobContext):
-    logger.info("starting speech-to-text example")
-    stt = deepgram.STT()
+    logger.info(f"starting transcriber (speech to text) example, room: {ctx.room.name}")
+    # this example uses OpenAI Whisper, but you can use assemblyai,deepgram, google, azure, etc.
+    stt_impl = openai.STT()
+
+    if not stt_impl.capabilities.streaming:
+        # wrap with a stream adapter to use streaming semantics
+        stt_impl = stt.StreamAdapter(
+            stt=stt_impl,
+            vad=silero.VAD.load(
+                min_silence_duration=0.2,
+            ),
+        )
 
     async def transcribe_track(participant: rtc.RemoteParticipant, track: rtc.Track):
         audio_stream = rtc.AudioStream(track)
         stt_forwarder = transcription.STTSegmentsForwarder(
             room=ctx.room, participant=participant, track=track
         )
-        stt_stream = stt.stream()
+
+        stt_stream = stt_impl.stream()
         asyncio.create_task(_forward_transcription(stt_stream, stt_forwarder))
 
         async for ev in audio_stream:
             stt_stream.push_frame(ev.frame)
 
-    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
-
     @ctx.room.on("track_subscribed")
     def on_track_subscribed(
         track: rtc.Track,
         publication: rtc.TrackPublication,
         participant: rtc.RemoteParticipant,
     ):
+        # spin up a task to transcribe each track
         if track.kind == rtc.TrackKind.KIND_AUDIO:
             asyncio.create_task(transcribe_track(participant, track))
 
+    await ctx.connect(auto_subscribe=AutoSubscribe.AUDIO_ONLY)
+
 
 if __name__ == "__main__":
     cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint))