uberduck-ai · hecko-yes · Mar 1, 2024 · Mar 6, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/clients/daily/daily_bot.py b/clients/daily/daily_bot.py
@@ -15,7 +15,6 @@
 
 
 class PyAudioApp:
-
     def __init__(self):
         self.__app_quit = False
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -5,8 +5,8 @@ services:
     build: .
     command: >
       bash -c "python setup.py develop &&  \
-               mkdir -p models/styletts2  && \
-               aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \ 
+               mkdir -p models  && \
+               aws s3 sync s3://uberduck-models-us-west-2/prototype models && \ 
                uvicorn openduck_py.routers.main:app --reload --host 0.0.0.0 --port 8000"
     working_dir: /openduck-py/openduck-py
     volumes:
@@ -21,8 +21,8 @@ services:
     build: .
     command: >
       bash -c "python setup.py develop &&  \ 
-               mkdir -p models/styletts2 && \ 
-               aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \ 
+               mkdir -p models && \ 
+               aws s3 sync s3://uberduck-models-us-west-2/prototype models && \ 
                apt update && apt install -y python3-pip && \ 
                pip install watchdog[watchmedo] && \ 
                watchmedo auto-restart --directory=./openduck_py --pattern=*.py --recursive -- python openduck_py/routers/voice.py"

diff --git a/openduck-py/openduck_py/routers/voice.py b/openduck-py/openduck_py/routers/voice.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 from uuid import uuid4
 import threading
+import traceback
 
 from fastapi import APIRouter, Depends, WebSocket, WebSocketDisconnect
 import numpy as np
@@ -23,14 +24,14 @@
 from openduck_py.models.chat_record import EventName
 from openduck_py.db import get_db_async, AsyncSession, SessionAsync
 from openduck_py.prompts import prompt
-from openduck_py.voices.styletts2 import styletts2_inference, STYLETTS2_SAMPLE_RATE
 from openduck_py.settings import (
     IS_DEV,
     WS_SAMPLE_RATE,
     OUTPUT_SAMPLE_RATE,
     CHUNK_SIZE,
     LOG_TO_SLACK,
     CHAT_MODEL,
+    TTS_ARCHITECTURE,
 )
 from openduck_py.utils.daily import create_room, RoomCreateResponse, CustomEventHandler
 from openduck_py.utils.speaker_identification import (
@@ -39,6 +40,8 @@
 )
 from openduck_py.logging.slack import log_audio_to_slack
 
+TTS_SAMPLE_RATE = 24000
+
 if IS_DEV:
     normalize_text = lambda x: x
 else:
@@ -47,6 +50,19 @@
     normalizer = Normalizer(input_case="cased", lang="en")
     normalize_text = normalizer.normalize
 
+if TTS_ARCHITECTURE == "styletts2":
+    from openduck_py.voices.styletts2 import styletts2_inference as tts_inference
+elif TTS_ARCHITECTURE == "piper":
+    from openduck_py.voices.piper import inference as tts_inference
+else:
+    raise ValueError(f"Unsupported TTS architecture: {TTS_ARCHITECTURE}")
+
+tts_name = {
+    "styletts2": "StyleTTS2",
+    "piper": "Piper",
+    "xtts": "XTTS v2",
+    "xtts_streaming": "XTTS v2 (streaming)",
+}[TTS_ARCHITECTURE]
 
 try:
     pipeline, inference = load_pipelines()
@@ -326,12 +342,10 @@ async def speak_response(
         )
 
         def _inference(sentence: str):
-            audio_chunk = styletts2_inference(
-                text=sentence,
-                output_sample_rate=OUTPUT_SAMPLE_RATE,
-            )
-            audio_chunk = np.int16(audio_chunk * 32767).tobytes()
-            return audio_chunk
+            audio_chunk = tts_inference(text=sentence)
+
+            audio_chunk_bytes = np.int16(audio_chunk * 32767).tobytes()
+            return audio_chunk_bytes
 
         audio_chunk_bytes = await asyncio.to_thread(_inference, normalized)
         t_styletts = time()
@@ -357,6 +371,7 @@ def _check_for_exceptions(response_task: Optional[asyncio.Task]) -> bool:
             print("response task was cancelled")
         except Exception as e:
             print("response task raised an exception:", e)
+            print(traceback.format_exc(e))
         else:
             print(
                 "response task completed successfully. Resetting audio_data and response_task"
@@ -383,7 +398,7 @@ async def log_event(
 
         sample_rate = WS_SAMPLE_RATE
         if event == "generated_tts":
-            sample_rate = STYLETTS2_SAMPLE_RATE
+            sample_rate = TTS_SAMPLE_RATE
         wavfile.write(abs_path, sample_rate, audio)
         print(f"Wrote wavfile to {abs_path}")
 

diff --git a/openduck-py/openduck_py/settings/__init__.py b/openduck-py/openduck_py/settings/__init__.py
@@ -10,6 +10,7 @@
 CHUNK_SIZE = 10240
 LOG_TO_SLACK = bool(os.environ.get("LOG_TO_SLACK", False))
 CHAT_MODEL = "azure/gpt-35-turbo-deployment"
+TTS_ARCHITECTURE = "styletts2"
 
 # to not break existing env files
 os.environ["AZURE_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")

diff --git a/openduck-py/openduck_py/voices/piper.py b/openduck-py/openduck_py/voices/piper.py
@@ -0,0 +1,26 @@
+import numpy as np
+import torch
+from torchaudio.functional import resample
+
+from piper.voice import PiperVoice
+
+from .settings import DEVICE
+
+model = PiperVoice.load(
+    "models/cartoon-boy-upbeat-piper.onnx",
+    config_path="models/cartoon-boy-upbeat-piper.onnx.json",
+    use_cuda=(DEVICE == "cuda"),
+)
+
+
+def inference(
+    text: str, output_sample_rate: int, language: str = "english"
+) -> np.ndarray:
+    audio = model.synthesize_stream_raw(
+        text,
+        speaker_id=0,
+    )
+    audio = b"".join(audio)
+    audio = torch.frombuffer(audio, dtype=torch.int16).float() / 32767  # TODO silly
+    audio = resample(audio, model.config.sample_rate, output_sample_rate)
+    return audio
diff --git a/openduck-py/requirements.txt b/openduck-py/requirements.txt
@@ -17,6 +17,7 @@ numpy
 openai
 openai-whisper
 phonemizer
+piper-tts
 pyannote.core
 pyannote.audio
 pylru
Original file line number	Diff line number	Diff line change
Expand Up		@@ -15,7 +15,6 @@


		class PyAudioApp:

		def __init__(self):
		self.__app_quit = False

Expand Down