Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion clients/daily/daily_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@


class PyAudioApp:

def __init__(self):
self.__app_quit = False

Expand Down
8 changes: 4 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ services:
build: .
command: >
bash -c "python setup.py develop && \
mkdir -p models/styletts2 && \
aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like this branch is a bit out of date with main, can you run:

git checkout more-tts-archs
git pull --rebase origin main
<resolve any merge conflicts>
git push -f

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

aye aye captain

mkdir -p models && \
aws s3 sync s3://uberduck-models-us-west-2/prototype models && \
uvicorn openduck_py.routers.main:app --reload --host 0.0.0.0 --port 8000"
working_dir: /openduck-py/openduck-py
volumes:
Expand All @@ -21,8 +21,8 @@ services:
build: .
command: >
bash -c "python setup.py develop && \
mkdir -p models/styletts2 && \
aws s3 sync s3://uberduck-models-us-west-2/prototype/styletts2 models/styletts2 && \
mkdir -p models && \
aws s3 sync s3://uberduck-models-us-west-2/prototype models && \
apt update && apt install -y python3-pip && \
pip install watchdog[watchmedo] && \
watchmedo auto-restart --directory=./openduck_py --pattern=*.py --recursive -- python openduck_py/routers/voice.py"
Expand Down
31 changes: 23 additions & 8 deletions openduck-py/openduck_py/routers/voice.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pathlib import Path
from uuid import uuid4
import threading
import traceback

from fastapi import APIRouter, Depends, WebSocket, WebSocketDisconnect
import numpy as np
Expand All @@ -23,14 +24,14 @@
from openduck_py.models.chat_record import EventName
from openduck_py.db import get_db_async, AsyncSession, SessionAsync
from openduck_py.prompts import prompt
from openduck_py.voices.styletts2 import styletts2_inference, STYLETTS2_SAMPLE_RATE
from openduck_py.settings import (
IS_DEV,
WS_SAMPLE_RATE,
OUTPUT_SAMPLE_RATE,
CHUNK_SIZE,
LOG_TO_SLACK,
CHAT_MODEL,
TTS_ARCHITECTURE,
)
from openduck_py.utils.daily import create_room, RoomCreateResponse, CustomEventHandler
from openduck_py.utils.speaker_identification import (
Expand All @@ -39,6 +40,8 @@
)
from openduck_py.logging.slack import log_audio_to_slack

TTS_SAMPLE_RATE = 24000

if IS_DEV:
normalize_text = lambda x: x
else:
Expand All @@ -47,6 +50,19 @@
normalizer = Normalizer(input_case="cased", lang="en")
normalize_text = normalizer.normalize

if TTS_ARCHITECTURE == "styletts2":
from openduck_py.voices.styletts2 import styletts2_inference as tts_inference
elif TTS_ARCHITECTURE == "piper":
from openduck_py.voices.piper import inference as tts_inference
else:
raise ValueError(f"Unsupported TTS architecture: {TTS_ARCHITECTURE}")

tts_name = {
"styletts2": "StyleTTS2",
"piper": "Piper",
"xtts": "XTTS v2",
"xtts_streaming": "XTTS v2 (streaming)",
}[TTS_ARCHITECTURE]

try:
pipeline, inference = load_pipelines()
Expand Down Expand Up @@ -326,12 +342,10 @@ async def speak_response(
)

def _inference(sentence: str):
audio_chunk = styletts2_inference(
text=sentence,
output_sample_rate=OUTPUT_SAMPLE_RATE,
)
audio_chunk = np.int16(audio_chunk * 32767).tobytes()
return audio_chunk
audio_chunk = tts_inference(text=sentence)

audio_chunk_bytes = np.int16(audio_chunk * 32767).tobytes()
return audio_chunk_bytes

audio_chunk_bytes = await asyncio.to_thread(_inference, normalized)
t_styletts = time()
Expand All @@ -357,6 +371,7 @@ def _check_for_exceptions(response_task: Optional[asyncio.Task]) -> bool:
print("response task was cancelled")
except Exception as e:
print("response task raised an exception:", e)
print(traceback.format_exc(e))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

except for the bit where it raises an exception of its own somehow :p

else:
print(
"response task completed successfully. Resetting audio_data and response_task"
Expand All @@ -383,7 +398,7 @@ async def log_event(

sample_rate = WS_SAMPLE_RATE
if event == "generated_tts":
sample_rate = STYLETTS2_SAMPLE_RATE
sample_rate = TTS_SAMPLE_RATE
wavfile.write(abs_path, sample_rate, audio)
print(f"Wrote wavfile to {abs_path}")

Expand Down
1 change: 1 addition & 0 deletions openduck-py/openduck_py/settings/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
CHUNK_SIZE = 10240
LOG_TO_SLACK = bool(os.environ.get("LOG_TO_SLACK", False))
CHAT_MODEL = "azure/gpt-35-turbo-deployment"
TTS_ARCHITECTURE = "styletts2"

# to not break existing env files
os.environ["AZURE_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
Expand Down
26 changes: 26 additions & 0 deletions openduck-py/openduck_py/voices/piper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import numpy as np
import torch
from torchaudio.functional import resample

from piper.voice import PiperVoice

from .settings import DEVICE

model = PiperVoice.load(
"models/cartoon-boy-upbeat-piper.onnx",
config_path="models/cartoon-boy-upbeat-piper.onnx.json",
use_cuda=(DEVICE == "cuda"),
)


def inference(
text: str, output_sample_rate: int, language: str = "english"
) -> np.ndarray:
audio = model.synthesize_stream_raw(
text,
speaker_id=0,
)
audio = b"".join(audio)
audio = torch.frombuffer(audio, dtype=torch.int16).float() / 32767 # TODO silly
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should it be / 32768 ? (2^15)
not 32767?

not a big difference though

also whats # TODO silly?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's silly because i'm undoing the conversion piper does (which uses 32767 btw)

audio = resample(audio, model.config.sample_rate, output_sample_rate)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we skip this step if the input and output sample rates are the same? (which I think it usually should be if they're both using 24000)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return audio
1 change: 1 addition & 0 deletions openduck-py/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ numpy
openai
openai-whisper
phonemizer
piper-tts
pyannote.core
pyannote.audio
pylru
Expand Down