-
Notifications
You must be signed in to change notification settings - Fork 1
More TTS architectures #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
d8e9701
4853b21
3c9e594
e458a28
5e72f61
345e2fa
cac55fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -15,7 +15,6 @@ | |
|
|
||
|
|
||
| class PyAudioApp: | ||
|
|
||
| def __init__(self): | ||
| self.__app_quit = False | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,6 +9,7 @@ | |
| from pathlib import Path | ||
| from uuid import uuid4 | ||
| import threading | ||
| import traceback | ||
|
|
||
| from fastapi import APIRouter, Depends, WebSocket, WebSocketDisconnect | ||
| import numpy as np | ||
|
|
@@ -23,14 +24,14 @@ | |
| from openduck_py.models.chat_record import EventName | ||
| from openduck_py.db import get_db_async, AsyncSession, SessionAsync | ||
| from openduck_py.prompts import prompt | ||
| from openduck_py.voices.styletts2 import styletts2_inference, STYLETTS2_SAMPLE_RATE | ||
| from openduck_py.settings import ( | ||
| IS_DEV, | ||
| WS_SAMPLE_RATE, | ||
| OUTPUT_SAMPLE_RATE, | ||
| CHUNK_SIZE, | ||
| LOG_TO_SLACK, | ||
| CHAT_MODEL, | ||
| TTS_ARCHITECTURE, | ||
| ) | ||
| from openduck_py.utils.daily import create_room, RoomCreateResponse, CustomEventHandler | ||
| from openduck_py.utils.speaker_identification import ( | ||
|
|
@@ -39,6 +40,8 @@ | |
| ) | ||
| from openduck_py.logging.slack import log_audio_to_slack | ||
|
|
||
| TTS_SAMPLE_RATE = 24000 | ||
|
|
||
| if IS_DEV: | ||
| normalize_text = lambda x: x | ||
| else: | ||
|
|
@@ -47,6 +50,19 @@ | |
| normalizer = Normalizer(input_case="cased", lang="en") | ||
| normalize_text = normalizer.normalize | ||
|
|
||
| if TTS_ARCHITECTURE == "styletts2": | ||
| from openduck_py.voices.styletts2 import styletts2_inference as tts_inference | ||
| elif TTS_ARCHITECTURE == "piper": | ||
| from openduck_py.voices.piper import inference as tts_inference | ||
| else: | ||
| raise ValueError(f"Unsupported TTS architecture: {TTS_ARCHITECTURE}") | ||
|
|
||
| tts_name = { | ||
| "styletts2": "StyleTTS2", | ||
| "piper": "Piper", | ||
| "xtts": "XTTS v2", | ||
| "xtts_streaming": "XTTS v2 (streaming)", | ||
| }[TTS_ARCHITECTURE] | ||
|
|
||
| try: | ||
| pipeline, inference = load_pipelines() | ||
|
|
@@ -326,12 +342,10 @@ async def speak_response( | |
| ) | ||
|
|
||
| def _inference(sentence: str): | ||
| audio_chunk = styletts2_inference( | ||
| text=sentence, | ||
| output_sample_rate=OUTPUT_SAMPLE_RATE, | ||
| ) | ||
| audio_chunk = np.int16(audio_chunk * 32767).tobytes() | ||
| return audio_chunk | ||
| audio_chunk = tts_inference(text=sentence) | ||
|
|
||
| audio_chunk_bytes = np.int16(audio_chunk * 32767).tobytes() | ||
| return audio_chunk_bytes | ||
|
|
||
| audio_chunk_bytes = await asyncio.to_thread(_inference, normalized) | ||
| t_styletts = time() | ||
|
|
@@ -357,6 +371,7 @@ def _check_for_exceptions(response_task: Optional[asyncio.Task]) -> bool: | |
| print("response task was cancelled") | ||
| except Exception as e: | ||
| print("response task raised an exception:", e) | ||
| print(traceback.format_exc(e)) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. except for the bit where it raises an exception of its own somehow :p |
||
| else: | ||
| print( | ||
| "response task completed successfully. Resetting audio_data and response_task" | ||
|
|
@@ -383,7 +398,7 @@ async def log_event( | |
|
|
||
| sample_rate = WS_SAMPLE_RATE | ||
| if event == "generated_tts": | ||
| sample_rate = STYLETTS2_SAMPLE_RATE | ||
| sample_rate = TTS_SAMPLE_RATE | ||
| wavfile.write(abs_path, sample_rate, audio) | ||
| print(f"Wrote wavfile to {abs_path}") | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,26 @@ | ||
| import numpy as np | ||
| import torch | ||
| from torchaudio.functional import resample | ||
|
|
||
| from piper.voice import PiperVoice | ||
|
|
||
| from .settings import DEVICE | ||
|
|
||
| model = PiperVoice.load( | ||
| "models/cartoon-boy-upbeat-piper.onnx", | ||
| config_path="models/cartoon-boy-upbeat-piper.onnx.json", | ||
| use_cuda=(DEVICE == "cuda"), | ||
| ) | ||
|
|
||
|
|
||
| def inference( | ||
| text: str, output_sample_rate: int, language: str = "english" | ||
| ) -> np.ndarray: | ||
| audio = model.synthesize_stream_raw( | ||
| text, | ||
| speaker_id=0, | ||
| ) | ||
| audio = b"".join(audio) | ||
| audio = torch.frombuffer(audio, dtype=torch.int16).float() / 32767 # TODO silly | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should it be / 32768 ? (2^15) not a big difference though also whats # TODO silly?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's silly because i'm undoing the conversion piper does (which uses 32767 btw) |
||
| audio = resample(audio, model.config.sample_rate, output_sample_rate) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we skip this step if the input and output sample rates are the same? (which I think it usually should be if they're both using 24000)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. torchaudio has an if-clause for that already https://pytorch.org/audio/stable/_modules/torchaudio/functional/functional.html#resample |
||
| return audio | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,7 @@ numpy | |
| openai | ||
| openai-whisper | ||
| phonemizer | ||
| piper-tts | ||
| pyannote.core | ||
| pyannote.audio | ||
| pylru | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it looks like this branch is a bit out of date with main, can you run:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aye aye captain