Skip to content

Commit

Permalink
feat(server): switch to python api
Browse files Browse the repository at this point in the history
Server can then synthesise with any Coqui model, including XTTS, Fairseq,
Tortoise, Bark etc.
  • Loading branch information
eginhard committed Jan 10, 2025
1 parent d6c8dae commit 38fda7f
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 72 deletions.
109 changes: 40 additions & 69 deletions TTS/server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import logging
import os
import sys
from pathlib import Path
from threading import Lock
from typing import Union
from urllib.parse import parse_qs
Expand All @@ -19,10 +18,9 @@
msg = "Server requires requires flask, use `pip install coqui-tts[server]`"
raise ImportError(msg) from e

from TTS.config import load_config
from TTS.api import TTS
from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
from TTS.utils.manage import ModelManager
from TTS.utils.synthesizer import Synthesizer

logger = logging.getLogger(__name__)
setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
Expand Down Expand Up @@ -60,6 +58,7 @@ def create_argparser() -> argparse.ArgumentParser:
parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
parser.add_argument("--port", type=int, default=5002, help="port to listen on.")
parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="true to use CUDA.")
parser.add_argument(
"--debug", action=argparse.BooleanOptionalAction, default=False, help="true to enable Flask debug mode."
Expand All @@ -73,8 +72,7 @@ def create_argparser() -> argparse.ArgumentParser:
# parse the args
args = create_argparser().parse_args()

path = Path(__file__).parent / "../.models.json"
manager = ModelManager(path)
manager = ModelManager(models_file=TTS.get_models_file_path())

# update in-use models to the specified released models.
model_path = None
Expand All @@ -86,51 +84,27 @@ def create_argparser() -> argparse.ArgumentParser:
# CASE1: list pre-trained TTS models
if args.list_models:
manager.list_models()
sys.exit()

# CASE2: load pre-trained model paths
if args.model_name is not None and not args.model_path:
model_path, config_path, model_item = manager.download_model(args.model_name)
args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name

if args.vocoder_name is not None and not args.vocoder_path:
vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name)

# CASE3: set custom model paths
if args.model_path is not None:
model_path = args.model_path
config_path = args.config_path
speakers_file_path = args.speakers_file_path

if args.vocoder_path is not None:
vocoder_path = args.vocoder_path
vocoder_config_path = args.vocoder_config_path

# load models
synthesizer = Synthesizer(
tts_checkpoint=model_path,
tts_config_path=config_path,
tts_speakers_file=speakers_file_path,
tts_languages_file=None,
vocoder_checkpoint=vocoder_path,
vocoder_config=vocoder_config_path,
encoder_checkpoint="",
encoder_config="",
use_cuda=args.use_cuda,
)

use_multi_speaker = hasattr(synthesizer.tts_model, "num_speakers") and (
synthesizer.tts_model.num_speakers > 1 or synthesizer.tts_speakers_file is not None
)
speaker_manager = getattr(synthesizer.tts_model, "speaker_manager", None)

use_multi_language = hasattr(synthesizer.tts_model, "num_languages") and (
synthesizer.tts_model.num_languages > 1 or synthesizer.tts_languages_file is not None
)
language_manager = getattr(synthesizer.tts_model, "language_manager", None)
sys.exit(0)

device = args.device
if args.use_cuda:
device = "cuda"

# CASE2: load models
model_name = args.model_name if args.model_path is None else None
api = TTS(
model_name=model_name,
model_path=args.model_path,
config_path=args.config_path,
vocoder_name=args.vocoder_name,
vocoder_path=args.vocoder_path,
vocoder_config_path=args.vocoder_config_path,
speakers_file_path=args.speakers_file_path,
# language_ids_file_path=args.language_ids_file_path,
).to(device)

# TODO: set this from SpeakerManager
use_gst = synthesizer.tts_config.get("use_gst", False)
use_gst = api.synthesizer.tts_config.get("use_gst", False)
app = Flask(__name__)


Expand Down Expand Up @@ -158,27 +132,18 @@ def index():
return render_template(
"index.html",
show_details=args.show_details,
use_multi_speaker=use_multi_speaker,
use_multi_language=use_multi_language,
speaker_ids=speaker_manager.name_to_id if speaker_manager is not None else None,
language_ids=language_manager.name_to_id if language_manager is not None else None,
use_multi_speaker=api.is_multi_speaker,
use_multi_language=api.is_multi_lingual,
speaker_ids=api.speakers,
language_ids=api.languages,
use_gst=use_gst,
)


@app.route("/details")
def details():
if args.config_path is not None and os.path.isfile(args.config_path):
model_config = load_config(args.config_path)
elif args.model_name is not None:
model_config = load_config(config_path)

if args.vocoder_config_path is not None and os.path.isfile(args.vocoder_config_path):
vocoder_config = load_config(args.vocoder_config_path)
elif args.vocoder_name is not None:
vocoder_config = load_config(vocoder_config_path)
else:
vocoder_config = None
model_config = api.synthesizer.tts_config
vocoder_config = api.synthesizer.vocoder_config or None

return render_template(
"details.html",
Expand All @@ -196,17 +161,23 @@ def details():
def tts():
with lock:
text = request.headers.get("text") or request.values.get("text", "")
speaker_idx = request.headers.get("speaker-id") or request.values.get("speaker_id", "")
language_idx = request.headers.get("language-id") or request.values.get("language_id", "")
speaker_idx = (
request.headers.get("speaker-id") or request.values.get("speaker_id", "") if api.is_multi_speaker else None
)
language_idx = (
request.headers.get("language-id") or request.values.get("language_id", "")
if api.is_multi_lingual
else None
)
style_wav = request.headers.get("style-wav") or request.values.get("style_wav", "")
style_wav = style_wav_uri_to_dict(style_wav)

logger.info("Model input: %s", text)
logger.info("Speaker idx: %s", speaker_idx)
logger.info("Language idx: %s", language_idx)
wavs = synthesizer.tts(text, speaker_name=speaker_idx, language_name=language_idx, style_wav=style_wav)
wavs = api.tts(text, speaker=speaker_idx, language=language_idx, style_wav=style_wav)
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
api.synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav")


Expand Down Expand Up @@ -248,9 +219,9 @@ def mary_tts_api_process():
else:
text = request.args.get("INPUT_TEXT", "")
logger.info("Model input: %s", text)
wavs = synthesizer.tts(text)
wavs = api.tts(text)
out = io.BytesIO()
synthesizer.save_wav(wavs, out)
api.synthesizer.save_wav(wavs, out)
return send_file(out, mimetype="audio/wav")


Expand Down
6 changes: 3 additions & 3 deletions docs/source/server.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

You can boot up a demo 🐸TTS server to run an inference with your models (make
sure to install the additional dependencies with `pip install coqui-tts[server]`).
Note that the server is not optimized for performance and does not support all
Coqui models yet.
Note that the server is not optimized for performance.

The demo server provides pretty much the same interface as the CLI command.

Expand All @@ -15,7 +14,8 @@ tts-server --list_models # list the available models.
```

Run a TTS model, from the release models list, with its default vocoder.
If the model you choose is a multi-speaker TTS model, you can select different speakers on the Web interface and synthesize
If the model you choose is a multi-speaker or multilingual TTS model, you can
select different speakers and languages on the Web interface and synthesize
speech.

```bash
Expand Down

0 comments on commit 38fda7f

Please sign in to comment.