From b960d027ed0ce113d9229f3b263f5a7e44cd1f55 Mon Sep 17 00:00:00 2001 From: geramyloveless Date: Mon, 15 Jun 2026 13:39:19 -0700 Subject: [PATCH 1/2] Add Chatterbox text-to-speech backend Integrate Resemble AI's Chatterbox TTS as a new backend supporting CUDA, ROCm, Metal, and CPU, defaulting to GPU acceleration with CPU fallback. Exposes the existing OpenAI-compatible /v1/audio/speech endpoint with byte-level PCM streaming. Registers English, Multilingual, and Turbo variants, with variant-aware selective downloads to avoid pulling the full multi-gigabyte repo. Bundles are built by the lemonade-sdk/chatterbox-rocm distribution repo. --- CMakeLists.txt | 1 + docs/guide/configuration/chatterbox.md | 117 +++++ mkdocs.yml | 1 + src/app/src/renderer/BackendManager.tsx | 1 + src/app/src/renderer/utils/recipeNames.ts | 1 + .../lemon/backends/chatterbox_server.h | 54 ++ src/cpp/include/lemon/model_manager.h | 5 + src/cpp/include/lemon/model_types.h | 3 + src/cpp/resources/backend_versions.json | 6 + src/cpp/resources/server_models.json | 28 + src/cpp/server/backends/backend_utils.cpp | 2 + src/cpp/server/backends/chatterbox_server.cpp | 237 +++++++++ src/cpp/server/model_manager.cpp | 58 +++ src/cpp/server/recipe_options.cpp | 8 + src/cpp/server/router.cpp | 4 + src/cpp/server/runtime_config.cpp | 4 +- src/cpp/server/system_info.cpp | 17 + tools/chatterbox-server/main.py | 480 ++++++++++++++++++ 18 files changed, 1025 insertions(+), 2 deletions(-) create mode 100644 docs/guide/configuration/chatterbox.md create mode 100644 src/cpp/include/lemon/backends/chatterbox_server.h create mode 100644 src/cpp/server/backends/chatterbox_server.cpp create mode 100644 tools/chatterbox-server/main.py diff --git a/CMakeLists.txt b/CMakeLists.txt index f8121dace..e08642d64 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -614,6 +614,7 @@ set(SOURCES_CORE src/cpp/server/backends/whisper_server.cpp src/cpp/server/backends/moonshine_server.cpp src/cpp/server/backends/kokoro_server.cpp + src/cpp/server/backends/chatterbox_server.cpp src/cpp/server/backends/sd_server.cpp src/cpp/server/backends/vllm_server.cpp src/cpp/server/backends/backend_utils.cpp diff --git a/docs/guide/configuration/chatterbox.md b/docs/guide/configuration/chatterbox.md new file mode 100644 index 000000000..1684f2fe5 --- /dev/null +++ b/docs/guide/configuration/chatterbox.md @@ -0,0 +1,117 @@ +# Chatterbox Backend Options + +Lemonade integrates [Chatterbox](https://github.com/resemble-ai/chatterbox) (Resemble AI) as a **text-to-speech** backend, exposed through the OpenAI-compatible `/v1/audio/speech` endpoint (the same endpoint the Kokoro backend uses). Unlike Kokoro (CPU/Metal only), Chatterbox is a PyTorch model and supports **GPU acceleration** across vendors: + +1. **GPU by default, CPU fallback.** Chatterbox auto-selects the best available device — CUDA (NVIDIA), ROCm (AMD, Linux), or Metal/MPS (Apple Silicon) — and falls back to CPU when no GPU is present. PyTorch's ROCm build drives AMD GPUs through the CUDA API, so a single bundle per device covers all GPU architectures. +2. **Expressive, multilingual, voice cloning.** Chatterbox supports emotion/exaggeration control, 23+ languages (multilingual variant), and zero-shot voice cloning from a reference clip. +3. **Byte-level streaming.** When the loaded Chatterbox build provides `generate_stream`, audio is emitted incrementally as raw PCM16 @ 24 kHz (`stream_format: "audio"`) for low time-to-first-audio; otherwise it falls back transparently to a single full-utterance response. + +## Available Backends + +Chatterbox auto-selects in this preference order (first available wins): + +| Backend | Device | Platforms | +|---------|--------|-----------| +| `metal` | Apple Silicon GPU (MPS) | macOS arm64 | +| `cuda` | NVIDIA GPU | Windows x64, Linux x64 | +| `rocm` | AMD GPU (HIP via PyTorch ROCm) | Linux x64 | +| `cpu` | CPU | Windows x64, Linux x64, macOS arm64 | + +Each is a self-contained PyInstaller bundle from [lemonade-sdk/chatterbox-rocm](https://github.com/lemonade-sdk/chatterbox-rocm) with an embedded Python runtime, the device-appropriate PyTorch wheel, and the `chatterbox-tts` library. No system Python is required (or touched) on the host; Lemonade additionally sets `PYTHONNOUSERSITE=1` at launch. + +## Install + +The correct device bundle is installed automatically the first time a Chatterbox model is loaded. To install explicitly: + +```bash +lemonade backends install chatterbox:cuda # or rocm / metal / cpu +``` + +Or via HTTP: +```bash +curl -X POST http://localhost:13305/api/v1/install \ + -H 'Content-Type: application/json' \ + -d '{"recipe": "chatterbox", "backend": "cuda"}' +``` + +Bundle versions are pinned in [`backend_versions.json`](https://github.com/lemonade-sdk/lemonade/blob/main/src/cpp/resources/backend_versions.json) (`chatterbox.{cuda,rocm,metal,cpu}`), with tags following the upstream library version (`chatterbox0.1.7` = `chatterbox-tts` 0.1.7). Bundles are built automatically by [lemonade-sdk/chatterbox-rocm](https://github.com/lemonade-sdk/chatterbox-rocm), a distribution-only repo that tracks `chatterbox-tts` PyPI releases — no Chatterbox code is forked; the `main.py` wrapper in `tools/chatterbox-server/` here is frozen together with the PyPI wheel into a self-contained bundle. + +## Models + +Three variants are registered in [`server_models.json`](https://github.com/lemonade-sdk/lemonade/blob/main/src/cpp/resources/server_models.json), downloading from Hugging Face into the standard HF cache: + +| Model | Variant | Checkpoint | +|-------|---------|-----------| +| `Chatterbox` | English (`ChatterboxTTS`) | `ResembleAI/chatterbox` | +| `Chatterbox-Multilingual` | 23+ languages (`ChatterboxMultilingualTTS`) | `ResembleAI/chatterbox` | +| `Chatterbox-Turbo` | Fast English w/ paralinguistic tags (`ChatterboxTurboTTS`) | `ResembleAI/chatterbox-turbo` | + +```bash +lemonade pull Chatterbox +``` + +To register your own Chatterbox checkpoint (loaded via `from_local`): + +```bash +lemonade pull user.MyChatterbox \ + --checkpoint main ResembleAI/chatterbox \ + --recipe chatterbox +``` + +## Use + +### Speech synthesis (OpenAI-compatible) + +```bash +curl http://localhost:13305/v1/audio/speech \ + -H 'Content-Type: application/json' \ + -d '{"model": "Chatterbox", "input": "Hello from Lemonade.", "response_format": "mp3"}' \ + --output speech.mp3 +``` + +`response_format` accepts `mp3` (default), `wav`, `pcm`, `flac`, and `opus`. + +### Streaming + +Request raw PCM streaming (24 kHz, signed 16-bit, little-endian): + +```bash +curl http://localhost:13305/v1/audio/speech \ + -H 'Content-Type: application/json' \ + -d '{"model": "Chatterbox", "input": "Streaming audio.", "stream_format": "audio"}' \ + --output speech.pcm +``` + +### Voice cloning and expressive controls + +The OpenAI `voice` field is treated as a reference-audio path for zero-shot voice cloning when it points at an existing file (or pass `audio_prompt_path` explicitly). Chatterbox-specific controls `exaggeration`, `cfg_weight`, and `temperature` are passed through, and `language_id` selects the language for the multilingual variant: + +```bash +curl http://localhost:13305/v1/audio/speech \ + -H 'Content-Type: application/json' \ + -d '{"model": "Chatterbox-Multilingual", "input": "Bonjour.", "language_id": "fr", "exaggeration": 0.6}' \ + --output bonjour.mp3 +``` + +## Tuning + +Force a specific device (overriding auto-selection) via config or per-load: + +```bash +lemonade config set chatterbox.backend=cpu +``` + +Free-form CLI args can be appended to `chatterbox-server` via `chatterbox_args`: + +```bash +lemonade config set chatterbox_args="..." +``` + +(`--ckpt-dir`, `--variant`, `--device`, `--host`, and `--port` are managed by Lemonade and rejected as custom args.) + +## Known gotchas + +- **ROCm is Linux-only.** PyTorch publishes ROCm wheels for Linux only, so the `rocm` bundle is offered on Linux x64. On Windows, AMD GPUs fall back to the `cpu` bundle. +- **Large bundles.** Chatterbox ships a full PyTorch runtime; the GPU bundles are multi-gigabyte downloads. The first load also downloads the model weights (~2 GB) from Hugging Face. +- **GPU memory.** Chatterbox participates in the GPU LRU like other GPU models; on tight-VRAM systems it may evict (or be evicted by) an LLM. Use `--max-loaded-models` and per-model eviction settings to tune coexistence. +- **Streaming support is version-dependent.** Byte-level streaming uses Chatterbox's `generate_stream` when present in the installed build; otherwise the wrapper returns the full utterance as a single chunk over the same streaming contract. diff --git a/mkdocs.yml b/mkdocs.yml index 6cd10220f..335b25f6b 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -74,6 +74,7 @@ nav: - llama.cpp: guide/configuration/llamacpp.md - vLLM: guide/configuration/vllm.md - Moonshine: guide/configuration/moonshine.md + - Chatterbox: guide/configuration/chatterbox.md - Cloud Offload: guide/configuration/cloud.md - FAQ: guide/faq.md - Development: diff --git a/src/app/src/renderer/BackendManager.tsx b/src/app/src/renderer/BackendManager.tsx index bc6381d34..4286fb551 100644 --- a/src/app/src/renderer/BackendManager.tsx +++ b/src/app/src/renderer/BackendManager.tsx @@ -12,6 +12,7 @@ const RECIPE_ORDER = new Map([ 'moonshine', 'sd-cpp', 'kokoro', + 'chatterbox', 'flm', 'ryzenai-llm', 'vllm', diff --git a/src/app/src/renderer/utils/recipeNames.ts b/src/app/src/renderer/utils/recipeNames.ts index d654c635a..48b082c4c 100644 --- a/src/app/src/renderer/utils/recipeNames.ts +++ b/src/app/src/renderer/utils/recipeNames.ts @@ -13,6 +13,7 @@ export const RECIPE_DISPLAY_NAMES: Record = { 'moonshine': 'Moonshine', 'sd-cpp': 'StableDiffusion.cpp', 'kokoro': 'Kokoro', + 'chatterbox': 'Chatterbox', 'cloud': 'Cloud', 'vllm': 'vLLM ROCm (experimental)', }; diff --git a/src/cpp/include/lemon/backends/chatterbox_server.h b/src/cpp/include/lemon/backends/chatterbox_server.h new file mode 100644 index 000000000..3e1bc3060 --- /dev/null +++ b/src/cpp/include/lemon/backends/chatterbox_server.h @@ -0,0 +1,54 @@ +#pragma once + +#include "../server_capabilities.h" +#include "../wrapped_server.h" +#include "backend_utils.h" +#include + +namespace lemon { +namespace backends { + +// Chatterbox text-to-speech backend (Resemble AI). Runs the self-contained +// chatterbox-server subprocess (PyInstaller bundle from the +// lemonade-sdk/chatterbox-rocm distribution repo) and forwards OpenAI-style +// /v1/audio/speech requests to it. Supports CUDA, ROCm, Metal, and CPU; the +// device variant is auto-selected (GPU when available, else CPU) via the +// RECIPE_DEFS preference order in system_info.cpp. +class ChatterboxServer : public WrappedServer, public ITextToSpeechServer { +public: + static InstallParams get_install_params(const std::string& backend, const std::string& version); + + inline static const BackendSpec SPEC = BackendSpec( + "chatterbox", +#ifdef _WIN32 + "chatterbox-server.exe" +#else + "chatterbox-server" +#endif + , get_install_params + ); + + explicit ChatterboxServer(const std::string& log_level, + ModelManager* model_manager, + BackendManager* backend_manager); + + ~ChatterboxServer() override; + + void load(const std::string& model_name, + const ModelInfo& model_info, + const RecipeOptions& options, + bool do_not_upgrade = false) override; + + void unload() override; + + // ICompletionServer implementation (not supported - return errors) + json chat_completion(const json& request) override; + json completion(const json& request) override; + json responses(const json& request) override; + + // ITextToSpeechServer implementation + void audio_speech(const json& request, httplib::DataSink& sink) override; +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h index b0a68d360..7e0376a4a 100644 --- a/src/cpp/include/lemon/model_manager.h +++ b/src/cpp/include/lemon/model_manager.h @@ -104,6 +104,11 @@ struct ModelInfo { // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING) int moonshine_arch = -1; + // Chatterbox-specific model variant ("english", "multilingual", "turbo"). + // Selects which Chatterbox class chatterbox-server instantiates. Empty + // defaults to "english". + std::string chatterbox_variant; + // Utility std::string checkpoint(const std::string& type = "main") const { return checkpoints.count(type) ? checkpoints.at(type) : ""; } std::string resolved_path(const std::string& type = "main") const { return resolved_paths.count(type) ? resolved_paths.at(type) : ""; } diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h index eb5d4e0b4..c7e7849da 100644 --- a/src/cpp/include/lemon/model_types.h +++ b/src/cpp/include/lemon/model_types.h @@ -156,6 +156,9 @@ inline DeviceType get_device_type_from_recipe(const std::string& recipe) { return DEVICE_CPU; } else if (recipe == "kokoro") { return DEVICE_CPU; + } else if (recipe == "chatterbox") { + // Defaults to GPU; ChatterboxServer overrides to CPU when the cpu backend is selected. + return DEVICE_GPU; } else if (is_collection_recipe(recipe)) { return DEVICE_NONE; } else if (recipe == "cloud") { diff --git a/src/cpp/resources/backend_versions.json b/src/cpp/resources/backend_versions.json index 4a84bd394..97446c71e 100644 --- a/src/cpp/resources/backend_versions.json +++ b/src/cpp/resources/backend_versions.json @@ -92,5 +92,11 @@ "moonshine": { "cpu": "moonshine0.0.62" }, + "chatterbox": { + "cuda": "chatterbox0.1.7", + "rocm": "chatterbox0.1.7", + "metal": "chatterbox0.1.7", + "cpu": "chatterbox0.1.7" + }, "clear_bin_if_lemonade_below": "9.4.0" } diff --git a/src/cpp/resources/server_models.json b/src/cpp/resources/server_models.json index 0ad84ed17..065cd712d 100644 --- a/src/cpp/resources/server_models.json +++ b/src/cpp/resources/server_models.json @@ -1581,6 +1581,34 @@ ], "size": 0.354 }, + "Chatterbox": { + "checkpoint": "ResembleAI/chatterbox", + "recipe": "chatterbox", + "chatterbox_variant": "english", + "suggested": true, + "labels": [ + "tts" + ], + "size": 3.2 + }, + "Chatterbox-Multilingual": { + "checkpoint": "ResembleAI/chatterbox", + "recipe": "chatterbox", + "chatterbox_variant": "multilingual", + "labels": [ + "tts" + ], + "size": 3.2 + }, + "Chatterbox-Turbo": { + "checkpoint": "ResembleAI/chatterbox-turbo", + "recipe": "chatterbox", + "chatterbox_variant": "turbo", + "labels": [ + "tts" + ], + "size": 3.0 + }, "RealESRGAN-x4plus": { "checkpoint": "amd/realesrgan-x4plus:RealESRGAN_x4plus.pth", "recipe": "sd-cpp", diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp index 28feccaab..fbada4ac7 100644 --- a/src/cpp/server/backends/backend_utils.cpp +++ b/src/cpp/server/backends/backend_utils.cpp @@ -4,6 +4,7 @@ #include "lemon/backends/llamacpp_server.h" #include "lemon/backends/whisper_server.h" #include "lemon/backends/sd_server.h" +#include "lemon/backends/chatterbox_server.h" #include "lemon/backends/kokoro_server.h" #include "lemon/backends/ryzenaiserver.h" #include "lemon/backends/vllm_server.h" @@ -43,6 +44,7 @@ namespace lemon::backends { if (recipe == "whispercpp") return &WhisperServer::SPEC; if (recipe == "sd-cpp") return &SDServer::SPEC; if (recipe == "kokoro") return &KokoroServer::SPEC; + if (recipe == "chatterbox") return &ChatterboxServer::SPEC; if (recipe == "ryzenai-llm") return &::lemon::RyzenAIServer::SPEC; if (recipe == "vllm") return &VLLMServer::SPEC; if (recipe == "flm") return &FastFlowLMServer::SPEC; diff --git a/src/cpp/server/backends/chatterbox_server.cpp b/src/cpp/server/backends/chatterbox_server.cpp new file mode 100644 index 000000000..ed95673cb --- /dev/null +++ b/src/cpp/server/backends/chatterbox_server.cpp @@ -0,0 +1,237 @@ +#include "lemon/backends/chatterbox_server.h" +#include "lemon/backends/backend_utils.h" +#include "lemon/backend_manager.h" +#include "lemon/error_types.h" +#include "lemon/runtime_config.h" +#include "lemon/system_info.h" +#include "lemon/utils/custom_args.h" +#include "lemon/utils/process_manager.h" +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#include +#endif + +namespace fs = std::filesystem; +using namespace lemon::utils; + +namespace lemon { +namespace backends { + +namespace { +// Map the resolved backend (cuda/rocm/metal/cpu) to the torch device string the +// chatterbox-server wrapper expects. PyTorch's ROCm build exposes AMD GPUs +// through the CUDA API, so "rocm" also maps to "cuda". +std::string backend_to_device(const std::string& backend) { + if (backend == "metal") { + return "mps"; + } + if (backend == "cpu") { + return "cpu"; + } + return "cuda"; // cuda (NVIDIA) and rocm (AMD via HIP) +} +} // namespace + +InstallParams ChatterboxServer::get_install_params(const std::string& backend, const std::string& version) { + InstallParams params; + params.repo = "lemonade-sdk/chatterbox-rocm"; + + // Self-contained PyInstaller bundles built by the lemonade-sdk/chatterbox-rocm + // distribution repo (tracks chatterbox-tts PyPI releases; tag scheme + // chatterbox) — no system Python needed. One PyTorch wheel covers + // all GPU architectures, so assets vary only by OS and device variant + // (cuda/rocm/metal/cpu), not by sm_/gfx arch. + // + // Device availability per platform: + // windows x64 : cuda, cpu + // linux x64 : cuda, rocm, cpu + // macos arm64: metal, cpu +#ifdef _WIN32 + params.filename = "chatterbox-server-" + version + "-windows-x64-" + backend + ".zip"; +#elif defined(__APPLE__) + params.filename = "chatterbox-server-" + version + "-macos-arm64-" + backend + ".tar.gz"; +#else + params.filename = "chatterbox-server-" + version + "-linux-x64-" + backend + ".tar.gz"; +#endif + + return params; +} + +ChatterboxServer::ChatterboxServer(const std::string& log_level, ModelManager* model_manager, + BackendManager* backend_manager) + : WrappedServer("chatterbox-server", log_level, model_manager, backend_manager) { +} + +ChatterboxServer::~ChatterboxServer() { + unload(); +} + +void ChatterboxServer::load(const std::string& model_name, const ModelInfo& model_info, + const RecipeOptions& options, bool do_not_upgrade) { + (void)do_not_upgrade; + LOG(INFO, "ChatterboxServer") << "Loading model: " << model_name << std::endl; + LOG(DEBUG, "ChatterboxServer") << "Per-model settings: " << options.to_log_string() << std::endl; + + // Resolve the device backend. An empty/"auto" option resolves to the first + // supported backend in RECIPE_DEFS preference order (GPU when present, else + // CPU). A user/config override is validated against the supported set. + std::string backend_option = options.get_option("chatterbox_backend"); + RuntimeConfig::validate_backend_choice("chatterbox", backend_option); + std::string backend = backend_option; + if (backend.empty() || backend == "auto") { + auto supported = SystemInfo::get_supported_backends("chatterbox"); + if (supported.backends.empty()) { + throw std::runtime_error( + supported.not_supported_error.empty() + ? SystemInfo::get_unsupported_backend_error("chatterbox", "auto") + : supported.not_supported_error); + } + backend = supported.backends[0]; + } + LOG(INFO, "ChatterboxServer") << "Using backend: " << backend << std::endl; + + // get_device_type_from_recipe() defaults chatterbox to GPU; the cpu backend runs on CPU. + device_type_ = (backend == "cpu") ? DEVICE_CPU : DEVICE_GPU; + + // Install chatterbox-server (device-specific bundle) if needed. + backend_manager_->install_backend(SPEC.recipe, backend); + + // Resolve the checkpoint directory (HF snapshot) downloaded by Lemonade. + std::string ckpt_dir = model_info.resolved_path(); + if (ckpt_dir.empty() || !fs::exists(ckpt_dir)) { + throw std::runtime_error("Model directory not found for checkpoint: " + model_info.checkpoint()); + } + + std::string variant = model_info.chatterbox_variant.empty() ? "english" : model_info.chatterbox_variant; + std::string device = backend_to_device(backend); + + std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend); + LOG(INFO, "ChatterboxServer") << "Using executable: " << exe_path << std::endl; + + port_ = choose_port(); + if (port_ == 0) { + throw std::runtime_error("Failed to find an available port"); + } + LOG(INFO, "ChatterboxServer") << "Starting server on port " << port_ + << " (variant=" << variant << ", device=" << device << ")" << std::endl; + + // Lemonade manages the model path, variant, device, and port; optional + // chatterbox-server flags come from chatterbox_args. + std::vector args = { + "--ckpt-dir", ckpt_dir, + "--variant", variant, + "--device", device, + "--host", "127.0.0.1", + "--port", std::to_string(port_) + }; + + std::string chatterbox_args = options.get_option("chatterbox_args"); + if (!chatterbox_args.empty()) { + std::set reserved_flags = { + "--ckpt-dir", "--variant", "--device", "--host", "--port" + }; + std::string validation_error = validate_custom_args(chatterbox_args, reserved_flags); + if (!validation_error.empty()) { + throw std::invalid_argument( + "Invalid custom chatterbox-server arguments:\n" + validation_error); + } + std::vector custom_args_vec = parse_custom_args(chatterbox_args); + args.insert(args.end(), custom_args_vec.begin(), custom_args_vec.end()); + } + + // Prevent system/user Python packages from leaking into the bundled environment. + std::vector> env_vars; + env_vars.push_back({"PYTHONNOUSERSITE", "1"}); + + ProcessHandle started_handle = utils::ProcessManager::start_process( + exe_path, + args, + "", // working_dir + is_debug(), // inherit_output + false, + env_vars + ); + set_process_handle(started_handle); + + if (!has_process_handle(started_handle)) { + throw std::runtime_error("Failed to start chatterbox-server process"); + } + LOG(INFO, "ChatterboxServer") << "Process started with PID: " << started_handle.pid << std::endl; + + // Model load can be slow (downloads/weights init); wait_for_ready polls + // /health, which returns "starting" until the model is loaded. + if (!wait_for_ready("/health")) { + unload(); + throw std::runtime_error("chatterbox-server failed to start or become ready"); + } + LOG(INFO, "ChatterboxServer") << "Server is ready!" << std::endl; +} + +void ChatterboxServer::unload() { + stop_backend_watchdog(); + const ProcessHandle handle = consume_process_handle_for_cleanup(); + if (has_process_handle(handle)) { + LOG(INFO, "ChatterboxServer") << "Stopping server (PID: " << handle.pid << ")" << std::endl; + utils::ProcessManager::stop_process(handle); + } +} + +// ICompletionServer implementation (not supported - return errors) +json ChatterboxServer::chat_completion(const json& request) { + (void)request; + return json{ + {"error", { + {"message", "Chatterbox does not support text completion. Use audio speech endpoints instead."}, + {"type", "unsupported_operation"}, + {"code", "model_not_applicable"} + }} + }; +} + +json ChatterboxServer::completion(const json& request) { + (void)request; + return json{ + {"error", { + {"message", "Chatterbox does not support text completion. Use audio speech endpoints instead."}, + {"type", "unsupported_operation"}, + {"code", "model_not_applicable"} + }} + }; +} + +json ChatterboxServer::responses(const json& request) { + (void)request; + return json{ + {"error", { + {"message", "Chatterbox does not support text completion. Use audio speech endpoints instead."}, + {"type", "unsupported_operation"}, + {"code", "model_not_applicable"} + }} + }; +} + +void ChatterboxServer::audio_speech(const json& request, httplib::DataSink& sink) { + json tts_request = request; + tts_request["model"] = "chatterbox"; + + // OpenAI does not define "stream" for the speech endpoint, relying solely on + // stream_format. The wrapper honors stream_format but we also set the + // boolean for parity with the Kokoro contract. + if (request.contains("stream_format")) { + tts_request["stream"] = true; + } + + forward_streaming_request("/v1/audio/speech", tts_request.dump(), sink, false); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index c4bb292b2..41ef878f8 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -1298,6 +1298,12 @@ std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::s return model_cache_path; // Return directory even if index not found } + // For chatterbox, chatterbox-server reads the whole HF snapshot directory + // (--ckpt-dir), so resolve to the model cache directory itself. + if (info.recipe == "chatterbox") { + return model_cache_path; + } + // For whispercpp, find the .bin model file if (info.recipe == "whispercpp" && variant.empty()) { // No variant specified - use fallback logic to find any .bin file @@ -1857,6 +1863,7 @@ void ModelManager::build_cache() { info.size = JsonUtils::get_or_default(value, "size", 0.0); info.cloud_provider = JsonUtils::get_or_default(value, "cloud_provider", ""); info.moonshine_arch = JsonUtils::get_or_default(value, "moonshine_arch", -1); + info.chatterbox_variant = JsonUtils::get_or_default(value, "chatterbox_variant", ""); if (value.contains("labels") && value["labels"].is_array()) { for (const auto& label : value["labels"]) { @@ -1898,6 +1905,7 @@ void ModelManager::build_cache() { info.size = JsonUtils::get_or_default(value, "size", 0.0); info.cloud_provider = JsonUtils::get_or_default(value, "cloud_provider", ""); info.moonshine_arch = JsonUtils::get_or_default(value, "moonshine_arch", -1); + info.chatterbox_variant = JsonUtils::get_or_default(value, "chatterbox_variant", ""); if (value.contains("labels") && value["labels"].is_array()) { for (const auto& label : value["labels"]) { @@ -2684,6 +2692,9 @@ void ModelManager::register_user_model(const std::string& model_name, labels.insert("transcription"); labels.insert("realtime-transcription"); } + if (recipe == "chatterbox") { + labels.insert("tts"); + } model_entry["labels"] = labels; model_entry["suggested"] = true; // Always set suggested=true for user models @@ -3878,6 +3889,48 @@ void ModelManager::download_from_huggingface(const ModelInfo& info, } } } + } else if (info.recipe == "chatterbox") { + // Chatterbox ships redundant weights in a single repo: both .pt and + // .safetensors copies, plus every t3 variant (English / multilingual / + // turbo). Downloading the whole repo would pull ~14 GB for a model that + // only needs ~3 GB. Select only the large weight files the chosen + // variant's from_local() actually loads; always keep small config and + // tokenizer files so the loader never misses a dependency. + const std::string variant = info.chatterbox_variant.empty() ? "english" : info.chatterbox_variant; + + // All large, mutually-exclusive weight files across variants. Any of + // these is skipped unless it is in the selected variant's wanted set. + static const std::set kLargeWeights = { + "ve.pt", "ve.safetensors", + "s3gen.pt", "s3gen.safetensors", + "s3gen_v3.pt", "s3gen_v3.safetensors", "s3gen_meanflow.safetensors", + "t3_cfg.pt", "t3_cfg.safetensors", + "t3_23lang.safetensors", + "t3_mtl23ls_v2.safetensors", "t3_mtl23ls_v3.safetensors", + "t3_turbo_v1.safetensors" + }; + + std::set wanted; + if (variant == "multilingual") { + // ChatterboxMultilingualTTS.from_local: ve.pt, s3gen.pt, t3_mtl23ls_v2.safetensors + wanted = {"ve.pt", "s3gen.pt", "t3_mtl23ls_v2.safetensors"}; + } else if (variant == "turbo") { + // ChatterboxTurboTTS.from_local: ve.safetensors, s3gen_meanflow.safetensors, t3_turbo_v1.safetensors + wanted = {"ve.safetensors", "s3gen_meanflow.safetensors", "t3_turbo_v1.safetensors"}; + } else { + // ChatterboxTTS (english).from_local: ve.safetensors, s3gen.safetensors, t3_cfg.safetensors + wanted = {"ve.safetensors", "s3gen.safetensors", "t3_cfg.safetensors"}; + } + + for (const auto& file : repo_files) { + if (kLargeWeights.count(file) && !wanted.count(file)) { + continue; // redundant weight not needed by this variant + } + files_to_download[main_repo_id].push_back(file); + } + LOG(INFO, "ModelManager") << "Chatterbox (" << variant << "): selected " + << files_to_download[main_repo_id].size() << " of " + << repo_files.size() << " repo files" << std::endl; } else { // Non-GGUF model (ONNX, etc.): Download all files in repository files_to_download[main_repo_id].insert(files_to_download[main_repo_id].end(), repo_files.begin(), repo_files.end()); @@ -4800,6 +4853,11 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name) info.moonshine_arch = (*model_json)["moonshine_arch"].get(); } + // Parse chatterbox_variant + if (model_json->contains("chatterbox_variant") && (*model_json)["chatterbox_variant"].is_string()) { + info.chatterbox_variant = (*model_json)["chatterbox_variant"].get(); + } + return info; } diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp index f5c025da1..a58a6d52c 100644 --- a/src/cpp/server/recipe_options.cpp +++ b/src/cpp/server/recipe_options.cpp @@ -23,6 +23,8 @@ static const json DEFAULTS = { {"whispercpp_backend", ""}, // "" means auto-detect (mapped from "auto" in config.json) {"whispercpp_args", ""}, {"moonshine_args", ""}, // Custom arguments to pass to moonshine-server + {"chatterbox_backend", ""}, // "" means auto-detect (GPU if available, else CPU) + {"chatterbox_args", ""}, // Custom arguments to pass to chatterbox-server // Image generation defaults (for sd-cpp recipe) // These are recipe-level defaults only, not CLI arguments — per reviewer guidance, // there are too many image gen params for CLI flags, and no universal defaults. @@ -63,6 +65,8 @@ static const std::map OPTION_TO_CLI_FLAG = { {"whispercpp_backend", "--whispercpp"}, {"whispercpp_args", "--whispercpp-args"}, {"moonshine_args", "--moonshine-args"}, + {"chatterbox_backend", "--chatterbox"}, + {"chatterbox_args", "--chatterbox-args"}, {"vllm_backend", "--vllm"}, {"vllm_args", "--vllm-args"} }; @@ -75,6 +79,8 @@ static std::vector get_keys_for_recipe(const std::string& recipe) { keys = {"whispercpp_backend", "whispercpp_args", "merge_args"}; } else if (recipe == "moonshine") { keys = {"moonshine_args", "merge_args"}; + } else if (recipe == "chatterbox") { + keys = {"chatterbox_backend", "chatterbox_args", "merge_args"}; } else if (recipe == "flm") { return {"ctx_size", "merge_args"}; } else if (recipe == "ryzenai-llm") { @@ -253,6 +259,8 @@ static const json CLI_OPTIONS = { {"--whispercpp", {{"option_name", "whispercpp_backend"}, {"type_name", "BACKEND"}, {"help", "WhisperCpp backend to use"}, {"group", "Whisper.cpp Options"}}}, {"--whispercpp-args", {{"option_name", "whispercpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to whisper-server"}, {"group", "Whisper.cpp Options"}}}, {"--moonshine-args", {{"option_name", "moonshine_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to moonshine-server"}}}, + {"--chatterbox", {{"option_name", "chatterbox_backend"}, {"type_name", "BACKEND"}, {"help", "Chatterbox backend to use (cuda, rocm, metal, cpu)"}, {"group", "Chatterbox Options"}}}, + {"--chatterbox-args", {{"option_name", "chatterbox_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to chatterbox-server"}, {"group", "Chatterbox Options"}}}, {"--vllm", {{"option_name", "vllm_backend"}, {"type_name", "BACKEND"}, {"help", "vLLM backend to use"}, {"group", "vLLM Options"}}}, {"--vllm-args", {{"option_name", "vllm_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to vllm-server"}, {"group", "vLLM Options"}}}, // Note: Image gen params (--steps, --cfg-scale, --width, --height) removed — recipe-level only. diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp index 82db5494d..07041521f 100644 --- a/src/cpp/server/router.cpp +++ b/src/cpp/server/router.cpp @@ -6,6 +6,7 @@ #include "lemon/backends/ryzenaiserver.h" #include "lemon/backends/whisper_server.h" #include "lemon/backends/moonshine_server.h" +#include "lemon/backends/chatterbox_server.h" #include "lemon/backends/kokoro_server.h" #include "lemon/backends/sd_server.h" #include "lemon/backends/vllm_server.h" @@ -305,6 +306,9 @@ std::unique_ptr Router::create_backend_server(const ModelInfo& mo } else if (model_info.recipe == "kokoro") { LOG(DEBUG, "Router") << "Creating Kokoro backend" << std::endl; new_server = std::make_unique(log_level, model_manager_, backend_manager_); + } else if (model_info.recipe == "chatterbox") { + LOG(DEBUG, "Router") << "Creating Chatterbox backend" << std::endl; + new_server = std::make_unique(log_level, model_manager_, backend_manager_); } else if (model_info.recipe == "sd-cpp") { LOG(DEBUG, "Router") << "Creating SDServer backend" << std::endl; new_server = std::make_unique(log_level, model_manager_, backend_manager_); diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp index a33a4098e..15e3f90fc 100644 --- a/src/cpp/server/runtime_config.cpp +++ b/src/cpp/server/runtime_config.cpp @@ -31,7 +31,7 @@ RuntimeConfig* RuntimeConfig::global() { } static const std::vector s_backend_names = { - "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro" + "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro", "chatterbox" }; static bool is_backend_name(const std::string& key) { @@ -40,7 +40,7 @@ static bool is_backend_name(const std::string& key) { // Backends that have a selectable "backend" key static const std::vector s_selectable_backends = { - "llamacpp", "whispercpp", "sdcpp", "vllm" + "llamacpp", "whispercpp", "sdcpp", "vllm", "chatterbox" }; static bool has_backend_selection(const std::string& config_section) { diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index fa6d5874e..1c4ea2dfc 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -467,6 +467,23 @@ static const std::vector RECIPE_DEFS = { {"metal", {}}, }}, + // chatterbox - PyTorch TTS (Resemble AI). One torch wheel per device covers + // all GPU archs, so backends vary only by device, not sm_/gfx family. + // Preference order: GPU first (metal/cuda/rocm), CPU fallback. PyTorch's + // ROCm build is Linux-only, so chatterbox rocm is offered on Linux only. + {"chatterbox", "metal", {"macos"}, { + {"metal", {}}, + }}, + {"chatterbox", "cuda", {"windows", "linux"}, { + {"nvidia_gpu", {}}, // all NVIDIA GPU families (torch cuda wheel) + }}, + {"chatterbox", "rocm", {"linux"}, { + {"amd_gpu", {}}, // all AMD GPU families (torch rocm wheel, Linux) + }}, + {"chatterbox", "cpu", {"windows", "linux", "macos"}, { + {"cpu", {"x86_64", "arm64"}}, + }}, + // stable-diffusion.cpp - ROCm backend for AMD GPUs {"sd-cpp", "rocm", {"windows", "linux"}, { {"amd_gpu", { diff --git a/tools/chatterbox-server/main.py b/tools/chatterbox-server/main.py new file mode 100644 index 000000000..91602b1db --- /dev/null +++ b/tools/chatterbox-server/main.py @@ -0,0 +1,480 @@ +#!/usr/bin/env python3 +"""chatterbox-server — thin OpenAI-compatible TTS HTTP wrapper around Resemble +AI's Chatterbox models, consumed by Lemonade's ``chatterbox`` backend. + +No Chatterbox code is vendored here. ``chatterbox-tts`` is installed from PyPI +at build time and frozen, together with this wrapper, into a self-contained +PyInstaller bundle by the ``lemonade-sdk/chatterbox-rocm`` distribution repo — +no system Python is required (or touched) on user machines. + +The wrapper exposes a single inference endpoint, ``POST /v1/audio/speech``, +matching the OpenAI text-to-speech contract that Lemonade's Router forwards to +(identical to the Kokoro backend's contract), plus ``GET /`` and +``GET /health`` readiness probes. + +Device selection: ``cuda`` (also covers AMD ROCm — PyTorch's ROCm build reports +``torch.cuda.is_available()``), then Apple ``mps`` (Metal), else ``cpu``. A +``--device`` override is honored; ``auto`` (the default) picks the best +available, giving "GPU by default, CPU fallback". + +Streaming: when the client requests ``stream_format: "audio"`` (or +``stream: true``), audio is emitted as raw little-endian PCM16 at 24 kHz — +exactly the format Lemonade advertises (``audio/l16;rate=24000``) and the +native sample rate of Chatterbox (S3GEN_SR = 24000), so no resampling is +needed. Byte-level streaming uses Chatterbox's ``generate_stream`` when the +installed version provides it (detected at runtime); otherwise it falls back to +a single full-utterance chunk, keeping the HTTP contract identical. +""" + +import argparse +import io +import json +import os +import struct +import sys +import threading +import wave + +import numpy as np + +# Chatterbox is built on PyTorch; importing torch up front lets us probe the +# available accelerators before the (slower) model import. +import torch + + +# Chatterbox's S3Gen vocoder samples at 24 kHz. This matches OpenAI's "pcm" +# response format (audio/l16;rate=24000;little-endian), so PCM streaming is a +# zero-resample passthrough. +SAMPLE_RATE = 24000 + +# Set once the model finishes loading; gates the /health readiness probe. +_READY = threading.Event() +_MODEL = None +_VARIANT = "english" +# Serializes generation: a single Chatterbox model instance is not safe to call +# concurrently from multiple request threads. +_GEN_LOCK = threading.Lock() + + +# --------------------------------------------------------------------------- # +# Device selection +# --------------------------------------------------------------------------- # +def pick_device(requested): + """Resolve the torch device string. + + ``auto`` prefers CUDA (NVIDIA, and AMD via PyTorch's ROCm build, which + masquerades as CUDA), then Apple MPS (Metal), then CPU. + """ + if requested and requested != "auto": + return requested + if torch.cuda.is_available(): + return "cuda" + mps = getattr(torch.backends, "mps", None) + if mps is not None and mps.is_available(): + return "mps" + return "cpu" + + +# --------------------------------------------------------------------------- # +# Model loading +# --------------------------------------------------------------------------- # +def load_model(variant, ckpt_dir, device): + """Instantiate the requested Chatterbox variant on ``device``. + + Prefers ``from_local(ckpt_dir, device)`` so the model is read from the + checkpoint directory Lemonade already downloaded into the Hugging Face + cache; falls back to ``from_pretrained(device)`` (which downloads on + demand) if a local load is not possible. + """ + if variant == "multilingual": + from chatterbox.mtl_tts import ChatterboxMultilingualTTS as Model + elif variant == "turbo": + from chatterbox.tts_turbo import ChatterboxTurboTTS as Model + else: + from chatterbox.tts import ChatterboxTTS as Model + + if ckpt_dir and os.path.isdir(ckpt_dir): + try: + return Model.from_local(ckpt_dir, device) + except Exception as exc: # noqa: BLE001 - fall back to hub download + print( + f"[chatterbox-server] from_local({ckpt_dir!r}) failed ({exc}); " + "falling back to from_pretrained", + file=sys.stderr, + flush=True, + ) + return Model.from_pretrained(device) + + +# --------------------------------------------------------------------------- # +# Audio conversion / encoding +# --------------------------------------------------------------------------- # +def to_pcm16(wav): + """Convert a Chatterbox waveform (torch tensor or ndarray, float32 in + [-1, 1], shape (1, N) or (N,)) to little-endian signed-16-bit PCM bytes.""" + if hasattr(wav, "detach"): + wav = wav.detach().to("cpu").float().numpy() + wav = np.asarray(wav, dtype=np.float32).reshape(-1) + wav = np.clip(wav, -1.0, 1.0) + return (wav * 32767.0).astype("'}", + flush=True, + ) + + # Start the HTTP server first so /health answers "starting" during the + # (potentially slow) model load, then load the model in this thread. + server = ThreadingHTTPServer((args.host, args.port), Handler) + threading.Thread(target=server.serve_forever, daemon=True).start() + print(f"[chatterbox-server] listening on {args.host}:{args.port}", flush=True) + + _MODEL = load_model(args.variant, args.ckpt_dir, device) + _READY.set() + print("[chatterbox-server] model ready", flush=True) + + try: + threading.Event().wait() # block forever; serving happens in the thread + except KeyboardInterrupt: + pass + finally: + server.shutdown() + + +if __name__ == "__main__": + main() From 553f77d5b3bc3fd3004fe25d262dee1ec7aa748a Mon Sep 17 00:00:00 2001 From: geramyloveless Date: Mon, 15 Jun 2026 14:27:03 -0700 Subject: [PATCH 2/2] chatterbox: enable split-archive install for multi-GB GPU bundles GitHub release assets are capped at 2 GiB; frozen torch+CUDA/ROCm bundles exceed that. Enable supports_split_archive and switch the Windows asset to .tar.gz (extracted via native tar) so the split-archive installer path serves all platforms. --- src/cpp/include/lemon/backends/chatterbox_server.h | 1 + src/cpp/server/backends/chatterbox_server.cpp | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cpp/include/lemon/backends/chatterbox_server.h b/src/cpp/include/lemon/backends/chatterbox_server.h index 3e1bc3060..f4b2899d4 100644 --- a/src/cpp/include/lemon/backends/chatterbox_server.h +++ b/src/cpp/include/lemon/backends/chatterbox_server.h @@ -26,6 +26,7 @@ class ChatterboxServer : public WrappedServer, public ITextToSpeechServer { "chatterbox-server" #endif , get_install_params + , true // supports_split_archive: GPU bundles exceed GitHub's 2 GiB asset limit ); explicit ChatterboxServer(const std::string& log_level, diff --git a/src/cpp/server/backends/chatterbox_server.cpp b/src/cpp/server/backends/chatterbox_server.cpp index ed95673cb..2c06dff7f 100644 --- a/src/cpp/server/backends/chatterbox_server.cpp +++ b/src/cpp/server/backends/chatterbox_server.cpp @@ -55,8 +55,12 @@ InstallParams ChatterboxServer::get_install_params(const std::string& backend, c // windows x64 : cuda, cpu // linux x64 : cuda, rocm, cpu // macos arm64: metal, cpu + // + // All platforms use .tar.gz (extracted via native tar on Windows too) so the + // split-archive path can serve the multi-GB GPU bundles that exceed GitHub's + // 2 GiB per-asset release limit. #ifdef _WIN32 - params.filename = "chatterbox-server-" + version + "-windows-x64-" + backend + ".zip"; + params.filename = "chatterbox-server-" + version + "-windows-x64-" + backend + ".tar.gz"; #elif defined(__APPLE__) params.filename = "chatterbox-server-" + version + "-macos-arm64-" + backend + ".tar.gz"; #else