From b960d027ed0ce113d9229f3b263f5a7e44cd1f55 Mon Sep 17 00:00:00 2001
From: geramyloveless <gloveless@jqluv.com>
Date: Mon, 15 Jun 2026 13:39:19 -0700
Subject: [PATCH 1/2] Add Chatterbox text-to-speech backend

Integrate Resemble AI's Chatterbox TTS as a new backend supporting CUDA,
ROCm, Metal, and CPU, defaulting to GPU acceleration with CPU fallback.
Exposes the existing OpenAI-compatible /v1/audio/speech endpoint with
byte-level PCM streaming. Registers English, Multilingual, and Turbo
variants, with variant-aware selective downloads to avoid pulling the
full multi-gigabyte repo.

Bundles are built by the lemonade-sdk/chatterbox-rocm distribution repo.
---
 CMakeLists.txt                                |   1 +
 docs/guide/configuration/chatterbox.md        | 117 +++++
 mkdocs.yml                                    |   1 +
 src/app/src/renderer/BackendManager.tsx       |   1 +
 src/app/src/renderer/utils/recipeNames.ts     |   1 +
 .../lemon/backends/chatterbox_server.h        |  54 ++
 src/cpp/include/lemon/model_manager.h         |   5 +
 src/cpp/include/lemon/model_types.h           |   3 +
 src/cpp/resources/backend_versions.json       |   6 +
 src/cpp/resources/server_models.json          |  28 +
 src/cpp/server/backends/backend_utils.cpp     |   2 +
 src/cpp/server/backends/chatterbox_server.cpp | 237 +++++++++
 src/cpp/server/model_manager.cpp              |  58 +++
 src/cpp/server/recipe_options.cpp             |   8 +
 src/cpp/server/router.cpp                     |   4 +
 src/cpp/server/runtime_config.cpp             |   4 +-
 src/cpp/server/system_info.cpp                |  17 +
 tools/chatterbox-server/main.py               | 480 ++++++++++++++++++
 18 files changed, 1025 insertions(+), 2 deletions(-)
 create mode 100644 docs/guide/configuration/chatterbox.md
 create mode 100644 src/cpp/include/lemon/backends/chatterbox_server.h
 create mode 100644 src/cpp/server/backends/chatterbox_server.cpp
 create mode 100644 tools/chatterbox-server/main.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f8121dace..e08642d64 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -614,6 +614,7 @@ set(SOURCES_CORE
     src/cpp/server/backends/whisper_server.cpp
     src/cpp/server/backends/moonshine_server.cpp
     src/cpp/server/backends/kokoro_server.cpp
+    src/cpp/server/backends/chatterbox_server.cpp
     src/cpp/server/backends/sd_server.cpp
     src/cpp/server/backends/vllm_server.cpp
     src/cpp/server/backends/backend_utils.cpp
diff --git a/docs/guide/configuration/chatterbox.md b/docs/guide/configuration/chatterbox.md
new file mode 100644
index 000000000..1684f2fe5
--- /dev/null
+++ b/docs/guide/configuration/chatterbox.md
@@ -0,0 +1,117 @@
+# Chatterbox Backend Options
+
+Lemonade integrates [Chatterbox](https://github.com/resemble-ai/chatterbox) (Resemble AI) as a **text-to-speech** backend, exposed through the OpenAI-compatible `/v1/audio/speech` endpoint (the same endpoint the Kokoro backend uses). Unlike Kokoro (CPU/Metal only), Chatterbox is a PyTorch model and supports **GPU acceleration** across vendors:
+
+1. **GPU by default, CPU fallback.** Chatterbox auto-selects the best available device — CUDA (NVIDIA), ROCm (AMD, Linux), or Metal/MPS (Apple Silicon) — and falls back to CPU when no GPU is present. PyTorch's ROCm build drives AMD GPUs through the CUDA API, so a single bundle per device covers all GPU architectures.
+2. **Expressive, multilingual, voice cloning.** Chatterbox supports emotion/exaggeration control, 23+ languages (multilingual variant), and zero-shot voice cloning from a reference clip.
+3. **Byte-level streaming.** When the loaded Chatterbox build provides `generate_stream`, audio is emitted incrementally as raw PCM16 @ 24 kHz (`stream_format: "audio"`) for low time-to-first-audio; otherwise it falls back transparently to a single full-utterance response.
+
+## Available Backends
+
+Chatterbox auto-selects in this preference order (first available wins):
+
+| Backend | Device | Platforms |
+|---------|--------|-----------|
+| `metal` | Apple Silicon GPU (MPS) | macOS arm64 |
+| `cuda`  | NVIDIA GPU | Windows x64, Linux x64 |
+| `rocm`  | AMD GPU (HIP via PyTorch ROCm) | Linux x64 |
+| `cpu`   | CPU | Windows x64, Linux x64, macOS arm64 |
+
+Each is a self-contained PyInstaller bundle from [lemonade-sdk/chatterbox-rocm](https://github.com/lemonade-sdk/chatterbox-rocm) with an embedded Python runtime, the device-appropriate PyTorch wheel, and the `chatterbox-tts` library. No system Python is required (or touched) on the host; Lemonade additionally sets `PYTHONNOUSERSITE=1` at launch.
+
+## Install
+
+The correct device bundle is installed automatically the first time a Chatterbox model is loaded. To install explicitly:
+
+```bash
+lemonade backends install chatterbox:cuda   # or rocm / metal / cpu
+```
+
+Or via HTTP:
+```bash
+curl -X POST http://localhost:13305/api/v1/install \
+  -H 'Content-Type: application/json' \
+  -d '{"recipe": "chatterbox", "backend": "cuda"}'
+```
+
+Bundle versions are pinned in [`backend_versions.json`](https://github.com/lemonade-sdk/lemonade/blob/main/src/cpp/resources/backend_versions.json) (`chatterbox.{cuda,rocm,metal,cpu}`), with tags following the upstream library version (`chatterbox0.1.7` = `chatterbox-tts` 0.1.7). Bundles are built automatically by [lemonade-sdk/chatterbox-rocm](https://github.com/lemonade-sdk/chatterbox-rocm), a distribution-only repo that tracks `chatterbox-tts` PyPI releases — no Chatterbox code is forked; the `main.py` wrapper in `tools/chatterbox-server/` here is frozen together with the PyPI wheel into a self-contained bundle.
+
+## Models
+
+Three variants are registered in [`server_models.json`](https://github.com/lemonade-sdk/lemonade/blob/main/src/cpp/resources/server_models.json), downloading from Hugging Face into the standard HF cache:
+
+| Model | Variant | Checkpoint |
+|-------|---------|-----------|
+| `Chatterbox` | English (`ChatterboxTTS`) | `ResembleAI/chatterbox` |
+| `Chatterbox-Multilingual` | 23+ languages (`ChatterboxMultilingualTTS`) | `ResembleAI/chatterbox` |
+| `Chatterbox-Turbo` | Fast English w/ paralinguistic tags (`ChatterboxTurboTTS`) | `ResembleAI/chatterbox-turbo` |
+
+```bash
+lemonade pull Chatterbox
+```
+
+To register your own Chatterbox checkpoint (loaded via `from_local`):
+
+```bash
+lemonade pull user.MyChatterbox \
+  --checkpoint main ResembleAI/chatterbox \
+  --recipe chatterbox
+```
+
+## Use
+
+### Speech synthesis (OpenAI-compatible)
+
+```bash
+curl http://localhost:13305/v1/audio/speech \
+  -H 'Content-Type: application/json' \
+  -d '{"model": "Chatterbox", "input": "Hello from Lemonade.", "response_format": "mp3"}' \
+  --output speech.mp3
+```
+
+`response_format` accepts `mp3` (default), `wav`, `pcm`, `flac`, and `opus`.
+
+### Streaming
+
+Request raw PCM streaming (24 kHz, signed 16-bit, little-endian):
+
+```bash
+curl http://localhost:13305/v1/audio/speech \
+  -H 'Content-Type: application/json' \
+  -d '{"model": "Chatterbox", "input": "Streaming audio.", "stream_format": "audio"}' \
+  --output speech.pcm
+```
+
+### Voice cloning and expressive controls
+
+The OpenAI `voice` field is treated as a reference-audio path for zero-shot voice cloning when it points at an existing file (or pass `audio_prompt_path` explicitly). Chatterbox-specific controls `exaggeration`, `cfg_weight`, and `temperature` are passed through, and `language_id` selects the language for the multilingual variant:
+
+```bash
+curl http://localhost:13305/v1/audio/speech \
+  -H 'Content-Type: application/json' \
+  -d '{"model": "Chatterbox-Multilingual", "input": "Bonjour.", "language_id": "fr", "exaggeration": 0.6}' \
+  --output bonjour.mp3
+```
+
+## Tuning
+
+Force a specific device (overriding auto-selection) via config or per-load:
+
+```bash
+lemonade config set chatterbox.backend=cpu
+```
+
+Free-form CLI args can be appended to `chatterbox-server` via `chatterbox_args`:
+
+```bash
+lemonade config set chatterbox_args="..."
+```
+
+(`--ckpt-dir`, `--variant`, `--device`, `--host`, and `--port` are managed by Lemonade and rejected as custom args.)
+
+## Known gotchas
+
+- **ROCm is Linux-only.** PyTorch publishes ROCm wheels for Linux only, so the `rocm` bundle is offered on Linux x64. On Windows, AMD GPUs fall back to the `cpu` bundle.
+- **Large bundles.** Chatterbox ships a full PyTorch runtime; the GPU bundles are multi-gigabyte downloads. The first load also downloads the model weights (~2 GB) from Hugging Face.
+- **GPU memory.** Chatterbox participates in the GPU LRU like other GPU models; on tight-VRAM systems it may evict (or be evicted by) an LLM. Use `--max-loaded-models` and per-model eviction settings to tune coexistence.
+- **Streaming support is version-dependent.** Byte-level streaming uses Chatterbox's `generate_stream` when present in the installed build; otherwise the wrapper returns the full utterance as a single chunk over the same streaming contract.
diff --git a/mkdocs.yml b/mkdocs.yml
index 6cd10220f..335b25f6b 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -74,6 +74,7 @@ nav:
       - llama.cpp: guide/configuration/llamacpp.md
       - vLLM: guide/configuration/vllm.md
       - Moonshine: guide/configuration/moonshine.md
+      - Chatterbox: guide/configuration/chatterbox.md
       - Cloud Offload: guide/configuration/cloud.md
     - FAQ: guide/faq.md
   - Development:
diff --git a/src/app/src/renderer/BackendManager.tsx b/src/app/src/renderer/BackendManager.tsx
index bc6381d34..4286fb551 100644
--- a/src/app/src/renderer/BackendManager.tsx
+++ b/src/app/src/renderer/BackendManager.tsx
@@ -12,6 +12,7 @@ const RECIPE_ORDER = new Map([
   'moonshine',
   'sd-cpp',
   'kokoro',
+  'chatterbox',
   'flm',
   'ryzenai-llm',
   'vllm',
diff --git a/src/app/src/renderer/utils/recipeNames.ts b/src/app/src/renderer/utils/recipeNames.ts
index d654c635a..48b082c4c 100644
--- a/src/app/src/renderer/utils/recipeNames.ts
+++ b/src/app/src/renderer/utils/recipeNames.ts
@@ -13,6 +13,7 @@ export const RECIPE_DISPLAY_NAMES: Record<string, string> = {
   'moonshine': 'Moonshine',
   'sd-cpp': 'StableDiffusion.cpp',
   'kokoro': 'Kokoro',
+  'chatterbox': 'Chatterbox',
   'cloud': 'Cloud',
   'vllm': 'vLLM ROCm (experimental)',
 };
diff --git a/src/cpp/include/lemon/backends/chatterbox_server.h b/src/cpp/include/lemon/backends/chatterbox_server.h
new file mode 100644
index 000000000..3e1bc3060
--- /dev/null
+++ b/src/cpp/include/lemon/backends/chatterbox_server.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "../server_capabilities.h"
+#include "../wrapped_server.h"
+#include "backend_utils.h"
+#include <string>
+
+namespace lemon {
+namespace backends {
+
+// Chatterbox text-to-speech backend (Resemble AI). Runs the self-contained
+// chatterbox-server subprocess (PyInstaller bundle from the
+// lemonade-sdk/chatterbox-rocm distribution repo) and forwards OpenAI-style
+// /v1/audio/speech requests to it. Supports CUDA, ROCm, Metal, and CPU; the
+// device variant is auto-selected (GPU when available, else CPU) via the
+// RECIPE_DEFS preference order in system_info.cpp.
+class ChatterboxServer : public WrappedServer, public ITextToSpeechServer {
+public:
+    static InstallParams get_install_params(const std::string& backend, const std::string& version);
+
+    inline static const BackendSpec SPEC = BackendSpec(
+        "chatterbox",
+#ifdef _WIN32
+        "chatterbox-server.exe"
+#else
+        "chatterbox-server"
+#endif
+        , get_install_params
+    );
+
+    explicit ChatterboxServer(const std::string& log_level,
+                              ModelManager* model_manager,
+                              BackendManager* backend_manager);
+
+    ~ChatterboxServer() override;
+
+    void load(const std::string& model_name,
+             const ModelInfo& model_info,
+             const RecipeOptions& options,
+             bool do_not_upgrade = false) override;
+
+    void unload() override;
+
+    // ICompletionServer implementation (not supported - return errors)
+    json chat_completion(const json& request) override;
+    json completion(const json& request) override;
+    json responses(const json& request) override;
+
+    // ITextToSpeechServer implementation
+    void audio_speech(const json& request, httplib::DataSink& sink) override;
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
index b0a68d360..7e0376a4a 100644
--- a/src/cpp/include/lemon/model_manager.h
+++ b/src/cpp/include/lemon/model_manager.h
@@ -104,6 +104,11 @@ struct ModelInfo {
     // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING)
     int moonshine_arch = -1;
 
+    // Chatterbox-specific model variant ("english", "multilingual", "turbo").
+    // Selects which Chatterbox class chatterbox-server instantiates. Empty
+    // defaults to "english".
+    std::string chatterbox_variant;
+
     // Utility
     std::string checkpoint(const std::string& type = "main") const { return checkpoints.count(type) ? checkpoints.at(type) : ""; }
     std::string resolved_path(const std::string& type = "main") const { return resolved_paths.count(type) ? resolved_paths.at(type) : ""; }
diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h
index eb5d4e0b4..c7e7849da 100644
--- a/src/cpp/include/lemon/model_types.h
+++ b/src/cpp/include/lemon/model_types.h
@@ -156,6 +156,9 @@ inline DeviceType get_device_type_from_recipe(const std::string& recipe) {
         return DEVICE_CPU;
     } else if (recipe == "kokoro") {
         return DEVICE_CPU;
+    } else if (recipe == "chatterbox") {
+        // Defaults to GPU; ChatterboxServer overrides to CPU when the cpu backend is selected.
+        return DEVICE_GPU;
     } else if (is_collection_recipe(recipe)) {
         return DEVICE_NONE;
     } else if (recipe == "cloud") {
diff --git a/src/cpp/resources/backend_versions.json b/src/cpp/resources/backend_versions.json
index 4a84bd394..97446c71e 100644
--- a/src/cpp/resources/backend_versions.json
+++ b/src/cpp/resources/backend_versions.json
@@ -92,5 +92,11 @@
   "moonshine": {
     "cpu": "moonshine0.0.62"
   },
+  "chatterbox": {
+    "cuda": "chatterbox0.1.7",
+    "rocm": "chatterbox0.1.7",
+    "metal": "chatterbox0.1.7",
+    "cpu": "chatterbox0.1.7"
+  },
   "clear_bin_if_lemonade_below": "9.4.0"
 }
diff --git a/src/cpp/resources/server_models.json b/src/cpp/resources/server_models.json
index 0ad84ed17..065cd712d 100644
--- a/src/cpp/resources/server_models.json
+++ b/src/cpp/resources/server_models.json
@@ -1581,6 +1581,34 @@
         ],
         "size": 0.354
     },
+    "Chatterbox": {
+        "checkpoint": "ResembleAI/chatterbox",
+        "recipe": "chatterbox",
+        "chatterbox_variant": "english",
+        "suggested": true,
+        "labels": [
+            "tts"
+        ],
+        "size": 3.2
+    },
+    "Chatterbox-Multilingual": {
+        "checkpoint": "ResembleAI/chatterbox",
+        "recipe": "chatterbox",
+        "chatterbox_variant": "multilingual",
+        "labels": [
+            "tts"
+        ],
+        "size": 3.2
+    },
+    "Chatterbox-Turbo": {
+        "checkpoint": "ResembleAI/chatterbox-turbo",
+        "recipe": "chatterbox",
+        "chatterbox_variant": "turbo",
+        "labels": [
+            "tts"
+        ],
+        "size": 3.0
+    },
     "RealESRGAN-x4plus": {
         "checkpoint": "amd/realesrgan-x4plus:RealESRGAN_x4plus.pth",
         "recipe": "sd-cpp",
diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp
index 28feccaab..fbada4ac7 100644
--- a/src/cpp/server/backends/backend_utils.cpp
+++ b/src/cpp/server/backends/backend_utils.cpp
@@ -4,6 +4,7 @@
 #include "lemon/backends/llamacpp_server.h"
 #include "lemon/backends/whisper_server.h"
 #include "lemon/backends/sd_server.h"
+#include "lemon/backends/chatterbox_server.h"
 #include "lemon/backends/kokoro_server.h"
 #include "lemon/backends/ryzenaiserver.h"
 #include "lemon/backends/vllm_server.h"
@@ -43,6 +44,7 @@ namespace lemon::backends {
         if (recipe == "whispercpp") return &WhisperServer::SPEC;
         if (recipe == "sd-cpp") return &SDServer::SPEC;
         if (recipe == "kokoro") return &KokoroServer::SPEC;
+        if (recipe == "chatterbox") return &ChatterboxServer::SPEC;
         if (recipe == "ryzenai-llm") return &::lemon::RyzenAIServer::SPEC;
         if (recipe == "vllm") return &VLLMServer::SPEC;
         if (recipe == "flm") return &FastFlowLMServer::SPEC;
diff --git a/src/cpp/server/backends/chatterbox_server.cpp b/src/cpp/server/backends/chatterbox_server.cpp
new file mode 100644
index 000000000..ed95673cb
--- /dev/null
+++ b/src/cpp/server/backends/chatterbox_server.cpp
@@ -0,0 +1,237 @@
+#include "lemon/backends/chatterbox_server.h"
+#include "lemon/backends/backend_utils.h"
+#include "lemon/backend_manager.h"
+#include "lemon/error_types.h"
+#include "lemon/runtime_config.h"
+#include "lemon/system_info.h"
+#include "lemon/utils/custom_args.h"
+#include "lemon/utils/process_manager.h"
+#include <httplib.h>
+#include <filesystem>
+#include <iostream>
+#include <set>
+#include <vector>
+#include <lemon/utils/aixlog.hpp>
+
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+
+namespace fs = std::filesystem;
+using namespace lemon::utils;
+
+namespace lemon {
+namespace backends {
+
+namespace {
+// Map the resolved backend (cuda/rocm/metal/cpu) to the torch device string the
+// chatterbox-server wrapper expects. PyTorch's ROCm build exposes AMD GPUs
+// through the CUDA API, so "rocm" also maps to "cuda".
+std::string backend_to_device(const std::string& backend) {
+    if (backend == "metal") {
+        return "mps";
+    }
+    if (backend == "cpu") {
+        return "cpu";
+    }
+    return "cuda";  // cuda (NVIDIA) and rocm (AMD via HIP)
+}
+}  // namespace
+
+InstallParams ChatterboxServer::get_install_params(const std::string& backend, const std::string& version) {
+    InstallParams params;
+    params.repo = "lemonade-sdk/chatterbox-rocm";
+
+    // Self-contained PyInstaller bundles built by the lemonade-sdk/chatterbox-rocm
+    // distribution repo (tracks chatterbox-tts PyPI releases; tag scheme
+    // chatterbox<version>) — no system Python needed. One PyTorch wheel covers
+    // all GPU architectures, so assets vary only by OS and device variant
+    // (cuda/rocm/metal/cpu), not by sm_/gfx arch.
+    //
+    // Device availability per platform:
+    //   windows x64  : cuda, cpu
+    //   linux   x64  : cuda, rocm, cpu
+    //   macos   arm64: metal, cpu
+#ifdef _WIN32
+    params.filename = "chatterbox-server-" + version + "-windows-x64-" + backend + ".zip";
+#elif defined(__APPLE__)
+    params.filename = "chatterbox-server-" + version + "-macos-arm64-" + backend + ".tar.gz";
+#else
+    params.filename = "chatterbox-server-" + version + "-linux-x64-" + backend + ".tar.gz";
+#endif
+
+    return params;
+}
+
+ChatterboxServer::ChatterboxServer(const std::string& log_level, ModelManager* model_manager,
+                                   BackendManager* backend_manager)
+    : WrappedServer("chatterbox-server", log_level, model_manager, backend_manager) {
+}
+
+ChatterboxServer::~ChatterboxServer() {
+    unload();
+}
+
+void ChatterboxServer::load(const std::string& model_name, const ModelInfo& model_info,
+                            const RecipeOptions& options, bool do_not_upgrade) {
+    (void)do_not_upgrade;
+    LOG(INFO, "ChatterboxServer") << "Loading model: " << model_name << std::endl;
+    LOG(DEBUG, "ChatterboxServer") << "Per-model settings: " << options.to_log_string() << std::endl;
+
+    // Resolve the device backend. An empty/"auto" option resolves to the first
+    // supported backend in RECIPE_DEFS preference order (GPU when present, else
+    // CPU). A user/config override is validated against the supported set.
+    std::string backend_option = options.get_option("chatterbox_backend");
+    RuntimeConfig::validate_backend_choice("chatterbox", backend_option);
+    std::string backend = backend_option;
+    if (backend.empty() || backend == "auto") {
+        auto supported = SystemInfo::get_supported_backends("chatterbox");
+        if (supported.backends.empty()) {
+            throw std::runtime_error(
+                supported.not_supported_error.empty()
+                    ? SystemInfo::get_unsupported_backend_error("chatterbox", "auto")
+                    : supported.not_supported_error);
+        }
+        backend = supported.backends[0];
+    }
+    LOG(INFO, "ChatterboxServer") << "Using backend: " << backend << std::endl;
+
+    // get_device_type_from_recipe() defaults chatterbox to GPU; the cpu backend runs on CPU.
+    device_type_ = (backend == "cpu") ? DEVICE_CPU : DEVICE_GPU;
+
+    // Install chatterbox-server (device-specific bundle) if needed.
+    backend_manager_->install_backend(SPEC.recipe, backend);
+
+    // Resolve the checkpoint directory (HF snapshot) downloaded by Lemonade.
+    std::string ckpt_dir = model_info.resolved_path();
+    if (ckpt_dir.empty() || !fs::exists(ckpt_dir)) {
+        throw std::runtime_error("Model directory not found for checkpoint: " + model_info.checkpoint());
+    }
+
+    std::string variant = model_info.chatterbox_variant.empty() ? "english" : model_info.chatterbox_variant;
+    std::string device = backend_to_device(backend);
+
+    std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend);
+    LOG(INFO, "ChatterboxServer") << "Using executable: " << exe_path << std::endl;
+
+    port_ = choose_port();
+    if (port_ == 0) {
+        throw std::runtime_error("Failed to find an available port");
+    }
+    LOG(INFO, "ChatterboxServer") << "Starting server on port " << port_
+                                  << " (variant=" << variant << ", device=" << device << ")" << std::endl;
+
+    // Lemonade manages the model path, variant, device, and port; optional
+    // chatterbox-server flags come from chatterbox_args.
+    std::vector<std::string> args = {
+        "--ckpt-dir", ckpt_dir,
+        "--variant", variant,
+        "--device", device,
+        "--host", "127.0.0.1",
+        "--port", std::to_string(port_)
+    };
+
+    std::string chatterbox_args = options.get_option("chatterbox_args");
+    if (!chatterbox_args.empty()) {
+        std::set<std::string> reserved_flags = {
+            "--ckpt-dir", "--variant", "--device", "--host", "--port"
+        };
+        std::string validation_error = validate_custom_args(chatterbox_args, reserved_flags);
+        if (!validation_error.empty()) {
+            throw std::invalid_argument(
+                "Invalid custom chatterbox-server arguments:\n" + validation_error);
+        }
+        std::vector<std::string> custom_args_vec = parse_custom_args(chatterbox_args);
+        args.insert(args.end(), custom_args_vec.begin(), custom_args_vec.end());
+    }
+
+    // Prevent system/user Python packages from leaking into the bundled environment.
+    std::vector<std::pair<std::string, std::string>> env_vars;
+    env_vars.push_back({"PYTHONNOUSERSITE", "1"});
+
+    ProcessHandle started_handle = utils::ProcessManager::start_process(
+        exe_path,
+        args,
+        "",          // working_dir
+        is_debug(),  // inherit_output
+        false,
+        env_vars
+    );
+    set_process_handle(started_handle);
+
+    if (!has_process_handle(started_handle)) {
+        throw std::runtime_error("Failed to start chatterbox-server process");
+    }
+    LOG(INFO, "ChatterboxServer") << "Process started with PID: " << started_handle.pid << std::endl;
+
+    // Model load can be slow (downloads/weights init); wait_for_ready polls
+    // /health, which returns "starting" until the model is loaded.
+    if (!wait_for_ready("/health")) {
+        unload();
+        throw std::runtime_error("chatterbox-server failed to start or become ready");
+    }
+    LOG(INFO, "ChatterboxServer") << "Server is ready!" << std::endl;
+}
+
+void ChatterboxServer::unload() {
+    stop_backend_watchdog();
+    const ProcessHandle handle = consume_process_handle_for_cleanup();
+    if (has_process_handle(handle)) {
+        LOG(INFO, "ChatterboxServer") << "Stopping server (PID: " << handle.pid << ")" << std::endl;
+        utils::ProcessManager::stop_process(handle);
+    }
+}
+
+// ICompletionServer implementation (not supported - return errors)
+json ChatterboxServer::chat_completion(const json& request) {
+    (void)request;
+    return json{
+        {"error", {
+            {"message", "Chatterbox does not support text completion. Use audio speech endpoints instead."},
+            {"type", "unsupported_operation"},
+            {"code", "model_not_applicable"}
+        }}
+    };
+}
+
+json ChatterboxServer::completion(const json& request) {
+    (void)request;
+    return json{
+        {"error", {
+            {"message", "Chatterbox does not support text completion. Use audio speech endpoints instead."},
+            {"type", "unsupported_operation"},
+            {"code", "model_not_applicable"}
+        }}
+    };
+}
+
+json ChatterboxServer::responses(const json& request) {
+    (void)request;
+    return json{
+        {"error", {
+            {"message", "Chatterbox does not support text completion. Use audio speech endpoints instead."},
+            {"type", "unsupported_operation"},
+            {"code", "model_not_applicable"}
+        }}
+    };
+}
+
+void ChatterboxServer::audio_speech(const json& request, httplib::DataSink& sink) {
+    json tts_request = request;
+    tts_request["model"] = "chatterbox";
+
+    // OpenAI does not define "stream" for the speech endpoint, relying solely on
+    // stream_format. The wrapper honors stream_format but we also set the
+    // boolean for parity with the Kokoro contract.
+    if (request.contains("stream_format")) {
+        tts_request["stream"] = true;
+    }
+
+    forward_streaming_request("/v1/audio/speech", tts_request.dump(), sink, false);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index c4bb292b2..41ef878f8 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -1298,6 +1298,12 @@ std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::s
         return model_cache_path;  // Return directory even if index not found
     }
 
+    // For chatterbox, chatterbox-server reads the whole HF snapshot directory
+    // (--ckpt-dir), so resolve to the model cache directory itself.
+    if (info.recipe == "chatterbox") {
+        return model_cache_path;
+    }
+
     // For whispercpp, find the .bin model file
     if (info.recipe == "whispercpp" && variant.empty()) {
         // No variant specified - use fallback logic to find any .bin file
@@ -1857,6 +1863,7 @@ void ModelManager::build_cache() {
         info.size = JsonUtils::get_or_default<double>(value, "size", 0.0);
         info.cloud_provider = JsonUtils::get_or_default<std::string>(value, "cloud_provider", "");
         info.moonshine_arch = JsonUtils::get_or_default<int>(value, "moonshine_arch", -1);
+        info.chatterbox_variant = JsonUtils::get_or_default<std::string>(value, "chatterbox_variant", "");
 
         if (value.contains("labels") && value["labels"].is_array()) {
             for (const auto& label : value["labels"]) {
@@ -1898,6 +1905,7 @@ void ModelManager::build_cache() {
         info.size = JsonUtils::get_or_default<double>(value, "size", 0.0);
         info.cloud_provider = JsonUtils::get_or_default<std::string>(value, "cloud_provider", "");
         info.moonshine_arch = JsonUtils::get_or_default<int>(value, "moonshine_arch", -1);
+        info.chatterbox_variant = JsonUtils::get_or_default<std::string>(value, "chatterbox_variant", "");
 
         if (value.contains("labels") && value["labels"].is_array()) {
             for (const auto& label : value["labels"]) {
@@ -2684,6 +2692,9 @@ void ModelManager::register_user_model(const std::string& model_name,
         labels.insert("transcription");
         labels.insert("realtime-transcription");
     }
+    if (recipe == "chatterbox") {
+        labels.insert("tts");
+    }
 
     model_entry["labels"] = labels;
     model_entry["suggested"] = true; // Always set suggested=true for user models
@@ -3878,6 +3889,48 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
                 }
             }
         }
+    } else if (info.recipe == "chatterbox") {
+        // Chatterbox ships redundant weights in a single repo: both .pt and
+        // .safetensors copies, plus every t3 variant (English / multilingual /
+        // turbo). Downloading the whole repo would pull ~14 GB for a model that
+        // only needs ~3 GB. Select only the large weight files the chosen
+        // variant's from_local() actually loads; always keep small config and
+        // tokenizer files so the loader never misses a dependency.
+        const std::string variant = info.chatterbox_variant.empty() ? "english" : info.chatterbox_variant;
+
+        // All large, mutually-exclusive weight files across variants. Any of
+        // these is skipped unless it is in the selected variant's wanted set.
+        static const std::set<std::string> kLargeWeights = {
+            "ve.pt", "ve.safetensors",
+            "s3gen.pt", "s3gen.safetensors",
+            "s3gen_v3.pt", "s3gen_v3.safetensors", "s3gen_meanflow.safetensors",
+            "t3_cfg.pt", "t3_cfg.safetensors",
+            "t3_23lang.safetensors",
+            "t3_mtl23ls_v2.safetensors", "t3_mtl23ls_v3.safetensors",
+            "t3_turbo_v1.safetensors"
+        };
+
+        std::set<std::string> wanted;
+        if (variant == "multilingual") {
+            // ChatterboxMultilingualTTS.from_local: ve.pt, s3gen.pt, t3_mtl23ls_v2.safetensors
+            wanted = {"ve.pt", "s3gen.pt", "t3_mtl23ls_v2.safetensors"};
+        } else if (variant == "turbo") {
+            // ChatterboxTurboTTS.from_local: ve.safetensors, s3gen_meanflow.safetensors, t3_turbo_v1.safetensors
+            wanted = {"ve.safetensors", "s3gen_meanflow.safetensors", "t3_turbo_v1.safetensors"};
+        } else {
+            // ChatterboxTTS (english).from_local: ve.safetensors, s3gen.safetensors, t3_cfg.safetensors
+            wanted = {"ve.safetensors", "s3gen.safetensors", "t3_cfg.safetensors"};
+        }
+
+        for (const auto& file : repo_files) {
+            if (kLargeWeights.count(file) && !wanted.count(file)) {
+                continue;  // redundant weight not needed by this variant
+            }
+            files_to_download[main_repo_id].push_back(file);
+        }
+        LOG(INFO, "ModelManager") << "Chatterbox (" << variant << "): selected "
+                                  << files_to_download[main_repo_id].size() << " of "
+                                  << repo_files.size() << " repo files" << std::endl;
     } else {
         // Non-GGUF model (ONNX, etc.): Download all files in repository
         files_to_download[main_repo_id].insert(files_to_download[main_repo_id].end(), repo_files.begin(), repo_files.end());
@@ -4800,6 +4853,11 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name)
         info.moonshine_arch = (*model_json)["moonshine_arch"].get<int>();
     }
 
+    // Parse chatterbox_variant
+    if (model_json->contains("chatterbox_variant") && (*model_json)["chatterbox_variant"].is_string()) {
+        info.chatterbox_variant = (*model_json)["chatterbox_variant"].get<std::string>();
+    }
+
     return info;
 }
 
diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp
index f5c025da1..a58a6d52c 100644
--- a/src/cpp/server/recipe_options.cpp
+++ b/src/cpp/server/recipe_options.cpp
@@ -23,6 +23,8 @@ static const json DEFAULTS = {
     {"whispercpp_backend", ""},  // "" means auto-detect (mapped from "auto" in config.json)
     {"whispercpp_args", ""},
     {"moonshine_args", ""},      // Custom arguments to pass to moonshine-server
+    {"chatterbox_backend", ""},  // "" means auto-detect (GPU if available, else CPU)
+    {"chatterbox_args", ""},     // Custom arguments to pass to chatterbox-server
     // Image generation defaults (for sd-cpp recipe)
     // These are recipe-level defaults only, not CLI arguments — per reviewer guidance,
     // there are too many image gen params for CLI flags, and no universal defaults.
@@ -63,6 +65,8 @@ static const std::map<std::string, std::string> OPTION_TO_CLI_FLAG = {
     {"whispercpp_backend", "--whispercpp"},
     {"whispercpp_args", "--whispercpp-args"},
     {"moonshine_args", "--moonshine-args"},
+    {"chatterbox_backend", "--chatterbox"},
+    {"chatterbox_args", "--chatterbox-args"},
     {"vllm_backend", "--vllm"},
     {"vllm_args", "--vllm-args"}
 };
@@ -75,6 +79,8 @@ static std::vector<std::string> get_keys_for_recipe(const std::string& recipe) {
         keys = {"whispercpp_backend", "whispercpp_args", "merge_args"};
     } else if (recipe == "moonshine") {
         keys = {"moonshine_args", "merge_args"};
+    } else if (recipe == "chatterbox") {
+        keys = {"chatterbox_backend", "chatterbox_args", "merge_args"};
     } else if (recipe == "flm") {
         return {"ctx_size", "merge_args"};
     } else if (recipe == "ryzenai-llm") {
@@ -253,6 +259,8 @@ static const json CLI_OPTIONS = {
     {"--whispercpp", {{"option_name", "whispercpp_backend"}, {"type_name", "BACKEND"}, {"help", "WhisperCpp backend to use"}, {"group", "Whisper.cpp Options"}}},
     {"--whispercpp-args", {{"option_name", "whispercpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to whisper-server"}, {"group", "Whisper.cpp Options"}}},
     {"--moonshine-args", {{"option_name", "moonshine_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to moonshine-server"}}},
+    {"--chatterbox", {{"option_name", "chatterbox_backend"}, {"type_name", "BACKEND"}, {"help", "Chatterbox backend to use (cuda, rocm, metal, cpu)"}, {"group", "Chatterbox Options"}}},
+    {"--chatterbox-args", {{"option_name", "chatterbox_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to chatterbox-server"}, {"group", "Chatterbox Options"}}},
     {"--vllm", {{"option_name", "vllm_backend"}, {"type_name", "BACKEND"}, {"help", "vLLM backend to use"}, {"group", "vLLM Options"}}},
     {"--vllm-args", {{"option_name", "vllm_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to vllm-server"}, {"group", "vLLM Options"}}},
     // Note: Image gen params (--steps, --cfg-scale, --width, --height) removed — recipe-level only.
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index 82db5494d..07041521f 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -6,6 +6,7 @@
 #include "lemon/backends/ryzenaiserver.h"
 #include "lemon/backends/whisper_server.h"
 #include "lemon/backends/moonshine_server.h"
+#include "lemon/backends/chatterbox_server.h"
 #include "lemon/backends/kokoro_server.h"
 #include "lemon/backends/sd_server.h"
 #include "lemon/backends/vllm_server.h"
@@ -305,6 +306,9 @@ std::unique_ptr<WrappedServer> Router::create_backend_server(const ModelInfo& mo
     } else if (model_info.recipe == "kokoro") {
         LOG(DEBUG, "Router") << "Creating Kokoro backend" << std::endl;
         new_server = std::make_unique<backends::KokoroServer>(log_level, model_manager_, backend_manager_);
+    } else if (model_info.recipe == "chatterbox") {
+        LOG(DEBUG, "Router") << "Creating Chatterbox backend" << std::endl;
+        new_server = std::make_unique<backends::ChatterboxServer>(log_level, model_manager_, backend_manager_);
     } else if (model_info.recipe == "sd-cpp") {
         LOG(DEBUG, "Router") << "Creating SDServer backend" << std::endl;
         new_server = std::make_unique<backends::SDServer>(log_level, model_manager_, backend_manager_);
diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp
index a33a4098e..15e3f90fc 100644
--- a/src/cpp/server/runtime_config.cpp
+++ b/src/cpp/server/runtime_config.cpp
@@ -31,7 +31,7 @@ RuntimeConfig* RuntimeConfig::global() {
 }
 
 static const std::vector<std::string> s_backend_names = {
-    "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro"
+    "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro", "chatterbox"
 };
 
 static bool is_backend_name(const std::string& key) {
@@ -40,7 +40,7 @@ static bool is_backend_name(const std::string& key) {
 
 // Backends that have a selectable "backend" key
 static const std::vector<std::string> s_selectable_backends = {
-    "llamacpp", "whispercpp", "sdcpp", "vllm"
+    "llamacpp", "whispercpp", "sdcpp", "vllm", "chatterbox"
 };
 
 static bool has_backend_selection(const std::string& config_section) {
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index fa6d5874e..1c4ea2dfc 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -467,6 +467,23 @@ static const std::vector<RecipeBackendDef> RECIPE_DEFS = {
         {"metal", {}},
     }},
 
+    // chatterbox - PyTorch TTS (Resemble AI). One torch wheel per device covers
+    // all GPU archs, so backends vary only by device, not sm_/gfx family.
+    // Preference order: GPU first (metal/cuda/rocm), CPU fallback. PyTorch's
+    // ROCm build is Linux-only, so chatterbox rocm is offered on Linux only.
+    {"chatterbox", "metal", {"macos"}, {
+        {"metal", {}},
+    }},
+    {"chatterbox", "cuda", {"windows", "linux"}, {
+        {"nvidia_gpu", {}},      // all NVIDIA GPU families (torch cuda wheel)
+    }},
+    {"chatterbox", "rocm", {"linux"}, {
+        {"amd_gpu", {}},         // all AMD GPU families (torch rocm wheel, Linux)
+    }},
+    {"chatterbox", "cpu", {"windows", "linux", "macos"}, {
+        {"cpu", {"x86_64", "arm64"}},
+    }},
+
     // stable-diffusion.cpp - ROCm backend for AMD GPUs
     {"sd-cpp", "rocm", {"windows", "linux"}, {
         {"amd_gpu", {
diff --git a/tools/chatterbox-server/main.py b/tools/chatterbox-server/main.py
new file mode 100644
index 000000000..91602b1db
--- /dev/null
+++ b/tools/chatterbox-server/main.py
@@ -0,0 +1,480 @@
+#!/usr/bin/env python3
+"""chatterbox-server — thin OpenAI-compatible TTS HTTP wrapper around Resemble
+AI's Chatterbox models, consumed by Lemonade's ``chatterbox`` backend.
+
+No Chatterbox code is vendored here. ``chatterbox-tts`` is installed from PyPI
+at build time and frozen, together with this wrapper, into a self-contained
+PyInstaller bundle by the ``lemonade-sdk/chatterbox-rocm`` distribution repo —
+no system Python is required (or touched) on user machines.
+
+The wrapper exposes a single inference endpoint, ``POST /v1/audio/speech``,
+matching the OpenAI text-to-speech contract that Lemonade's Router forwards to
+(identical to the Kokoro backend's contract), plus ``GET /`` and
+``GET /health`` readiness probes.
+
+Device selection: ``cuda`` (also covers AMD ROCm — PyTorch's ROCm build reports
+``torch.cuda.is_available()``), then Apple ``mps`` (Metal), else ``cpu``. A
+``--device`` override is honored; ``auto`` (the default) picks the best
+available, giving "GPU by default, CPU fallback".
+
+Streaming: when the client requests ``stream_format: "audio"`` (or
+``stream: true``), audio is emitted as raw little-endian PCM16 at 24 kHz —
+exactly the format Lemonade advertises (``audio/l16;rate=24000``) and the
+native sample rate of Chatterbox (S3GEN_SR = 24000), so no resampling is
+needed. Byte-level streaming uses Chatterbox's ``generate_stream`` when the
+installed version provides it (detected at runtime); otherwise it falls back to
+a single full-utterance chunk, keeping the HTTP contract identical.
+"""
+
+import argparse
+import io
+import json
+import os
+import struct
+import sys
+import threading
+import wave
+
+import numpy as np
+
+# Chatterbox is built on PyTorch; importing torch up front lets us probe the
+# available accelerators before the (slower) model import.
+import torch
+
+
+# Chatterbox's S3Gen vocoder samples at 24 kHz. This matches OpenAI's "pcm"
+# response format (audio/l16;rate=24000;little-endian), so PCM streaming is a
+# zero-resample passthrough.
+SAMPLE_RATE = 24000
+
+# Set once the model finishes loading; gates the /health readiness probe.
+_READY = threading.Event()
+_MODEL = None
+_VARIANT = "english"
+# Serializes generation: a single Chatterbox model instance is not safe to call
+# concurrently from multiple request threads.
+_GEN_LOCK = threading.Lock()
+
+
+# --------------------------------------------------------------------------- #
+# Device selection
+# --------------------------------------------------------------------------- #
+def pick_device(requested):
+    """Resolve the torch device string.
+
+    ``auto`` prefers CUDA (NVIDIA, and AMD via PyTorch's ROCm build, which
+    masquerades as CUDA), then Apple MPS (Metal), then CPU.
+    """
+    if requested and requested != "auto":
+        return requested
+    if torch.cuda.is_available():
+        return "cuda"
+    mps = getattr(torch.backends, "mps", None)
+    if mps is not None and mps.is_available():
+        return "mps"
+    return "cpu"
+
+
+# --------------------------------------------------------------------------- #
+# Model loading
+# --------------------------------------------------------------------------- #
+def load_model(variant, ckpt_dir, device):
+    """Instantiate the requested Chatterbox variant on ``device``.
+
+    Prefers ``from_local(ckpt_dir, device)`` so the model is read from the
+    checkpoint directory Lemonade already downloaded into the Hugging Face
+    cache; falls back to ``from_pretrained(device)`` (which downloads on
+    demand) if a local load is not possible.
+    """
+    if variant == "multilingual":
+        from chatterbox.mtl_tts import ChatterboxMultilingualTTS as Model
+    elif variant == "turbo":
+        from chatterbox.tts_turbo import ChatterboxTurboTTS as Model
+    else:
+        from chatterbox.tts import ChatterboxTTS as Model
+
+    if ckpt_dir and os.path.isdir(ckpt_dir):
+        try:
+            return Model.from_local(ckpt_dir, device)
+        except Exception as exc:  # noqa: BLE001 - fall back to hub download
+            print(
+                f"[chatterbox-server] from_local({ckpt_dir!r}) failed ({exc}); "
+                "falling back to from_pretrained",
+                file=sys.stderr,
+                flush=True,
+            )
+    return Model.from_pretrained(device)
+
+
+# --------------------------------------------------------------------------- #
+# Audio conversion / encoding
+# --------------------------------------------------------------------------- #
+def to_pcm16(wav):
+    """Convert a Chatterbox waveform (torch tensor or ndarray, float32 in
+    [-1, 1], shape (1, N) or (N,)) to little-endian signed-16-bit PCM bytes."""
+    if hasattr(wav, "detach"):
+        wav = wav.detach().to("cpu").float().numpy()
+    wav = np.asarray(wav, dtype=np.float32).reshape(-1)
+    wav = np.clip(wav, -1.0, 1.0)
+    return (wav * 32767.0).astype("<i2").tobytes()
+
+
+def pcm_to_wav(pcm_bytes):
+    """Wrap raw PCM16 mono @ SAMPLE_RATE in a RIFF/WAVE container."""
+    buf = io.BytesIO()
+    with wave.open(buf, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(SAMPLE_RATE)
+        wf.writeframes(pcm_bytes)
+    return buf.getvalue()
+
+
+def encode_full(pcm_bytes, fmt):
+    """Encode complete PCM16 audio into the requested OpenAI response_format.
+
+    Returns ``(encoded_bytes, ok)``. ``ok`` is False when no encoder for the
+    requested format is available, so the caller can surface a clean error.
+    """
+    fmt = (fmt or "mp3").lower()
+
+    if fmt == "pcm":
+        return pcm_bytes, True
+    if fmt == "wav":
+        return pcm_to_wav(pcm_bytes), True
+
+    if fmt == "mp3":
+        try:
+            import lameenc
+
+            enc = lameenc.Encoder()
+            enc.set_bit_rate(128)
+            enc.set_in_sample_rate(SAMPLE_RATE)
+            enc.set_channels(1)
+            enc.set_quality(2)
+            return enc.encode(pcm_bytes) + enc.flush(), True
+        except Exception as exc:  # noqa: BLE001
+            print(f"[chatterbox-server] mp3 encode failed: {exc}", file=sys.stderr, flush=True)
+            return b"", False
+
+    # flac / ogg-opus via libsndfile (soundfile).
+    if fmt in ("flac", "opus"):
+        try:
+            import soundfile as sf
+
+            samples = np.frombuffer(pcm_bytes, dtype="<i2")
+            buf = io.BytesIO()
+            if fmt == "flac":
+                sf.write(buf, samples, SAMPLE_RATE, format="FLAC")
+            else:
+                sf.write(buf, samples, SAMPLE_RATE, format="OGG", subtype="OPUS")
+            return buf.getvalue(), True
+        except Exception as exc:  # noqa: BLE001
+            print(f"[chatterbox-server] {fmt} encode failed: {exc}", file=sys.stderr, flush=True)
+            return b"", False
+
+    # aac and anything else: best effort via PyAV if present.
+    try:
+        import av  # noqa: F401
+
+        return _encode_via_av(pcm_bytes, fmt), True
+    except Exception as exc:  # noqa: BLE001
+        print(f"[chatterbox-server] no encoder for format '{fmt}': {exc}", file=sys.stderr, flush=True)
+        return b"", False
+
+
+def _encode_via_av(pcm_bytes, fmt):
+    """Encode PCM16 into ``fmt`` using PyAV (ffmpeg). Used for aac/opus/etc."""
+    import av
+
+    container_fmt = {"aac": "adts", "opus": "ogg", "mp3": "mp3", "flac": "flac"}.get(fmt, fmt)
+    out = io.BytesIO()
+    container = av.open(out, mode="w", format=container_fmt)
+    stream = container.add_stream(fmt, rate=SAMPLE_RATE)
+    stream.layout = "mono"
+
+    samples = np.frombuffer(pcm_bytes, dtype="<i2").reshape(1, -1)
+    frame = av.AudioFrame.from_ndarray(samples, format="s16", layout="mono")
+    frame.rate = SAMPLE_RATE
+    for packet in stream.encode(frame):
+        container.mux(packet)
+    for packet in stream.encode(None):  # flush
+        container.mux(packet)
+    container.close()
+    return out.getvalue()
+
+
+# --------------------------------------------------------------------------- #
+# Generation
+# --------------------------------------------------------------------------- #
+def build_gen_kwargs(body):
+    """Map request fields to Chatterbox generate() kwargs.
+
+    OpenAI's ``voice`` is treated as an optional reference-audio path for voice
+    cloning when it points at an existing file (also accepts the explicit
+    ``audio_prompt_path``). Chatterbox expressive controls and the multilingual
+    ``language_id`` are passed through when provided.
+    """
+    kwargs = {}
+
+    prompt = body.get("audio_prompt_path")
+    voice = body.get("voice")
+    if not prompt and isinstance(voice, str) and os.path.isfile(voice):
+        prompt = voice
+    if prompt:
+        kwargs["audio_prompt_path"] = prompt
+
+    for key in ("exaggeration", "cfg_weight", "temperature"):
+        if key in body and body[key] is not None:
+            kwargs[key] = body[key]
+
+    if _VARIANT == "multilingual":
+        lang = body.get("language_id") or body.get("language")
+        if lang:
+            kwargs["language_id"] = lang
+
+    return kwargs
+
+
+def generate_full(text, gen_kwargs):
+    """Run a full (non-streaming) synthesis and return PCM16 bytes."""
+    with _GEN_LOCK:
+        wav = _MODEL.generate(text, **gen_kwargs)
+    return to_pcm16(wav)
+
+
+def iter_pcm_chunks(text, gen_kwargs, chunk_size):
+    """Yield PCM16 byte chunks as Chatterbox produces them.
+
+    Uses ``generate_stream`` when the installed Chatterbox version exposes it
+    (true byte-level streaming); otherwise falls back to one full-utterance
+    chunk so the streaming HTTP contract still holds.
+    """
+    with _GEN_LOCK:
+        if hasattr(_MODEL, "generate_stream"):
+            stream_kwargs = dict(gen_kwargs)
+            if chunk_size:
+                stream_kwargs.setdefault("chunk_size", chunk_size)
+            try:
+                for item in _MODEL.generate_stream(text, **stream_kwargs):
+                    # Some versions yield (audio_chunk, metrics); others just audio.
+                    chunk = item[0] if isinstance(item, (tuple, list)) else item
+                    if chunk is None:
+                        continue
+                    pcm = to_pcm16(chunk)
+                    if pcm:
+                        yield pcm
+                return
+            except TypeError:
+                # Signature mismatch (e.g. no chunk_size kwarg) — retry plainly.
+                for item in _MODEL.generate_stream(text, **gen_kwargs):
+                    chunk = item[0] if isinstance(item, (tuple, list)) else item
+                    if chunk is None:
+                        continue
+                    pcm = to_pcm16(chunk)
+                    if pcm:
+                        yield pcm
+                return
+
+        # Fallback: no streaming support in this build.
+        wav = _MODEL.generate(text, **gen_kwargs)
+    yield to_pcm16(wav)
+
+
+# --------------------------------------------------------------------------- #
+# HTTP server
+# --------------------------------------------------------------------------- #
+from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer  # noqa: E402
+
+
+class Handler(BaseHTTPRequestHandler):
+    protocol_version = "HTTP/1.1"
+
+    def log_message(self, fmt, *args):  # quieter logs; Lemonade captures stdout
+        if os.environ.get("CHATTERBOX_VERBOSE"):
+            super().log_message(fmt, *args)
+
+    # -- helpers ----------------------------------------------------------- #
+    def _send_json(self, status, obj):
+        body = json.dumps(obj).encode("utf-8")
+        self.send_response(status)
+        self.send_header("Content-Type", "application/json")
+        self.send_header("Content-Length", str(len(body)))
+        self.end_headers()
+        self.wfile.write(body)
+
+    def _send_error(self, status, message, etype="invalid_request_error"):
+        self._send_json(status, {"error": {"message": message, "type": etype}})
+
+    def _write_chunk(self, data):
+        """Write one HTTP/1.1 chunked-transfer chunk."""
+        self.wfile.write(f"{len(data):X}\r\n".encode("ascii"))
+        self.wfile.write(data)
+        self.wfile.write(b"\r\n")
+
+    # -- routing ----------------------------------------------------------- #
+    def do_GET(self):
+        path = self.path.split("?", 1)[0]
+        if path in ("/", "/health"):
+            if _READY.is_set():
+                self._send_json(200, {"status": "ok"})
+            else:
+                self._send_json(503, {"status": "starting"})
+        else:
+            self._send_error(404, "Not found")
+
+    def do_POST(self):
+        path = self.path.split("?", 1)[0]
+        if path not in ("/v1/audio/speech", "/audio/speech", "/api/v1/audio/speech"):
+            self._send_error(404, "Not found")
+            return
+
+        try:
+            length = int(self.headers.get("Content-Length", 0))
+            body = json.loads(self.rfile.read(length) or b"{}")
+        except Exception as exc:  # noqa: BLE001
+            self._send_error(400, f"Invalid JSON body: {exc}")
+            return
+
+        text = body.get("input")
+        if not text:
+            self._send_error(400, "Missing 'input' field")
+            return
+
+        if not _READY.is_set():
+            self._send_error(503, "Model is still loading", etype="server_error")
+            return
+
+        streaming = bool(body.get("stream")) or "stream_format" in body
+        stream_format = body.get("stream_format")  # Lemonade only sends "audio"
+        response_format = (body.get("response_format") or "mp3").lower()
+
+        try:
+            gen_kwargs = build_gen_kwargs(body)
+        except Exception as exc:  # noqa: BLE001
+            self._send_error(400, f"Invalid generation parameters: {exc}")
+            return
+
+        try:
+            if streaming and stream_format == "sse":
+                self._stream_sse(text, gen_kwargs, body)
+            elif streaming:
+                self._stream_audio(text, gen_kwargs, body)
+            else:
+                self._send_full(text, gen_kwargs, response_format)
+        except BrokenPipeError:
+            pass  # client disconnected mid-stream
+        except Exception as exc:  # noqa: BLE001
+            print(f"[chatterbox-server] generation error: {exc}", file=sys.stderr, flush=True)
+            # Headers may already be sent in streaming paths; best effort.
+            try:
+                self._send_error(500, str(exc), etype="server_error")
+            except Exception:  # noqa: BLE001
+                pass
+
+    # -- response strategies ----------------------------------------------- #
+    def _send_full(self, text, gen_kwargs, response_format):
+        pcm = generate_full(text, gen_kwargs)
+        data, ok = encode_full(pcm, response_format)
+        if not ok:
+            self._send_error(400, f"Unsupported audio format: {response_format}")
+            return
+        mime = {
+            "mp3": "audio/mpeg",
+            "opus": "audio/opus",
+            "aac": "audio/aac",
+            "flac": "audio/flac",
+            "wav": "audio/wav",
+            "pcm": "audio/l16;rate=24000;endianness=little-endian",
+        }.get(response_format, "application/octet-stream")
+        self.send_response(200)
+        self.send_header("Content-Type", mime)
+        self.send_header("Content-Length", str(len(data)))
+        self.end_headers()
+        self.wfile.write(data)
+
+    def _stream_audio(self, text, gen_kwargs, body):
+        """Chunked raw PCM16 @ 24 kHz — Lemonade's stream_format: "audio"."""
+        chunk_size = int(body.get("chunk_size", 0) or 0)
+        self.send_response(200)
+        self.send_header("Content-Type", "audio/l16;rate=24000;endianness=little-endian")
+        self.send_header("Transfer-Encoding", "chunked")
+        self.send_header("Cache-Control", "no-cache")
+        self.end_headers()
+        for pcm in iter_pcm_chunks(text, gen_kwargs, chunk_size):
+            self._write_chunk(pcm)
+            self.wfile.flush()
+        self.wfile.write(b"0\r\n\r\n")
+        self.wfile.flush()
+
+    def _stream_sse(self, text, gen_kwargs, body):
+        """OpenAI Server-Sent-Events streaming (speech.audio.delta/done)."""
+        import base64
+
+        chunk_size = int(body.get("chunk_size", 0) or 0)
+        self.send_response(200)
+        self.send_header("Content-Type", "text/event-stream")
+        self.send_header("Transfer-Encoding", "chunked")
+        self.send_header("Cache-Control", "no-cache")
+        self.end_headers()
+
+        def sse(obj):
+            self._write_chunk(("data: " + json.dumps(obj) + "\n\n").encode("utf-8"))
+            self.wfile.flush()
+
+        for pcm in iter_pcm_chunks(text, gen_kwargs, chunk_size):
+            sse({"type": "speech.audio.delta", "audio": base64.b64encode(pcm).decode("ascii")})
+        sse({"type": "speech.audio.done"})
+        self.wfile.write(b"0\r\n\r\n")
+        self.wfile.flush()
+
+
+def main():
+    global _MODEL, _VARIANT
+
+    parser = argparse.ArgumentParser(description="OpenAI-compatible Chatterbox TTS server")
+    parser.add_argument("--ckpt-dir", default="", help="Local checkpoint directory (HF snapshot)")
+    parser.add_argument(
+        "--variant",
+        default="english",
+        choices=["english", "multilingual", "turbo"],
+        help="Chatterbox model class to load",
+    )
+    parser.add_argument(
+        "--device",
+        default="auto",
+        choices=["auto", "cuda", "mps", "cpu"],
+        help="Inference device (auto = GPU if available, else CPU)",
+    )
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8080)
+    args = parser.parse_args()
+
+    _VARIANT = args.variant
+    device = pick_device(args.device)
+    print(
+        f"[chatterbox-server] variant={args.variant} device={device} "
+        f"ckpt_dir={args.ckpt_dir or '<hub>'}",
+        flush=True,
+    )
+
+    # Start the HTTP server first so /health answers "starting" during the
+    # (potentially slow) model load, then load the model in this thread.
+    server = ThreadingHTTPServer((args.host, args.port), Handler)
+    threading.Thread(target=server.serve_forever, daemon=True).start()
+    print(f"[chatterbox-server] listening on {args.host}:{args.port}", flush=True)
+
+    _MODEL = load_model(args.variant, args.ckpt_dir, device)
+    _READY.set()
+    print("[chatterbox-server] model ready", flush=True)
+
+    try:
+        threading.Event().wait()  # block forever; serving happens in the thread
+    except KeyboardInterrupt:
+        pass
+    finally:
+        server.shutdown()
+
+
+if __name__ == "__main__":
+    main()

From 553f77d5b3bc3fd3004fe25d262dee1ec7aa748a Mon Sep 17 00:00:00 2001
From: geramyloveless <gloveless@jqluv.com>
Date: Mon, 15 Jun 2026 14:27:03 -0700
Subject: [PATCH 2/2] chatterbox: enable split-archive install for multi-GB GPU
 bundles

GitHub release assets are capped at 2 GiB; frozen torch+CUDA/ROCm bundles
exceed that. Enable supports_split_archive and switch the Windows asset to
.tar.gz (extracted via native tar) so the split-archive installer path
serves all platforms.
---
 src/cpp/include/lemon/backends/chatterbox_server.h | 1 +
 src/cpp/server/backends/chatterbox_server.cpp      | 6 +++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/cpp/include/lemon/backends/chatterbox_server.h b/src/cpp/include/lemon/backends/chatterbox_server.h
index 3e1bc3060..f4b2899d4 100644
--- a/src/cpp/include/lemon/backends/chatterbox_server.h
+++ b/src/cpp/include/lemon/backends/chatterbox_server.h
@@ -26,6 +26,7 @@ class ChatterboxServer : public WrappedServer, public ITextToSpeechServer {
         "chatterbox-server"
 #endif
         , get_install_params
+        , true  // supports_split_archive: GPU bundles exceed GitHub's 2 GiB asset limit
     );
 
     explicit ChatterboxServer(const std::string& log_level,
diff --git a/src/cpp/server/backends/chatterbox_server.cpp b/src/cpp/server/backends/chatterbox_server.cpp
index ed95673cb..2c06dff7f 100644
--- a/src/cpp/server/backends/chatterbox_server.cpp
+++ b/src/cpp/server/backends/chatterbox_server.cpp
@@ -55,8 +55,12 @@ InstallParams ChatterboxServer::get_install_params(const std::string& backend, c
     //   windows x64  : cuda, cpu
     //   linux   x64  : cuda, rocm, cpu
     //   macos   arm64: metal, cpu
+    //
+    // All platforms use .tar.gz (extracted via native tar on Windows too) so the
+    // split-archive path can serve the multi-GB GPU bundles that exceed GitHub's
+    // 2 GiB per-asset release limit.
 #ifdef _WIN32
-    params.filename = "chatterbox-server-" + version + "-windows-x64-" + backend + ".zip";
+    params.filename = "chatterbox-server-" + version + "-windows-x64-" + backend + ".tar.gz";
 #elif defined(__APPLE__)
     params.filename = "chatterbox-server-" + version + "-macos-arm64-" + backend + ".tar.gz";
 #else