lemonade-sdk · Geramy · Jun 15, 2026 · Jun 15, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -614,6 +614,7 @@ set(SOURCES_CORE
     src/cpp/server/backends/whisper_server.cpp
     src/cpp/server/backends/moonshine_server.cpp
     src/cpp/server/backends/kokoro_server.cpp
+    src/cpp/server/backends/chatterbox_server.cpp
     src/cpp/server/backends/sd_server.cpp
     src/cpp/server/backends/vllm_server.cpp
     src/cpp/server/backends/backend_utils.cpp

diff --git a/docs/guide/configuration/chatterbox.md b/docs/guide/configuration/chatterbox.md
@@ -0,0 +1,117 @@
+# Chatterbox Backend Options
+
+Lemonade integrates [Chatterbox](https://github.com/resemble-ai/chatterbox) (Resemble AI) as a **text-to-speech** backend, exposed through the OpenAI-compatible `/v1/audio/speech` endpoint (the same endpoint the Kokoro backend uses). Unlike Kokoro (CPU/Metal only), Chatterbox is a PyTorch model and supports **GPU acceleration** across vendors:
+
+1. **GPU by default, CPU fallback.** Chatterbox auto-selects the best available device — CUDA (NVIDIA), ROCm (AMD, Linux), or Metal/MPS (Apple Silicon) — and falls back to CPU when no GPU is present. PyTorch's ROCm build drives AMD GPUs through the CUDA API, so a single bundle per device covers all GPU architectures.
+2. **Expressive, multilingual, voice cloning.** Chatterbox supports emotion/exaggeration control, 23+ languages (multilingual variant), and zero-shot voice cloning from a reference clip.
+3. **Byte-level streaming.** When the loaded Chatterbox build provides `generate_stream`, audio is emitted incrementally as raw PCM16 @ 24 kHz (`stream_format: "audio"`) for low time-to-first-audio; otherwise it falls back transparently to a single full-utterance response.
+
+## Available Backends
+
+Chatterbox auto-selects in this preference order (first available wins):
+
+| Backend | Device | Platforms |
+|---------|--------|-----------|
+| `metal` | Apple Silicon GPU (MPS) | macOS arm64 |
+| `cuda`  | NVIDIA GPU | Windows x64, Linux x64 |
+| `rocm`  | AMD GPU (HIP via PyTorch ROCm) | Linux x64 |
+| `cpu`   | CPU | Windows x64, Linux x64, macOS arm64 |
+
+Each is a self-contained PyInstaller bundle from [lemonade-sdk/chatterbox-rocm](https://github.com/lemonade-sdk/chatterbox-rocm) with an embedded Python runtime, the device-appropriate PyTorch wheel, and the `chatterbox-tts` library. No system Python is required (or touched) on the host; Lemonade additionally sets `PYTHONNOUSERSITE=1` at launch.
+
+## Install
+
+The correct device bundle is installed automatically the first time a Chatterbox model is loaded. To install explicitly:
+
+```bash
+lemonade backends install chatterbox:cuda   # or rocm / metal / cpu
+```
+
+Or via HTTP:
+```bash
+curl -X POST http://localhost:13305/api/v1/install \
+  -H 'Content-Type: application/json' \
+  -d '{"recipe": "chatterbox", "backend": "cuda"}'
+```
+
+Bundle versions are pinned in [`backend_versions.json`](https://github.com/lemonade-sdk/lemonade/blob/main/src/cpp/resources/backend_versions.json) (`chatterbox.{cuda,rocm,metal,cpu}`), with tags following the upstream library version (`chatterbox0.1.7` = `chatterbox-tts` 0.1.7). Bundles are built automatically by [lemonade-sdk/chatterbox-rocm](https://github.com/lemonade-sdk/chatterbox-rocm), a distribution-only repo that tracks `chatterbox-tts` PyPI releases — no Chatterbox code is forked; the `main.py` wrapper in `tools/chatterbox-server/` here is frozen together with the PyPI wheel into a self-contained bundle.
+
+## Models
+
+Three variants are registered in [`server_models.json`](https://github.com/lemonade-sdk/lemonade/blob/main/src/cpp/resources/server_models.json), downloading from Hugging Face into the standard HF cache:
+
+| Model | Variant | Checkpoint |
+|-------|---------|-----------|
+| `Chatterbox` | English (`ChatterboxTTS`) | `ResembleAI/chatterbox` |
+| `Chatterbox-Multilingual` | 23+ languages (`ChatterboxMultilingualTTS`) | `ResembleAI/chatterbox` |
+| `Chatterbox-Turbo` | Fast English w/ paralinguistic tags (`ChatterboxTurboTTS`) | `ResembleAI/chatterbox-turbo` |
+
+```bash
+lemonade pull Chatterbox
+```
+
+To register your own Chatterbox checkpoint (loaded via `from_local`):
+
+```bash
+lemonade pull user.MyChatterbox \
+  --checkpoint main ResembleAI/chatterbox \
+  --recipe chatterbox
+```
+
+## Use
+
+### Speech synthesis (OpenAI-compatible)
+
+```bash
+curl http://localhost:13305/v1/audio/speech \
+  -H 'Content-Type: application/json' \
+  -d '{"model": "Chatterbox", "input": "Hello from Lemonade.", "response_format": "mp3"}' \
+  --output speech.mp3
+```
+
+`response_format` accepts `mp3` (default), `wav`, `pcm`, `flac`, and `opus`.
+
+### Streaming
+
+Request raw PCM streaming (24 kHz, signed 16-bit, little-endian):
+
+```bash
+curl http://localhost:13305/v1/audio/speech \
+  -H 'Content-Type: application/json' \
+  -d '{"model": "Chatterbox", "input": "Streaming audio.", "stream_format": "audio"}' \
+  --output speech.pcm
+```
+
+### Voice cloning and expressive controls
+
+The OpenAI `voice` field is treated as a reference-audio path for zero-shot voice cloning when it points at an existing file (or pass `audio_prompt_path` explicitly). Chatterbox-specific controls `exaggeration`, `cfg_weight`, and `temperature` are passed through, and `language_id` selects the language for the multilingual variant:
+
+```bash
+curl http://localhost:13305/v1/audio/speech \
+  -H 'Content-Type: application/json' \
+  -d '{"model": "Chatterbox-Multilingual", "input": "Bonjour.", "language_id": "fr", "exaggeration": 0.6}' \
+  --output bonjour.mp3
+```
+
+## Tuning
+
+Force a specific device (overriding auto-selection) via config or per-load:
+
+```bash
+lemonade config set chatterbox.backend=cpu
+```
+
+Free-form CLI args can be appended to `chatterbox-server` via `chatterbox_args`:
+
+```bash
+lemonade config set chatterbox_args="..."
+```
+
+(`--ckpt-dir`, `--variant`, `--device`, `--host`, and `--port` are managed by Lemonade and rejected as custom args.)
+
+## Known gotchas
+
+- **ROCm is Linux-only.** PyTorch publishes ROCm wheels for Linux only, so the `rocm` bundle is offered on Linux x64. On Windows, AMD GPUs fall back to the `cpu` bundle.
+- **Large bundles.** Chatterbox ships a full PyTorch runtime; the GPU bundles are multi-gigabyte downloads. The first load also downloads the model weights (~2 GB) from Hugging Face.
+- **GPU memory.** Chatterbox participates in the GPU LRU like other GPU models; on tight-VRAM systems it may evict (or be evicted by) an LLM. Use `--max-loaded-models` and per-model eviction settings to tune coexistence.
+- **Streaming support is version-dependent.** Byte-level streaming uses Chatterbox's `generate_stream` when present in the installed build; otherwise the wrapper returns the full utterance as a single chunk over the same streaming contract.
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -74,6 +74,7 @@ nav:
       - llama.cpp: guide/configuration/llamacpp.md
       - vLLM: guide/configuration/vllm.md
       - Moonshine: guide/configuration/moonshine.md
+      - Chatterbox: guide/configuration/chatterbox.md
       - Cloud Offload: guide/configuration/cloud.md
     - FAQ: guide/faq.md
   - Development:

diff --git a/src/app/src/renderer/BackendManager.tsx b/src/app/src/renderer/BackendManager.tsx
@@ -12,6 +12,7 @@ const RECIPE_ORDER = new Map([
   'moonshine',
   'sd-cpp',
   'kokoro',
+  'chatterbox',
   'flm',
   'ryzenai-llm',
   'vllm',

diff --git a/src/app/src/renderer/utils/recipeNames.ts b/src/app/src/renderer/utils/recipeNames.ts
@@ -13,6 +13,7 @@ export const RECIPE_DISPLAY_NAMES: Record<string, string> = {
   'moonshine': 'Moonshine',
   'sd-cpp': 'StableDiffusion.cpp',
   'kokoro': 'Kokoro',
+  'chatterbox': 'Chatterbox',
   'cloud': 'Cloud',
   'vllm': 'vLLM ROCm (experimental)',
 };
diff --git a/src/cpp/include/lemon/backends/chatterbox_server.h b/src/cpp/include/lemon/backends/chatterbox_server.h
@@ -0,0 +1,54 @@
+#pragma once
+
+#include "../server_capabilities.h"
+#include "../wrapped_server.h"
+#include "backend_utils.h"
+#include <string>
+
+namespace lemon {
+namespace backends {
+
+// Chatterbox text-to-speech backend (Resemble AI). Runs the self-contained
+// chatterbox-server subprocess (PyInstaller bundle from the
+// lemonade-sdk/chatterbox-rocm distribution repo) and forwards OpenAI-style
+// /v1/audio/speech requests to it. Supports CUDA, ROCm, Metal, and CPU; the
+// device variant is auto-selected (GPU when available, else CPU) via the
+// RECIPE_DEFS preference order in system_info.cpp.
+class ChatterboxServer : public WrappedServer, public ITextToSpeechServer {
+public:
+    static InstallParams get_install_params(const std::string& backend, const std::string& version);
+
+    inline static const BackendSpec SPEC = BackendSpec(
+        "chatterbox",
+#ifdef _WIN32
+        "chatterbox-server.exe"
+#else
+        "chatterbox-server"
+#endif
+        , get_install_params
+    );
+
+    explicit ChatterboxServer(const std::string& log_level,
+                              ModelManager* model_manager,
+                              BackendManager* backend_manager);
+
+    ~ChatterboxServer() override;
+
+    void load(const std::string& model_name,
+             const ModelInfo& model_info,
+             const RecipeOptions& options,
+             bool do_not_upgrade = false) override;
+
+    void unload() override;
+
+    // ICompletionServer implementation (not supported - return errors)
+    json chat_completion(const json& request) override;
+    json completion(const json& request) override;
+    json responses(const json& request) override;
+
+    // ITextToSpeechServer implementation
+    void audio_speech(const json& request, httplib::DataSink& sink) override;
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
@@ -104,6 +104,11 @@ struct ModelInfo {
     // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING)
     int moonshine_arch = -1;
 
+    // Chatterbox-specific model variant ("english", "multilingual", "turbo").
+    // Selects which Chatterbox class chatterbox-server instantiates. Empty
+    // defaults to "english".
+    std::string chatterbox_variant;
+
     // Utility
     std::string checkpoint(const std::string& type = "main") const { return checkpoints.count(type) ? checkpoints.at(type) : ""; }
     std::string resolved_path(const std::string& type = "main") const { return resolved_paths.count(type) ? resolved_paths.at(type) : ""; }

diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h
@@ -156,6 +156,9 @@ inline DeviceType get_device_type_from_recipe(const std::string& recipe) {
         return DEVICE_CPU;
     } else if (recipe == "kokoro") {
         return DEVICE_CPU;
+    } else if (recipe == "chatterbox") {
+        // Defaults to GPU; ChatterboxServer overrides to CPU when the cpu backend is selected.
+        return DEVICE_GPU;
     } else if (is_collection_recipe(recipe)) {
         return DEVICE_NONE;
     } else if (recipe == "cloud") {

diff --git a/src/cpp/resources/backend_versions.json b/src/cpp/resources/backend_versions.json
@@ -92,5 +92,11 @@
   "moonshine": {
     "cpu": "moonshine0.0.62"
   },
+  "chatterbox": {
+    "cuda": "chatterbox0.1.7",
+    "rocm": "chatterbox0.1.7",
+    "metal": "chatterbox0.1.7",
+    "cpu": "chatterbox0.1.7"
+  },
   "clear_bin_if_lemonade_below": "9.4.0"
 }
diff --git a/src/cpp/resources/server_models.json b/src/cpp/resources/server_models.json
@@ -1581,6 +1581,34 @@
         ],
         "size": 0.354
     },
+    "Chatterbox": {
+        "checkpoint": "ResembleAI/chatterbox",
+        "recipe": "chatterbox",
+        "chatterbox_variant": "english",
+        "suggested": true,
+        "labels": [
+            "tts"
+        ],
+        "size": 3.2
+    },
+    "Chatterbox-Multilingual": {
+        "checkpoint": "ResembleAI/chatterbox",
+        "recipe": "chatterbox",
+        "chatterbox_variant": "multilingual",
+        "labels": [
+            "tts"
+        ],
+        "size": 3.2
+    },
+    "Chatterbox-Turbo": {
+        "checkpoint": "ResembleAI/chatterbox-turbo",
+        "recipe": "chatterbox",
+        "chatterbox_variant": "turbo",
+        "labels": [
+            "tts"
+        ],
+        "size": 3.0
+    },
     "RealESRGAN-x4plus": {
         "checkpoint": "amd/realesrgan-x4plus:RealESRGAN_x4plus.pth",
         "recipe": "sd-cpp",

diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp
@@ -4,6 +4,7 @@
 #include "lemon/backends/llamacpp_server.h"
 #include "lemon/backends/whisper_server.h"
 #include "lemon/backends/sd_server.h"
+#include "lemon/backends/chatterbox_server.h"
 #include "lemon/backends/kokoro_server.h"
 #include "lemon/backends/ryzenaiserver.h"
 #include "lemon/backends/vllm_server.h"
@@ -43,6 +44,7 @@ namespace lemon::backends {
         if (recipe == "whispercpp") return &WhisperServer::SPEC;
         if (recipe == "sd-cpp") return &SDServer::SPEC;
         if (recipe == "kokoro") return &KokoroServer::SPEC;
+        if (recipe == "chatterbox") return &ChatterboxServer::SPEC;
         if (recipe == "ryzenai-llm") return &::lemon::RyzenAIServer::SPEC;
         if (recipe == "vllm") return &VLLMServer::SPEC;
         if (recipe == "flm") return &FastFlowLMServer::SPEC;