diff --git a/.github/workflows/docs_and_style.yml b/.github/workflows/docs_and_style.yml
index a64e4d7f2..3354f50fd 100644
--- a/.github/workflows/docs_and_style.yml
+++ b/.github/workflows/docs_and_style.yml
@@ -24,6 +24,23 @@ jobs:
       - name: Run app regression tests
         run: node test/app/run-app-regression-tests.cjs
 
+  backend-docs-drift:
+    # The backend reference doc (docs/dev/backends-reference.md) is generated from
+    # the self-describing backend descriptors. Build lemond, regenerate, and fail
+    # if the committed doc is stale — the same guarantee a lint provides.
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-backend-docs-${{ github.ref }}
+      cancel-in-progress: true
+    steps:
+      - uses: actions/checkout@v5
+      - name: Configure and install build dependencies
+        run: ./setup.sh
+      - name: Build lemond
+        run: cmake --build --preset default --target lemond
+      - name: Check backend reference docs are up to date
+        run: python3 docs/tools/gen_backend_boilerplate.py --check
+
   markdown-link-check:
     runs-on: ubuntu-latest
     concurrency:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70c3bf352..2b7dee8a4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -607,15 +607,6 @@ set(SOURCES_CORE
     src/cpp/server/utils/wmi_helper.cpp
     src/cpp/server/utils/network_beacon.cpp
     src/cpp/server/utils/tcp_jsonl_client.cpp
-    src/cpp/server/backends/cloud_server.cpp
-    src/cpp/server/backends/llamacpp_server.cpp
-    src/cpp/server/backends/fastflowlm_server.cpp
-    src/cpp/server/backends/ryzenaiserver.cpp
-    src/cpp/server/backends/whisper_server.cpp
-    src/cpp/server/backends/moonshine_server.cpp
-    src/cpp/server/backends/kokoro_server.cpp
-    src/cpp/server/backends/sd_server.cpp
-    src/cpp/server/backends/vllm_server.cpp
     src/cpp/server/backends/backend_utils.cpp
     src/cpp/server/backend_manager.cpp
     src/cpp/server/ollama_api.cpp
@@ -647,6 +638,83 @@ elseif(UNIX)
     list(APPEND SOURCES_CORE src/cpp/server/utils/platform/process_unix.cpp)
 endif()
 
+# ============================================================
+# Self-describing backends registry
+# ============================================================
+# The authoritative backend list. Each entry is "<recipe>|<stem>":
+#   recipe - the recipe string used in server_models.json (may contain dashes)
+#   stem   - identifier-safe name and folder. Each backend lives in its own
+#            folder, shipping (in namespace lemon::backends::<stem>):
+#              include/lemon/backends/<stem>/<stem>.h         inline const descriptor (CLI-safe data)
+#              include/lemon/backends/<stem>/<stem>_server.h  WrappedServer subclass + create() decl
+#              server/backends/<stem>/<stem>_server.cpp       implementation + create() def
+#
+# Adding a backend is one line here plus that folder. The foreach below compiles
+# the server source and regenerates the registry headers, which bind each
+# descriptor to its create(). Because this list is a tracked input, editing it
+# forces regeneration on the next build (a file(GLOB) would silently miss a
+# newly added backend). The descriptor is a header-only inline const, so it links
+# into both the lemonade CLI and lemond; only lemond links the server sources.
+set(LEMON_BACKENDS
+    # "<recipe>|<stem>"
+    "llamacpp|llamacpp"
+    "whispercpp|whispercpp"
+    "moonshine|moonshine"
+    "kokoro|kokoro"
+    "sd-cpp|sdcpp"
+    "flm|fastflowlm"
+    "ryzenai-llm|ryzenai"
+    "vllm|vllm"
+    "cloud|cloud"
+)
+
+set(LEMON_DESCRIPTOR_INCLUDES "")
+set(LEMON_DESCRIPTOR_ENTRIES "")
+set(LEMON_FACTORY_INCLUDES "")
+set(LEMON_FACTORY_ENTRIES "")
+# The data registry (descriptors, header-only) links into both binaries; the
+# factory registry + per-backend server sources are server-only.
+# Absolute paths so the CLI subdirectory can reuse LEMON_BACKEND_DESCRIPTOR_SOURCES.
+set(LEMON_BACKEND_DESCRIPTOR_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp)
+set(LEMON_BACKEND_FACTORY_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/hf_cache_util.cpp)
+foreach(_backend_entry ${LEMON_BACKENDS})
+    string(REPLACE "|" ";" _backend_parts "${_backend_entry}")
+    list(GET _backend_parts 1 _backend_stem)
+    # The descriptor is header-only (no source). Compile every .cpp in the
+    # backend's folder (server class + any backend-private helpers like GGUF
+    # parsing) — CONFIGURE_DEPENDS re-globs when a file is added/removed so a new
+    # helper in a folder needs no CMake edit. (The backend LIST is still explicit
+    # above so a whole new backend is never silently missed.)
+    file(GLOB _backend_srcs CONFIGURE_DEPENDS
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/*.cpp)
+    list(APPEND LEMON_BACKEND_FACTORY_SOURCES ${_backend_srcs})
+    string(APPEND LEMON_DESCRIPTOR_INCLUDES
+        "#include \"lemon/backends/${_backend_stem}/${_backend_stem}.h\"\n")
+    string(APPEND LEMON_DESCRIPTOR_ENTRIES
+        "        &lemon::backends::${_backend_stem}::descriptor,\n")
+    string(APPEND LEMON_FACTORY_INCLUDES
+        "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n")
+    string(APPEND LEMON_FACTORY_ENTRIES
+        "        { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec(), lemon::backends::${_backend_stem}::ops() },\n")
+endforeach()
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptors_generated.h.in
+    ${CMAKE_CURRENT_BINARY_DIR}/include/backend_descriptors_generated.h
+    @ONLY)
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_factories_generated.h.in
+    ${CMAKE_CURRENT_BINARY_DIR}/include/backend_factories_generated.h
+    @ONLY)
+
+# lemond gets both descriptor data and factories; the CLI gets only the data
+# (see src/cpp/cli/CMakeLists.txt, which reuses LEMON_BACKEND_DESCRIPTOR_SOURCES).
+list(APPEND SOURCES_CORE ${LEMON_BACKEND_DESCRIPTOR_SOURCES} ${LEMON_BACKEND_FACTORY_SOURCES})
+
 # ============================================================
 # Server core OBJECT library (shared by lemond and Lemonade.exe)
 # ============================================================
diff --git a/README.md b/README.md
index 38d9db6fe..2175b846e 100644
--- a/README.md
+++ b/README.md
@@ -123,6 +123,7 @@ Use `lemonade pull` or the built-in **Model Manager** to download models. You ca
 
 Lemonade supports multiple inference engines for LLM, speech, TTS, and image generation, and each has its own backend and hardware requirements.
 
+<!-- BEGIN GENERATED: backends-matrix -->
 <table>
   <thead>
     <tr>
@@ -137,14 +138,14 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
     <tr>
       <td rowspan="9"><strong>Text generation</strong></td>
       <td rowspan="6"><code>llamacpp</code></td>
-      <td><code>vulkan</code></td>
-      <td><code>x86_64</code> CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)</td>
-      <td>Windows, Linux</td>
+      <td><code>system</code></td>
+      <td><code>x86_64</code>/ARM64 CPU, GPU</td>
+      <td>Linux</td>
     </tr>
     <tr>
-      <td><code>rocm</code></td>
-      <td>Supported AMD ROCm iGPU/dGPU families*</td>
-      <td>Windows, Linux</td>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
     </tr>
     <tr>
       <td><code>cuda</code></td>
@@ -152,49 +153,54 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>cpu</code></td>
-      <td><code>x86_64</code> CPU; ARM64 CPU (Linux)</td>
+      <td><code>vulkan</code></td>
+      <td><code>x86_64</code> CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>metal</code></td>
-      <td>Apple Silicon GPU</td>
-      <td>macOS</td>
+      <td><code>rocm</code></td>
+      <td>Supported AMD ROCm iGPU/dGPU families*</td>
+      <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>system</code></td>
-      <td><code>x86_64</code>/ARM64 CPU, GPU</td>
-      <td>Linux</td>
+      <td><code>cpu</code></td>
+      <td><code>x86_64</code> CPU; ARM64 CPU (Linux)</td>
+      <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>flm</code></td>
+      <td rowspan="1"><code>flm</code></td>
       <td><code>npu</code></td>
       <td>XDNA2 NPU</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>ryzenai-llm</code></td>
+      <td rowspan="1"><code>ryzenai-llm</code></td>
       <td><code>npu</code></td>
       <td>XDNA2 NPU</td>
       <td>Windows</td>
     </tr>
     <tr>
-      <td><code>vllm</code> (experimental)</td>
+      <td rowspan="1"><code>vllm</code> (experimental)</td>
       <td><code>rocm</code></td>
       <td>Strix Halo iGPU (gfx1151)</td>
       <td>Linux</td>
     </tr>
     <tr>
-      <td rowspan="4"><strong>Speech-to-text</strong></td>
-      <td rowspan="3"><code>whispercpp</code></td>
+      <td rowspan="6"><strong>Speech-to-text</strong></td>
+      <td rowspan="5"><code>whispercpp</code></td>
       <td><code>npu</code></td>
       <td>XDNA2 NPU</td>
       <td>Windows</td>
     </tr>
+    <tr>
+      <td><code>rocm</code></td>
+      <td>Supported AMD ROCm iGPU/dGPU families*</td>
+      <td>Windows, Linux</td>
+    </tr>
     <tr>
       <td><code>vulkan</code></td>
       <td><code>x86_64</code> CPU</td>
-      <td>Linux</td>
+      <td>Windows, Linux</td>
     </tr>
     <tr>
       <td><code>cpu</code></td>
@@ -202,28 +208,33 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>moonshine</code></td>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
+    </tr>
+    <tr>
+      <td rowspan="1"><code>moonshine</code></td>
       <td><code>cpu</code></td>
       <td><code>x86_64</code>/<code>arm64</code> CPU</td>
       <td>Windows, Linux, macOS</td>
     </tr>
     <tr>
-      <td><strong>Text-to-speech</strong></td>
-      <td><code>kokoro</code></td>
+      <td rowspan="2"><strong>Text-to-speech</strong></td>
+      <td rowspan="2"><code>kokoro</code></td>
       <td><code>cpu</code></td>
       <td><code>x86_64</code> CPU</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td rowspan="4"><strong>Image generation</strong></td>
-      <td rowspan="4"><code>sd-cpp</code></td>
-      <td><code>rocm</code></td>
-      <td>Supported AMD ROCm iGPU/dGPU families*</td>
-      <td>Windows, Linux</td>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
     </tr>
     <tr>
-      <td><code>vulkan</code></td>
-      <td>Vulkan-capable GPUs</td>
+      <td rowspan="5"><strong>Image generation</strong></td>
+      <td rowspan="5"><code>sd-cpp</code></td>
+      <td><code>rocm</code></td>
+      <td>Supported AMD ROCm iGPU/dGPU families*</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
@@ -231,13 +242,24 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
       <td>NVIDIA GPUs (Turing or newer)**</td>
       <td>Linux</td>
     </tr>
+    <tr>
+      <td><code>vulkan</code></td>
+      <td>Vulkan-capable GPUs</td>
+      <td>Windows, Linux</td>
+    </tr>
     <tr>
       <td><code>cpu</code></td>
       <td><code>x86_64</code> CPU</td>
       <td>Windows, Linux</td>
     </tr>
+    <tr>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
+    </tr>
   </tbody>
 </table>
+<!-- END GENERATED: backends-matrix -->
 
 To check exactly which recipes/backends are supported on your own machine, run:
 
diff --git a/docs/assets/models.js b/docs/assets/models.js
index 5bb604006..d9814cccb 100644
--- a/docs/assets/models.js
+++ b/docs/assets/models.js
@@ -2,25 +2,25 @@ const GITHUB_REPO = 'lemonade-sdk/lemonade';
 const TAGS_URL = `https://api.github.com/repos/${GITHUB_REPO}/tags?per_page=100`;
 const RAW_BASE = 'https://raw.githubusercontent.com/lemonade-sdk/lemonade';
 
+/* BEGIN GENERATED: models-js-recipes */
 const RECIPE_PRIORITY = [
   'llamacpp',
   'ryzenai-llm',
   'flm',
   'whispercpp',
   'sd-cpp',
-  'oga-hybrid',
-  'oga-npu',
-  'oga-cpu',
   'kokoro'
 ];
 
 const RECIPE_DISPLAY_NAMES = {
   llamacpp: 'llama.cpp GPU',
-  'ryzenai-llm': 'Ryzen AI SW NPU',
-  flm: 'FastFlowLM NPU',
   whispercpp: 'whisper.cpp',
-  'sd-cpp': 'stable-diffusion.cpp'
+  'sd-cpp': 'stable-diffusion.cpp',
+  flm: 'FastFlowLM NPU',
+  'ryzenai-llm': 'Ryzen AI SW NPU',
+  vllm: 'vLLM ROCm (experimental)'
 };
+/* END GENERATED: models-js-recipes */
 
 const state = {
   tag: null,
diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md
new file mode 100644
index 000000000..7699f97e6
--- /dev/null
+++ b/docs/dev/adding-a-backend.md
@@ -0,0 +1,151 @@
+# Adding a backend
+
+Lemonade backends are **self-describing**. A backend declares *what it is* in a
+plain-data **descriptor** and implements *how it runs* in a **server class**, and
+both live together in the backend's own folder. A registry collects every
+descriptor, and the router, the CLI, `/system-info`, and the generated docs all
+read it — so there are no scattered `if (recipe == "...")` sites to update.
+
+Adding a backend is **one folder plus three small appends**:
+
+| You edit | What goes there |
+|----------|-----------------|
+| `CMakeLists.txt` → `LEMON_BACKENDS` | **one line**: `"<recipe>\|<stem>"` |
+| `src/cpp/include/lemon/backends/<stem>/<stem>.h` | the descriptor (header-only `inline const`) |
+| `src/cpp/include/lemon/backends/<stem>/<stem>_server.h` | the `WrappedServer` subclass + `create()` declaration |
+| `src/cpp/server/backends/<stem>/<stem>_server.cpp` | the implementation + `create()` definition |
+| `src/cpp/resources/backend_versions.json` | version pin(s) — skip if there's no downloaded binary (e.g. cloud) |
+| `src/cpp/resources/server_models.json` | the models |
+
+No router edits, no CLI edits, no doc edits, no support-matrix edits.
+
+Everything for one backend lives in `lemon::backends::<stem>`. The descriptor is
+header-only so it links into **both** the `lemonade` CLI and `lemond`; the server
+class and `create()` are server-only (compiled into `lemond`).
+
+## The descriptor — `<stem>/<stem>.h`
+
+Plain data. The single object the registry, CLI, `/system-info`, and docs all read.
+
+```cpp
+#pragma once
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon { namespace backends { namespace myrecipe {
+
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "myrecipe",
+    /*display_name*/    "My Backend",
+    /*binary*/          "my-server",        // "" = no subprocess (e.g. cloud)
+    /*config_section*/  "myrecipe",         // defaults to recipe
+    /*default_device*/  DEVICE_GPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ false,           // true auto-exposes "<recipe>_backend" + "--<recipe>"
+    /*uses_ctx_size*/   true,               // opt in to the shared ctx_size option
+    /*dynamic_models*/  false,              // true = models discovered at runtime (cloud)
+    /*options*/ {                           // backend-specific knobs (common ones are automatic)
+        {"myrecipe_args", "--myrecipe-args", "", "ARGS", "Custom args to pass", "My Options"},
+    },
+    /*support*/ {                           // OS / device families ({} = no local gating)
+        {"myrecipe", "cpu", {"linux", "windows"}, {{"cpu", {"x86_64"}}}},
+    },
+    /*default_labels*/  {},                 // labels injected when a model omits them
+    /*required_checkpoints*/ {"main"},      // unconditional files; conditional ones checked in load()
+};
+
+}}}  // namespace lemon::backends::myrecipe
+```
+
+`SlotPolicy` controls accelerator sharing: `Standard` (counts toward LRU slots),
+`ExclusiveNpu` (evicts all NPU servers first), `CoexistByType` (one per model
+type), `Unmetered` (never counted, never auto-evicted — cloud).
+
+## The server class + factory — `<stem>/<stem>_server.{h,cpp}`
+
+The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`,
+and only the capability interfaces you serve (`ITranscriptionServer`,
+`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default
+"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend
+does not stub them. Alongside it, a free `create()` builds the instance.
+
+`<stem>_server.h`:
+
+```cpp
+#pragma once
+#include "lemon/backends/backend_registry.h"   // BackendContext
+#include "lemon/wrapped_server.h"
+
+namespace lemon { namespace backends {
+
+class MyServer : public WrappedServer, public ICompletionServer {
+    // load(), unload(), the capability methods you serve …
+};
+
+namespace myrecipe {
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);  // server-only
+}
+
+}}  // namespace lemon::backends
+```
+
+`<stem>_server.cpp`:
+
+```cpp
+#include "lemon/backends/myrecipe/myrecipe_server.h"
+// … MyServer method definitions …
+
+namespace lemon { namespace backends { namespace myrecipe {
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<MyServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+}}}  // namespace lemon::backends::myrecipe
+```
+
+## Register it: one line
+
+```cmake
+set(LEMON_BACKENDS
+    ...
+    "myrecipe|myrecipe"   # "<recipe>|<stem>"
+)
+```
+
+The `foreach` in `CMakeLists.txt` compiles `<stem>/<stem>_server.cpp` and
+regenerates the registry headers, binding `<stem>::descriptor` to `<stem>::create`.
+
+## What you get for free
+
+- **Standard options:** `merge_args`, `auto_evict`, `evict_idle_timeout`,
+  `downsize_idle_timeout`, `evict_weight_factor`, `pinned`. `ctx_size` is opt-in
+  via `uses_ctx_size`.
+- **Generated CLI flags** for every descriptor option with a `cli_flag`, plus
+  `--<recipe>` when `selectable_backend = true`.
+- **Install/download** via the backend's `BackendSpec` (binary + install params).
+- **`/system-info`** `recipes` entry (display name, options schema, support matrix).
+- **Generated docs** — your backend appears automatically in
+  [`backends-reference.md`](backends-reference.md), the README "Supported
+  Configurations" matrix, and the multi-model NPU-exclusivity list. A CI job
+  (`backend-docs-drift`) fails if the committed docs are stale. The descriptor's
+  `modality`, `experimental`, `web_display_name`, and each support row's
+  `device_summary` supply the editorial bits the matrix needs.
+
+## Escape hatches
+
+| Need | Hook |
+|------|------|
+| Device depends on the chosen backend variant (whisper npu vs cpu) | override `WrappedServer::effective_device(opts)` |
+| Eviction rule depends on the variant | override `WrappedServer::effective_slot_policy(opts)` |
+| Availability decided at runtime (cloud creds) | override `WrappedServer::availability()` |
+| Conditional / grouped checkpoints (sd-cpp flux, whisper npu_cache) | validate in `load()`; list only unconditional files in `required_checkpoints` |
+| Custom per-model fields without editing `ModelInfo` | read `model_info.extra<T>("my_field", fallback)` (populated from unknown `server_models.json` keys) |
+| Models supplied at runtime, not from `server_models.json` | set `dynamic_models = true` and provide them in the class (see cloud's `discover_models()`) |
+| Per-create setup before load (ryzenai `set_model_path`) | do it in `create()` |
+
+## The simplest end-to-end example
+
+**Moonshine** is the minimal case: a single descriptor option, no backend
+selection, CPU-only, one capability interface. See
+`src/cpp/server/backends/moonshine/` and `include/lemon/backends/moonshine/`.
+
+> Note: collections (`collection.omni`) are orchestrator-driven, not
+> `WrappedServer` subprocesses, and are the one explicit exception to this model.
diff --git a/docs/dev/backends-reference.md b/docs/dev/backends-reference.md
new file mode 100644
index 000000000..3993fe8fe
--- /dev/null
+++ b/docs/dev/backends-reference.md
@@ -0,0 +1,328 @@
+# Backend reference
+
+<!-- This file is generated by docs/tools/gen_backend_boilerplate.py from the C++ backend
+descriptors. Do not edit the regions between the GENERATED markers by hand; run
+the generator instead. Prose outside the markers is preserved. -->
+
+## Backends
+
+<!-- BEGIN GENERATED: backends-overview -->
+| Recipe | Name | Selectable backend | Uses ctx_size | Backends |
+|--------|------|--------------------|---------------|----------|
+| `flm` | FastFlowLM NPU | no | yes | npu |
+| `kokoro` | Kokoro | no | no | cpu, metal |
+| `llamacpp` | Llama.cpp GPU | yes | yes | cpu, cuda, metal, rocm, system, vulkan |
+| `moonshine` | Moonshine | no | no | cpu |
+| `ryzenai-llm` | Ryzen AI LLM | no | yes | npu |
+| `sd-cpp` | StableDiffusion.cpp | yes | no | cpu, cuda, metal, rocm, vulkan |
+| `vllm` | vLLM ROCm (experimental) | yes | yes | rocm |
+| `whispercpp` | Whisper.cpp | yes | no | cpu, metal, npu, rocm, vulkan |
+<!-- END GENERATED: backends-overview -->
+
+## Support matrix
+
+<!-- BEGIN GENERATED: backends-matrix -->
+| Recipe | Backend | OS | Device families |
+|--------|---------|----|-----------------|
+| `flm` | npu | linux, windows | amd_npu (XDNA2) |
+| `kokoro` | cpu | linux, windows | cpu (x86_64) |
+| `kokoro` | metal | macos | metal |
+| `llamacpp` | system | linux | cpu (arm64, x86_64) |
+| `llamacpp` | metal | macos | metal |
+| `llamacpp` | cuda | linux, windows | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) |
+| `llamacpp` | vulkan | linux, windows | amd_gpu; cpu (arm64, x86_64) |
+| `llamacpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) |
+| `llamacpp` | cpu | linux, windows | cpu (arm64, x86_64) |
+| `moonshine` | cpu | windows | cpu (x86_64) |
+| `moonshine` | cpu | linux | cpu (arm64, x86_64) |
+| `moonshine` | cpu | macos | cpu (arm64) |
+| `ryzenai-llm` | npu | windows | amd_npu (XDNA2) |
+| `sd-cpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) |
+| `sd-cpp` | cuda | linux | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) |
+| `sd-cpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64); nvidia_gpu |
+| `sd-cpp` | cpu | linux, windows | cpu (x86_64) |
+| `sd-cpp` | metal | macos | metal |
+| `vllm` | rocm | linux | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) |
+| `whispercpp` | npu | windows | amd_npu (XDNA2) |
+| `whispercpp` | rocm | linux, windows | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) |
+| `whispercpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64) |
+| `whispercpp` | cpu | linux, windows | cpu (x86_64) |
+| `whispercpp` | metal | macos | metal |
+<!-- END GENERATED: backends-matrix -->
+
+## Recipe options
+
+<!-- BEGIN GENERATED: backend-options -->
+#### `llamacpp` — Llama.cpp GPU
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |
+| `llamacpp_backend` | `--llamacpp` | BACKEND | "" | LlamaCpp backend to use |
+| `llamacpp_device` | `--llamacpp-device` | DEVICES | "" | Comma-separated list of accelerator devices to use (e.g. Vulkan0) |
+| `llamacpp_args` | `--llamacpp-args` | ARGS | "" | Custom arguments to pass to llama-server |
+
+#### `moonshine` — Moonshine
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `moonshine_args` | `--moonshine-args` | ARGS | "" | Custom arguments to pass to moonshine-server |
+
+#### `sd-cpp` — StableDiffusion.cpp
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `sd-cpp_backend` | `--sdcpp` | BACKEND | "" | SD.cpp backend to use |
+| `sdcpp_args` | `--sdcpp-args` | ARGS | "" | Custom arguments to pass to sd-server (must not conflict with managed args) |
+| `steps` | — | SIZE | 20 | Number of diffusion steps |
+| `cfg_scale` | — | SIZE | 7.0 | Classifier-free guidance scale |
+| `width` | — | SIZE | 512 | Output image width |
+| `height` | — | SIZE | 512 | Output image height |
+| `sampling_method` | — | ARGS | "" | Sampling method |
+| `flow_shift` | — | SIZE | 0.0 | Flow shift |
+
+#### `vllm` — vLLM ROCm (experimental)
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |
+| `vllm_backend` | `--vllm` | BACKEND | "" | vLLM backend to use |
+| `vllm_args` | `--vllm-args` | ARGS | "" | Custom arguments to pass to vllm-server |
+
+#### `whispercpp` — Whisper.cpp
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `whispercpp_backend` | `--whispercpp` | BACKEND | "" | WhisperCpp backend to use |
+| `whispercpp_args` | `--whispercpp-args` | ARGS | "" | Custom arguments to pass to whisper-server |
+<!-- END GENERATED: backend-options -->
+
+## Models
+
+<!-- BEGIN GENERATED: backend-models -->
+#### `collection.omni` — collection.omni (4 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `LMX-Omni-5.5B-Lite` | 9.3 | — |
+| `LMX-Omni-52B-Halo` | 44.77 | — |
+| `Lite Collection` |  | — |
+| `Ultra Collection` |  | — |
+
+#### `kokoro` — Kokoro (1 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `kokoro-v1` | 0.354 | tts |
+
+#### `llamacpp` — Llama.cpp GPU (77 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Bonsai-1.7B-gguf` | 0.25 | llamacpp |
+| `Bonsai-4B-gguf` | 0.572 | llamacpp |
+| `Bonsai-8B-gguf` | 1.16 | llamacpp |
+| `Cogito-v2-llama-109B-MoE-GGUF` | 65.4 | vision |
+| `DeepSeek-Qwen3-8B-GGUF` | 5.25 | reasoning |
+| `Devstral-Small-2507-GGUF` | 14.3 | coding, tool-calling |
+| `GLM-4.5-Air-UD-Q4K-XL-GGUF` | 67.7 | reasoning |
+| `GLM-4.7-Flash-GGUF` | 17.5 | tool-calling |
+| `Gemma-3-4b-it-GGUF` | 3.34 | vision |
+| `Gemma-4-12B-it-GGUF` | 7.29 | tool-calling, vision, llamacpp |
+| `Gemma-4-12B-it-MTP-GGUF` | 7.75 | tool-calling, llamacpp, vision, mtp |
+| `Gemma-4-26B-A4B-it-GGUF` | 18.1 | hot, tool-calling, vision, llamacpp |
+| `Gemma-4-26B-A4B-it-MTP-GGUF` | 18.5 | hot, tool-calling, vision, llamacpp, mtp |
+| `Gemma-4-31B-it-GGUF` | 19.5 | hot, tool-calling, vision, llamacpp |
+| `Gemma-4-31B-it-MTP-GGUF` | 20.0 | hot, tool-calling, vision, llamacpp, mtp |
+| `Gemma-4-E2B-it-GGUF` | 4.09 | tool-calling, vision, llamacpp |
+| `Gemma-4-E4B-it-GGUF` | 5.97 | tool-calling, vision, llamacpp |
+| `Jan-nano-128k-GGUF` | 2.5 | — |
+| `Jan-v1-4B-GGUF` | 2.5 | — |
+| `LFM2-1.2B-GGUF` | 0.731 | — |
+| `LFM2-24B-A2B-GGUF` | 14.4 | — |
+| `LFM2-8B-A1B-GGUF` | 5.04 | — |
+| `LFM2.5-1.2B-Instruct-GGUF` | 0.731 | — |
+| `LFM2.5-8B-A1B` | 5.16 | — |
+| `Llama-3.2-1B-Instruct-GGUF` | 0.834 | — |
+| `Llama-3.2-3B-Instruct-GGUF` | 2.06 | — |
+| `Llama-4-Scout-17B-16E-Instruct-GGUF` | 63.2 | vision |
+| `Ministral-3-3B-Instruct-2512-GGUF` | 2.99 | vision |
+| `Nemotron-3-Nano-30B-A3B-GGUF` | 22.8 | — |
+| `Phi-4-mini-instruct-GGUF` | 2.49 | — |
+| `Playable1-GGUF` | 4.68 | coding |
+| `PromptBridge-0.6b-Alpha-GGUF` | 0.397 | — |
+| `Qwen2.5-Coder-32B-Instruct-GGUF` | 19.9 | coding |
+| `Qwen2.5-Omni-3B-GGUF` | 4.73 | vision, chat-transcription |
+| `Qwen2.5-Omni-7B-GGUF` | 7.33 | vision, chat-transcription |
+| `Qwen2.5-VL-3B-Instruct-GGUF` | 3.27 | vision |
+| `Qwen2.5-VL-7B-Instruct-GGUF` | 6.04 | vision |
+| `Qwen3-0.6B-GGUF` | 0.38 | reasoning |
+| `Qwen3-1.7B-GGUF` | 1.06 | reasoning |
+| `Qwen3-14B-GGUF` | 8.54 | reasoning |
+| `Qwen3-30B-A3B-GGUF` | 17.4 | reasoning |
+| `Qwen3-30B-A3B-Instruct-2507-GGUF` | 17.4 | tool-calling |
+| `Qwen3-4B-GGUF` | 2.38 | reasoning |
+| `Qwen3-4B-Instruct-2507-GGUF` | 2.5 | tool-calling |
+| `Qwen3-8B-GGUF` | 5.25 | reasoning |
+| `Qwen3-Coder-30B-A3B-Instruct-GGUF` | 18.6 | coding, tool-calling, hot |
+| `Qwen3-Coder-Next-GGUF` | 48.0 | coding, tool-calling, hot |
+| `Qwen3-Embedding-0.6B-GGUF` | 0.64 | embeddings |
+| `Qwen3-Embedding-4B-GGUF` | 4.28 | embeddings |
+| `Qwen3-Embedding-8B-GGUF` | 8.05 | embeddings |
+| `Qwen3-Next-80B-A3B-Instruct-GGUF` | 46.1 | tool-calling |
+| `Qwen3-VL-4B-Instruct-GGUF` | 3.33 | vision |
+| `Qwen3-VL-8B-Instruct-GGUF` | 6.19 | vision |
+| `Qwen3.5-0.8B-GGUF` | 0.764 | vision, tool-calling |
+| `Qwen3.5-122B-A10B-GGUF` | 77.9 | vision, tool-calling |
+| `Qwen3.5-122B-A10B-MTP-GGUF` | 79.6 | vision, tool-calling, mtp |
+| `Qwen3.5-27B-GGUF` | 18.5 | vision, tool-calling |
+| `Qwen3.5-2B-GGUF` | 2.01 | vision, tool-calling |
+| `Qwen3.5-35B-A3B-GGUF` | 23.1 | vision, tool-calling |
+| `Qwen3.5-4B-GGUF` | 3.58 | vision, tool-calling, hot |
+| `Qwen3.5-4B-MTP-GGUF` | 3.66 | vision, tool-calling, mtp |
+| `Qwen3.5-9B-GGUF` | 6.88 | vision, tool-calling |
+| `Qwen3.6-27B-GGUF` | 18.5 | vision, tool-calling |
+| `Qwen3.6-27B-MTP-GGUF` | 18.8 | vision, tool-calling, mtp, hot |
+| `Qwen3.6-35B-A3B-GGUF` | 23.3 | vision, tool-calling, hot |
+| `Qwen3.6-35B-A3B-MTP-GGUF` | 23.8 | vision, tool-calling, mtp |
+| `SmolLM3-3B-GGUF` | 1.94 | — |
+| `Tiny-Test-Model-GGUF` | 0.18 | — |
+| `bge-reranker-v2-m3-GGUF` | 0.636 | reranking |
+| `gpt-oss-120b-GGUF` | 62.8 | reasoning, tool-calling |
+| `gpt-oss-120b-mxfp-GGUF` | 63.4 | hot, reasoning, tool-calling |
+| `gpt-oss-20b-GGUF` | 11.6 | reasoning, tool-calling |
+| `gpt-oss-20b-mxfp4-GGUF` | 12.1 | hot, reasoning, tool-calling |
+| `granite-4.0-h-tiny-GGUF` | 4.25 | tool-calling |
+| `jina-reranker-v1-tiny-en-GGUF` | 0.0367 | reranking |
+| `nomic-embed-text-v1-GGUF` | 0.0781 | embeddings |
+| `nomic-embed-text-v2-moe-GGUF` | 0.51 | embeddings |
+
+#### `moonshine` — Moonshine (3 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Moonshine-Medium-Streaming` | 1.08 | transcription, realtime-transcription, hot |
+| `Moonshine-Small-Streaming` | 0.431 | transcription, realtime-transcription |
+| `Moonshine-Tiny-Streaming` | 0.202 | transcription, realtime-transcription |
+
+#### `ryzenai-llm` — Ryzen AI LLM (79 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `AMD-OLMo-1B-SFT-DPO-Hybrid` | 1.48 | — |
+| `CodeLlama-7b-Instruct-hf-Hybrid` | 7.24 | coding |
+| `CodeLlama-7b-Instruct-hf-NPU` | 7.54 | coding |
+| `DeepSeek-R1-Distill-Llama-8B-CPU` | 6.2 | reasoning |
+| `DeepSeek-R1-Distill-Llama-8B-Hybrid` | 9.09 | reasoning |
+| `DeepSeek-R1-Distill-Llama-8B-NPU` | 9.3 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-1.5B-Hybrid` | 2.19 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-1.5B-NPU` | 2.3 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-7B-CPU` | 6.2 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-7B-Hybrid` | 8.67 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-7B-NPU` | 8.87 | reasoning |
+| `Gemma-3-4b-it-mm-NPU` | 6.68 | vision |
+| `Llama-2-7b-chat-hf-Hybrid` | 7.31 | — |
+| `Llama-2-7b-chat-hf-NPU` | 7.47 | — |
+| `Llama-2-7b-hf-Hybrid` | 7.31 | — |
+| `Llama-2-7b-hf-NPU` | 7.47 | — |
+| `Llama-3.1-8B-Hybrid` | 9.09 | — |
+| `Llama-3.1-8B-NPU` | 9.3 | — |
+| `Llama-3.2-1B-Hybrid` | 1.89 | — |
+| `Llama-3.2-1B-Instruct-CPU` | 1.76 | — |
+| `Llama-3.2-1B-Instruct-Hybrid` | 1.89 | — |
+| `Llama-3.2-1B-Instruct-NPU` | 1.96 | — |
+| `Llama-3.2-1B-NPU` | 1.96 | — |
+| `Llama-3.2-3B-Hybrid` | 4.28 | — |
+| `Llama-3.2-3B-Instruct-CPU` | 3.38 | — |
+| `Llama-3.2-3B-Instruct-Hybrid` | 4.28 | — |
+| `Meta-Llama-3-8B-Hybrid` | 9.06 | — |
+| `Meta-Llama-3-8B-NPU` | 9.23 | — |
+| `Meta-Llama-3.1-8B-Instruct-Hybrid` | 9.09 | — |
+| `Meta-Llama-3.1-8B-Instruct-NPU` | 9.3 | — |
+| `Mistral-7B-Instruct-v0.1-Hybrid` | 7.84 | — |
+| `Mistral-7B-Instruct-v0.1-NPU` | 8.01 | — |
+| `Mistral-7B-Instruct-v0.2-Hybrid` | 7.84 | — |
+| `Mistral-7B-Instruct-v0.2-NPU` | 8.01 | — |
+| `Mistral-7B-Instruct-v0.3-Hybrid` | 7.85 | — |
+| `Mistral-7B-Instruct-v0.3-NPU` | 8.09 | — |
+| `Mistral-7B-v0.3-Hybrid` | 7.85 | — |
+| `Mistral-7B-v0.3-NPU` | 8.09 | — |
+| `Phi-3-Mini-Instruct-CPU` | 2.39 | — |
+| `Phi-3-mini-128k-instruct-Hybrid` | 4.21 | — |
+| `Phi-3-mini-128k-instruct-NPU` | 4.35 | — |
+| `Phi-3-mini-4k-instruct-Hybrid` | 4.19 | — |
+| `Phi-3-mini-4k-instruct-NPU` | 4.3 | — |
+| `Phi-3.5-mini-instruct-Hybrid` | 4.21 | — |
+| `Phi-3.5-mini-instruct-NPU` | 4.35 | — |
+| `Phi-4-mini-instruct-Hybrid` | 5.47 | — |
+| `Phi-4-mini-instruct-NPU` | 5.59 | — |
+| `Phi-4-mini-reasoning-Hybrid` | 5.47 | reasoning |
+| `Qwen-1.5-7B-Chat-CPU` | 6.32 | — |
+| `Qwen-2.5-1.5B-Instruct-Hybrid` | 2.17 | — |
+| `Qwen-2.5-1.5B-Instruct-NPU` | 2.25 | — |
+| `Qwen1.5-7B-Chat-Hybrid` | 8.83 | — |
+| `Qwen1.5-7B-Chat-NPU` | 9.02 | — |
+| `Qwen2-1.5B-Hybrid` | 2.19 | — |
+| `Qwen2-1.5B-NPU` | 2.3 | — |
+| `Qwen2-7B-Hybrid` | 8.68 | — |
+| `Qwen2-7B-NPU` | 8.88 | — |
+| `Qwen2.5-0.5B-Instruct-CPU` | 0.834 | — |
+| `Qwen2.5-0.5B-Instruct-Hybrid` | 0.828 | — |
+| `Qwen2.5-14B-instruct-Hybrid` | 16.5 | — |
+| `Qwen2.5-3B-Instruct-Hybrid` | 3.97 | — |
+| `Qwen2.5-3B-Instruct-NPU` | 4.1 | — |
+| `Qwen2.5-7B-Instruct-Hybrid` | 8.65 | — |
+| `Qwen2.5-7B-Instruct-NPU` | 8.83 | — |
+| `Qwen2.5-Coder-0.5B-Instruct-Hybrid` | 0.828 | coding |
+| `Qwen2.5-Coder-1.5B-Instruct-Hybrid` | 2.17 | coding |
+| `Qwen2.5-Coder-1.5B-Instruct-NPU` | 2.25 | coding |
+| `Qwen2.5-Coder-7B-Instruct-Hybrid` | 8.65 | coding |
+| `Qwen2.5-Coder-7B-Instruct-NPU` | 8.83 | coding |
+| `Qwen3-1.7B-Hybrid` | 2.55 | reasoning |
+| `Qwen3-14B-Hybrid` | 16.5 | reasoning |
+| `Qwen3-4B-Hybrid` | 5.17 | reasoning |
+| `Qwen3-8B-Hybrid` | 9.42 | reasoning |
+| `SmolLM-135M-Instruct-Hybrid` | 0.232 | — |
+| `SmolLM2-135M-Instruct-Hybrid` | 0.233 | — |
+| `chatglm3-6b-Hybrid` | 6.9 | — |
+| `chatglm3-6b-NPU` | 7.04 | — |
+| `gemma-2-2b-Hybrid` | 4.04 | — |
+| `gpt-oss-20b-NPU` | 13.4 | — |
+
+#### `sd-cpp` — StableDiffusion.cpp (12 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Flux-2-Klein-4B` | 16.1 | image, edit |
+| `Flux-2-Klein-9B-GGUF` | 19.0 | image, edit |
+| `Qwen-Image-2512-GGUF` | 19.4 | image |
+| `Qwen-Image-GGUF` | 18.2 | image |
+| `RealESRGAN-x4plus` | 0.064 | upscaling, image |
+| `RealESRGAN-x4plus-anime` | 0.017 | upscaling, image |
+| `SD-1.5` | 7.7 | image |
+| `SD-Turbo` | 5.21 | image |
+| `SD-Turbo-GGUF` | 2.02 | image |
+| `SDXL-Base-1.0` | 6.94 | image |
+| `SDXL-Turbo` | 6.94 | image |
+| `Z-Image-Turbo` | 20.7 | image |
+
+#### `vllm` — vLLM ROCm (experimental) (4 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Qwen3.5-0.8B-FP16-vLLM` | 1.77 | reasoning |
+| `Qwen3.5-2B-FP16-vLLM` | 4.57 | reasoning, tool-calling |
+| `Qwen3.5-4B-FP16-vLLM` | 9.34 | reasoning, hot, tool-calling |
+| `Qwen3.5-9B-FP16-vLLM` | 19.3 | reasoning, tool-calling |
+
+#### `whispercpp` — Whisper.cpp (6 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Whisper-Base` | 0.148 | transcription, realtime-transcription |
+| `Whisper-Large-v3` | 3.1 | transcription, realtime-transcription |
+| `Whisper-Large-v3-Turbo` | 1.62 | transcription, realtime-transcription, hot |
+| `Whisper-Medium` | 1.53 | transcription, realtime-transcription |
+| `Whisper-Small` | 0.488 | transcription, realtime-transcription |
+| `Whisper-Tiny` | 0.075 | transcription, realtime-transcription |
+<!-- END GENERATED: backend-models -->
diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md
index 766744ea8..5835b1fae 100644
--- a/docs/dev/contribute.md
+++ b/docs/dev/contribute.md
@@ -23,6 +23,10 @@ Lemonade's roadmap is defined by a set of [working groups](./working-groups/READ
 
 Not sure what to work on? Come to the feature-requests and troubleshooting channels on the Discord and see what people need!
 
+### Adding a Backend
+
+Inference backends are self-describing: a backend is a descriptor (plain data) plus a server class, and everything else (router, CLI, `/system-info`, docs) is derived from it. See [Adding a backend](./adding-a-backend.md) for the full contract and a minimal example.
+
 ### Issues
 
 Issues are a great way to document a bug or feature request. However, Lemonade is a community-driven project and you still need to find someone to implement your issue. It is highly recommended that you bring your issue to the [Lemonade discord community](https://discord.gg/5xXzkMu8Zk) and connect with a contributor who wants to implement it.
diff --git a/docs/dev/getting-started.md b/docs/dev/getting-started.md
index b8e487c4c..ef1769059 100644
--- a/docs/dev/getting-started.md
+++ b/docs/dev/getting-started.md
@@ -625,6 +625,7 @@ Internal endpoints accept connections from any address, so first-party clients o
 | `POST` | `/internal/shutdown` | Unloads all models and shuts down the server |
 | `POST` | `/internal/set` | Unified config setter (see below) |
 | `GET`  | `/internal/config` | Returns the full runtime config snapshot |
+| `GET`  | `/internal/config/defaults` | Returns the canonical default config (factory defaults) |
 | `POST` | `/internal/cleanup-cache` | Cleans up orphaned files in the Hugging Face cache |
 | `POST` | `/internal/pin` | Pin or unpin a loaded model |
 
@@ -676,6 +677,15 @@ Returns the full runtime configuration as a flat JSON object containing all serv
 curl http://localhost:13305/internal/config
 ```
 
+#### `GET /internal/config/defaults`
+
+Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or deployment overrides. The per-recipe sections come from the backend descriptors (each descriptor's `config_defaults()`), making this the authoritative source of the factory defaults. `docs/tools/gen_backend_boilerplate.py` reads this endpoint to regenerate the committed `src/cpp/resources/defaults.json`, and a CI `--check` fails if that file drifts from the descriptors.
+
+**Example:**
+```bash
+curl http://localhost:13305/internal/config/defaults
+```
+
 ### Dependencies
 
 All dependencies are automatically fetched by CMake via FetchContent:
diff --git a/docs/embeddable/runtime.md b/docs/embeddable/runtime.md
index a50b8c4af..983038e95 100644
--- a/docs/embeddable/runtime.md
+++ b/docs/embeddable/runtime.md
@@ -114,6 +114,7 @@ Your app can manage its `lemond` instance at runtime by using `/internal` endpoi
 |--------|------|-------------|
 | `POST` | `/internal/set` | Unified config setter (see below) |
 | `GET`  | `/internal/config` | Returns the full runtime config snapshot |
+| `GET`  | `/internal/config/defaults` | Returns the canonical default config (factory defaults) |
 | `POST` | `/internal/pin` | Pin or unpin a loaded model (prevents auto-eviction) |
 
 The settings defined in `config.json` can all be changed at runtime without restarting `lemond` with the `/internal/set` endpoint. See the [Configuration Guide](../guide/configuration/README.md) for details on all settings.
@@ -137,6 +138,23 @@ Returns the full runtime configuration as a flat JSON object containing all serv
     curl http://localhost:8000/internal/config
     ```
 
+#### `GET /internal/config/defaults`
+
+Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or any deployment override. The per-recipe sections are derived from the backend descriptors, so this is the authoritative source for "what are the factory defaults." It is what `docs/tools/gen_backend_boilerplate.py` reads to regenerate `src/cpp/resources/defaults.json`.
+
+**Example:**
+=== "Windows (cmd.exe)"
+
+    ```cmd
+    curl http://localhost:8000/internal/config/defaults
+    ```
+
+=== "Linux (bash)"
+
+    ```bash
+    curl http://localhost:8000/internal/config/defaults
+    ```
+
 #### `POST /internal/set`
 
 Accepts a JSON object with one or more keys to update atomically. Returns `{"status":"success","updated":{...}}` on success, or `400` with an error message on validation failure.
diff --git a/docs/guide/cli.md b/docs/guide/cli.md
index 50d388bbb..53ed5f1f5 100644
--- a/docs/guide/cli.md
+++ b/docs/guide/cli.md
@@ -325,44 +325,56 @@ The following options apply to all model loads:
 
 The following options are available depending on the recipe being used:
 
-#### Llama.cpp (`llamacpp` recipe)
+<!-- BEGIN GENERATED: cli-recipe-options -->
+#### Llama.cpp GPU (`llamacpp` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--ctx-size SIZE` | Context size for the model | `4096` |
+| `--ctx-size SIZE` | Context size for the model | auto |
 | `--llamacpp BACKEND` | LlamaCpp backend to use | Auto-detected |
-| `--llamacpp-device DEVICE` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | (empty) |
-| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server (must not conflict with managed args) | `""` |
+| `--llamacpp-device DEVICES` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | `""` |
+| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server | `""` |
 
-#### FLM (`flm` recipe)
+#### Whisper.cpp (`whispercpp` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--ctx-size SIZE` | Context size for the model | `4096` |
+| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected |
+| `--whispercpp-args ARGS` | Custom arguments to pass to whisper-server | `""` |
 
-#### RyzenAI LLM (`ryzenai-llm` recipe)
+#### Moonshine (`moonshine` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--ctx-size SIZE` | Context size for the model | `4096` |
+| `--moonshine-args ARGS` | Custom arguments to pass to moonshine-server | `""` |
 
-#### SD.cpp (`sd-cpp` recipe)
+#### StableDiffusion.cpp (`sd-cpp` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--sdcpp BACKEND` | SD.cpp backend to use (`cpu` for CPU, `rocm` for AMD GPU) | Auto-detected |
+| `--sdcpp BACKEND` | SD.cpp backend to use | Auto-detected |
 | `--sdcpp-args ARGS` | Custom arguments to pass to sd-server (must not conflict with managed args) | `""` |
-| `--steps N` | Number of inference steps for image generation | `20` |
-| `--cfg-scale SCALE` | Classifier-free guidance scale for image generation | `7.0` |
-| `--width PX` | Image width in pixels | `512` |
-| `--height PX` | Image height in pixels | `512` |
 
-#### Whisper.cpp (`whispercpp` recipe)
+#### FastFlowLM NPU (`flm` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected |
+| `--ctx-size SIZE` | Context size for the model | auto |
+
+#### Ryzen AI LLM (`ryzenai-llm` recipe)
 
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--ctx-size SIZE` | Context size for the model | auto |
+
+#### vLLM ROCm (experimental) (`vllm` recipe)
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--ctx-size SIZE` | Context size for the model | auto |
+| `--vllm BACKEND` | vLLM backend to use | Auto-detected |
+| `--vllm-args ARGS` | Custom arguments to pass to vllm-server | `""` |
+<!-- END GENERATED: cli-recipe-options -->
 **Notes:**
 - Unspecified options will use the backend's default values
 - Backend options (`--llamacpp`, `--sdcpp`, `--whispercpp`) are auto-detected based on system capabilities
diff --git a/docs/guide/configuration/README.md b/docs/guide/configuration/README.md
index 93977148c..2a388dc8f 100644
--- a/docs/guide/configuration/README.md
+++ b/docs/guide/configuration/README.md
@@ -31,68 +31,81 @@ Values set in the user's `config.json` always take precedence over these seeded
 
 ### Example config.json
 
+<!-- BEGIN GENERATED: config-example -->
 ```json
 {
-  "config_version": 1,
-  "port": 13305,
-  "host": "localhost",
-  "log_level": "info",
-  "global_timeout": 600,
-  "max_loaded_models": 1,
-  "no_broadcast": false,
-  "extra_models_dir": "",
-  "models_dir": "auto",
+  "cloud_providers": [],
+  "config_version": 2,
   "ctx_size": -1,
-  "offline": false,
-  "no_fetch_executables": false,
   "disable_model_filtering": false,
   "enable_dgpu_gtt": false,
-  "rocm_channel": "stable",
+  "extra_models_dir": "",
+  "flm": {
+    "args": ""
+  },
+  "global_timeout": 600,
+  "host": "localhost",
+  "kokoro": {
+    "cpu_bin": "builtin"
+  },
   "llamacpp": {
-    "backend": "auto",
     "args": "",
-    "vulkan_args": "",
-    "rocm_args": "",
+    "backend": "auto",
     "cpu_args": "",
-	"device": "",
-    "prefer_system": false,
+    "cpu_bin": "builtin",
+    "cuda_bin": "builtin",
+    "prefer_system": true,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin",
-    "cpu_bin": "builtin"
+    "vulkan_args": "",
+    "vulkan_bin": "builtin"
   },
-  "whispercpp": {
-    "backend": "auto",
+  "log_level": "info",
+  "max_loaded_models": 1,
+  "models_dir": "auto",
+  "moonshine": {
     "args": "",
     "cpu_args": "",
-    "npu_args": "",
-    "cpu_bin": "builtin",
-    "npu_bin": "builtin"
+    "cpu_bin": "builtin"
+  },
+  "no_broadcast": false,
+  "no_fetch_executables": false,
+  "offline": false,
+  "port": 13305,
+  "rocm_channel": "stable",
+  "ryzenai": {
+    "server_bin": "builtin"
   },
   "sdcpp": {
-    "backend": "auto",
     "args": "",
-    "cpu_args": "",
-    "rocm_args": "",
-    "vulkan_args": "",
-    "steps": 20,
+    "backend": "auto",
     "cfg_scale": 7.0,
-    "width": 512,
-    "height": 512,
+    "cpu_args": "",
     "cpu_bin": "builtin",
+    "height": 512,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin"
+    "steps": 20,
+    "vulkan_args": "",
+    "vulkan_bin": "builtin",
+    "width": 512
   },
-  "flm": {
+  "vllm": {
     "args": "",
+    "backend": "auto"
   },
-  "ryzenai": {
-    "server_bin": "builtin"
-  },
-  "kokoro": {
-    "cpu_bin": "builtin"
+  "websocket_port": "auto",
+  "whispercpp": {
+    "args": "",
+    "backend": "auto",
+    "cpu_args": "",
+    "cpu_bin": "builtin",
+    "npu_args": "",
+    "npu_bin": "builtin"
   }
 }
 ```
+<!-- END GENERATED: config-example -->
 
 ### Settings Reference
 
diff --git a/docs/guide/configuration/custom-models.md b/docs/guide/configuration/custom-models.md
index c3e770442..5a7dbd878 100644
--- a/docs/guide/configuration/custom-models.md
+++ b/docs/guide/configuration/custom-models.md
@@ -71,7 +71,7 @@ Supported registration flags:
 | Flag | Description |
 |------|-------------|
 | `--checkpoint TYPE CHECKPOINT` | Add a checkpoint entry. Repeat for multi-file models such as `main` + `mmproj` or `main` + `vae`. |
-| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: `llamacpp`, `flm`, `ryzenai-llm`, `vllm`, `whispercpp`, `moonshine`, `sd-cpp`, `kokoro`, `collection.omni`. |
+| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: <!-- BEGIN GENERATED: recipe-values -->`llamacpp`, `whispercpp`, `moonshine`, `kokoro`, `sd-cpp`, `flm`, `ryzenai-llm`, `vllm`, `collection.omni`<!-- END GENERATED: recipe-values -->. |
 | `--label LABEL` | Add a label to the new model. Repeatable. Valid labels include `coding`, `embeddings`, `hot`, `mtp`, `reasoning`, `reranking`, `tool-calling`, `vision`. |
 | `--components MODEL [MODEL ...]` | Components for an omni collection (see below). Use with `--recipe collection.omni`. |
 
diff --git a/docs/guide/configuration/multi-model.md b/docs/guide/configuration/multi-model.md
index 30ed840d5..db9944ff9 100644
--- a/docs/guide/configuration/multi-model.md
+++ b/docs/guide/configuration/multi-model.md
@@ -22,7 +22,9 @@ Each type has its own independent LRU cache, all sharing the same slot limit set
 
 ## Device Constraints
 
-- **NPU Exclusivity:** `flm`, `ryzenai-llm`, and `whispercpp` are mutually exclusive on the NPU.
+<!-- BEGIN GENERATED: npu-exclusivity -->
+- **NPU Exclusivity:** `whispercpp`, `flm`, and `ryzenai-llm` are mutually exclusive on the NPU.
+<!-- END GENERATED: npu-exclusivity -->
     - Loading a model from one of these backends will automatically evict all NPU models from the other backends.
     - `flm` supports loading 1 ASR model, 1 LLM, and 1 embedding model on the NPU at the same time.
     - `ryzenai-llm` supports loading exactly 1 LLM, which uses the entire NPU.
diff --git a/docs/tools/gen_backend_boilerplate.py b/docs/tools/gen_backend_boilerplate.py
new file mode 100644
index 000000000..b4e8ac8d9
--- /dev/null
+++ b/docs/tools/gen_backend_boilerplate.py
@@ -0,0 +1,621 @@
+#!/usr/bin/env python3
+"""Generate backend boilerplate (docs + config defaults) from the descriptors.
+
+The C++ backend descriptors (src/cpp/include/lemon/backends/<stem>/<stem>.h) are
+the single source of truth for what each backend is. This script boots a `lemond`
+server and regenerates the committed artifacts that would otherwise be
+hand-maintained:
+
+  * Marker-delimited regions of the backend reference docs, from
+    ``/system-info`` ``recipes`` + ``server_models.json``.
+  * The whole of ``src/cpp/resources/defaults.json``, mirrored verbatim from
+    ``/internal/config/defaults`` (its per-recipe blocks come from each
+    descriptor's ``config_defaults()``).
+
+A CI step runs it with ``--check`` and fails if any committed artifact drifts.
+
+Usage:
+    python docs/tools/gen_backend_boilerplate.py [--lemond PATH] [--check]
+
+``--check`` regenerates in memory and exits non-zero if any on-disk artifact
+differs, without modifying it. For the docs, only the regions between::
+
+    <!-- BEGIN GENERATED: <id> -->
+    <!-- END GENERATED: <id> -->
+
+are rewritten; surrounding prose is left untouched.
+"""
+
+import argparse
+import json
+import re
+import socket
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.request
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SERVER_MODELS = REPO_ROOT / "src" / "cpp" / "resources" / "server_models.json"
+TARGET_DOC = REPO_ROOT / "docs" / "dev" / "backends-reference.md"
+
+
+def free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+def find_lemond(explicit: str | None) -> Path:
+    if explicit:
+        p = Path(explicit)
+        if not p.exists():
+            sys.exit(f"lemond not found at {p}")
+        return p
+    for candidate in [
+        REPO_ROOT / "build" / "lemond",
+        REPO_ROOT / "build" / "lemond.exe",
+    ]:
+        if candidate.exists():
+            return candidate
+    sys.exit("Could not find a built lemond (looked in build/). Pass --lemond PATH.")
+
+
+class Lemond:
+    """Boots a throwaway lemond on a free port with an isolated cache dir."""
+
+    def __init__(self, binary: Path):
+        self.binary = binary
+        self.port = free_port()
+        self._cache = tempfile.TemporaryDirectory(prefix="lemond-docs-")
+        self._proc: subprocess.Popen | None = None
+
+    def __enter__(self):
+        self._proc = subprocess.Popen(
+            [str(self.binary), self._cache.name, "--port", str(self.port)],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        deadline = time.time() + 60
+        while time.time() < deadline:
+            try:
+                self._get("/api/v1/health")
+                return self
+            except Exception:
+                if self._proc.poll() is not None:
+                    sys.exit("lemond exited before becoming ready")
+                time.sleep(0.5)
+        self.__exit__(None, None, None)
+        sys.exit("lemond did not become ready within 60s")
+
+    def __exit__(self, *exc):
+        if self._proc and self._proc.poll() is None:
+            try:
+                self._get("/internal/shutdown", timeout=2)
+            except Exception:
+                pass
+            try:
+                self._proc.wait(timeout=10)
+            except Exception:
+                self._proc.kill()
+        self._cache.cleanup()
+
+    def _get(self, path: str, timeout: float = 5):
+        url = f"http://127.0.0.1:{self.port}{path}"
+        with urllib.request.urlopen(url, timeout=timeout) as r:
+            return r.read()
+
+    def system_info(self) -> dict:
+        return json.loads(self._get("/api/v1/system-info", timeout=30))
+
+    def config(self) -> dict:
+        return json.loads(self._get("/internal/config", timeout=10))
+
+    def config_defaults_text(self) -> str:
+        # Verbatim text of the canonical default config (the server's own
+        # serialization) so the committed resources/defaults.json is byte-stable.
+        text = self._get("/internal/config/defaults", timeout=10).decode("utf-8")
+        return text if text.endswith("\n") else text + "\n"
+
+
+def md_escape(text: str) -> str:
+    return str(text).replace("|", "\\|")
+
+
+MODALITY_ORDER = [
+    "Text generation",
+    "Speech-to-text",
+    "Text-to-speech",
+    "Image generation",
+]
+OS_LABEL = {"windows": "Windows", "linux": "Linux", "macos": "macOS"}
+OS_ORDER = ["windows", "linux", "macos"]
+
+
+def _fmt_os(os_set) -> str:
+    return ", ".join(OS_LABEL.get(o, o) for o in OS_ORDER if o in os_set)
+
+
+def _code_devices(summary: str) -> str:
+    # Light formatting: render bare arch tokens as <code>, matching the README style.
+    summary = re.sub(r"\bx86_64\b", "<code>x86_64</code>", summary)
+    summary = re.sub(r"\barm64\b", "<code>arm64</code>", summary)
+    return summary
+
+
+def _ordered(recipes: dict) -> list:
+    # Recipes in descriptor registry order (stable, deterministic doc rendering).
+    return sorted(recipes.items(), key=lambda kv: kv[1].get("order", 999))
+
+
+def render_readme_matrix(recipes: dict) -> str:
+    # Group descriptor-backed recipes by modality, in descriptor registry order.
+    by_mod: dict[str, list] = {m: [] for m in MODALITY_ORDER}
+    for recipe, info in _ordered(recipes):
+        mod = info.get("modality")
+        if not mod or mod not in by_mod:
+            continue
+        # Merge support rows sharing a (backend, device summary); union their OS.
+        merged: list[dict] = []
+        seen: dict[tuple, dict] = {}
+        for row in info.get("support", []):
+            key = (row["backend"], row.get("device_summary", ""))
+            if key in seen:
+                seen[key]["os"] |= set(row.get("os", []))
+            else:
+                d = {
+                    "backend": row["backend"],
+                    "summary": row.get("device_summary", ""),
+                    "os": set(row.get("os", [])),
+                }
+                seen[key] = d
+                merged.append(d)
+        if merged:
+            by_mod[mod].append((recipe, info, merged))
+
+    out = [
+        "<table>",
+        "  <thead>",
+        "    <tr>",
+        "      <th>Modality</th>",
+        "      <th>Engine</th>",
+        "      <th>Backend</th>",
+        "      <th>Device</th>",
+        "      <th>OS</th>",
+        "    </tr>",
+        "  </thead>",
+        "  <tbody>",
+    ]
+    for mod in MODALITY_ORDER:
+        recipes_in = by_mod[mod]
+        if not recipes_in:
+            continue
+        mod_span = sum(len(m) for _, _, m in recipes_in)
+        first_mod = True
+        for recipe, info, merged in recipes_in:
+            engine = f"<code>{recipe}</code>" + (
+                " (experimental)" if info.get("experimental") else ""
+            )
+            first_recipe = True
+            for d in merged:
+                out.append("    <tr>")
+                if first_mod:
+                    out.append(
+                        f'      <td rowspan="{mod_span}"><strong>{mod}</strong></td>'
+                    )
+                    first_mod = False
+                if first_recipe:
+                    out.append(f'      <td rowspan="{len(merged)}">{engine}</td>')
+                    first_recipe = False
+                out.append(f'      <td><code>{d["backend"]}</code></td>')
+                out.append(f"      <td>{_code_devices(d['summary'])}</td>")
+                out.append(f"      <td>{_fmt_os(d['os'])}</td>")
+                out.append("    </tr>")
+    out += ["  </tbody>", "</table>"]
+    return "\n".join(out)
+
+
+def _cli_default(opt: dict) -> str:
+    d = opt.get("default")
+    if opt.get("type_name") == "BACKEND" and d == "":
+        return "Auto-detected"
+    if isinstance(d, str):
+        return '`""`' if d == "" else f"`{d}`"
+    if isinstance(d, bool):
+        return f"`{str(d).lower()}`"
+    if d == -1:
+        return "auto"
+    return f"`{d}`"
+
+
+def render_cli_recipe_options(recipes: dict) -> str:
+    # Per-recipe load options, exactly as the CLI registers them from descriptors.
+    # Recipes with no CLI options (kokoro, cloud) are omitted.
+    blocks: list[str] = []
+    for recipe, info in _ordered(recipes):
+        cli_opts = [o for o in info.get("options", []) if o.get("cli_flag")]
+        if not info.get("uses_ctx_size") and not cli_opts:
+            continue
+        blocks.append(f"#### {info.get('display_name', recipe)} (`{recipe}` recipe)\n")
+        blocks.append("| Option | Description | Default |")
+        blocks.append("|--------|-------------|---------|")
+        if info.get("uses_ctx_size"):
+            blocks.append("| `--ctx-size SIZE` | Context size for the model | auto |")
+        for o in cli_opts:
+            blocks.append(
+                "| `{flag} {t}` | {h} | {d} |".format(
+                    flag=o["cli_flag"],
+                    t=o.get("type_name", ""),
+                    h=md_escape(o.get("help", "")),
+                    d=_cli_default(o),
+                )
+            )
+        blocks.append("")
+    return "\n".join(blocks).rstrip()
+
+
+def _oxford(items: list) -> str:
+    items = [f"`{i}`" for i in items]
+    if len(items) <= 1:
+        return "".join(items)
+    if len(items) == 2:
+        return f"{items[0]} and {items[1]}"
+    return ", ".join(items[:-1]) + f", and {items[-1]}"
+
+
+def _js_to_title(recipe: str) -> str:
+    # Mirror models.js toTitle(): the website's fallback for unlisted display names.
+    return re.sub(
+        r"\b\w",
+        lambda m: m.group(0).upper(),
+        recipe.replace("_", " ").replace("-", " "),
+    )
+
+
+def _js_key(recipe: str) -> str:
+    # Bare identifier if it's a valid JS key, else quoted (matches models.js style).
+    return recipe if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", recipe) else f"'{recipe}'"
+
+
+def render_models_js(recipes: dict) -> str:
+    # RECIPE_PRIORITY: recipes with web_priority > 0, in that order (legacy oga-*
+    # recipes have no descriptor and are intentionally dropped).
+    prioritized = sorted(
+        (r for r, i in recipes.items() if i.get("web_priority", 0) > 0),
+        key=lambda r: recipes[r]["web_priority"],
+    )
+    pri_lines = ",\n".join(f"  '{r}'" for r in prioritized)
+
+    # RECIPE_DISPLAY_NAMES: only recipes whose name differs from the JS toTitle()
+    # fallback (matching the curated map, which omits redundant entries).
+    name_lines = []
+    for r, info in _ordered(recipes):
+        name = info.get("web_display_name") or info.get("display_name", r)
+        if name and name != _js_to_title(r):
+            name_lines.append(f"  {_js_key(r)}: '{name}'")
+    names = ",\n".join(name_lines)
+
+    return (
+        f"const RECIPE_PRIORITY = [\n{pri_lines}\n];\n\n"
+        f"const RECIPE_DISPLAY_NAMES = {{\n{names}\n}};"
+    )
+
+
+def render_config_example(config: dict) -> str:
+    # The canonical config.json, straight from a fresh lemond's /internal/config.
+    # `port` is the only environment-dependent field (it reflects the launch port);
+    # normalize it to the documented default.
+    cfg = dict(config)
+    cfg["port"] = 13305
+    return "```json\n" + json.dumps(cfg, indent=2) + "\n```"
+
+
+def render_recipe_values(recipes: dict) -> str:
+    # Inline list of recipe values for `--recipe`, plus the collection orchestrator.
+    rs = [r for r, _ in _ordered(recipes)] + ["collection.omni"]
+    return ", ".join(f"`{r}`" for r in rs)
+
+
+def render_npu_exclusivity(recipes: dict) -> str:
+    npu = [
+        r
+        for r, info in _ordered(recipes)
+        if any(
+            row.get("backend") == "npu"
+            or any(d.get("device") == "amd_npu" for d in row.get("devices", []))
+            for row in info.get("support", [])
+        )
+    ]
+    return f"- **NPU Exclusivity:** {_oxford(npu)} are mutually exclusive on the NPU."
+
+
+def render_overview(recipes: dict) -> str:
+    rows = [
+        "| Recipe | Name | Selectable backend | Uses ctx_size | Backends |",
+        "|--------|------|--------------------|---------------|----------|",
+    ]
+    for recipe in sorted(recipes):
+        info = recipes[recipe]
+        if "display_name" not in info:
+            continue  # not a descriptor-backed recipe on this run
+        backends = sorted({b["backend"] for b in info.get("support", [])}) or sorted(
+            info.get("backends", {})
+        )
+        rows.append(
+            "| `{r}` | {n} | {s} | {c} | {b} |".format(
+                r=recipe,
+                n=md_escape(info.get("display_name", "")),
+                s="yes" if info.get("selectable_backend") else "no",
+                c="yes" if info.get("uses_ctx_size") else "no",
+                b=", ".join(backends) if backends else "—",
+            )
+        )
+    return "\n".join(rows)
+
+
+def render_support_matrix(recipes: dict) -> str:
+    rows = [
+        "| Recipe | Backend | OS | Device families |",
+        "|--------|---------|----|-----------------|",
+    ]
+    for recipe in sorted(recipes):
+        info = recipes[recipe]
+        for row in info.get("support", []):
+            fams = []
+            for d in row.get("devices", []):
+                f = d.get("families") or []
+                fams.append(d["device"] + (f" ({', '.join(f)})" if f else ""))
+            rows.append(
+                "| `{r}` | {b} | {o} | {d} |".format(
+                    r=recipe,
+                    b=row.get("backend", ""),
+                    o=", ".join(sorted(row.get("os", []))),
+                    d=md_escape("; ".join(fams)) if fams else "—",
+                )
+            )
+    return "\n".join(rows)
+
+
+def render_options(recipes: dict) -> str:
+    blocks = []
+    for recipe in sorted(recipes):
+        info = recipes[recipe]
+        opts = info.get("options")
+        if not opts:
+            continue
+        blocks.append(f"#### `{recipe}` — {info.get('display_name', recipe)}\n")
+        blocks.append("| Option | CLI flag | Type | Default | Description |")
+        blocks.append("|--------|----------|------|---------|-------------|")
+        if info.get("uses_ctx_size"):
+            blocks.append(
+                "| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |"
+            )
+        for o in opts:
+            blocks.append(
+                "| `{n}` | {f} | {t} | {d} | {h} |".format(
+                    n=o["name"],
+                    f=f"`{o['cli_flag']}`" if o.get("cli_flag") else "—",
+                    t=o.get("type_name", ""),
+                    d=md_escape(
+                        json.dumps(o.get("default"))
+                        if not isinstance(o.get("default"), str)
+                        else o.get("default") or '""'
+                    ),
+                    h=md_escape(o.get("help", "")),
+                )
+            )
+        blocks.append("")
+    return "\n".join(blocks).rstrip()
+
+
+def render_models(recipes: dict) -> str:
+    models = json.loads(SERVER_MODELS.read_text())
+    by_recipe: dict[str, list] = {}
+    for name, data in models.items():
+        if not isinstance(data, dict):
+            continue
+        by_recipe.setdefault(data.get("recipe", "(unspecified)"), []).append(
+            (name, data)
+        )
+    blocks = []
+    for recipe in sorted(by_recipe):
+        entries = sorted(by_recipe[recipe])
+        display = recipes.get(recipe, {}).get("display_name", recipe)
+        blocks.append(f"#### `{recipe}` — {display} ({len(entries)} models)\n")
+        blocks.append("| Model | Size (GB) | Labels |")
+        blocks.append("|-------|-----------|--------|")
+        for name, data in entries:
+            blocks.append(
+                "| `{n}` | {s} | {l} |".format(
+                    n=md_escape(name),
+                    s=data.get("size", ""),
+                    l=md_escape(", ".join(data.get("labels", []))) or "—",
+                )
+            )
+        blocks.append("")
+    return "\n".join(blocks).rstrip()
+
+
+DEFAULT_TEMPLATE = """# Backend reference
+
+<!-- This file is generated by docs/tools/gen_backend_boilerplate.py from the C++ backend
+descriptors. Do not edit the regions between the GENERATED markers by hand; run
+the generator instead. Prose outside the markers is preserved. -->
+
+## Backends
+
+<!-- BEGIN GENERATED: backends-overview -->
+<!-- END GENERATED: backends-overview -->
+
+## Support matrix
+
+<!-- BEGIN GENERATED: backends-matrix -->
+<!-- END GENERATED: backends-matrix -->
+
+## Recipe options
+
+<!-- BEGIN GENERATED: backend-options -->
+<!-- END GENERATED: backend-options -->
+
+## Models
+
+<!-- BEGIN GENERATED: backend-models -->
+<!-- END GENERATED: backend-models -->
+"""
+
+
+def apply_sections(text: str, sections: dict[str, str]) -> str:
+    for marker_id, body in sections.items():
+        # Accept HTML (`<!-- ... -->`) markers for Markdown and block (`/* ... */`)
+        # markers for code files like .js, so the same generator drives both.
+        mid = re.escape(marker_id)
+        begin = (
+            r"(<!-- BEGIN GENERATED: "
+            + mid
+            + r" -->|/\* BEGIN GENERATED: "
+            + mid
+            + r" \*/)"
+        )
+        end = (
+            r"(<!-- END GENERATED: "
+            + mid
+            + r" -->|/\* END GENERATED: "
+            + mid
+            + r" \*/)"
+        )
+        pattern = re.compile(begin + r".*?" + end, re.DOTALL)
+        m = pattern.search(text)
+        if not m:
+            sys.exit(f"Marker region '{marker_id}' not found in target doc")
+
+        # Inline regions (markers mid-line, e.g. inside a table cell) get no
+        # surrounding newlines; block regions are wrapped on their own lines.
+        inline = m.start() > 0 and text[m.start() - 1] != "\n"
+        # Escape backslashes and group-ref markers in the body for re.sub.
+        safe_body = body.replace("\\", "\\\\")
+        sep = "" if inline else "\n"
+        replacement = r"\1" + sep + safe_body + sep + r"\2"
+        text = pattern.sub(replacement, text)
+    return text
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    ap.add_argument("--lemond", help="Path to the built lemond binary")
+    ap.add_argument(
+        "--check", action="store_true", help="Fail if docs are stale; do not write"
+    )
+    args = ap.parse_args()
+
+    binary = find_lemond(args.lemond)
+    with Lemond(binary) as server:
+        info = server.system_info()
+        config = server.config()
+        defaults_text = server.config_defaults_text()
+    recipes = info.get("recipes", {})
+    if not recipes:
+        sys.exit("/system-info returned no recipes")
+    if not config:
+        sys.exit("/internal/config returned nothing")
+
+    # Each target doc maps marker IDs -> generated content. backends-reference.md
+    # is created from a template if missing; the others must already contain their
+    # markers (the regions were added to the curated docs by hand once).
+    targets: dict = {
+        TARGET_DOC: {
+            "sections": {
+                "backends-overview": render_overview(recipes),
+                "backends-matrix": render_support_matrix(recipes),
+                "backend-options": render_options(recipes),
+                "backend-models": render_models(recipes),
+            },
+            "template": DEFAULT_TEMPLATE,
+        },
+        REPO_ROOT
+        / "README.md": {
+            "sections": {"backends-matrix": render_readme_matrix(recipes)},
+        },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "configuration"
+        / "multi-model.md": {
+            "sections": {"npu-exclusivity": render_npu_exclusivity(recipes)},
+        },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "cli.md": {
+            "sections": {"cli-recipe-options": render_cli_recipe_options(recipes)},
+        },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "configuration"
+        / "custom-models.md": {
+            "sections": {"recipe-values": render_recipe_values(recipes)},
+        },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "configuration"
+        / "README.md": {
+            "sections": {"config-example": render_config_example(config)},
+        },
+        REPO_ROOT
+        / "docs"
+        / "assets"
+        / "models.js": {
+            "sections": {"models-js-recipes": render_models_js(recipes)},
+        },
+    }
+
+    # Whole-file generated artifacts (not marker-delimited): resources/defaults.json
+    # is the canonical default config, mirrored verbatim from GET
+    # /internal/config/defaults (per-recipe blocks come from the descriptors).
+    raw_targets: dict = {
+        REPO_ROOT / "src" / "cpp" / "resources" / "defaults.json": defaults_text,
+    }
+
+    stale = []
+    for path, content in raw_targets.items():
+        rel = path.relative_to(REPO_ROOT)
+        if args.check:
+            if not path.exists() or path.read_text() != content:
+                stale.append(str(rel))
+        else:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path.write_text(content)
+            print(f"Wrote {rel}")
+
+    for path, spec in targets.items():
+        rel = path.relative_to(REPO_ROOT)
+        current = path.read_text() if path.exists() else spec.get("template", "")
+        if not current:
+            sys.exit(f"{rel} is missing and has no template")
+        updated = apply_sections(current, spec["sections"])
+        if args.check:
+            if not path.exists() or path.read_text() != updated:
+                stale.append(str(rel))
+        else:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path.write_text(updated)
+            print(f"Wrote {rel}")
+
+    if args.check:
+        if stale:
+            sys.exit(
+                "Stale generated files: "
+                + ", ".join(stale)
+                + "\nRun: python docs/tools/gen_backend_boilerplate.py"
+            )
+        print("All generated files are up to date.")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/mkdocs.yml b/mkdocs.yml
index 18201bba3..73ecc9981 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -83,6 +83,8 @@ nav:
     - Contribute: dev/contribute.md
     - Documentation Guide: dev/documentation.md
     - C++: dev/getting-started.md
+    - Adding a Backend: dev/adding-a-backend.md
+    - Backends Reference: dev/backends-reference.md
     - Desktop App: dev/app.md
     - Web UI: dev/web-ui.md
     - Lemonade Omni Models: dev/lemonade-omni.md
diff --git a/src/app/src/renderer/utils/recipeNames.ts b/src/app/src/renderer/utils/recipeNames.ts
index d654c635a..8f1fdbb1f 100644
--- a/src/app/src/renderer/utils/recipeNames.ts
+++ b/src/app/src/renderer/utils/recipeNames.ts
@@ -4,15 +4,28 @@ export const isCollectionRecipe = (recipe?: string): boolean => {
   return recipe === COLLECTION_OMNI_MODEL_RECIPE;
 };
 
+// Recipe display names. Hardware-backend names (llamacpp, whispercpp, sd-cpp, …)
+// are populated at runtime from /system-info's `recipes[].display_name`, which is
+// generated from the C++ backend descriptors — the single source of truth. Only
+// recipes NOT surfaced by /system-info's hardware support matrix are seeded here:
+// the collection orchestrator (not a backend) and cloud offload (a backend with
+// no local support rows).
 export const RECIPE_DISPLAY_NAMES: Record<string, string> = {
   [COLLECTION_OMNI_MODEL_RECIPE]: 'Lemonade',
-  'flm': 'FastFlowLM NPU',
-  'llamacpp': 'Llama.cpp GPU',
-  'ryzenai-llm': 'Ryzen AI LLM',
-  'whispercpp': 'Whisper.cpp',
-  'moonshine': 'Moonshine',
-  'sd-cpp': 'StableDiffusion.cpp',
-  'kokoro': 'Kokoro',
   'cloud': 'Cloud',
-  'vllm': 'vLLM ROCm (experimental)',
+};
+
+// Merge display names from a /system-info `recipes` object into RECIPE_DISPLAY_NAMES.
+// Called whenever system info is (re)fetched so the map reflects the descriptors.
+export const updateRecipeDisplayNames = (
+  recipes?: Record<string, { display_name?: string }>
+): void => {
+  if (!recipes) {
+    return;
+  }
+  for (const [recipe, info] of Object.entries(recipes)) {
+    if (info && typeof info.display_name === 'string' && info.display_name) {
+      RECIPE_DISPLAY_NAMES[recipe] = info.display_name;
+    }
+  }
 };
diff --git a/src/app/src/renderer/utils/systemData.ts b/src/app/src/renderer/utils/systemData.ts
index 63f1d9427..fcd3b8f92 100644
--- a/src/app/src/renderer/utils/systemData.ts
+++ b/src/app/src/renderer/utils/systemData.ts
@@ -39,8 +39,23 @@ export interface Recipes {
   [recipeName: string]: Recipe;
 }
 
+// Per-recipe option schema, generated from the C++ backend descriptor.
+export interface RecipeOptionSchema {
+  name: string;
+  cli_flag: string;
+  default: unknown;
+  type_name: string;
+  help: string;
+  group: string;
+}
+
 export interface Recipe {
   default_backend?: string;
+  // Descriptor metadata (generated from the C++ backend descriptors).
+  display_name?: string;
+  selectable_backend?: boolean;
+  uses_ctx_size?: boolean;
+  options?: RecipeOptionSchema[];
   backends: {
     [backendName: string]: BackendInfo;
   };
@@ -75,6 +90,11 @@ const fetchSystemInfoFromAPI = async (): Promise<SystemData> => {
     const data = await response.json();
     const systemInfo: SystemInfo = { ...data };
 
+    // Seed recipe display names from the descriptor-generated /system-info data
+    // so the UI doesn't hardcode per-recipe names.
+    const { updateRecipeDisplayNames } = await import('./recipeNames');
+    updateRecipeDisplayNames(systemInfo.recipes);
+
     return { info: systemInfo };
   } catch (error) {
     console.error('Failed to fetch supported inference data from API:', error);
diff --git a/src/cpp/cli/CMakeLists.txt b/src/cpp/cli/CMakeLists.txt
index bd58c60ba..b6a0f26d6 100644
--- a/src/cpp/cli/CMakeLists.txt
+++ b/src/cpp/cli/CMakeLists.txt
@@ -97,6 +97,10 @@ set(COMMON_SOURCES
     agent_config_file.cpp
     opencode_profile.cpp
     pi_profile.cpp
+    # Self-describing backend descriptors (plain data; CLI-safe). Lets the CLI
+    # read recipe options/flags from descriptors without linking server classes.
+    # The matching factories (create()) are server-only and NOT listed here.
+    ${LEMON_BACKEND_DESCRIPTOR_SOURCES}
 )
 
 # Add platform-specific sources
diff --git a/src/cpp/cli/bench.cpp b/src/cpp/cli/bench.cpp
index 6cf1b1a5b..280b26d33 100644
--- a/src/cpp/cli/bench.cpp
+++ b/src/cpp/cli/bench.cpp
@@ -1,5 +1,6 @@
 #include "lemon_cli/bench.h"
 #include "lemon_cli/lemonade_client.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include <CLI/CLI.hpp>
 #include <lemon/utils/path_utils.h>
 #include <algorithm>
@@ -406,9 +407,10 @@ bool load_model_for_backend(lemonade::LemonadeClient& client,
         request_body["model_name"] = model;
         request_body["save_options"] = false;
 
-        // For llamacpp recipe, pass backend override
-        if (recipe == "llamacpp") {
-            request_body["llamacpp_backend"] = backend;
+        // For recipes that expose a selectable backend, pass the override.
+        if (const auto* desc = lemon::backends::descriptor_for(recipe);
+            desc && desc->selectable_backend) {
+            request_body[desc->effective_config_section() + "_backend"] = backend;
         }
 
         if (ctx_size > 0) {
diff --git a/src/cpp/cli/hf_pull.cpp b/src/cpp/cli/hf_pull.cpp
index 8ed30ca0a..f5a84c051 100644
--- a/src/cpp/cli/hf_pull.cpp
+++ b/src/cpp/cli/hf_pull.cpp
@@ -255,11 +255,12 @@ int hf_pull_flow(lemonade::LemonadeClient& client,
 
     const auto& variants = variants_response["variants"];
     std::string recipe = variants_response.value("recipe", std::string("llamacpp"));
+    std::string repo_kind = variants_response.value("repo_kind", std::string("gguf"));
 
-    // Non-llamacpp recipes (currently: ONNX RyzenAI) ship as a single
-    // installable unit — no per-variant menu, no `:variant` checkpoint
-    // suffix, no `-VARIANT` model name tail.
-    if (recipe != "llamacpp") {
+    // Non-GGUF repos (currently: ONNX RyzenAI) ship as a single installable
+    // unit — no per-variant menu, no `:variant` checkpoint suffix, no
+    // `-VARIANT` model name tail. (Collections returned earlier above.)
+    if (repo_kind != "gguf") {
         if (!variant.empty()) {
             std::cerr << "warning: variant '" << variant << "' ignored for "
                       << recipe << " checkpoints" << std::endl;
diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
new file mode 100644
index 000000000..03ca71e69
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -0,0 +1,128 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <nlohmann/json.hpp>
+#include "lemon/model_types.h"
+#include "lemon/recipe_backend_def.h"
+
+namespace lemon {
+
+// A single declarative configuration knob a backend exposes. The same list
+// drives config.json defaults, CLI flag registration, and load-time option
+// resolution, so they can never drift apart.
+struct BackendOption {
+    std::string name;                 // option key, e.g. "vllm_args"
+    std::string cli_flag;             // CLI flag, e.g. "--vllm-args" ("" = not a CLI flag)
+    nlohmann::json default_value;     // default value when the option is unset
+    std::string type_name;            // "ARGS" | "SIZE" | "BACKEND" | "BOOL"
+    std::string help;                 // CLI help text
+    std::string group;                // CLI help group, e.g. "General Options"
+};
+
+// How a backend shares the accelerator. Replaces the router's recipe-string
+// checks for NPU exclusivity and LRU slot accounting.
+enum class SlotPolicy {
+    Standard,      // counts toward the LRU slots, no device exclusivity (llamacpp, sd-cpp)
+    ExclusiveNpu,  // evict ALL npu servers before loading (ryzenai-llm, whispercpp-npu)
+    CoexistByType, // one per model type, evicts exclusive-npu peers (flm)
+    Unmetered      // never counts toward slots, never auto-evicted (cloud)
+};
+
+// How an installed backend version is compared against the expected pin.
+enum class VersionPolicy {
+    Exact,    // installed must match the expected version
+    AtLeast   // installed >= expected is acceptable (system-managed packages, e.g. flm)
+};
+
+inline const char* slot_policy_to_string(SlotPolicy p) {
+    switch (p) {
+        case SlotPolicy::Standard:      return "standard";
+        case SlotPolicy::ExclusiveNpu:  return "exclusive_npu";
+        case SlotPolicy::CoexistByType: return "coexist_by_type";
+        case SlotPolicy::Unmetered:     return "unmetered";
+    }
+    return "standard";
+}
+
+// Plain data declaring *what a backend is*. This is the single object the
+// registry, the CLI, /system-info, and the docs all read. Behavior lives in the
+// paired WrappedServer subclass (see backend_registry.h for how they bind).
+struct BackendDescriptor {
+    std::string recipe;             // "vllm"
+    std::string display_name;       // "vLLM ROCm (experimental)"
+    std::string binary;             // subprocess to launch/install ("" = none, e.g. cloud)
+    std::string config_section;     // config.json section; defaults to recipe (sd-cpp -> "sdcpp")
+
+    DeviceType default_device = DEVICE_GPU;           // default; override effective_device() if variant-dependent
+    SlotPolicy slot_policy    = SlotPolicy::Standard; // default; override effective_slot_policy() if variant-dependent
+    bool selectable_backend   = false;  // auto-creates "<recipe>_backend" option + "--<recipe>" flag
+    bool uses_ctx_size        = false;  // opt in to the shared ctx_size option
+    bool dynamic_models       = false;  // true = ops supply models at runtime (cloud, flm), not server_models.json
+
+    std::vector<BackendOption>    options;                       // backend-specific knobs (common ones are automatic)
+    std::vector<BackendSupport>   support;                       // which OS / GPU families it runs on ({} = no local gating)
+    std::vector<std::string>      default_labels;                // labels injected when a model omits them
+    std::vector<std::string>      required_checkpoints{"main"};  // unconditional files; conditional ones checked in load()
+
+    // Editorial metadata for the generated docs (README support matrix, website).
+    std::string modality;           // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation"
+    bool        experimental = false; // true renders "(experimental)" next to the recipe in generated docs
+    std::string web_display_name;   // name used on the docs website ("" = fall back to display_name)
+    int         web_priority = 0;    // model-grouping order on the docs website (lower = higher; 0 = unlisted)
+
+    // ROCm release channels this backend publishes (e.g. {"stable","nightly"}).
+    // Empty = the backend has no ROCm channels (its "rocm" build is a single
+    // artifact). Drives the rocm-stable/rocm-nightly bin-key collapse and the
+    // channel clamp (a requested channel not listed here falls back to the first).
+    std::vector<std::string> rocm_channels;
+
+    // True if the backend's subprocess exposes a Prometheus /metrics endpoint
+    // that lemond should scrape and re-export (llama-server does).
+    bool exposes_prometheus_metrics = false;
+
+    // True if this backend's ROCm build requires the gfx1151 (Strix Halo) kernel
+    // CWSR fix. Gates the availability/remediation check for the "rocm" backend.
+    bool rocm_requires_cwsr_fix = false;
+
+    // How the installed version is compared against the expected pin. Exact by
+    // default; system-managed packages (flm) accept any version >= expected.
+    VersionPolicy version_policy = VersionPolicy::Exact;
+
+    // True if the backend pulls its own models on demand (flm self-pulls via its
+    // CLI) rather than being pre-downloaded from Hugging Face by the router. Such
+    // backends are skipped by the load-time auto-download path.
+    bool self_manages_downloads = false;
+
+    // --- config.json per-recipe defaults schema ---
+    // The backend's section of config.json is derived from these fields, so a new
+    // backend's defaults live in its descriptor instead of a hand-maintained
+    // defaults.json block. (selectable_backend additionally emits `backend: "auto"`.)
+    bool takes_args = false;                       // emits `args: ""`
+    std::vector<std::string> arg_variants;         // each emits `<variant>_args: ""`
+    std::vector<std::string> bin_variants;         // each emits `<variant>_bin: "builtin"`
+    nlohmann::json config_extra = nlohmann::json::object();  // fixed extras (e.g. prefer_system, image defaults)
+
+    // The config.json section name for this backend, falling back to the recipe.
+    std::string effective_config_section() const {
+        return config_section.empty() ? recipe : config_section;
+    }
+
+    // Build this backend's config.json default section from the schema above.
+    // Returns an empty object when the backend has no configurable section.
+    nlohmann::json config_defaults() const {
+        nlohmann::json block = nlohmann::json::object();
+        if (selectable_backend) block["backend"] = "auto";
+        if (takes_args) block["args"] = "";
+        for (const auto& v : arg_variants) block[v + "_args"] = "";
+        for (const auto& v : bin_variants) block[v + "_bin"] = "builtin";
+        if (config_extra.is_object()) {
+            for (auto it = config_extra.begin(); it != config_extra.end(); ++it) {
+                block[it.key()] = it.value();
+            }
+        }
+        return block;
+    }
+};
+
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/backend_descriptor_registry.h b/src/cpp/include/lemon/backends/backend_descriptor_registry.h
new file mode 100644
index 000000000..44ec7e15d
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_descriptor_registry.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// Read-only view over every backend descriptor (plain data). This API is
+// CLI-safe: it pulls in no server classes, so it links into both the lemonade
+// CLI and lemond. The factory side (create_server) lives in backend_registry.h
+// and is server-only.
+
+// All registered descriptors, in LEMON_BACKENDS order.
+const std::vector<const BackendDescriptor*>& all_descriptors();
+
+// Descriptor for a recipe, or nullptr if the recipe has no registered backend.
+const BackendDescriptor* descriptor_for(const std::string& recipe);
+
+// True if the recipe is backed by a registered descriptor.
+bool has_backend(const std::string& recipe);
+
+// True if the recipe publishes ROCm release channels (stable/nightly) — i.e. its
+// "rocm" backend resolves to a channel-specific artifact. False for recipes whose
+// rocm build is a single artifact (or that have no rocm build at all).
+bool recipe_has_rocm_channels(const std::string& recipe);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
new file mode 100644
index 000000000..047c6795d
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -0,0 +1,177 @@
+#pragma once
+
+#include <optional>
+#include <string>
+#include <vector>
+#include "lemon/model_manager.h"  // ModelInfo, DownloadProgressCallback (server-side only)
+
+namespace lemon {
+
+class CloudProviderRegistry;
+
+namespace backends {
+
+// Context handed to BackendOps methods — the bits of server state model
+// management needs without a running subprocess. Grows as migrations require.
+struct BackendOpsContext {
+    ModelManager* model_manager = nullptr;
+    CloudProviderRegistry* cloud_registry = nullptr;  // for dynamic cloud discovery
+};
+
+// Inputs for resolving a checkpoint's on-disk path. The model manager computes
+// the HF-cache locations generically; each backend's ops decide how to find its
+// artifact within (a .gguf file, a genai_config.json directory, a .bin, …).
+struct CheckpointResolveContext {
+    std::string hf_cache;          // HF cache root dir
+    std::string model_cache_path;  // hf_cache/<checkpoint repo cache dir>
+    std::string repo_id;           // checkpoint's repo id
+    std::string main_repo_id;      // the model's "main" checkpoint repo id (fallback)
+    std::string variant;           // checkpoint variant after ':' ("" if none)
+    std::string type;              // checkpoint type ("main", "mmproj", "npu_cache", …)
+    std::string checkpoint;        // the raw checkpoint string
+};
+
+// Stateless per-backend behavior for model management that happens WITHOUT a
+// running subprocess: checkpoint-path resolution, download, dynamic discovery,
+// per-model metadata, version detection, availability. One singleton per
+// backend, exposed via lemon::backends::<stem>::ops() and bound in the registry
+// (see BackendRegistration::ops).
+//
+// The base class is the shared default behavior (the common HF-backed case);
+// each backend folder overrides ONLY the policy points it needs, so shared
+// logic is inherited rather than copied. Methods are added here incrementally as
+// switchboards in model_manager / system_info are migrated; every method has a
+// default so adding one never forces edits to backends that don't override it.
+class BackendOps {
+public:
+    virtual ~BackendOps() = default;
+
+    // Populate model-specific metadata (context window, capability labels, …)
+    // for a downloaded model. Default: nothing.
+    virtual void populate_metadata(ModelInfo& info, const BackendOpsContext& ctx) const {
+        (void)info;
+        (void)ctx;
+    }
+
+    // Resolve a checkpoint to its absolute on-disk path (file or directory).
+    // Default: the shared HF behavior — locate the variant/aux file in the active
+    // snapshot, else fall back to the model cache directory. Backends with a
+    // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override.
+    virtual std::string resolve_checkpoint_path(const ModelInfo& info,
+                                                const CheckpointResolveContext& ctx) const;
+
+    // Find the primary checkpoint artifact inside a freshly-imported local
+    // directory (a local_import pull), e.g. the .gguf / .bin file or the
+    // genai_config.json directory. Returns the absolute path to register, or ""
+    // to register the directory itself. Default: "" (register the directory).
+    virtual std::string find_imported_checkpoint(const std::string& import_dir) const {
+        (void)import_dir;
+        return "";
+    }
+
+    // Validate a user-supplied checkpoint string when registering a new model.
+    // Return an error message if invalid, "" if acceptable. Default: accept.
+    // llamacpp requires a :variant on GGUF checkpoints.
+    virtual std::string validate_registration_checkpoint(const std::string& checkpoint) const {
+        (void)checkpoint;
+        return "";
+    }
+
+    // Select the repo-relative files to download for the main checkpoint
+    // `main_variant`, for backends whose artifact layout isn't a GGUF file.
+    // Return nullopt to use the default GGUF selection. (Direct single-file
+    // variants — .safetensors/.pth/.ckpt — are handled generically upstream.)
+    // moonshine overrides: its variant names a directory of files to fetch.
+    virtual std::optional<std::vector<std::string>> select_checkpoint_files(
+        const std::string& main_variant, const std::vector<std::string>& repo_files) const {
+        (void)main_variant;
+        (void)repo_files;
+        return std::nullopt;
+    }
+
+    // Models supplied at runtime rather than from server_models.json (descriptor
+    // dynamic_models = true). Default: none. cloud/flm override.
+    virtual std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const {
+        (void)ctx;
+        return {};
+    }
+
+    // Whether a model's local artifacts are present. Default: the shared HF
+    // checkpoint-completeness check (ModelManager::checkpoints_complete). cloud
+    // (always true) and flm (installed-set membership) override.
+    virtual bool is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const;
+
+    // Validate a resolved checkpoint file for the cache. Returns "" if valid, or
+    // a reason it should be treated as not-downloaded. Default: always valid;
+    // llamacpp checks GGUF magic.
+    virtual std::string validate_checkpoint_file(const std::string& resolved_path) const {
+        (void)resolved_path;
+        return "";
+    }
+
+    // Download a model's artifacts. Default: the shared Hugging Face download.
+    // cloud (no-op) and flm (flm pull) override.
+    virtual void download_model(const ModelInfo& info, bool do_not_upgrade,
+                                DownloadProgressCallback progress,
+                                const BackendOpsContext& ctx) const;
+
+    // Whether the model cache must be rebuilt after this backend downloads a
+    // model (e.g. flm, whose model list changes). Default: false.
+    virtual bool invalidates_cache_after_download() const { return false; }
+
+    // Resolve a backend's installed version for a given backend variant. The
+    // caller passes the version read from the on-disk version.txt (or "" if
+    // absent); the default returns it unchanged. Backends that detect their
+    // version another way override: llamacpp's "system" build runs
+    // `llama-server --version`; flm queries `flm version` when no file is present.
+    virtual std::string resolve_version(const std::string& backend,
+                                        const std::string& file_version) const {
+        (void)backend;
+        return file_version;
+    }
+
+    // Result of a backend-specific install check: whether the backend variant is
+    // usable, plus an optional error explaining why not.
+    struct InstallCheck {
+        bool installed = false;
+        std::string error;
+    };
+
+    // Decide whether a backend variant is installed, given whether its managed
+    // binary was found on disk. Default: installed iff the binary was found.
+    // llamacpp's "system" build also requires the ggml HIP plugin when an AMD GPU
+    // is present; flm can be a system PATH package even without a managed binary.
+    virtual InstallCheck check_install(const std::string& backend, bool binary_found) const {
+        (void)backend;
+        return {binary_found, ""};
+    }
+
+    // The /system-info state for a backend variant that is supported but not
+    // currently available (install probe failed).
+    struct UnavailableState {
+        std::string state;    // "installable" | "update_required" | "action_required"
+        std::string message;  // shown to the user
+        std::string action;   // remediation (a URL or an install command)
+        bool attach_installed_version = false;  // surface the installed version too
+    };
+
+    // Classify a "supported but not available" backend variant for /system-info,
+    // given the install probe's error text and the generic install command the
+    // caller would otherwise use. Return nullopt to use the generic
+    // installable/no-fetch default. flm overrides: it is a system .deb + drivers
+    // needing manual setup, so its states and remediation links differ.
+    virtual std::optional<UnavailableState> classify_unavailable(
+        const std::string& backend, const std::string& install_error,
+        const std::string& default_install_command) const {
+        (void)backend;
+        (void)install_error;
+        (void)default_install_command;
+        return std::nullopt;
+    }
+};
+
+// Shared default ops instance for backends that override nothing.
+const BackendOps* default_backend_ops();
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h
new file mode 100644
index 000000000..240ddf728
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_registry.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lemon/backends/backend_descriptor.h"
+#include "lemon/backends/backend_descriptor_registry.h"
+#include "lemon/backends/backend_ops.h"
+
+namespace lemon {
+
+class WrappedServer;
+class ModelManager;
+class BackendManager;
+class CloudProviderRegistry;
+struct ModelInfo;
+
+namespace backends {
+
+struct BackendSpec;  // install/download spec, defined in backend_utils.h
+
+// Everything a backend's create() needs to build an instance. Mirrors the
+// arguments the old router factory passed to each backend constructor.
+struct BackendContext {
+    std::string log_level;
+    ModelManager* model_manager = nullptr;
+    BackendManager* backend_manager = nullptr;
+    CloudProviderRegistry* cloud_registry = nullptr;
+    const ModelInfo* model_info = nullptr;  // for per-create setup (cloud provider, ryzenai model path)
+};
+
+using BackendCreateFn = std::unique_ptr<WrappedServer> (*)(const BackendContext&);
+
+// Convenience for the common create(): construct a server class from the
+// standard (log_level, model_manager, backend_manager) context fields. Backends
+// needing extra constructor arguments (cloud, ryzenai) build theirs by hand.
+template <typename T>
+std::unique_ptr<WrappedServer> make_server(const BackendContext& ctx) {
+    return std::make_unique<T>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+// Construct-on-first-use singleton for a stateless ops class, giving the
+// registry a stable pointer. Backends with no custom behavior return
+// default_backend_ops() from their ops() instead.
+template <typename T>
+const BackendOps* single_ops() {
+    static const T kOps;
+    return &kOps;
+}
+
+// Binds a descriptor (what the backend is) to its server class's create() (how
+// it runs). The generated factory registry supplies one per backend. This API is
+// server-only: it references server classes via create(), so it is compiled into
+// lemond but not the CLI. The CLI reads descriptors through backend_descriptor_registry.h.
+struct BackendRegistration {
+    const BackendDescriptor* descriptor;
+    BackendCreateFn create;
+    const BackendSpec* spec;  // install/download spec, or nullptr (e.g. cloud has none)
+    const BackendOps* ops;    // stateless model-management behavior (never null)
+};
+
+// All registered (descriptor, create, spec, ops) entries, in LEMON_BACKENDS order.
+const std::vector<BackendRegistration>& all_registrations();
+
+// Install/download spec for a recipe, or nullptr if the recipe has none.
+const BackendSpec* spec_for(const std::string& recipe);
+
+// Stateless model-management ops for a recipe. Falls back to the shared default
+// ops (base behavior) for recipes with no registered backend.
+const BackendOps* ops_for(const std::string& recipe);
+
+// Construct a backend instance for a recipe and associate its descriptor, or
+// nullptr if the recipe has no registered backend.
+std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/backend_utils.h b/src/cpp/include/lemon/backends/backend_utils.h
index bfc37734d..bdbfe0869 100644
--- a/src/cpp/include/lemon/backends/backend_utils.h
+++ b/src/cpp/include/lemon/backends/backend_utils.h
@@ -5,6 +5,7 @@
 #include <filesystem>
 #include <utility>
 #include <vector>
+#include "lemon/backends/backend_descriptor.h"
 
 namespace fs = std::filesystem;
 
@@ -42,6 +43,17 @@ namespace lemon::backends {
         std::string log_name() const { return recipe + " Server"; };
     };
 
+    // Build a backend's install/download spec from its descriptor's recipe/binary
+    // and the server class T's get_install_params. The construct-on-first-use
+    // static gives the registry a stable pointer. Backends whose install key
+    // differs from the recipe (ryzenai) or that have no installable artifact
+    // (cloud) build their BackendSpec by hand instead of using this.
+    template <typename T>
+    const BackendSpec* make_spec(const BackendDescriptor& d, bool split = false) {
+        static const BackendSpec kSpec(d.recipe, d.binary, T::get_install_params, split);
+        return &kSpec;
+    }
+
     // Return the backend spec for recipes that use the standard BackendSpec flow.
     // Returns nullptr for recipes that require custom handling (e.g., flm) or unknown recipes.
     const BackendSpec* try_get_spec_for_recipe(const std::string& recipe);
diff --git a/src/cpp/include/lemon/backends/cloud/cloud.h b/src/cpp/include/lemon/backends/cloud/cloud.h
new file mode 100644
index 000000000..976a84f70
--- /dev/null
+++ b/src/cpp/include/lemon/backends/cloud/cloud.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace cloud {
+
+// The cloud backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "cloud",
+    /*display_name*/    "Cloud",
+    /*binary*/          "",  // no subprocess: runs on a remote provider
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_NONE,
+    /*slot_policy*/     SlotPolicy::Unmetered,  // never counts toward slots, never auto-evicted
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  true,   // models discovered at runtime from the provider
+    /*options*/ {},
+    /*support*/ {},             // no local gating: install/support machinery skips cloud
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {},  // no downloaded files
+    /*modality*/        "",
+    /*experimental*/    false,
+    /*web_display_name*/ "",
+};
+
+}  // namespace cloud
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h
similarity index 92%
rename from src/cpp/include/lemon/backends/cloud_server.h
rename to src/cpp/include/lemon/backends/cloud/cloud_server.h
index 21bf20642..51b61d6f4 100644
--- a/src/cpp/include/lemon/backends/cloud_server.h
+++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../model_manager.h"
-#include "../wrapped_server.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/model_manager.h"
+#include "lemon/wrapped_server.h"
 #include <string>
 #include <vector>
 
@@ -109,5 +111,11 @@ class CloudServer : public WrappedServer {
     bool loaded_ = false;
 };
 
-} // namespace backends
-} // namespace lemon
+namespace cloud {
+// Factory for the cloud backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace cloud
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
new file mode 100644
index 000000000..24049ab31
--- /dev/null
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace fastflowlm {
+
+// The fastflowlm backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "flm",
+    /*display_name*/    "FastFlowLM NPU",
+#ifdef _WIN32
+    /*binary*/          "flm.exe",
+#else
+    /*binary*/          "flm",
+#endif
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_NPU,
+    /*slot_policy*/     SlotPolicy::CoexistByType,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  true,  // models come from flm's model_list.json, not server_models.json
+    /*options*/ {},
+    /*support*/ {
+        {"npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "FastFlowLM NPU",
+    /*web_priority*/    3,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::AtLeast,  // system-managed package
+    /*self_manages_downloads*/ true,  // flm pulls its own models via the flm CLI
+    /*takes_args*/      true,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {},
+    /*config_extra*/    nlohmann::json::object(),
+};
+
+}  // namespace fastflowlm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
new file mode 100644
index 000000000..87470300c
--- /dev/null
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <cstdint>
+#include <filesystem>
+#include <string>
+#include <vector>
+#include "lemon/model_manager.h"  // ModelInfo, DownloadProgressCallback
+
+namespace lemon {
+
+namespace backends {
+namespace fastflowlm {
+
+// Locate the FLM executable (install dir on Windows, system PATH on Linux).
+std::string find_flm_binary();
+
+// Installed FLM model checkpoints (from `flm list --filter installed`).
+std::vector<std::string> flm_installed_checkpoints();
+
+// Discover all available FLM models (from `flm list --json`), each with its
+// downloaded status set. Returns empty if FLM is not ready.
+std::vector<ModelInfo> flm_discover_models();
+
+// FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH /
+// platform-default roots and describes them with a config.json; this knowledge
+// lives in the fastflowlm backend folder rather than in the shared model manager.
+
+// Derive the on-disk repo directory name from an FLM model URL.
+std::string repo_dir_from_url(const std::string& url);
+
+// Locate config.json for an FLM repo dir across the candidate model roots.
+std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo_dir);
+
+// Read the model's max context window from its FLM config.json (0 if unknown).
+int64_t read_flm_max_context_window(const ModelInfo& info);
+
+// Locate the flm executable on PATH / install dirs ("" if not found).
+std::string find_flm_executable();
+
+// Run `flm validate` and report readiness; error_message on failure.
+bool run_flm_validate(const std::string& flm_path, std::string& error_message);
+
+// Detect the installed FLM version via `flm version` ("unknown" if unavailable).
+std::string flm_version();
+
+// Download (pull) an FLM model by checkpoint via the `flm` CLI.
+void flm_download(const std::string& checkpoint, bool do_not_upgrade,
+                  DownloadProgressCallback progress_callback);
+
+// Remove an installed FLM model by checkpoint via `flm remove`; throws on failure.
+void flm_remove(const std::string& checkpoint);
+
+} // namespace fastflowlm
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
similarity index 82%
rename from src/cpp/include/lemon/backends/fastflowlm_server.h
rename to src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
index bd9c554ac..bdcb1d88a 100644
--- a/src/cpp/include/lemon/backends/fastflowlm_server.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -11,17 +13,6 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        // recipe
-            "flm",
-        // executable
-    #ifdef _WIN32
-            "flm.exe"
-    #else
-            "flm"
-    #endif
-        , get_install_params
-    );
 
     FastFlowLMServer(const std::string& log_level, ModelManager* model_manager = nullptr,
                      BackendManager* backend_manager = nullptr);
@@ -70,5 +61,11 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public
     bool is_loaded_ = false;
 };
 
-} // namespace backends
-} // namespace lemon
+namespace fastflowlm {
+// Factory for the fastflowlm backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace fastflowlm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/hf_cache_util.h b/src/cpp/include/lemon/backends/hf_cache_util.h
new file mode 100644
index 000000000..91c64278e
--- /dev/null
+++ b/src/cpp/include/lemon/backends/hf_cache_util.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <filesystem>
+#include <string>
+
+namespace lemon {
+namespace backends {
+namespace hf_cache {
+
+// Shared Hugging Face cache mechanics used by backend ops to locate model
+// artifacts on disk (the same logic model_manager uses for its own cache work).
+
+// Exists check that tolerates the symlinks HF uses for dedup (Win32 on Windows,
+// where MSVC's std::filesystem refuses untrusted reparse points).
+bool exists(const std::filesystem::path& p);
+
+// Directory-iteration options that skip inaccessible/symlinked entries instead
+// of throwing.
+std::filesystem::directory_options dir_options();
+
+// The active HF snapshot directory (snapshots/<refs/main>) for a model cache
+// dir, or an empty path if there is no recorded ref / it doesn't exist.
+std::filesystem::path active_snapshot_path(const std::filesystem::path& model_cache_path);
+
+// HF cache directory name for a repo id ("org/repo" -> "models--org--repo").
+std::string repo_id_to_cache_dir_name(const std::string& repo_id);
+
+} // namespace hf_cache
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h
new file mode 100644
index 000000000..5f3fbf97c
--- /dev/null
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace kokoro {
+
+// The kokoro backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "kokoro",
+    /*display_name*/    "Kokoro",
+#ifdef _WIN32
+    /*binary*/          "koko.exe",
+#else
+    /*binary*/          "koko",
+#endif
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_CPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {},
+    /*support*/ {
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+    },
+    /*default_labels*/  {},  // kokoro models carry "tts" explicitly in server_models.json
+    /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text-to-speech",
+    /*experimental*/    false,
+    /*web_display_name*/ "",
+    /*web_priority*/    6,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      false,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {"cpu"},
+    /*config_extra*/    nlohmann::json::object(),
+};
+
+}  // namespace kokoro
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
similarity index 69%
rename from src/cpp/include/lemon/backends/kokoro_server.h
rename to src/cpp/include/lemon/backends/kokoro/kokoro_server.h
index 0b99bcb96..6a9738252 100644
--- a/src/cpp/include/lemon/backends/kokoro_server.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 #include <filesystem>
 
@@ -13,15 +15,6 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "kokoro",
-    #ifdef _WIN32
-            "koko.exe"
-    #else
-            "koko"
-    #endif
-        , get_install_params
-    );
 
     explicit KokoroServer(const std::string& log_level,
                           ModelManager* model_manager,
@@ -45,5 +38,11 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer {
     void audio_speech(const json& request, httplib::DataSink& sink) override;
 };
 
-} // namespace backends
-} // namespace lemon
+namespace kokoro {
+// Factory for the kokoro backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace kokoro
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
new file mode 100644
index 000000000..7c58a73f3
--- /dev/null
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -0,0 +1,62 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+
+// The llamacpp backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "llamacpp",
+    /*display_name*/    "Llama.cpp GPU",
+#ifdef _WIN32
+    /*binary*/          "llama-server.exe",
+#else
+    /*binary*/          "llama-server",
+#endif
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_GPU,   // cpu/system variants resolve to CPU via effective_device()
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"llamacpp_backend", "--llamacpp", "", "BACKEND",
+         "LlamaCpp backend to use", "Llama.cpp Backend Options"},
+        {"llamacpp_device", "--llamacpp-device", "", "DEVICES",
+         "Comma-separated list of accelerator devices to use (e.g. Vulkan0)", "Llama.cpp Backend Options"},
+        {"llamacpp_args", "--llamacpp-args", "", "ARGS",
+         "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"},
+    },
+    /*support*/ {
+        {"system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+        {"cuda", {"windows", "linux"},
+         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"},
+        {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"},
+        {"rocm", {"windows", "linux"},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "llama.cpp GPU",
+    /*web_priority*/    1,
+    /*rocm_channels*/   {"stable", "nightly"},
+    /*exposes_prometheus_metrics*/ true,
+    /*rocm_requires_cwsr_fix*/ true,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"rocm", "vulkan", "cpu"},
+    /*bin_variants*/    {"rocm", "vulkan", "cuda", "cpu"},
+    /*config_extra*/    {{"prefer_system", true}},
+};
+
+}  // namespace llamacpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
new file mode 100644
index 000000000..6bf170584
--- /dev/null
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <string>
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+
+// Resolve the on-disk path of the GGUF file for a model cache directory and
+// variant (handles sharding, folder variants, and quant-token fallback). Returns
+// the cache directory if no GGUF is present, or "" if the requested variant
+// can't be resolved.
+std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant);
+
+} // namespace llamacpp
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
similarity index 77%
rename from src/cpp/include/lemon/backends/llamacpp_server.h
rename to src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
index c9356f6b8..8a7a8405f 100644
--- a/src/cpp/include/lemon/backends/llamacpp_server.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -11,15 +13,6 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "llamacpp",
-    #ifdef _WIN32
-            "llama-server.exe"
-    #else
-            "llama-server"
-    #endif
-        , get_install_params
-    );
 
     LlamaCppServer(const std::string& log_level,
                    ModelManager* model_manager,
@@ -56,5 +49,11 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR
     json tokenize(const json& request) override;
 };
 
-} // namespace backends
-} // namespace lemon
+namespace llamacpp {
+// Factory for the llamacpp backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace llamacpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h
new file mode 100644
index 000000000..ae7313714
--- /dev/null
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace moonshine {
+
+// The moonshine backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "moonshine",
+    /*display_name*/    "Moonshine",
+    /*binary*/          "moonshine-server",
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_CPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"moonshine_args", "--moonshine-args", "", "ARGS",
+         "Custom arguments to pass to moonshine-server", ""},
+    },
+    /*support*/ {
+        {"cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"},
+        {"cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"},
+        {"cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"},
+    },
+    /*default_labels*/  {"transcription", "realtime-transcription"},
+    /*required_checkpoints*/ {"main"},
+    /*modality*/        "Speech-to-text",
+    /*experimental*/    false,
+    /*web_display_name*/ "",
+    /*web_priority*/    0,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"cpu"},
+    /*bin_variants*/    {"cpu"},
+    /*config_extra*/    nlohmann::json::object(),
+};
+
+}  // namespace moonshine
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
similarity index 76%
rename from src/cpp/include/lemon/backends/moonshine_server.h
rename to src/cpp/include/lemon/backends/moonshine/moonshine_server.h
index 6f13f216b..e6535a34b 100644
--- a/src/cpp/include/lemon/backends/moonshine_server.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -12,11 +14,6 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        "moonshine",
-        "moonshine-server",
-        get_install_params
-    );
 
     explicit MoonshineServer(const std::string& log_level,
                             ModelManager* model_manager,
@@ -51,5 +48,11 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi
     int tcp_port_ = 0;     // Port for line-delimited JSON streaming
 };
 
-} // namespace backends
-} // namespace lemon
+namespace moonshine {
+// Factory for the moonshine backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace moonshine
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
new file mode 100644
index 000000000..dbc15d7f3
--- /dev/null
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace ryzenai {
+
+// The ryzenai backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "ryzenai-llm",
+    /*display_name*/    "Ryzen AI LLM",
+#ifdef _WIN32
+    /*binary*/          "ryzenai-server.exe",
+#else
+    /*binary*/          "ryzenai-server",
+#endif
+    /*config_section*/  "ryzenai",  // differs from recipe "ryzenai-llm"
+    /*default_device*/  DEVICE_NPU,
+    /*slot_policy*/     SlotPolicy::ExclusiveNpu,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {},
+    /*support*/ {
+        {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "Ryzen AI SW NPU",
+    /*web_priority*/    2,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      false,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {"server"},
+    /*config_extra*/    nlohmann::json::object(),
+};
+
+}  // namespace ryzenai
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenaiserver.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
similarity index 78%
rename from src/cpp/include/lemon/backends/ryzenaiserver.h
rename to src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
index 36e1ba98d..f3a6806e7 100644
--- a/src/cpp/include/lemon/backends/ryzenaiserver.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "lemon/backends/backend_registry.h"
+
 #include "lemon/wrapped_server.h"
 #include "lemon/server_capabilities.h"
 #include "lemon/backends/backend_utils.h"
@@ -15,15 +17,6 @@ class RyzenAIServer : public WrappedServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        "ryzenai-server",
-#ifdef _WIN32
-        "ryzenai-server.exe"
-#else
-        "ryzenai-server"
-#endif
-        , get_install_params
-    );
 
     RyzenAIServer(const std::string& model_name, bool debug, ModelManager* model_manager,
                   BackendManager* backend_manager);
@@ -54,3 +47,14 @@ class RyzenAIServer : public WrappedServer {
 };
 
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace ryzenai {
+// Factory for the ryzenai backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace ryzenai
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
new file mode 100644
index 000000000..986d26fbe
--- /dev/null
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace sdcpp {
+
+// The sdcpp backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "sd-cpp",
+    /*display_name*/    "StableDiffusion.cpp",
+#ifdef _WIN32
+    /*binary*/          "sd-server.exe",
+#else
+    /*binary*/          "sd-server",
+#endif
+    /*config_section*/  "sdcpp",
+    /*default_device*/  DEVICE_CPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"sd-cpp_backend", "--sdcpp", "", "BACKEND",
+         "SD.cpp backend to use", "Stable Diffusion Options"},
+        {"sdcpp_args", "--sdcpp-args", "", "ARGS",
+         "Custom arguments to pass to sd-server (must not conflict with managed args)", "Stable Diffusion Options"},
+        // Image generation defaults (recipe-level only, not CLI flags).
+        {"steps", "", 20, "SIZE", "Number of diffusion steps", "Stable Diffusion Options"},
+        {"cfg_scale", "", 7.0, "SIZE", "Classifier-free guidance scale", "Stable Diffusion Options"},
+        {"width", "", 512, "SIZE", "Output image width", "Stable Diffusion Options"},
+        {"height", "", 512, "SIZE", "Output image height", "Stable Diffusion Options"},
+        {"sampling_method", "", "", "ARGS", "Sampling method", "Stable Diffusion Options"},
+        {"flow_shift", "", 0.0, "SIZE", "Flow shift", "Stable Diffusion Options"},
+    },
+    /*support*/ {
+        {"rocm", {"windows", "linux"},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
+        {"cuda", {"linux"},
+         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"},
+        {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"},
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+    },
+    /*default_labels*/  {"image"},
+    /*required_checkpoints*/ {"main"},  // flux text_encoder+vae validated together in load()
+    /*modality*/        "Image generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "stable-diffusion.cpp",
+    /*web_priority*/    5,
+    /*rocm_channels*/   {"stable"},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ true,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"cpu", "rocm", "vulkan"},
+    /*bin_variants*/    {"cpu", "rocm", "vulkan"},
+    /*config_extra*/    {{"steps", 20}, {"cfg_scale", 7.0}, {"width", 512}, {"height", 512}},
+};
+
+}  // namespace sdcpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sd_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
similarity index 85%
rename from src/cpp/include/lemon/backends/sd_server.h
rename to src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
index 857374951..185108afc 100644
--- a/src/cpp/include/lemon/backends/sd_server.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
@@ -1,11 +1,13 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "../model_manager.h"
-#include "../recipe_options.h"
-#include "../utils/process_manager.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/model_manager.h"
+#include "lemon/recipe_options.h"
+#include "lemon/utils/process_manager.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 #include <filesystem>
 
@@ -16,15 +18,6 @@ class SDServer : public WrappedServer, public IImageServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "sd-cpp",
-    #ifdef _WIN32
-            "sd-server.exe"
-    #else
-            "sd-server"
-    #endif
-        , get_install_params
-    );
 
     explicit SDServer(const std::string& log_level,
                       ModelManager* model_manager,
@@ -93,5 +86,11 @@ class SDServer : public WrappedServer, public IImageServer {
     std::string resolve_size(const nlohmann::json& request) const;
 };
 
-} // namespace backends
-} // namespace lemon
+namespace sdcpp {
+// Factory for the sdcpp backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace sdcpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h
new file mode 100644
index 000000000..8984e15b3
--- /dev/null
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -0,0 +1,49 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace vllm {
+
+// The vllm backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "vllm",
+    /*display_name*/    "vLLM ROCm (experimental)",
+    /*binary*/          "vllm-server",
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_GPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"vllm_backend", "--vllm", "", "BACKEND",
+         "vLLM backend to use", "vLLM Options"},
+        {"vllm_args", "--vllm-args", "", "ARGS",
+         "Custom arguments to pass to vllm-server", "vLLM Options"},
+    },
+    /*support*/ {
+        {"rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    true,
+    /*web_display_name*/ "",
+    /*web_priority*/    0,
+    /*rocm_channels*/   {},  // single rocm artifact, no stable/nightly channels
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ true,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {},
+    /*config_extra*/    nlohmann::json::object(),
+};
+
+}  // namespace vllm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h
similarity index 74%
rename from src/cpp/include/lemon/backends/vllm_server.h
rename to src/cpp/include/lemon/backends/vllm/vllm_server.h
index 62ec94af2..1ac9438ed 100644
--- a/src/cpp/include/lemon/backends/vllm_server.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -11,12 +13,6 @@ class VLLMServer : public WrappedServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "vllm",
-            "vllm-server"
-        , get_install_params
-        , /*supports_split_archive=*/true
-    );
 
     VLLMServer(const std::string& log_level,
                ModelManager* model_manager,
@@ -45,5 +41,11 @@ class VLLMServer : public WrappedServer {
 
 };
 
-} // namespace backends
-} // namespace lemon
+namespace vllm {
+// Factory for the vllm backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace vllm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
new file mode 100644
index 000000000..9c38b66d5
--- /dev/null
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+namespace whispercpp {
+
+// The whispercpp backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
+    /*recipe*/          "whispercpp",
+    /*display_name*/    "Whisper.cpp",
+#ifdef _WIN32
+    /*binary*/          "whisper-server.exe",
+#else
+    /*binary*/          "whisper-server",
+#endif
+    /*config_section*/  "",  // defaults to recipe
+    /*default_device*/  DEVICE_CPU,   // npu variant resolves to NPU + ExclusiveNpu via effective_*()
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"whispercpp_backend", "--whispercpp", "", "BACKEND",
+         "WhisperCpp backend to use", "Whisper.cpp Options"},
+        {"whispercpp_args", "--whispercpp-args", "", "ARGS",
+         "Custom arguments to pass to whisper-server", "Whisper.cpp Options"},
+    },
+    /*support*/ {
+        {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
+        {"rocm", {"windows", "linux"},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
+        {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"},
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+    },
+    /*default_labels*/  {"transcription", "realtime-transcription"},
+    /*required_checkpoints*/ {"main"},  // npu_cache validated in load() (npu variant only)
+    /*modality*/        "Speech-to-text",
+    /*experimental*/    false,
+    /*web_display_name*/ "whisper.cpp",
+    /*web_priority*/    4,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"cpu", "npu"},
+    /*bin_variants*/    {"cpu", "npu"},
+    /*config_extra*/    nlohmann::json::object(),
+};
+
+}  // namespace whispercpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whisper_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
similarity index 83%
rename from src/cpp/include/lemon/backends/whisper_server.h
rename to src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
index 55a1734d9..dc97cbd9f 100644
--- a/src/cpp/include/lemon/backends/whisper_server.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 #include <filesystem>
 
@@ -13,15 +15,6 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        "whispercpp",
-#ifdef _WIN32
-        "whisper-server.exe"
-#else
-        "whisper-server"
-#endif
-        , get_install_params
-    );
 
     explicit WhisperServer(const std::string& log_level,
                           ModelManager* model_manager,
@@ -74,5 +67,11 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer {
     std::filesystem::path temp_dir_;  // Directory for temporary audio files
 };
 
-} // namespace backends
-} // namespace lemon
+namespace whispercpp {
+// Factory for the whispercpp backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
+const BackendOps* ops();
+}  // namespace whispercpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/config_file.h b/src/cpp/include/lemon/config_file.h
index ec56c17fb..8c46e125f 100644
--- a/src/cpp/include/lemon/config_file.h
+++ b/src/cpp/include/lemon/config_file.h
@@ -84,8 +84,15 @@ static inline bool config_migrate(json& config,
 /// Manages reading and writing config.json in the lemonade cache dir.
 class ConfigFile {
 public:
-    /// Returns the full default config loaded from installed resource JSON.
-    /// On Linux, an optional distro override at /usr/share/lemonade/defaults.json
+    /// The canonical default config: resources/defaults.json (global keys) with
+    /// each backend's per-recipe section seeded from its descriptor. Host- and
+    /// deployment-independent, so it is reproducible — this is what
+    /// GET /internal/config/defaults emits and gen_backend_boilerplate.py writes
+    /// back into resources/defaults.json.
+    static json base_defaults();
+
+    /// base_defaults() plus deployment overrides. On Linux, an optional distro
+    /// override at /usr/share/lemonade/defaults.json (and LEMONADE_DEFAULTS_PATH)
     /// is merged on top when present.
     static json get_defaults();
 
diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
index 77a10066e..c346e3ab9 100644
--- a/src/cpp/include/lemon/model_manager.h
+++ b/src/cpp/include/lemon/model_manager.h
@@ -77,11 +77,6 @@ struct ModelInfo {
     bool suggested = false;
     std::string source;  // "local_upload" for locally uploaded models
     bool downloaded = false;     // Whether model is downloaded and available
-    // When true, LlamaCppServer launches llama-server with `-hf <checkpoint>`
-    // instead of `-m <gguf> [--mmproj <mmproj>]`. Required for models like
-    // Qwen2.5-Omni where llama-server's manual-load path rejects audio content
-    // parts — the -hf path drives the dual-clip (vision+audio) context correctly.
-    bool hf_load = false;
     double size = 0.0;   // Model size in GB
     int64_t max_context_window = 0;  // Static model-supported text context, when known
 
@@ -105,8 +100,18 @@ struct ModelInfo {
     double cost_input_per_million = -1.0;
     double cost_output_per_million = -1.0;
 
-    // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING)
-    int moonshine_arch = -1;
+    // Generic per-model fields a backend declares for itself. Any server_models.json
+    // key not consumed by a typed field above lands here, so a new backend can read
+    // custom per-model config in load() without editing this shared struct.
+    std::map<std::string, json> extras;
+
+    // Look up an extra field, returning a default when absent.
+    template <typename T>
+    T extra(const std::string& key, const T& fallback) const {
+        auto it = extras.find(key);
+        if (it == extras.end() || it->second.is_null()) return fallback;
+        try { return it->second.get<T>(); } catch (...) { return fallback; }
+    }
 
     // Utility
     std::string checkpoint(const std::string& type = "main") const { return checkpoints.count(type) ? checkpoints.at(type) : ""; }
@@ -209,11 +214,19 @@ class ModelManager {
     // Check if model is downloaded
     bool is_model_downloaded(const std::string& model_name);
 
-    // Get list of installed FLM models (for caching)
-    std::vector<std::string> get_flm_installed_models();
+    // True if the model's backend pulls its own models on demand (e.g. flm) and
+    // so should be skipped by the router's load-time auto-download path.
+    bool backend_self_manages_downloads(const std::string& recipe) const;
+
+    // Shared Hugging Face completeness check: true if all required checkpoints
+    // are present and complete (per-backend file validation runs via ops). The
+    // default BackendOps::is_downloaded delegates here for HF-backed backends.
+    bool checkpoints_complete(const ModelInfo& info) const;
 
-    // Get list of all available FLM models from 'flm list --json'
-    std::vector<ModelInfo> get_flm_available_models();
+    // Shared Hugging Face download engine. The default BackendOps::download_model
+    // delegates here; flm/cloud override with their own download.
+    void download_from_huggingface_engine(const ModelInfo& info,
+                                          DownloadProgressCallback progress_callback = nullptr);
 
     // Get HuggingFace cache directory (respects HF_HUB_CACHE, HF_HOME, and platform defaults)
     std::string get_hf_cache_dir() const;
@@ -295,11 +308,6 @@ class ModelManager {
     void download_from_huggingface(const ModelInfo& info,
                                    DownloadProgressCallback progress_callback = nullptr);
 
-    // Download from FLM
-    void download_from_flm(const std::string& checkpoint,
-                          bool do_not_upgrade = true,
-                          DownloadProgressCallback progress_callback = nullptr);
-
     // Discover GGUF models from extra_models_dir
     std::map<std::string, ModelInfo> discover_extra_models() const;
 
diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h
index eb5d4e0b4..c92bedb37 100644
--- a/src/cpp/include/lemon/model_types.h
+++ b/src/cpp/include/lemon/model_types.h
@@ -139,28 +139,14 @@ inline ModelType get_model_type_from_labels(const std::vector<std::string>& labe
     return ModelType::LLM;
 }
 
-// Determine device type from recipe
-// Default device from recipe — individual backends override based on their config
+// Fallback device type for recipes with no registered backend descriptor
+// (collections and unknown recipes). The authoritative per-backend default lives
+// in BackendDescriptor::default_device; ModelManager::device_type_for_recipe
+// consults the descriptor registry first and only falls back here. Kept in this
+// low-level header (which must not depend on the backend registry) for that
+// fallback alone — it intentionally carries no per-backend knowledge.
 inline DeviceType get_device_type_from_recipe(const std::string& recipe) {
-    if (recipe == "llamacpp") {
-        return DEVICE_GPU;
-    } else if (recipe == "ryzenai-llm") {
-        return DEVICE_NPU;
-    } else if (recipe == "flm") {
-        return DEVICE_NPU;
-    } else if (recipe == "whispercpp") {
-        return DEVICE_CPU;
-    } else if (recipe == "moonshine") {
-        return DEVICE_CPU;
-    } else if (recipe == "sd-cpp") {
-        return DEVICE_CPU;
-    } else if (recipe == "kokoro") {
-        return DEVICE_CPU;
-    } else if (is_collection_recipe(recipe)) {
-        return DEVICE_NONE;
-    } else if (recipe == "cloud") {
-        return DEVICE_NONE;  // Cloud-offloaded models execute on a remote provider
-    }
+    (void)recipe;
     return DEVICE_NONE;
 }
 
diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h
new file mode 100644
index 000000000..ec0af9a9d
--- /dev/null
+++ b/src/cpp/include/lemon/recipe_backend_def.h
@@ -0,0 +1,39 @@
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+namespace lemon {
+
+// Device constraints: device_type -> set of allowed families (empty = all families)
+using DeviceConstraints = std::map<std::string, std::set<std::string>>;
+
+// A single recipe/backend support row: which OS and device families a given
+// (recipe, backend) pair runs on. The canonical support matrix is assembled by
+// collecting these rows from every backend descriptor (see BackendDescriptor::support).
+//
+// IMPORTANT: For recipes with multiple backends (e.g. llamacpp), the order in
+// which these rows appear defines the preference order — first listed = most
+// preferred. Empty family set {} means "all families of that device type".
+struct RecipeBackendDef {
+    std::string recipe;
+    std::string backend;
+    std::set<std::string> supported_os;
+    DeviceConstraints devices;
+    // Human-friendly device description for the generated support matrix (README).
+    // May contain footnote markers (e.g. "*") whose text lives as prose in the doc.
+    std::string device_summary = "";
+};
+
+// A backend descriptor's support row, without the recipe (it's always the
+// owning descriptor's recipe — assembling a RecipeBackendDef fills it in). Keeps
+// the descriptor literals from repeating their own recipe on every row.
+struct BackendSupport {
+    std::string backend;
+    std::set<std::string> supported_os;
+    DeviceConstraints devices;
+    std::string device_summary = "";
+};
+
+} // namespace lemon
diff --git a/src/cpp/include/lemon/router.h b/src/cpp/include/lemon/router.h
index e98a8b11d..a4f2d9629 100644
--- a/src/cpp/include/lemon/router.h
+++ b/src/cpp/include/lemon/router.h
@@ -167,7 +167,7 @@ class Router {
     bool has_npu_server() const;
     WrappedServer* find_npu_server() const;
     WrappedServer* find_npu_server_by_recipe(const std::string& recipe) const;
-    WrappedServer* find_flm_server_by_type(ModelType type) const;
+    WrappedServer* find_coexisting_server_by_type(ModelType type) const;
     void evict_all_npu_servers();
     void evict_server(WrappedServer* server, int timeout_seconds = -1);
     void evict_all_servers();
diff --git a/src/cpp/include/lemon/server.h b/src/cpp/include/lemon/server.h
index d481f5b80..3e39357d3 100644
--- a/src/cpp/include/lemon/server.h
+++ b/src/cpp/include/lemon/server.h
@@ -73,6 +73,7 @@ class Server {
     // Unified config endpoints
     void handle_config_set(const httplib::Request& req, httplib::Response& res);
     void handle_config_get(const httplib::Request& req, httplib::Response& res);
+    void handle_config_defaults_get(const httplib::Request& req, httplib::Response& res);
 
     // Side-effect callback for RuntimeConfig::set(). Receives a nested JSON
     // mirroring the input shape, containing only entries that actually changed.
diff --git a/src/cpp/include/lemon/system_info.h b/src/cpp/include/lemon/system_info.h
index 9b143ae47..a67c744b6 100644
--- a/src/cpp/include/lemon/system_info.h
+++ b/src/cpp/include/lemon/system_info.h
@@ -104,9 +104,6 @@ class SystemInfo {
     };
     static std::vector<RecipeStatus> get_all_recipe_statuses();
 
-    static std::string get_flm_version();
-    static std::string get_system_llamacpp_version();
-
     // Device support detection
     static std::string get_rocm_arch();
     static std::string get_cuda_arch();
diff --git a/src/cpp/include/lemon/utils/path_utils.h b/src/cpp/include/lemon/utils/path_utils.h
index 96561186c..63f142ee6 100644
--- a/src/cpp/include/lemon/utils/path_utils.h
+++ b/src/cpp/include/lemon/utils/path_utils.h
@@ -35,22 +35,6 @@ bool is_safe_executable_path(const std::string& path);
  */
 bool looks_like_path(const std::string& v);
 
-/**
- * Find the FLM executable (flm.exe on Windows, flm on Unix).
- * Uses SearchPathA on Windows (same API as CreateProcessA) to search PATH,
- * then falls back to the default installation directory.
- * @return Full path to flm executable, or empty string if not found.
- */
-std::string find_flm_executable();
-
-/**
- * Run 'flm validate' command and check if it succeeds.
- * @param flm_path Optional path to flm executable. If empty, will search for it.
- * @param error_message Output parameter for error message if validation fails.
- * @return true if validation succeeds, false otherwise.
- */
-bool run_flm_validate(const std::string& flm_path, std::string& error_message);
-
 /**
  * Get an environment variable as UTF-8 text.
  */
@@ -73,13 +57,6 @@ std::string path_to_utf8(const std::filesystem::path& path);
  */
 std::string find_executable_in_path(const std::string& executable_name);
 
-/**
- * Check if the HIP plugin for GGML backends is available on the system.
- * This function checks common installation paths for libggml-hip.so.
- * @return true if the HIP plugin is found, false otherwise.
- */
-bool is_ggml_hip_plugin_available();
-
 /**
  * Set the lemonade cache directory. Must be called once at startup before
  * get_cache_dir(). After this call, get_cache_dir() returns this path.
diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h
index f3ec74da4..41e91595b 100644
--- a/src/cpp/include/lemon/wrapped_server.h
+++ b/src/cpp/include/lemon/wrapped_server.h
@@ -17,6 +17,7 @@
 #include "model_manager.h"
 #include "backend_manager.h"
 #include "recipe_options.h"
+#include "backends/backend_descriptor.h"
 
 namespace lemon {
 
@@ -307,10 +308,46 @@ class WrappedServer : public ICompletionServer {
         // No-op by default
     }
 
-    // ICompletionServer implementation - forward requests to the wrapped server
-    virtual json chat_completion(const json& request) override = 0;
-    virtual json completion(const json& request) override = 0;
-    virtual json responses(const json& request) = 0;
+    // ICompletionServer implementation - forward requests to the wrapped server.
+    // Default to an "unsupported" error so non-chat backends (TTS, image,
+    // transcription) inherit a sensible response instead of stubbing each one.
+    virtual json chat_completion(const json& request) override {
+        return unsupported_capability_error("chat completion");
+    }
+    virtual json completion(const json& request) override {
+        return unsupported_capability_error("text completion");
+    }
+    virtual json responses(const json& request) {
+        return unsupported_capability_error("responses");
+    }
+
+    // Descriptor association (set by the backend registry at create() time). The
+    // effective_* hooks below default to the descriptor's declared values; a
+    // backend whose device or eviction rule depends on the chosen backend
+    // variant overrides them (e.g. whisper on npu vs cpu, llamacpp on cpu vs gpu).
+    void set_descriptor(const BackendDescriptor* descriptor) { descriptor_ = descriptor; }
+    const BackendDescriptor* get_descriptor() const { return descriptor_; }
+
+    // Effective accelerator device for this load. The router calls this after it
+    // resolves the "<recipe>_backend" option but before eviction. Defaults to the
+    // descriptor's default_device; variant-dependent backends override.
+    virtual DeviceType effective_device(const RecipeOptions& options) const {
+        (void)options;
+        return descriptor_ ? descriptor_->default_device : device_type_;
+    }
+
+    // Effective slot/eviction policy for this load. The router switches on this
+    // value to enforce NPU exclusivity and LRU slot accounting. Defaults to the
+    // descriptor's slot_policy; variant-dependent backends override.
+    virtual SlotPolicy effective_slot_policy(const RecipeOptions& options) const {
+        (void)options;
+        return descriptor_ ? descriptor_->slot_policy : SlotPolicy::Standard;
+    }
+
+    // Dynamic availability check. Returns "" if the backend can run on this
+    // system, or a user-facing reason why it cannot. Defaults to "available";
+    // backends with runtime-dependent availability (cloud) override.
+    virtual std::string availability() const { return ""; }
 
     // Forward streaming requests to the wrapped server (public for Router access)
     // Virtual so backends can transform request (e.g., FLM needs checkpoint in model field)
@@ -373,6 +410,17 @@ class WrappedServer : public ICompletionServer {
         BackendRequestKind kind_;
     };
 
+    // Standard "this backend does not serve <what>" error payload, matching the
+    // shape backends return from unsupported capability methods.
+    json unsupported_capability_error(const std::string& what) const {
+        return json{{"error", {
+            {"message", server_name_ + " does not support " + what +
+                            ". Use the appropriate endpoint for this model type instead."},
+            {"type", "unsupported_operation"},
+            {"code", "model_not_applicable"}
+        }}};
+    }
+
     static bool has_process_handle(const ProcessHandle& handle);
     ProcessHandle get_process_handle_snapshot() const;
     void set_process_handle(ProcessHandle handle);
@@ -420,6 +468,7 @@ class WrappedServer : public ICompletionServer {
     std::string log_level_;
     ModelManager* model_manager_;  // Non-owning pointer to ModelManager
     BackendManager* backend_manager_;  // Non-owning pointer to BackendManager
+    const BackendDescriptor* descriptor_ = nullptr;  // Non-owning; set by the backend registry at create()
 
     // Multi-model support fields
     std::string model_name_;
diff --git a/src/cpp/resources/defaults.json b/src/cpp/resources/defaults.json
index f79396266..ab86404dd 100644
--- a/src/cpp/resources/defaults.json
+++ b/src/cpp/resources/defaults.json
@@ -1,71 +1,71 @@
 {
+  "cloud_providers": [],
   "config_version": 2,
-  "port": 13305,
-  "host": "localhost",
-  "websocket_port": "auto",
-  "log_level": "info",
-  "global_timeout": 600,
-  "max_loaded_models": 1,
-  "no_broadcast": false,
-  "extra_models_dir": "",
-  "models_dir": "auto",
   "ctx_size": -1,
-  "offline": false,
-  "no_fetch_executables": false,
   "disable_model_filtering": false,
   "enable_dgpu_gtt": false,
-  "rocm_channel": "stable",
+  "extra_models_dir": "",
+  "flm": {
+    "args": ""
+  },
+  "global_timeout": 600,
+  "host": "localhost",
+  "kokoro": {
+    "cpu_bin": "builtin"
+  },
   "llamacpp": {
-    "backend": "auto",
     "args": "",
-    "rocm_args": "",
-    "vulkan_args": "",
+    "backend": "auto",
     "cpu_args": "",
+    "cpu_bin": "builtin",
+    "cuda_bin": "builtin",
     "prefer_system": true,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin",
-    "cuda_bin": "builtin",
-    "cpu_bin": "builtin"
+    "vulkan_args": "",
+    "vulkan_bin": "builtin"
   },
-  "whispercpp": {
-    "backend": "auto",
+  "log_level": "info",
+  "max_loaded_models": 1,
+  "models_dir": "auto",
+  "moonshine": {
     "args": "",
     "cpu_args": "",
-    "npu_args": "",
-    "cpu_bin": "builtin",
-    "npu_bin": "builtin"
+    "cpu_bin": "builtin"
+  },
+  "no_broadcast": false,
+  "no_fetch_executables": false,
+  "offline": false,
+  "port": 13305,
+  "rocm_channel": "stable",
+  "ryzenai": {
+    "server_bin": "builtin"
   },
   "sdcpp": {
-    "backend": "auto",
     "args": "",
-    "cpu_args": "",
-    "rocm_args": "",
-    "vulkan_args": "",
-    "steps": 20,
+    "backend": "auto",
     "cfg_scale": 7.0,
-    "width": 512,
-    "height": 512,
+    "cpu_args": "",
     "cpu_bin": "builtin",
+    "height": 512,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin"
-  },
-  "flm": {
-    "args": ""
+    "steps": 20,
+    "vulkan_args": "",
+    "vulkan_bin": "builtin",
+    "width": 512
   },
   "vllm": {
-    "backend": "auto",
-    "args": ""
-  },
-  "ryzenai": {
-    "server_bin": "builtin"
-  },
-  "kokoro": {
-    "cpu_bin": "builtin"
+    "args": "",
+    "backend": "auto"
   },
-  "moonshine": {
+  "websocket_port": "auto",
+  "whispercpp": {
     "args": "",
+    "backend": "auto",
     "cpu_args": "",
-    "cpu_bin": "builtin"
-  },
-  "cloud_providers": []
+    "cpu_bin": "builtin",
+    "npu_args": "",
+    "npu_bin": "builtin"
+  }
 }
diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp
index 120b61428..7f5452845 100644
--- a/src/cpp/server/backend_manager.cpp
+++ b/src/cpp/server/backend_manager.cpp
@@ -1,5 +1,6 @@
 #include "lemon/backend_manager.h"
 #include "lemon/backend_version_policy.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/runtime_config.h"
 #include "lemon/system_info.h"
@@ -36,7 +37,7 @@ std::string get_current_os() {
 }
 
 std::string normalize_backend_name(const std::string& recipe, const std::string& backend) {
-    if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") {
+    if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") {
         // Map "rocm" to the appropriate channel based on config
         std::string channel = "stable";  // default to stable for now
         if (auto* cfg = RuntimeConfig::global()) {
@@ -64,15 +65,6 @@ std::string get_backend_runtime_version(const json& backend_versions,
         return backend_versions[recipe][runtime_key].get<std::string>();
     }
 
-    // Only fall back to llamacpp runtime version if the recipe is llamacpp
-    if (recipe == "llamacpp" &&
-        backend_versions.contains("llamacpp") &&
-        backend_versions["llamacpp"].is_object() &&
-        backend_versions["llamacpp"].contains(runtime_key) &&
-        backend_versions["llamacpp"][runtime_key].is_string()) {
-        return backend_versions["llamacpp"][runtime_key].get<std::string>();
-    }
-
     throw std::runtime_error("backend_versions.json is missing runtime version for: " + recipe + ":" + runtime_key);
 }
 
@@ -484,7 +476,7 @@ void BackendManager::install_backend(const std::string& recipe, const std::strin
     // Do that here before inflating the install to a multi-file UX flow.
     const std::string os = get_current_os();
     const bool is_rocm_stable_backend =
-        (recipe == "llamacpp" || recipe == "sd-cpp") &&
+        backends::recipe_has_rocm_channels(recipe) &&
         resolved_backend == "rocm-stable";
     const bool therock_applicable =
         is_rocm_stable_backend && will_install_therock(os, backend_versions_);
diff --git a/src/cpp/server/backends/backend_descriptor_registry.cpp b/src/cpp/server/backends/backend_descriptor_registry.cpp
new file mode 100644
index 000000000..6d1741d87
--- /dev/null
+++ b/src/cpp/server/backends/backend_descriptor_registry.cpp
@@ -0,0 +1,34 @@
+#include "lemon/backends/backend_descriptor_registry.h"
+
+// Generated from LEMON_BACKENDS at configure time. Defines
+// lemon::backends::all_generated_descriptors() (descriptor data only).
+#include "backend_descriptors_generated.h"
+
+namespace lemon {
+namespace backends {
+
+const std::vector<const BackendDescriptor*>& all_descriptors() {
+    static const std::vector<const BackendDescriptor*> kDescriptors = all_generated_descriptors();
+    return kDescriptors;
+}
+
+const BackendDescriptor* descriptor_for(const std::string& recipe) {
+    for (const BackendDescriptor* d : all_descriptors()) {
+        if (d->recipe == recipe) {
+            return d;
+        }
+    }
+    return nullptr;
+}
+
+bool has_backend(const std::string& recipe) {
+    return descriptor_for(recipe) != nullptr;
+}
+
+bool recipe_has_rocm_channels(const std::string& recipe) {
+    const BackendDescriptor* d = descriptor_for(recipe);
+    return d != nullptr && !d->rocm_channels.empty();
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_descriptors_generated.h.in b/src/cpp/server/backends/backend_descriptors_generated.h.in
new file mode 100644
index 000000000..3f6d7ec2a
--- /dev/null
+++ b/src/cpp/server/backends/backend_descriptors_generated.h.in
@@ -0,0 +1,19 @@
+#pragma once
+//
+// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt.
+// Do not edit by hand. Descriptor DATA only (CLI-safe; no server classes).
+//
+#include <vector>
+#include "lemon/backends/backend_descriptor.h"
+@LEMON_DESCRIPTOR_INCLUDES@
+namespace lemon {
+namespace backends {
+
+inline std::vector<const BackendDescriptor*> all_generated_descriptors() {
+    return {
+@LEMON_DESCRIPTOR_ENTRIES@
+    };
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_factories_generated.h.in b/src/cpp/server/backends/backend_factories_generated.h.in
new file mode 100644
index 000000000..d488ce014
--- /dev/null
+++ b/src/cpp/server/backends/backend_factories_generated.h.in
@@ -0,0 +1,21 @@
+#pragma once
+//
+// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt.
+// Do not edit by hand. Binds each descriptor to its server class's create()
+// (server-only: pulls in server classes, compiled into lemond not the CLI).
+//
+#include <vector>
+#include "lemon/backends/backend_registry.h"
+@LEMON_DESCRIPTOR_INCLUDES@
+@LEMON_FACTORY_INCLUDES@
+namespace lemon {
+namespace backends {
+
+inline std::vector<BackendRegistration> generated_registrations() {
+    return {
+@LEMON_FACTORY_ENTRIES@
+    };
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp
new file mode 100644
index 000000000..2f4cdf48c
--- /dev/null
+++ b/src/cpp/server/backends/backend_ops.cpp
@@ -0,0 +1,125 @@
+#include "lemon/backends/backend_ops.h"
+
+#include <algorithm>
+#include <filesystem>
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/utils/path_utils.h"
+
+namespace fs = std::filesystem;
+
+namespace lemon {
+namespace backends {
+
+using lemon::utils::path_from_utf8;
+using lemon::utils::path_to_utf8;
+
+// Default checkpoint resolution: the shared Hugging Face behavior. Locate the
+// requested variant (or auxiliary file like mmproj) within the active snapshot,
+// falling back to the main repo and finally the model cache directory. Backends
+// with bespoke layouts override resolve_checkpoint_path().
+std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info,
+                                                const CheckpointResolveContext& ctx) const {
+    (void)info;
+
+    // NPU side-cache checkpoints have no resolvable local file here (the backend
+    // that uses them resolves them itself at load time).
+    if (ctx.type == "npu_cache") {
+        return "";
+    }
+
+    fs::path model_cache_path_fs = path_from_utf8(ctx.model_cache_path);
+
+    if (!ctx.variant.empty()) {
+        // Prefer refs/main for auxiliary checkpoints too (e.g. mmproj) so
+        // companion files stay on the active snapshot as the main model.
+        fs::path active_snapshot = hf_cache::active_snapshot_path(model_cache_path_fs);
+        if (!active_snapshot.empty()) {
+            fs::path direct_variant_path = active_snapshot / path_from_utf8(ctx.variant);
+            if (hf_cache::exists(direct_variant_path)) {
+                return path_to_utf8(direct_variant_path);
+            }
+            std::error_code ec;
+            for (const auto& entry :
+                 fs::recursive_directory_iterator(active_snapshot, hf_cache::dir_options(), ec)) {
+                if (ec) break;
+                if (entry.is_regular_file(ec)) {
+                    if (entry.path().filename().string() == ctx.variant) {
+                        return path_to_utf8(entry.path());
+                    }
+                } else if (entry.is_directory(ec)) {
+                    fs::path variant_path = entry.path() / path_from_utf8(ctx.variant);
+                    if (hf_cache::exists(variant_path)) {
+                        return path_to_utf8(variant_path);
+                    }
+                }
+                ec.clear();
+            }
+        }
+
+        // Try to find the exact variant in the cache directory's subtree.
+        if (hf_cache::exists(model_cache_path_fs)) {
+            for (const auto& entry :
+                 fs::recursive_directory_iterator(model_cache_path_fs, hf_cache::dir_options())) {
+                if (entry.is_regular_file()) {
+                    if (entry.path().filename().string() == ctx.variant) {
+                        return path_to_utf8(entry.path());
+                    }
+                } else if (entry.is_directory()) {
+                    fs::path variant_path = entry.path() / path_from_utf8(ctx.variant);
+                    if (hf_cache::exists(variant_path)) {
+                        return path_to_utf8(variant_path);
+                    }
+                }
+            }
+        }
+
+        // Backward-compat: older downloads placed all files in the main repo dir.
+        if (ctx.repo_id != ctx.main_repo_id) {
+            std::string main_cache_path =
+                ctx.hf_cache + "/" + hf_cache::repo_id_to_cache_dir_name(ctx.main_repo_id);
+            fs::path main_cache_path_fs = path_from_utf8(main_cache_path);
+            if (fs::exists(main_cache_path_fs)) {
+                for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) {
+                    if (entry.is_regular_file()) {
+                        if (entry.path().filename().string() == ctx.variant) {
+                            return path_to_utf8(entry.path());
+                        }
+                    } else if (entry.is_directory()) {
+                        fs::path variant_path = entry.path() / path_from_utf8(ctx.variant);
+                        if (fs::exists(variant_path)) {
+                            return path_to_utf8(variant_path);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Variant not found — signal not downloaded.
+        return "";
+    }
+
+    // No variant: return the cache directory.
+    return ctx.model_cache_path;
+}
+
+bool BackendOps::is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const {
+    // Default: the shared HF checkpoint-completeness check.
+    return ctx.model_manager != nullptr && ctx.model_manager->checkpoints_complete(info);
+}
+
+void BackendOps::download_model(const ModelInfo& info, bool do_not_upgrade,
+                                DownloadProgressCallback progress, const BackendOpsContext& ctx) const {
+    // Default: the shared Hugging Face download engine.
+    (void)do_not_upgrade;
+    if (ctx.model_manager != nullptr) {
+        ctx.model_manager->download_from_huggingface_engine(info, progress);
+    }
+}
+
+const BackendOps* default_backend_ops() {
+    static const BackendOps kDefault;
+    return &kDefault;
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp
new file mode 100644
index 000000000..abbeaf998
--- /dev/null
+++ b/src/cpp/server/backends/backend_registry.cpp
@@ -0,0 +1,49 @@
+#include "lemon/backends/backend_registry.h"
+#include "lemon/wrapped_server.h"
+
+// Generated from LEMON_BACKENDS at configure time. Defines
+// lemon::backends::generated_registrations(), pairing each descriptor with its
+// server class's create().
+#include "backend_factories_generated.h"
+
+namespace lemon {
+namespace backends {
+
+const std::vector<BackendRegistration>& all_registrations() {
+    static const std::vector<BackendRegistration> kRegistrations = generated_registrations();
+    return kRegistrations;
+}
+
+const BackendSpec* spec_for(const std::string& recipe) {
+    for (const auto& reg : all_registrations()) {
+        if (reg.descriptor->recipe == recipe) {
+            return reg.spec;
+        }
+    }
+    return nullptr;
+}
+
+const BackendOps* ops_for(const std::string& recipe) {
+    for (const auto& reg : all_registrations()) {
+        if (reg.descriptor->recipe == recipe) {
+            return reg.ops;
+        }
+    }
+    return default_backend_ops();
+}
+
+std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx) {
+    for (const auto& reg : all_registrations()) {
+        if (reg.descriptor->recipe == recipe) {
+            std::unique_ptr<WrappedServer> server = reg.create(ctx);
+            if (server) {
+                server->set_descriptor(reg.descriptor);
+            }
+            return server;
+        }
+    }
+    return nullptr;
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp
index bbed27684..42c0d1709 100644
--- a/src/cpp/server/backends/backend_utils.cpp
+++ b/src/cpp/server/backends/backend_utils.cpp
@@ -2,14 +2,7 @@
 #include "lemon/backends/install_staging.h"
 #include "lemon/runtime_config.h"
 #include "lemon/system_info.h"
-#include "lemon/backends/llamacpp_server.h"
-#include "lemon/backends/whisper_server.h"
-#include "lemon/backends/sd_server.h"
-#include "lemon/backends/kokoro_server.h"
-#include "lemon/backends/ryzenaiserver.h"
-#include "lemon/backends/vllm_server.h"
-#include "lemon/backends/fastflowlm_server.h"
-#include "lemon/backends/moonshine_server.h"
+#include "lemon/backends/backend_registry.h"  // spec_for() — descriptor->install spec, no server includes
 #include "lemon/model_manager.h"  // For DownloadProgress, DownloadProgressCallback
 
 #include "lemon/utils/path_utils.h"
@@ -41,15 +34,9 @@ using json = nlohmann::json;
 namespace lemon::backends {
 
     const BackendSpec* try_get_spec_for_recipe(const std::string& recipe) {
-        if (recipe == "llamacpp") return &LlamaCppServer::SPEC;
-        if (recipe == "whispercpp") return &WhisperServer::SPEC;
-        if (recipe == "sd-cpp") return &SDServer::SPEC;
-        if (recipe == "kokoro") return &KokoroServer::SPEC;
-        if (recipe == "ryzenai-llm") return &::lemon::RyzenAIServer::SPEC;
-        if (recipe == "vllm") return &VLLMServer::SPEC;
-        if (recipe == "flm") return &FastFlowLMServer::SPEC;
-        if (recipe == "moonshine") return &MoonshineServer::SPEC;
-        return nullptr;
+        // Each backend exposes its install/download spec through the registry
+        // (see <stem>::spec()); no per-recipe branches or server includes here.
+        return spec_for(recipe);
     }
 
     static std::string hash_string_from_json(const json& node) {
@@ -315,8 +302,8 @@ namespace lemon::backends {
                                               std::string& out_section,
                                               std::string& out_bin_key) {
         std::string config_backend = backend;
-        if ((recipe == "llamacpp" || recipe == "sd-cpp") &&
-            (backend == "rocm-stable" || backend == "rocm-nightly")) {
+        if ((recipe_has_rocm_channels(recipe) &&
+            (backend == "rocm-stable" || backend == "rocm-nightly"))) {
             config_backend = "rocm";
         }
         out_section = RuntimeConfig::recipe_to_config_section(recipe);
@@ -369,7 +356,7 @@ namespace lemon::backends {
 
         // Resolve "rocm" to actual channel for backends that support ROCm channels
         std::string resolved_backend = backend;
-        if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") {
+        if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") {
             std::string channel = "stable";  // default to stable
             if (auto* cfg = RuntimeConfig::global()) {
                 channel = cfg->rocm_channel_for_recipe(spec.recipe);
@@ -409,7 +396,7 @@ namespace lemon::backends {
         // directory or ROCm backends remain stuck in update_required after a
         // successful install.
         std::string resolved_backend = backend;
-        if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") {
+        if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") {
             std::string channel = "stable";
             if (auto* cfg = RuntimeConfig::global()) {
                 channel = cfg->rocm_channel_for_recipe(spec.recipe);
@@ -423,7 +410,7 @@ namespace lemon::backends {
 
     std::string BackendUtils::get_backend_version(const std::string& recipe, const std::string& backend) {
         std::string resolved_backend = backend;
-        if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") {
+        if (recipe_has_rocm_channels(recipe) && backend == "rocm") {
             // Map "rocm" to the appropriate channel based on config
             std::string channel = "stable";  // default to stable for now
             if (auto* cfg = RuntimeConfig::global()) {
diff --git a/src/cpp/server/backends/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
similarity index 90%
rename from src/cpp/server/backends/cloud_server.cpp
rename to src/cpp/server/backends/cloud/cloud_server.cpp
index b29fee4cc..3c61c213b 100644
--- a/src/cpp/server/backends/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -1,4 +1,6 @@
-#include "lemon/backends/cloud_server.h"
+#include "lemon/backends/cloud/cloud_server.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/model_manager.h"
 #include "lemon/cloud_provider_registry.h"
 #include "lemon/error_types.h"
 #include "lemon/runtime_config.h"
@@ -830,3 +832,81 @@ std::vector<ModelInfo> CloudServer::discover_models(const std::string& provider,
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace cloud {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<CloudServer>(
+        ctx.model_info->cloud_provider, ctx.log_level,
+        ctx.model_manager, ctx.backend_manager, ctx.cloud_registry);
+}
+
+
+namespace {
+class CloudOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext&) const override {
+        // Cloud-offloaded models have no local artifacts; the checkpoint is the
+        // upstream provider's model id, used directly when forwarding requests.
+        return "";
+    }
+
+    // Cloud models have no local artifacts — always "downloaded".
+    bool is_downloaded(const ModelInfo&, const BackendOpsContext&) const override {
+        return true;
+    }
+
+    // "Downloading" a cloud model is a no-op.
+    void download_model(const ModelInfo&, bool, DownloadProgressCallback,
+                        const BackendOpsContext&) const override {}
+
+    // Discover models from each installed cloud provider with a resolvable
+    // credential. Per AGENTS.md invariant #11 the registry persists only
+    // {provider, base_url}; keys come from env vars / process memory. Failures
+    // are logged, never propagated, so one offline provider can't block discovery.
+    std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const override {
+        std::vector<ModelInfo> out;
+        if (ctx.cloud_registry == nullptr) {
+            return out;
+        }
+        for (const auto& rec : ctx.cloud_registry->list_installed()) {
+            const std::string api_key = ctx.cloud_registry->resolve_key(rec.name);
+            if (api_key.empty() || rec.base_url.empty()) {
+                LOG(INFO, "CloudOps") << "Skipping cloud discovery for '" << rec.name
+                                      << "': no API key resolvable (set "
+                                      << CloudProviderRegistry::env_var_name(rec.name)
+                                      << " or POST /v1/cloud/auth)" << std::endl;
+                continue;
+            }
+            // Don't send the API key to a plaintext http:// endpoint unless the
+            // provider explicitly opted in (AGENTS.md invariant #11).
+            if (CloudProviderRegistry::is_http_base_url(rec.base_url) && !rec.allow_insecure_http) {
+                LOG(WARNING, "CloudOps") << "Skipping cloud discovery for '" << rec.name
+                                         << "': http:// with API key requires allow_insecure_http=true"
+                                         << std::endl;
+                continue;
+            }
+            try {
+                for (auto& m : CloudServer::discover_models(rec.name, api_key, rec.base_url)) {
+                    if (m.recipe == "cloud" && !m.model_name.empty()) {
+                        out.push_back(std::move(m));
+                    }
+                }
+            } catch (const std::exception& e) {
+                LOG(WARNING, "CloudOps") << "Cloud discovery threw for '" << rec.name
+                                         << "': " << e.what() << std::endl;
+            }
+        }
+        return out;
+    }
+};
+}  // namespace
+
+const BackendSpec* spec() { return nullptr; }
+const BackendOps* ops() { return single_ops<CloudOps>(); }
+}  // namespace cloud
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
new file mode 100644
index 000000000..83d2080bc
--- /dev/null
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
@@ -0,0 +1,753 @@
+#include "lemon/backends/fastflowlm/fastflowlm_models.h"
+
+#include <cstdlib>
+#include <vector>
+#include <nlohmann/json.hpp>
+#include "lemon/model_manager.h"
+#include "lemon/utils/aixlog.hpp"
+#include "lemon/utils/json_utils.h"
+#include "lemon/utils/path_utils.h"
+#include <sstream>
+#include <thread>
+#include <chrono>
+#include "lemon/backends/backend_descriptor_registry.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_utils.h"
+#include "lemon/system_info.h"
+#include "lemon/utils/process_manager.h"
+
+namespace fs = std::filesystem;
+using json = nlohmann::json;
+
+namespace lemon {
+namespace backends {
+namespace fastflowlm {
+namespace {
+
+using lemon::utils::path_from_utf8;
+using lemon::utils::path_to_utf8;
+
+bool safe_exists(const fs::path& p) {
+    std::error_code ec;
+    return fs::exists(p, ec);
+}
+
+// Candidate roots that FLM may use to store models. FLM resolves its model
+// directory from the FLM_MODEL_PATH env var (set by the installer) and falls
+// back to platform-default locations.
+std::vector<fs::path> get_flm_models_dir_candidates() {
+    std::vector<fs::path> roots;
+
+    const char* flm_model_path = std::getenv("FLM_MODEL_PATH");
+    if (flm_model_path && *flm_model_path) {
+        roots.push_back(path_from_utf8(flm_model_path) / "models");
+    }
+
+#ifdef _WIN32
+    const char* userprofile = std::getenv("USERPROFILE");
+    if (userprofile && *userprofile) {
+        fs::path home = path_from_utf8(userprofile);
+        roots.push_back(home / ".flm" / "models");              // current installer default
+        roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default
+        roots.push_back(home / "flm" / "models");
+    }
+#else
+    const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME");
+    if (xdg_config_home && *xdg_config_home) {
+        roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models");
+    }
+    const char* home = std::getenv("HOME");
+    if (home && *home) {
+        fs::path home_path = path_from_utf8(home);
+        roots.push_back(home_path / ".flm" / "models");
+        roots.push_back(home_path / ".config" / "flm" / "models");
+    }
+#endif
+
+    return roots;
+}
+
+} // namespace
+
+fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) {
+    if (repo_dir.empty()) return fs::path();
+
+    for (const auto& root : get_flm_models_dir_candidates()) {
+        fs::path candidate = root / repo_dir / "config.json";
+        if (safe_exists(candidate)) return candidate;
+    }
+    return fs::path();
+}
+
+std::string repo_dir_from_url(const std::string& url) {
+    std::string clean = url;
+    while (!clean.empty() && clean.back() == '/') clean.pop_back();
+    size_t query_pos = clean.find_first_of("?#");
+    if (query_pos != std::string::npos) clean = clean.substr(0, query_pos);
+
+    for (const std::string marker : {"/tree/", "/resolve/"}) {
+        size_t marker_pos = clean.find(marker);
+        if (marker_pos != std::string::npos) {
+            clean = clean.substr(0, marker_pos);
+            break;
+        }
+    }
+
+    size_t slash = clean.find_last_of('/');
+    return slash == std::string::npos ? clean : clean.substr(slash + 1);
+}
+
+int64_t read_flm_max_context_window(const ModelInfo& info) {
+    if (info.type != ModelType::LLM) return 0;
+
+    std::string config_path = info.resolved_path("config");
+    if (config_path.empty()) return 0;
+
+    try {
+        json config = lemon::utils::JsonUtils::load_from_file(config_path);
+        if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) {
+            int64_t value = config["max_position_embeddings"].get<int64_t>();
+            return value > 0 ? value : 0;
+        }
+        if (config.contains("text_config") && config["text_config"].is_object()) {
+            const auto& text_config = config["text_config"];
+            if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) {
+                int64_t value = text_config["max_position_embeddings"].get<int64_t>();
+                return value > 0 ? value : 0;
+            }
+        }
+    } catch (const std::exception& e) {
+        LOG(DEBUG, "FastFlowLM") << "Could not read FLM config metadata for "
+                                 << info.model_name << ": " << e.what() << std::endl;
+    }
+    return 0;
+}
+
+std::string find_flm_binary() {
+    try {
+        const backends::BackendSpec* spec = try_get_spec_for_recipe("flm");
+        if (!spec) {
+            return "";
+        }
+        return BackendUtils::get_backend_binary_path(*spec, "npu");
+    } catch (...) {
+#ifndef _WIN32
+        return find_flm_executable();
+#else
+        return "";
+#endif
+    }
+}
+
+std::vector<std::string> flm_installed_checkpoints() {
+    std::vector<std::string> installed_models;
+
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) return installed_models;
+
+    // Run 'flm list --filter installed --quiet --json' to get only installed models
+    std::string output;
+#ifdef _WIN32
+    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+#else
+    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null";
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        return installed_models;
+    }
+
+    char buffer[256];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    pclose(pipe);
+#endif
+
+    // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] }
+    try {
+        json j = lemon::utils::JsonUtils::parse(output);
+        if (j.contains("models") && j["models"].is_array()) {
+            for (const auto& model : j["models"]) {
+                if (model.contains("name") && model["name"].is_string()) {
+                    installed_models.push_back(model["name"].get<std::string>());
+                }
+            }
+            return installed_models;
+        }
+    } catch (...) {
+        // Fallback to legacy parsing if JSON parsing fails
+    }
+
+    // Legacy parsing - cleaner format without emojis
+    // Expected format:
+    //   Models:
+    //     - modelname:tag
+    //     - another:model
+    std::istringstream stream(output);
+    std::string line;
+    while (std::getline(stream, line)) {
+        // Trim whitespace
+        line.erase(0, line.find_first_not_of(" \t\r\n"));
+        line.erase(line.find_last_not_of(" \t\r\n") + 1);
+
+        // Skip the "Models:" header line or empty lines
+        if (line == "Models:" || line.empty()) {
+            continue;
+        }
+
+        // Parse model checkpoint (format: "  - modelname:tag")
+        if (line.find("- ") == 0) {
+            std::string checkpoint = line.substr(2);
+            // Trim any remaining whitespace
+            checkpoint.erase(0, checkpoint.find_first_not_of(" \t"));
+            checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1);
+            if (!checkpoint.empty()) {
+                installed_models.push_back(checkpoint);
+            }
+        }
+    }
+
+    return installed_models;
+}
+
+std::vector<ModelInfo> flm_discover_models() {
+    std::vector<ModelInfo> flm_models;
+    if (!SystemInfoCache::get_flm_status().is_ready()) {
+        return flm_models;
+    }
+
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) return flm_models;
+
+    LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl;
+
+    // Run 'flm list --json' to get all available models
+    std::string output;
+#ifdef _WIN32
+    std::string command = "\"" + flm_path + "\" list --json";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+    LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc
+              << ", output length: " << output.size() << std::endl;
+    if (rc != 0 || output.empty()) {
+        LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. "
+                  << "Output: " << output.substr(0, 200) << std::endl;
+    }
+#else
+    std::string command = "\"" + flm_path + "\" list --json 2>/dev/null";
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        return flm_models;
+    }
+
+    char buffer[256];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    pclose(pipe);
+#endif
+
+    // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] }
+    try {
+        json j = lemon::utils::JsonUtils::parse(output);
+        if (j.contains("models") && j["models"].is_array()) {
+            for (const auto& m : j["models"]) {
+                if (m.contains("name") && m["name"].is_string()) {
+                    std::string checkpoint = m["name"].get<std::string>();
+
+                    // Format display name: replace : with -, append -FLM
+                    // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM"
+                    std::string display_name = checkpoint;
+                    // Replace : with -
+                    std::replace(display_name.begin(), display_name.end(), ':', '-');
+
+                    std::string model_name = display_name + "-FLM";
+
+                    ModelInfo info;
+                    info.model_name = model_name;
+                    info.checkpoints["main"] = checkpoint;
+                    info.recipe = "flm";
+                    info.suggested = true; // All official FLM models are suggested
+                    info.downloaded = lemon::utils::JsonUtils::get_or_default<bool>(m, "installed", false);
+
+                    if (lemon::utils::JsonUtils::get_or_default<bool>(m, "installed", false) && m.contains("url") && m["url"].is_string()) {
+                        fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir(
+                            backends::fastflowlm::repo_dir_from_url(m["url"].get<std::string>()));
+                        if (!config_path.empty()) {
+                            info.resolved_paths["config"] = path_to_utf8(config_path);
+                        }
+                    }
+
+                    // Size in GB (footprint field contains disk size in GB)
+                    if (m.contains("footprint") && m["footprint"].is_number()) {
+                        info.size = m["footprint"].get<double>();
+                    }
+
+                    // Labels from FLM metadata
+                    if (m.contains("label") && m["label"].is_array()) {
+                        for (const auto& l : m["label"]) {
+                            if (l.is_string()) {
+                                info.labels.push_back(l.get<std::string>());
+                            }
+                        }
+                    }
+
+                    // Populate type and device fields (multi-model support)
+                    info.type = get_model_type_from_labels(info.labels);
+                    const BackendDescriptor* flm_desc = descriptor_for("flm");
+                    info.device = flm_desc ? flm_desc->default_device : DEVICE_NPU;
+
+                    flm_models.push_back(info);
+                }
+            }
+        }
+    } catch (const std::exception& e) {
+        LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl;
+    } catch (...) {
+        LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl;
+    }
+
+    return flm_models;
+}
+
+
+void flm_download(const std::string& checkpoint, bool do_not_upgrade,
+                  DownloadProgressCallback progress_callback) {
+    LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl;
+
+    // Ensure FLM is ready (single source of truth)
+    auto status = SystemInfoCache::get_flm_status();
+    if (!status.is_ready()) {
+        throw std::runtime_error(status.error_string());
+    }
+
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) {
+        throw std::runtime_error("FLM executable not found");
+    }
+
+    // Prepare arguments
+    std::vector<std::string> args = {"pull", checkpoint};
+    if (!do_not_upgrade) {
+        args.push_back("--force");
+    }
+
+    LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\"";
+    for (const auto& arg : args) {
+        LOG(INFO, "ProcessManager") << " \"" << arg << "\"";
+    }
+    LOG(INFO, "ProcessManager") << std::endl;
+
+    // State for parsing FLM output
+    int total_files = 0;
+    int current_file_index = 0;
+    std::string current_filename;
+    bool cancelled = false;
+
+    // Run flm pull command and parse output
+    int exit_code = lemon::utils::ProcessManager::run_process_with_output(
+        flm_path, args,
+        [&](const std::string& line) -> bool {
+            // Always print the line to console
+            LOG(INFO, "FLM") << line << std::endl;
+
+            // Parse FLM output to extract progress information
+            // Pattern: "[FLM]  Downloading X/Y: filename"
+            if (line.find("[FLM]  Downloading ") != std::string::npos &&
+                line.find("/") != std::string::npos &&
+                line.find(":") != std::string::npos) {
+
+                // Extract "X/Y: filename" from "[FLM]  Downloading X/Y: filename"
+                size_t start = line.find("Downloading ") + 12;
+                size_t slash = line.find("/", start);
+                size_t colon = line.find(":", slash);
+
+                if (slash != std::string::npos && colon != std::string::npos) {
+                    try {
+                        current_file_index = std::stoi(line.substr(start, slash - start));
+                        total_files = std::stoi(line.substr(slash + 1, colon - slash - 1));
+                        current_filename = line.substr(colon + 2);  // Skip ": "
+
+                        // Send progress update
+                        if (progress_callback) {
+                            DownloadProgress progress;
+                            progress.file = current_filename;
+                            progress.file_index = current_file_index;
+                            progress.total_files = total_files;
+                            progress.bytes_downloaded = 0;
+                            progress.bytes_total = 0;
+                            progress.percent = (total_files > 0) ?
+                                ((current_file_index - 1) * 100 / total_files) : 0;
+
+                            if (!progress_callback(progress)) {
+                                cancelled = true;
+                                return false;  // Kill the process
+                            }
+                        }
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+            // Pattern: "[FLM]  Downloading: XX.X% (XXX.XMB / XXX.XMB)"
+            else if (line.find("[FLM]  Downloading: ") != std::string::npos &&
+                     line.find("%") != std::string::npos) {
+
+                // Extract percentage and bytes
+                size_t start = line.find("Downloading: ") + 13;
+                size_t pct_end = line.find("%", start);
+
+                if (pct_end != std::string::npos) {
+                    try {
+                        std::string pct_str = line.substr(start, pct_end - start);
+                        double file_percent = std::stod(pct_str);
+
+                        // Try to extract bytes (XXX.XMB / XXX.XMB)
+                        size_t open_paren = line.find("(", pct_end);
+                        size_t slash = line.find("/", open_paren);
+                        size_t close_paren = line.find(")", slash);
+
+                        size_t bytes_downloaded = 0;
+                        size_t bytes_total = 0;
+
+                        if (open_paren != std::string::npos && slash != std::string::npos) {
+                            std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1);
+                            std::string total_str = line.substr(slash + 1, close_paren - slash - 1);
+
+                            // Parse "XXX.XMB" format
+                            auto parse_size = [](const std::string& s) -> size_t {
+                                double val = 0;
+                                size_t mb_pos = s.find("MB");
+                                size_t gb_pos = s.find("GB");
+                                size_t kb_pos = s.find("KB");
+
+                                if (mb_pos != std::string::npos) {
+                                    val = std::stod(s.substr(0, mb_pos));
+                                    return static_cast<size_t>(val * 1024 * 1024);
+                                } else if (gb_pos != std::string::npos) {
+                                    val = std::stod(s.substr(0, gb_pos));
+                                    return static_cast<size_t>(val * 1024 * 1024 * 1024);
+                                } else if (kb_pos != std::string::npos) {
+                                    val = std::stod(s.substr(0, kb_pos));
+                                    return static_cast<size_t>(val * 1024);
+                                }
+                                return 0;
+                            };
+
+                            bytes_downloaded = parse_size(downloaded_str);
+                            bytes_total = parse_size(total_str);
+                        }
+
+                        // Send progress update with byte-level info
+                        if (progress_callback) {
+                            DownloadProgress progress;
+                            progress.file = current_filename;
+                            progress.file_index = current_file_index;
+                            progress.total_files = total_files;
+                            progress.bytes_downloaded = bytes_downloaded;
+                            progress.bytes_total = bytes_total;
+                            // Use intra-file percent when we have byte-level progress
+                            progress.percent = static_cast<int>(file_percent);
+
+                            if (!progress_callback(progress)) {
+                                cancelled = true;
+                                return false;  // Kill the process
+                            }
+                        }
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+            // Pattern: "[FLM]  Overall progress: XX.X% (X/Y files)"
+            else if (line.find("[FLM]  Overall progress: ") != std::string::npos) {
+                size_t start = line.find("progress: ") + 10;
+                size_t pct_end = line.find("%", start);
+
+                if (pct_end != std::string::npos) {
+                    try {
+                        int overall_percent = static_cast<int>(std::stod(line.substr(start, pct_end - start)));
+
+                        if (progress_callback) {
+                            DownloadProgress progress;
+                            progress.file = current_filename;
+                            progress.file_index = current_file_index;
+                            progress.total_files = total_files;
+                            progress.bytes_downloaded = 0;  // Not available for overall progress
+                            progress.bytes_total = 0;
+                            progress.percent = overall_percent;
+
+                            if (!progress_callback(progress)) {
+                                cancelled = true;
+                                return false;  // Kill the process
+                            }
+                        }
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+            // Pattern: "[FLM]  Missing files (N):"
+            else if (line.find("[FLM]  Missing files (") != std::string::npos) {
+                size_t start = line.find("(") + 1;
+                size_t end = line.find(")", start);
+                if (end != std::string::npos) {
+                    try {
+                        total_files = std::stoi(line.substr(start, end - start));
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+
+            return true;  // Continue
+        },
+        "",  // Working directory
+        3600  // 1 hour timeout for large model downloads
+    );
+
+    if (cancelled) {
+        LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl;
+        throw std::runtime_error("Download cancelled");
+    }
+
+    if (exit_code != 0) {
+        LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl;
+        throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code));
+    }
+
+    // Send completion event
+    if (progress_callback) {
+        DownloadProgress progress;
+        progress.complete = true;
+        progress.file_index = total_files;
+        progress.total_files = total_files;
+        progress.percent = 100;
+        (void)progress_callback(progress);  // Ignore return - download already complete
+    }
+
+    LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl;
+}
+
+
+std::string flm_version() {
+    // Cache real version strings to avoid spawning the subprocess twice per
+    // build_recipes_info() pass. "unknown" is NOT cached so that post-install
+    // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed.
+    static std::string cached_version;
+    if (!cached_version.empty()) {
+        return cached_version;
+    }
+
+    // Find the flm executable using shared utility
+    std::string flm_path = find_flm_executable();
+    if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) {
+        return "unknown";
+    }
+
+    std::string output;
+    #ifdef _WIN32
+    std::string command = "\"" + flm_path + "\" version --json 2>NUL";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+    #else
+    std::string command = "\"" + flm_path + "\" version --json 2>/dev/null";
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        return "unknown";
+    }
+
+    char buffer[256];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    pclose(pipe);
+    #endif
+
+    // Parse JSON output: { "version": "0.9.34" }
+    try {
+        json j = lemon::utils::JsonUtils::parse(output);
+        if (j.contains("version") && j["version"].is_string()) {
+            std::string version = j["version"].get<std::string>();
+            // If the version doesn't start with 'v', prepend it
+            // for backend_versions.json compatibility (e.g. "v0.9.34").
+            if (!version.empty() && version[0] != 'v') {
+                version = "v" + version;
+            }
+            cached_version = version;
+            return cached_version;
+        }
+    } catch (...) {
+        // Fallback to legacy parsing if JSON parsing fails
+    }
+
+    // Legacy parsing from output like "FLM v0.9.4"
+    if (output.find("FLM v") != std::string::npos) {
+        size_t pos = output.find("FLM v");
+        // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34").
+        std::string version = output.substr(pos + 4);
+        // Trim whitespace and newlines
+        size_t end = version.find_first_of(" \t\n\r");
+        if (end != std::string::npos) {
+            version = version.substr(0, end);
+        }
+        cached_version = version;
+        return cached_version;
+    }
+
+    return "unknown";
+}
+
+
+std::string find_flm_executable() {
+#ifdef _WIN32
+    // On Windows, only check the Lemonade install directory (auto-installed zip).
+    // No system PATH fallback - FLM should be installed via install_backend().
+    std::string install_dir = (fs::path(lemon::utils::get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string();
+    if (fs::exists(install_dir)) {
+        for (const auto& entry : fs::recursive_directory_iterator(install_dir)) {
+            if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") {
+                std::string path = entry.path().string();
+                if (lemon::utils::is_safe_executable_path(path)) {
+                    return path;
+                }
+            }
+        }
+    }
+    return "";
+#else
+    // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`.
+    if (!lemon::utils::find_executable_in_path("flm").empty()) {
+        return "flm";
+    }
+    return "";
+#endif
+}
+
+bool run_flm_validate(const std::string& flm_path, std::string& error_message) {
+    std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path;
+    if (flm_exe.empty()) {
+        error_message = "FLM executable not found";
+        return false;
+    }
+    if (!lemon::utils::is_safe_executable_path(flm_exe)) {
+        error_message = "FLM path contains invalid characters";
+        return false;
+    }
+
+    std::string command = "\"" + flm_exe + "\" validate --json";
+    std::string output;
+    int exit_code;
+#ifdef _WIN32
+    exit_code = lemon::utils::ProcessManager::run_command(command, output);
+#else
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        error_message = "Failed to execute " + flm_exe;
+        return false;
+    }
+
+    char buffer[1024];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    exit_code = pclose(pipe);
+    if (exit_code != -1) {
+        exit_code = WEXITSTATUS(exit_code);
+    }
+#endif
+
+    try {
+        if (!output.empty()) {
+            json j = lemon::utils::JsonUtils::parse(output);
+            if (j.is_object()) {
+                // Check for overall status
+                bool validation_ok = false;
+                if (j.contains("ready")) {
+                    validation_ok = j["ready"].get<bool>();
+                }
+
+                if (validation_ok) {
+                    error_message.clear();
+                    return true;
+                }
+
+                std::vector<std::string> errors;
+
+                if (j.contains("amd_device_found") && !j["amd_device_found"].get<bool>()) {
+                    errors.push_back("No AMD NPU device found.");
+                }
+
+                if (j.contains("all_fw_ok") && !j["all_fw_ok"].get<bool>()) {
+                    errors.push_back("NPU firmware is incompatible.");
+                }
+                if (j.contains("kernel_ok") && !j["kernel_ok"].get<bool>()) {
+                    errors.push_back("Kernel version is incompatible.");
+                }
+
+                if (j.contains("memlock_ok") && !j["memlock_ok"].get<bool>()) {
+                    errors.push_back("Memlock limits are too low.");
+                }
+
+                if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get<bool>()) {
+                    errors.push_back("NPU driver version is too old.");
+                }
+
+                if (errors.empty()) {
+                    error_message = "NPU validation failed.";
+                } else {
+                    error_message = "";
+                    for (size_t i = 0; i < errors.size(); ++i) {
+                        error_message += errors[i] + (i == errors.size() - 1 ? "" : " ");
+                    }
+                }
+                return false;
+            }
+        }
+    } catch (...) {
+        // Fallback for non-JSON output or parsing error
+    }
+
+    if (exit_code != 0) {
+        error_message = "flm validate failed with exit code " + std::to_string(exit_code);
+        return false;
+    }
+
+    error_message.clear();
+    return true;
+}
+
+
+void flm_remove(const std::string& checkpoint) {
+    if (checkpoint.empty()) {
+        throw std::runtime_error("FLM model has empty checkpoint field, cannot delete");
+    }
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) {
+        throw std::runtime_error("FLM executable not found");
+    }
+    std::vector<std::string> args = {"remove", checkpoint};
+    auto handle = lemon::utils::ProcessManager::start_process(flm_path, args, "", false);
+
+    int timeout_seconds = 60;
+    for (int i = 0; i < timeout_seconds * 10; ++i) {
+        if (!lemon::utils::ProcessManager::is_running(handle)) {
+            int exit_code = lemon::utils::ProcessManager::get_exit_code(handle);
+            if (exit_code != 0) {
+                throw std::runtime_error("FLM remove failed for " + checkpoint +
+                                         " (exit code " + std::to_string(exit_code) + ")");
+            }
+            return;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+    lemon::utils::ProcessManager::stop_process(handle);
+    throw std::runtime_error("FLM remove timed out for " + checkpoint);
+}
+
+} // namespace fastflowlm
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
similarity index 80%
rename from src/cpp/server/backends/fastflowlm_server.cpp
rename to src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index dc38928e3..050b5a961 100644
--- a/src/cpp/server/backends/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -1,5 +1,10 @@
-#include "lemon/backends/fastflowlm_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm.h"
+#include "lemon/backends/fastflowlm/fastflowlm_models.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/model_manager.h"
 #include "lemon/system_info.h"
 #include "lemon/error_types.h"
 #include "lemon/utils/process_manager.h"
@@ -9,6 +14,7 @@
 #include <iostream>
 #include <filesystem>
 #include <cstdlib>
+#include <optional>
 #include <thread>
 #include <chrono>
 #include <fstream>
@@ -160,13 +166,13 @@ void FastFlowLMServer::load(const std::string& model_name,
 
 #ifdef _WIN32
     // On Windows, auto-install FLM binary if needed (downloads zip and extracts)
-    backend_manager_->install_backend(SPEC.recipe, "npu");
+    backend_manager_->install_backend(fastflowlm::spec()->recipe, "npu");
 #endif
 
     // Validate NPU hardware/drivers
     std::string flm_path = get_flm_path();
     std::string validate_error;
-    if (!utils::run_flm_validate(flm_path, validate_error)) {
+    if (!fastflowlm::run_flm_validate(flm_path, validate_error)) {
         throw std::runtime_error("FLM NPU validation failed: " + validate_error +
             "\nVisit " + DRIVER_INSTALL_URL + " for driver installation instructions.");
     }
@@ -444,7 +450,7 @@ std::string FastFlowLMServer::get_flm_path() {
 #ifdef _WIN32
     // On Windows, use the standard install directory (auto-installed zip)
     try {
-        std::string path = BackendUtils::get_backend_binary_path(SPEC, "npu");
+        std::string path = BackendUtils::get_backend_binary_path(*fastflowlm::spec(), "npu");
         LOG(INFO, "FastFlowLM") << "Found flm at: " << path << std::endl;
         return path;
     } catch (const std::exception& e) {
@@ -453,7 +459,7 @@ std::string FastFlowLMServer::get_flm_path() {
     }
 #else
     // On Linux, FLM is installed as a system package (in PATH)
-    std::string flm_path = utils::find_flm_executable();
+    std::string flm_path = fastflowlm::find_flm_executable();
     if (!flm_path.empty()) {
         LOG(INFO, "FastFlowLM") << "Found flm at: " << flm_path << std::endl;
     } else {
@@ -465,3 +471,101 @@ std::string FastFlowLMServer::get_flm_path() {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace fastflowlm {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return make_server<FastFlowLMServer>(ctx);
+}
+
+namespace {
+// FLM model-management behavior: max context window from the model's config.json.
+class FlmOps : public BackendOps {
+public:
+    void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override {
+        info.max_context_window = read_flm_max_context_window(info);
+    }
+
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext& ctx) const override {
+        // FLM uses the checkpoint string as-is (e.g. "gemma3:4b"); no local file.
+        return ctx.checkpoint;
+    }
+
+    std::vector<ModelInfo> discover_models(const BackendOpsContext&) const override {
+        return flm_discover_models();
+    }
+
+    bool is_downloaded(const ModelInfo& info, const BackendOpsContext&) const override {
+        const auto installed = flm_installed_checkpoints();
+        return std::find(installed.begin(), installed.end(), info.checkpoint()) != installed.end();
+    }
+
+    void download_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress,
+                        const BackendOpsContext&) const override {
+        flm_download(info.checkpoint(), do_not_upgrade, progress);
+    }
+
+    bool invalidates_cache_after_download() const override { return true; }
+
+    std::string resolve_version(const std::string&, const std::string& file_version) const override {
+        // On Linux FLM is a system package with no version.txt; query the CLI.
+        if (file_version.empty() || file_version == "unknown") {
+            return flm_version();
+        }
+        return file_version;
+    }
+
+    InstallCheck check_install(const std::string&, bool binary_found) const override {
+        // On Linux FLM is a system package on PATH, not in the managed install dir.
+        if (!binary_found && !find_flm_executable().empty()) {
+            return {true, ""};
+        }
+        return {binary_found, ""};
+    }
+
+    std::optional<UnavailableState> classify_unavailable(
+        const std::string&, const std::string& install_error,
+        const std::string& default_install_command) const override {
+        // FLM needs richer state to guide users through manual setup (installing
+        // the .deb, xrt drivers, etc.) rather than an automatic backend install.
+        bool is_not_installed = install_error.empty()
+                             || install_error.find("not installed") != std::string::npos
+                             || install_error.find("not found") != std::string::npos;
+        bool is_version_mismatch = install_error.find("requires") != std::string::npos;
+
+        UnavailableState s;
+        if (is_not_installed) {
+            s.state = "installable";
+        } else if (is_version_mismatch) {
+            s.state = "update_required";
+        } else {
+            s.state = "action_required";
+        }
+        s.message = install_error;
+        s.attach_installed_version = !is_not_installed;
+
+#ifdef __linux__
+        (void)default_install_command;
+        s.action = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot";
+#elif defined(_WIN32)
+        if (!is_not_installed && !is_version_mismatch) {
+            s.action = "Visit https://lemonade-server.ai/driver_install.html";
+        } else {
+            s.action = default_install_command;
+        }
+#else
+        s.action = default_install_command;
+#endif
+        return s;
+    }
+};
+}  // namespace
+
+const BackendSpec* spec() { return make_spec<FastFlowLMServer>(descriptor); }
+const BackendOps* ops() { return single_ops<FlmOps>(); }
+}  // namespace fastflowlm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/hf_cache_util.cpp b/src/cpp/server/backends/hf_cache_util.cpp
new file mode 100644
index 000000000..028b25ee4
--- /dev/null
+++ b/src/cpp/server/backends/hf_cache_util.cpp
@@ -0,0 +1,72 @@
+#include "lemon/backends/hf_cache_util.h"
+
+#include <fstream>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+namespace fs = std::filesystem;
+
+namespace lemon {
+namespace backends {
+namespace hf_cache {
+
+bool exists(const fs::path& p) {
+#ifdef _WIN32
+    // The HF cache uses symlinks for dedup; MSVC's std::filesystem refuses
+    // "untrusted" reparse points when the token lacks symlink privilege, so use
+    // the Win32 API which has no such restriction.
+    return GetFileAttributesW(p.c_str()) != INVALID_FILE_ATTRIBUTES;
+#else
+    std::error_code ec;
+    return fs::exists(p, ec);
+#endif
+}
+
+fs::directory_options dir_options() {
+#ifdef _WIN32
+    return fs::directory_options::skip_permission_denied;
+#else
+    return fs::directory_options::none;
+#endif
+}
+
+namespace {
+std::string read_ref_main(const fs::path& model_cache_path) {
+    std::ifstream refs_file(model_cache_path / "refs" / "main");
+    if (!refs_file.is_open()) {
+        return "";
+    }
+    std::string ref;
+    std::getline(refs_file, ref);
+    ref.erase(0, ref.find_first_not_of(" \t\r\n"));
+    size_t last = ref.find_last_not_of(" \t\r\n");
+    if (last == std::string::npos) {
+        return "";
+    }
+    ref.erase(last + 1);
+    return ref;
+}
+} // namespace
+
+fs::path active_snapshot_path(const fs::path& model_cache_path) {
+    std::string ref = read_ref_main(model_cache_path);
+    if (ref.empty()) {
+        return fs::path();
+    }
+    fs::path snapshot_path = model_cache_path / "snapshots" / ref;
+    return lemon::backends::hf_cache::exists(snapshot_path) ? snapshot_path : fs::path();
+}
+
+std::string repo_id_to_cache_dir_name(const std::string& repo_id) {
+    std::string cache_dir_name = "models--";
+    for (char c : repo_id) {
+        cache_dir_name += (c == '/') ? "--" : std::string(1, c);
+    }
+    return cache_dir_name;
+}
+
+} // namespace hf_cache
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
similarity index 80%
rename from src/cpp/server/backends/kokoro_server.cpp
rename to src/cpp/server/backends/kokoro/kokoro_server.cpp
index 7a707cd7e..95d46de6a 100644
--- a/src/cpp/server/backends/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -1,5 +1,12 @@
-#include "lemon/backends/kokoro_server.h"
+#include "lemon/backends/kokoro/kokoro_server.h"
+#include "lemon/backends/kokoro/kokoro.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/model_manager.h"
+#include "lemon/utils/path_utils.h"
+#include <filesystem>
 #include "lemon/backend_manager.h"
 #include "lemon/utils/process_manager.h"
 #include "lemon/utils/json_utils.h"
@@ -68,7 +75,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
 
     // Install kokoros if needed
     const std::string backend = default_kokoro_backend();
-    backend_manager_->install_backend(SPEC.recipe, backend);
+    backend_manager_->install_backend(kokoro::spec()->recipe, backend);
 
     // Use pre-resolved model path
     fs::path model_path = fs::path(model_info.resolved_path());
@@ -88,7 +95,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
     LOG(INFO, "KokoroServer") << "Using model: " << model_index["model"] << std::endl;
 
     // Get koko executable path
-    std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend);
+    std::string exe_path = BackendUtils::get_backend_binary_path(*kokoro::spec(), backend);
 
     // Choose a port
     port_ = choose_port();
@@ -203,3 +210,38 @@ void KokoroServer::audio_speech(const json& request, httplib::DataSink& sink) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace kokoro {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return make_server<KokoroServer>(ctx);
+}
+
+
+namespace {
+class KokoroOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext& ctx) const override {
+        // Kokoro models are a directory; resolve to the index.json file inside.
+        std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path);
+        if (hf_cache::exists(dir)) {
+            for (const auto& entry :
+                 std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+                if (entry.is_regular_file() && entry.path().filename() == "index.json") {
+                    return lemon::utils::path_to_utf8(entry.path());
+                }
+            }
+        }
+        return ctx.model_cache_path;  // directory even if index not found
+    }
+};
+}  // namespace
+
+const BackendSpec* spec() { return make_spec<KokoroServer>(descriptor); }
+const BackendOps* ops() { return single_ops<KokoroOps>(); }
+}  // namespace kokoro
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
new file mode 100644
index 000000000..81cc1c555
--- /dev/null
+++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
@@ -0,0 +1,250 @@
+#include "lemon/backends/llamacpp/llamacpp_gguf.h"
+
+#include <algorithm>
+#include <cctype>
+#include <filesystem>
+#include <map>
+#include <vector>
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/hf_variants.h"
+#include "lemon/utils/aixlog.hpp"
+#include "lemon/utils/path_utils.h"
+
+namespace fs = std::filesystem;
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+namespace {
+
+using lemon::utils::path_from_utf8;
+using lemon::utils::path_to_utf8;
+
+std::string to_lower(std::string s) {
+    std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); });
+    return s;
+}
+
+} // namespace
+
+std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant) {
+    fs::path model_cache_path_fs = path_from_utf8(model_cache_path);
+    if (!hf_cache::exists(model_cache_path_fs)) {
+        return model_cache_path;  // Return directory path even if not found
+    }
+
+    // Collect the (sorted, mmproj-excluded) GGUF files under a search root.
+    auto collect_gguf_files = [](const fs::path& search_root) {
+        std::vector<std::string> files;
+        if (search_root.empty() || !hf_cache::exists(search_root)) {
+            return files;
+        }
+
+        std::error_code ec;
+        for (const auto& entry : fs::recursive_directory_iterator(search_root, hf_cache::dir_options(), ec)) {
+            if (ec) break;
+            if (!entry.is_regular_file(ec)) {
+                ec.clear();
+                continue;
+            }
+
+            std::string filename = entry.path().filename().string();
+            std::string filename_lower = filename;
+            std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
+
+            if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) {
+                files.push_back(path_to_utf8(entry.path()));
+            }
+        }
+        // Sort for consistent ordering (important for sharded models) and so the
+        // active/whole-cache sets compare equal when they hold the same files.
+        std::sort(files.begin(), files.end());
+        return files;
+    };
+
+    const std::string variant_lower = to_lower(variant);
+
+    // Resolve the requested GGUF variant within a candidate list of files.
+    // Returns the matched absolute path, or "" if this candidate set does not
+    // contain the variant. Factored into a lambda so the search can be retried
+    // against a broader set of snapshots (see #2300 below) without duplicating
+    // the matching logic.
+    auto resolve_gguf_variant = [&](const std::vector<std::string>& gguf_files) -> std::string {
+        if (gguf_files.empty()) {
+            return "";
+        }
+
+        // Case 0: Wildcard (*) - return first file (llama-server auto-loads shards)
+        if (variant == "*") {
+            return gguf_files[0];
+        }
+
+        // Case 1: Empty variant - return first file
+        if (variant.empty()) {
+            return gguf_files[0];
+        }
+
+        // Case 2: Exact filename match (variant ends with .gguf)
+        if (variant.find(".gguf") != std::string::npos) {
+            for (const auto& filepath : gguf_files) {
+                if (path_from_utf8(filepath).filename().string() == variant) {
+                    return filepath;
+                }
+            }
+            return "";  // Exact variant not found in this candidate set
+        }
+
+        // Case 3: Files ending with {variant}.gguf (case insensitive)
+        const std::string suffix = variant_lower + ".gguf";
+        for (const auto& filepath : gguf_files) {
+            std::string filename_lower = to_lower(path_from_utf8(filepath).filename().string());
+            if (filename_lower.size() >= suffix.size() &&
+                filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) {
+                return filepath;
+            }
+        }
+
+        // Case 4: Folder-based sharding (files in variant/ folder)
+        const std::string folder_prefix_lower = variant_lower + "/";
+        for (const auto& filepath : gguf_files) {
+            std::string relative_lower = to_lower(path_to_utf8(
+                path_from_utf8(filepath).lexically_relative(model_cache_path_fs)));
+            std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/');
+            if (relative_lower.find(folder_prefix_lower) != std::string::npos) {
+                return filepath;
+            }
+        }
+
+        // Case 5: Local quant-token fallback.
+        //
+        // Keep the existing resolver cases above as the primary logic: exact
+        // filenames, suffix matches, and folder-based sharding are more
+        // specific and preserve the CHECKPOINT:VARIANT contract.
+        //
+        // Some GGUF repositories name files with the quant token in the middle,
+        // for example:
+        //   Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf
+        // for variant:
+        //   IQ4_XS
+        // That file does not end with IQ4_XS.gguf, so mirror the downloader's
+        // GGUF variant enumeration over the files that are already present in
+        // the local HF cache before declaring the model missing.
+        //
+        // HF cache paths have an extra snapshots/<revision>/ prefix that is not
+        // part of the repository-relative filename. Strip it before calling
+        // enumerate_gguf_variants(); otherwise the enumerator treats
+        // "snapshots" as a top-level sharded-folder variant and never extracts
+        // the quant token from the actual GGUF filename.
+        std::vector<std::string> relative_gguf_files;
+        std::map<std::string, std::string> absolute_by_relative;
+        auto repo_relative_from_cache_relative = [](std::string rel) {
+            std::replace(rel.begin(), rel.end(), '\\', '/');
+
+            static const std::string snapshots_prefix = "snapshots/";
+            if (rel.rfind(snapshots_prefix, 0) == 0) {
+                size_t revision_end = rel.find('/', snapshots_prefix.size());
+                if (revision_end != std::string::npos && revision_end + 1 < rel.size()) {
+                    rel = rel.substr(revision_end + 1);
+                }
+            }
+
+            return rel;
+        };
+
+        for (const auto& filepath : gguf_files) {
+            std::string relative_path = path_to_utf8(
+                path_from_utf8(filepath).lexically_relative(model_cache_path_fs));
+            relative_path = repo_relative_from_cache_relative(relative_path);
+
+            // Multiple HF snapshots can contain the same repo-relative file.
+            // Keep the first absolute path from the sorted file list so
+            // duplicates do not create false ambiguity.
+            if (absolute_by_relative.emplace(relative_path, filepath).second) {
+                relative_gguf_files.push_back(relative_path);
+            }
+        }
+
+        std::vector<std::string> enumerated_matches;
+        auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files);
+        for (const auto& local_variant : local_variants.variants) {
+            if (to_lower(local_variant.name) != variant_lower) {
+                continue;
+            }
+
+            auto it = absolute_by_relative.find(local_variant.primary_file);
+            if (it != absolute_by_relative.end()) {
+                enumerated_matches.push_back(it->second);
+            }
+        }
+
+        if (enumerated_matches.size() == 1) {
+            LOG(INFO, "ModelManager")
+                << "Resolved local GGUF variant '" << variant
+                << "' via quant-token fallback: " << enumerated_matches[0] << std::endl;
+            return enumerated_matches[0];
+        }
+
+        if (enumerated_matches.size() > 1) {
+            LOG(WARNING, "ModelManager")
+                << "Multiple local GGUF files matched variant '" << variant
+                << "' via quant-token fallback; refusing to guess" << std::endl;
+            return "";
+        }
+
+        // No match in this candidate set. Do not fall back to another
+        // quantization in the same Hugging Face repo; otherwise a custom
+        // download with a different quant can make a built-in model appear
+        // downloaded and allow deleting the wrong file.
+        return "";
+    };
+
+    // Prefer the active refs/main snapshot so that when upstream only changed
+    // README/metadata Lemonade keeps using the previous snapshot's artifacts.
+    std::vector<std::string> active_gguf_files =
+        collect_gguf_files(hf_cache::active_snapshot_path(model_cache_path_fs));
+
+    // Whole-repo-cache candidates spanning every snapshot, populated on demand.
+    std::vector<std::string> all_cache_gguf_files;
+    bool all_cache_computed = false;
+    auto whole_cache_gguf_files = [&]() -> const std::vector<std::string>& {
+        if (!all_cache_computed) {
+            all_cache_gguf_files = collect_gguf_files(model_cache_path_fs);
+            all_cache_computed = true;
+        }
+        return all_cache_gguf_files;
+    };
+
+    if (active_gguf_files.empty() && whole_cache_gguf_files().empty()) {
+        return model_cache_path;  // Return directory if no GGUF found anywhere
+    }
+
+    std::string resolved_path = resolve_gguf_variant(active_gguf_files);
+
+    // #2300: a sibling variant that shares this HF repo can live in a snapshot
+    // other than the one refs/main points at. refs/main advances to the
+    // snapshot of whichever variant was pulled or updated last, leaving the
+    // other variants' symlinks behind in earlier snapshots; after a restart the
+    // refs/main-only search above then reports them as missing. If the active
+    // snapshot did not contain the requested variant, broaden the search to
+    // every snapshot in this repo's cache before declaring it missing. Blobs are
+    // content-addressed and shared, so reading an older snapshot is safe, and
+    // resolving against the active snapshot first preserves the CHECKPOINT:VARIANT
+    // contract (a different quant is never substituted while the exact one exists).
+    //
+    // The whole-cache set is a superset of the active set, so the two are equal
+    // only when refs/main's snapshot is the sole snapshot holding GGUFs — in
+    // which case the broader search is identical and skipped.
+    if (resolved_path.empty()) {
+        const std::vector<std::string>& all_files = whole_cache_gguf_files();
+        if (all_files != active_gguf_files) {
+            resolved_path = resolve_gguf_variant(all_files);
+        }
+    }
+
+    return resolved_path;
+}
+
+} // namespace llamacpp
+} // namespace backends
+} // namespace lemon
+
diff --git a/src/cpp/server/backends/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
similarity index 75%
rename from src/cpp/server/backends/llamacpp_server.cpp
rename to src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index a8b731f63..eb766e798 100644
--- a/src/cpp/server/backends/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -1,5 +1,16 @@
-#include "lemon/backends/llamacpp_server.h"
+#include "lemon/backends/llamacpp/llamacpp_server.h"
+#include "lemon/backends/llamacpp/llamacpp.h"
+#include "lemon/backends/llamacpp/llamacpp_gguf.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/gguf_capabilities.h"
+#include "lemon/gguf_reader.h"
+#include "lemon/model_manager.h"
+#include <algorithm>
+#include <filesystem>
+#include <regex>
+#include <system_error>
 #include "lemon/auto_tune.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
@@ -281,12 +292,12 @@ void LlamaCppServer::load(const std::string& model_name,
     device_type_ = use_gpu ? DEVICE_GPU : DEVICE_CPU;
 
     // Install llama-server if needed (use per-model backend)
-    backend_manager_->install_backend(SPEC.recipe, llamacpp_backend);
+    backend_manager_->install_backend(llamacpp::spec()->recipe, llamacpp_backend);
 
     // Use pre-resolved GGUF path. Skipped for hf_load models because llama-server
     // sources the weights itself via -hf; those models may not have local files.
     std::string gguf_path = model_info.resolved_path();
-    if (gguf_path.empty() && !model_info.hf_load) {
+    if (gguf_path.empty() && !model_info.extra<bool>("hf_load", false)) {
         throw std::runtime_error("GGUF file not found for checkpoint: " + model_info.checkpoint());
     }
 
@@ -302,7 +313,7 @@ void LlamaCppServer::load(const std::string& model_name,
     port_ = choose_port();
 
     // Get executable path
-    std::string executable = BackendUtils::get_backend_binary_path(SPEC, llamacpp_backend);
+    std::string executable = BackendUtils::get_backend_binary_path(*llamacpp::spec(), llamacpp_backend);
 
     // Check for embeddings and reranking support based on model type
     bool supports_embeddings = (model_info.type == ModelType::EMBEDDING);
@@ -323,7 +334,7 @@ void LlamaCppServer::load(const std::string& model_name,
     // is required for models like Qwen2.5-Omni where the manual -m + --mmproj
     // path rejects audio content parts in /v1/chat/completions — the -hf path
     // drives the dual-clip (vision+audio) context correctly.
-    if (model_info.hf_load) {
+    if (model_info.extra<bool>("hf_load", false)) {
         push_arg(args, reserved_flags, "-hf", model_info.checkpoint(),
                  std::vector<std::string>{"--hf-repo", "-mr", "--hf-file", "-mf"});
     } else {
@@ -345,7 +356,7 @@ void LlamaCppServer::load(const std::string& model_name,
 
     // Add mmproj file if present (for vision models). Skip when hf_load is set —
     // llama-server resolves the mmproj companion itself from the HF repo.
-    if (!mmproj_path.empty() && !model_info.hf_load) {
+    if (!mmproj_path.empty() && !model_info.extra<bool>("hf_load", false)) {
         push_arg(args, reserved_flags, "--mmproj", mmproj_path);
         if (!use_gpu) {
             LOG(DEBUG, "LlamaCpp") << "Skipping mmproj argument since GPU mode is not enabled" << std::endl;
@@ -651,3 +662,207 @@ json LlamaCppServer::responses(const json& request) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return make_server<LlamaCppServer>(ctx);
+}
+
+namespace {
+std::string system_llamacpp_version() {
+    std::string output;
+    #ifdef _WIN32
+    std::string command = "llama-server --version 2>NUL";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+    #else
+    FILE* pipe = popen("llama-server --version 2>/dev/null", "r");
+    if (!pipe) {
+        return "unknown";
+    }
+
+    char buffer[256];
+    if (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output = buffer;
+    }
+
+    pclose(pipe);
+    #endif
+
+    // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432"
+    if (!output.empty()) {
+        // Try to find a version number
+        std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))");
+        std::smatch match;
+        if (std::regex_search(output, match, version_regex)) {
+            for (size_t i = 1; i < match.size(); ++i) {
+                if (match[i].matched) {
+                    return "b" + match[i].str();
+                }
+            }
+        }
+        return "detected";
+    }
+
+    return "unknown";
+}
+
+
+bool is_ggml_hip_plugin_available() {
+#ifdef __linux__
+    // Allow distros/packagers that install outside the FHS paths below
+    // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so.
+    if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) {
+        // Require the basename to look like the HIP plugin (libggml-hip*.so*,
+        // case-insensitive, versioned sonames allowed). This is a sanity check,
+        // not a security boundary: the path is not forwarded to ggml's loader,
+        // so we cannot verify it is actually loadable. It only guards against an
+        // accidental override pointing at an unrelated existing file.
+        std::string name = fs::path(env).filename().string();
+        std::transform(name.begin(), name.end(), name.begin(),
+                       [](unsigned char c) { return std::tolower(c); });
+        const bool name_matches = name.rfind("libggml-hip", 0) == 0 &&
+                                  name.find(".so") != std::string::npos;
+        // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing
+        // filesystem overload: an odd or malformed path resolves to "not a
+        // regular file" (ec set) instead of raising a filesystem_error.
+        std::error_code hip_path_ec;
+        if (name_matches && fs::is_regular_file(env, hip_path_ec)) {
+            return true;
+        }
+    }
+    // On Linux x86_64, check common system library paths for the HIP plugin
+    std::vector<std::string> possible_paths = {
+        // Debian/Ubuntu multiarch path (most common)
+        "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so",
+	// Arch AUR path
+	"/usr/lib/libggml-hip.so",
+        // Standard Linux paths
+        "/usr/lib/ggml/backends0/libggml-hip.so",
+        "/usr/lib64/ggml/backends0/libggml-hip.so"
+    };
+
+    // Check all possible paths
+    for (const auto& path : possible_paths) {
+        if (fs::exists(path)) {
+            return true;
+        }
+    }
+#endif
+
+    return false;
+}
+
+
+// llamacpp model-management behavior: GGUF metadata + capability labels.
+class LlamaCppOps : public BackendOps {
+public:
+    void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override {
+        const std::string gguf_path = info.resolved_path();
+        if (gguf_path.size() < 5) {
+            return;
+        }
+        std::string ext = gguf_path.substr(gguf_path.size() - 5);
+        std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+        if (ext != ".gguf") {
+            return;
+        }
+        std::error_code ec;
+        if (!std::filesystem::exists(lemon::utils::path_from_utf8(gguf_path), ec)) {
+            return;
+        }
+        GgufMetadata meta;
+        if (!read_gguf_metadata(meta, gguf_path)) {
+            return;
+        }
+        info.max_context_window = meta.context_length;
+        info.gguf = std::move(meta);
+        // GGUF vision/tool metadata are LLM capabilities. Don't apply them to
+        // embedding/reranking models, or labels like tool-calling would
+        // reclassify the model away from its endpoint type.
+        if (info.type == ModelType::LLM) {
+            apply_gguf_capability_labels(info.labels, info.gguf.caps);
+        }
+    }
+
+    std::string resolve_checkpoint_path(const ModelInfo& info,
+                                        const CheckpointResolveContext& ctx) const override {
+        // The main checkpoint is a GGUF file (with sharding/variant resolution);
+        // auxiliary checkpoints (mmproj, …) use the shared default.
+        if (ctx.type == "main") {
+            return resolve_gguf_path(ctx.model_cache_path, ctx.variant);
+        }
+        return BackendOps::resolve_checkpoint_path(info, ctx);
+    }
+
+    std::string find_imported_checkpoint(const std::string& import_dir) const override {
+        // The primary artifact is the (non-mmproj) GGUF file.
+        return resolve_gguf_path(import_dir, "");
+    }
+
+    std::string validate_registration_checkpoint(const std::string& checkpoint) const override {
+        // A GGUF checkpoint must name its quant via CHECKPOINT:VARIANT.
+        std::string lower = checkpoint;
+        std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+        if (lower.find("gguf") != std::string::npos &&
+            checkpoint.find(':') == std::string::npos) {
+            return "You are required to provide a 'variant' in the checkpoint field when "
+                   "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. "
+                   "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
+                   "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf";
+        }
+        return "";
+    }
+
+    std::string validate_checkpoint_file(const std::string& resolved_path) const override {
+        // A .gguf file in the cache must start with the GGUF magic, else it's a
+        // truncated/corrupt download and the model is not really present.
+        std::error_code ec;
+        std::filesystem::path p = lemon::utils::path_from_utf8(resolved_path);
+        if (std::filesystem::is_directory(p, ec)) {
+            return "";
+        }
+        std::string ext = resolved_path.size() >= 5 ? resolved_path.substr(resolved_path.size() - 5) : "";
+        std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+        if (ext != ".gguf") {
+            return "";
+        }
+        std::ifstream in(p, std::ios::binary);
+        char magic[4] = {};
+        in.read(magic, sizeof(magic));
+        bool ok = in.gcount() == static_cast<std::streamsize>(sizeof(magic)) &&
+                  magic[0] == 'G' && magic[1] == 'G' && magic[2] == 'U' && magic[3] == 'F';
+        return ok ? "" : "Invalid GGUF cache file";
+    }
+
+    std::string resolve_version(const std::string& backend,
+                                const std::string& file_version) const override {
+        // The PATH-installed "system" llama-server has no version.txt; query it.
+        if (backend == "system") {
+            return system_llamacpp_version();
+        }
+        return file_version;
+    }
+
+    InstallCheck check_install(const std::string& backend, bool binary_found) const override {
+        // The system llama-server also needs the ggml HIP plugin for ROCm GPU
+        // acceleration when an AMD GPU (KFD) is present.
+        if (binary_found && backend == "system") {
+#ifdef __linux__
+            if (std::filesystem::exists("/sys/class/kfd") && !is_ggml_hip_plugin_available()) {
+                return {false, "HIP plugin libggml-hip.so not installed"};
+            }
+#endif
+        }
+        return {binary_found, ""};
+    }
+};
+}  // namespace
+
+const BackendSpec* spec() { return make_spec<LlamaCppServer>(descriptor); }
+const BackendOps* ops() { return single_ops<LlamaCppOps>(); }
+}  // namespace llamacpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
similarity index 86%
rename from src/cpp/server/backends/moonshine_server.cpp
rename to src/cpp/server/backends/moonshine/moonshine_server.cpp
index d3729d435..bcf263d67 100644
--- a/src/cpp/server/backends/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -1,4 +1,6 @@
-#include "lemon/backends/moonshine_server.h"
+#include "lemon/backends/moonshine/moonshine_server.h"
+#include "lemon/backends/moonshine/moonshine.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
@@ -6,8 +8,10 @@
 #include "lemon/utils/http_client.h"
 #include "lemon/utils/process_manager.h"
 #include "lemon/error_types.h"
+#include <cctype>
 #include <iostream>
 #include <filesystem>
+#include <optional>
 #include <set>
 #include <vector>
 #include <lemon/utils/aixlog.hpp>
@@ -71,7 +75,7 @@ void MoonshineServer::load(const std::string& model_name,
     device_type_ = DEVICE_CPU;
 
     // Install moonshine-server if needed
-    backend_manager_->install_backend(SPEC.recipe, "cpu");
+    backend_manager_->install_backend(moonshine::spec()->recipe, "cpu");
 
     // Resolve model path from ModelManager (standard HF cache)
     std::string model_path = model_info.resolved_path();
@@ -83,7 +87,7 @@ void MoonshineServer::load(const std::string& model_name,
 
     // Resolve model architecture. Prefer the explicit registry field; fall back
     // to inferring from the checkpoint variant (onnx/tiny, onnx/small, etc.).
-    int model_arch = model_info.moonshine_arch;
+    int model_arch = model_info.extra<int>("moonshine_arch", -1);
     if (model_arch < 0) {
         std::string variant = model_info.checkpoint();
         std::transform(variant.begin(), variant.end(), variant.begin(), ::tolower);
@@ -97,7 +101,7 @@ void MoonshineServer::load(const std::string& model_name,
     }
 
     // Get executable path
-    std::string executable = BackendUtils::get_backend_binary_path(SPEC, "cpu");
+    std::string executable = BackendUtils::get_backend_binary_path(*moonshine::spec(), "cpu");
     LOG(INFO, "MoonshineServer") << "Using executable: " << executable << std::endl;
 
     // moonshine-server binds three consecutive ports: HTTP, WS (+1), TCP (+2).
@@ -358,3 +362,53 @@ json MoonshineServer::audio_transcriptions(const json& request) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace moonshine {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return make_server<MoonshineServer>(ctx);
+}
+
+
+namespace {
+class MoonshineOps : public BackendOps {
+public:
+    std::optional<std::vector<std::string>> select_checkpoint_files(
+        const std::string& main_variant, const std::vector<std::string>& repo_files) const override {
+        // A Moonshine variant names a directory (e.g. "medium-streaming-en/quantized");
+        // download every file under it.
+        std::string folder_prefix = main_variant;
+        if (!folder_prefix.empty() && folder_prefix.back() != '/') {
+            folder_prefix += "/";
+        }
+        auto starts_with_ci = [](const std::string& s, const std::string& p) {
+            if (s.size() < p.size()) return false;
+            for (size_t i = 0; i < p.size(); ++i) {
+                if (std::tolower(static_cast<unsigned char>(s[i])) !=
+                    std::tolower(static_cast<unsigned char>(p[i]))) {
+                    return false;
+                }
+            }
+            return true;
+        };
+        std::vector<std::string> files;
+        for (const auto& f : repo_files) {
+            if (starts_with_ci(f, folder_prefix)) {
+                files.push_back(f);
+            }
+        }
+        if (files.empty()) {
+            throw std::runtime_error("No Moonshine model files found in folder: " + main_variant);
+        }
+        return files;
+    }
+};
+}  // namespace
+
+const BackendSpec* spec() { return make_spec<MoonshineServer>(descriptor); }
+const BackendOps* ops() { return single_ops<MoonshineOps>(); }
+}  // namespace moonshine
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/ryzenaiserver.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
similarity index 67%
rename from src/cpp/server/backends/ryzenaiserver.cpp
rename to src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index 6e250fa35..69e1eed16 100644
--- a/src/cpp/server/backends/ryzenaiserver.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -1,4 +1,10 @@
-#include "lemon/backends/ryzenaiserver.h"
+#include "lemon/backends/ryzenai/ryzenai_server.h"
+#include "lemon/backends/ryzenai/ryzenai.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/model_manager.h"
+#include "lemon/backends/backend_ops.h"
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/utils/path_utils.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/utils/process_manager.h"
@@ -38,7 +44,7 @@ RyzenAIServer::~RyzenAIServer() {
 
 bool RyzenAIServer::is_available() {
     try {
-        return !backends::BackendUtils::get_backend_binary_path(SPEC, "npu").empty();
+        return !backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu").empty();
     } catch (...) {
         return false;
     }
@@ -55,7 +61,7 @@ void RyzenAIServer::load(const std::string& model_name,
     backend_manager_->install_backend("ryzenai-llm", "npu");
 
     // Get the path to ryzenai-server
-    std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(SPEC, "npu");
+    std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu");
     if (ryzenai_server_path.empty()) {
         throw std::runtime_error("RyzenAI-Server executable not found even after installation attempt");
     }
@@ -167,3 +173,54 @@ json RyzenAIServer::responses(const json& request) {
 }
 
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace ryzenai {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    // RyzenAI resolves its model path before load (set_model_path), matching the
+    // original router factory's special-casing.
+    auto server = std::make_unique<::lemon::RyzenAIServer>(
+        ctx.model_info->model_name, ctx.log_level == "debug",
+        ctx.model_manager, ctx.backend_manager);
+    server->set_model_path(ctx.model_info->resolved_path());
+    return server;
+}
+
+
+namespace {
+class RyzenAiOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext& ctx) const override {
+        // RyzenAI models are a directory containing genai_config.json.
+        std::string found = find_imported_checkpoint(ctx.model_cache_path);
+        return found.empty() ? ctx.model_cache_path : found;  // dir if not found
+    }
+
+    std::string find_imported_checkpoint(const std::string& import_dir) const override {
+        // The primary artifact is the directory holding genai_config.json.
+        std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir);
+        if (hf_cache::exists(dir)) {
+            for (const auto& entry :
+                 std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+                if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") {
+                    return lemon::utils::path_to_utf8(entry.path().parent_path());
+                }
+            }
+        }
+        return "";  // register the directory itself
+    }
+};
+}  // namespace
+
+const BackendSpec* spec() {
+    static const BackendSpec kSpec("ryzenai-server", descriptor.binary,
+                                   ::lemon::RyzenAIServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
+const BackendOps* ops() { return single_ops<RyzenAiOps>(); }
+}  // namespace ryzenai
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/sd_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
similarity index 97%
rename from src/cpp/server/backends/sd_server.cpp
rename to src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index 734454c36..a4b1787f9 100644
--- a/src/cpp/server/backends/sd_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -1,4 +1,6 @@
-#include "lemon/backends/sd_server.h"
+#include "lemon/backends/sdcpp/sdcpp_server.h"
+#include "lemon/backends/sdcpp/sdcpp.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
@@ -202,7 +204,7 @@ void SDServer::load(const std::string& model_name,
     RuntimeConfig::validate_backend_choice("sdcpp", backend);
 
     // Update device type based on the actual backend selected.
-    // get_device_type_from_recipe() defaults sd-cpp to CPU, but rocm/vulkan/metal/cuda are GPU backends.
+    // The descriptor defaults sd-cpp to CPU; rocm/vulkan/metal/cuda variants are GPU backends.
     if (backend == "rocm" || backend == "vulkan" || backend == "metal" || backend == "cuda") {
         device_type_ = DEVICE_GPU;
     } else {
@@ -210,7 +212,7 @@ void SDServer::load(const std::string& model_name,
     }
 
     // Install sd-server if needed
-    backend_manager_->install_backend(SPEC.recipe, backend);
+    backend_manager_->install_backend(sdcpp::spec()->recipe, backend);
 
     // Get model path
     std::string model_path = model_info.resolved_path("main");
@@ -232,7 +234,7 @@ void SDServer::load(const std::string& model_name,
     LOG(DEBUG, "SDServer") << "Using model: " << model_path << std::endl;
 
     // Get sd-server executable path
-    std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend);
+    std::string exe_path = BackendUtils::get_backend_binary_path(*sdcpp::spec(), backend);
 
     // Choose a port
     port_ = choose_port();
@@ -746,3 +748,18 @@ std::string SDServer::upscale_via_cli(
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace sdcpp {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return make_server<SDServer>(ctx);
+}
+
+
+const BackendSpec* spec() { return make_spec<SDServer>(descriptor); }
+const BackendOps* ops() { return default_backend_ops(); }
+}  // namespace sdcpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
similarity index 95%
rename from src/cpp/server/backends/vllm_server.cpp
rename to src/cpp/server/backends/vllm/vllm_server.cpp
index 7584d56d9..60a79c95f 100644
--- a/src/cpp/server/backends/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -1,4 +1,6 @@
-#include "lemon/backends/vllm_server.h"
+#include "lemon/backends/vllm/vllm_server.h"
+#include "lemon/backends/vllm/vllm.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/model_manager.h"
 #include "lemon/runtime_config.h"
@@ -122,7 +124,7 @@ void VLLMServer::load(const std::string& model_name,
     RuntimeConfig::validate_backend_choice("vllm", vllm_backend);
 
     // Install vllm-server if needed
-    backend_manager_->install_backend(SPEC.recipe, vllm_backend);
+    backend_manager_->install_backend(vllm::spec()->recipe, vllm_backend);
 
     // vLLM uses HuggingFace model names, not local file paths.
     // The checkpoint field in server_models.json is the HF model ID.
@@ -137,7 +139,7 @@ void VLLMServer::load(const std::string& model_name,
     port_ = choose_port();
 
     // Get executable path
-    std::string executable = BackendUtils::get_backend_binary_path(SPEC, vllm_backend);
+    std::string executable = BackendUtils::get_backend_binary_path(*vllm::spec(), vllm_backend);
 
     // Build command line arguments
     std::vector<std::string> args;
@@ -311,3 +313,18 @@ void VLLMServer::forward_streaming_request(const std::string& endpoint,
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace vllm {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return make_server<VLLMServer>(ctx);
+}
+
+
+const BackendSpec* spec() { return make_spec<VLLMServer>(descriptor, /*split=*/true); }
+const BackendOps* ops() { return default_backend_ops(); }
+}  // namespace vllm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/whisper_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
similarity index 89%
rename from src/cpp/server/backends/whisper_server.cpp
rename to src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index 9f50da020..d1222e551 100644
--- a/src/cpp/server/backends/whisper_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -1,5 +1,10 @@
-#include "lemon/backends/whisper_server.h"
+#include "lemon/backends/whispercpp/whispercpp_server.h"
+#include "lemon/backends/whispercpp/whispercpp.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/model_manager.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
 #include "lemon/system_info.h"
@@ -229,7 +234,7 @@ void WhisperServer::load(const std::string& model_name,
     RuntimeConfig::validate_backend_choice("whispercpp", whispercpp_backend);
 
     // Update device type based on the actual backend selected.
-    // get_device_type_from_recipe() defaults whispercpp to CPU, but npu/vulkan use different devices.
+    // The descriptor defaults whispercpp to CPU; npu/vulkan variants use different devices.
     if (whispercpp_backend == "npu") {
         device_type_ = DEVICE_NPU;
     } else if (whispercpp_backend == "vulkan" || whispercpp_backend == "metal") {
@@ -238,7 +243,7 @@ void WhisperServer::load(const std::string& model_name,
         device_type_ = DEVICE_CPU;
     }
 
-    backend_manager_->install_backend(SPEC.recipe, whispercpp_backend);
+    backend_manager_->install_backend(whispercpp::spec()->recipe, whispercpp_backend);
 
     std::string model_path = model_info.resolved_path();
     if (model_path.empty()) {
@@ -254,7 +259,7 @@ void WhisperServer::load(const std::string& model_name,
     }
 
     // Get whisper-server executable path
-    std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, whispercpp_backend);
+    std::string exe_path = BackendUtils::get_backend_binary_path(*whispercpp::spec(), whispercpp_backend);
 
     // Choose a port
     port_ = choose_port();
@@ -701,3 +706,65 @@ json WhisperServer::audio_transcriptions(const json& request) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace whispercpp {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<WhisperServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+
+namespace {
+class WhisperOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo& info,
+                                        const CheckpointResolveContext& ctx) const override {
+        // With no variant, find any .bin model file; otherwise use the shared
+        // default (variant/aux resolution).
+        if (ctx.variant.empty()) {
+            std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path);
+            if (!hf_cache::exists(dir)) {
+                return ctx.model_cache_path;
+            }
+            std::vector<std::string> bin_files;
+            for (const auto& entry :
+                 std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+                if (entry.is_regular_file() &&
+                    entry.path().filename().string().find(".bin") != std::string::npos) {
+                    bin_files.push_back(lemon::utils::path_to_utf8(entry.path()));
+                }
+            }
+            if (bin_files.empty()) {
+                return ctx.model_cache_path;
+            }
+            std::sort(bin_files.begin(), bin_files.end());
+            return bin_files[0];
+        }
+        return BackendOps::resolve_checkpoint_path(info, ctx);
+    }
+
+    std::string find_imported_checkpoint(const std::string& import_dir) const override {
+        // The primary artifact is the .bin model file.
+        std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir);
+        if (!hf_cache::exists(dir)) {
+            return "";
+        }
+        for (const auto& entry :
+             std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+            if (entry.is_regular_file() &&
+                entry.path().filename().string().find(".bin") != std::string::npos) {
+                return lemon::utils::path_to_utf8(entry.path());
+            }
+        }
+        return "";
+    }
+};
+}  // namespace
+
+const BackendSpec* spec() { return make_spec<WhisperServer>(descriptor); }
+const BackendOps* ops() { return single_ops<WhisperOps>(); }
+}  // namespace whispercpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/config_file.cpp b/src/cpp/server/config_file.cpp
index d8f6955af..2787c0167 100644
--- a/src/cpp/server/config_file.cpp
+++ b/src/cpp/server/config_file.cpp
@@ -1,4 +1,5 @@
 #include "lemon/config_file.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/utils/json_utils.h"
 #include "lemon/utils/path_utils.h"
 
@@ -27,10 +28,28 @@ static json load_json_file(const fs::path& path) {
     }
 }
 
-json ConfigFile::get_defaults() {
+json ConfigFile::base_defaults() {
     json defaults = load_json_file(utils::path_from_utf8(
         utils::get_resource_path("resources/defaults.json")));
 
+    // Seed each backend's config.json section from its descriptor. The per-recipe
+    // defaults are authored in the backend's descriptor; resources/defaults.json
+    // is the generated, committed mirror (see GET /internal/config/defaults and
+    // docs/tools/gen_backend_boilerplate.py). Re-seeding here keeps the descriptor
+    // authoritative even if the committed file lags. Empty result = no section.
+    for (const auto* d : backends::all_descriptors()) {
+        json block = d->config_defaults();
+        if (!block.empty()) {
+            defaults[d->effective_config_section()] = block;
+        }
+    }
+
+    return defaults;
+}
+
+json ConfigFile::get_defaults() {
+    json defaults = base_defaults();
+
 #ifndef _WIN32
     fs::path distro_defaults = "/usr/share/lemonade/defaults.json";
     if (fs::exists(distro_defaults)) {
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 0d51efb2c..02679e803 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -1,17 +1,17 @@
 #include <lemon/model_manager.h>
 #include <lemon/runtime_config.h>
 #include <lemon/hf_variants.h>
-#include <lemon/gguf_capabilities.h>
-#include <lemon/gguf_reader.h>
 #include <lemon/utils/json_utils.h>
 #include <lemon/utils/http_client.h>
 #include <lemon/utils/process_manager.h>
 #include <lemon/utils/path_utils.h>
 #include <lemon/system_info.h>
+#include <lemon/backends/backend_descriptor_registry.h>
+#include <lemon/backends/backend_registry.h>
 #include <lemon/backends/backend_utils.h>
-#include <lemon/backends/cloud_server.h>
+#include <lemon/backends/cloud/cloud_server.h>
+#include <lemon/backends/fastflowlm/fastflowlm_models.h>
 #include <lemon/cloud_provider_registry.h>
-#include <lemon/backends/fastflowlm_server.h>
 #include <filesystem>
 #include <iostream>
 #include <fstream>
@@ -132,116 +132,15 @@ static std::string cache_key_to_canonical_id(const std::string& cache_key) {
 // launched from a parent process that predates the FLM install and therefore
 // doesn't see FLM_MODEL_PATH, so we also probe every documented default.
 // Order is most-specific to most-historical.
-static std::vector<fs::path> get_flm_models_dir_candidates() {
-    std::vector<fs::path> roots;
-
-    const char* flm_model_path = std::getenv("FLM_MODEL_PATH");
-    if (flm_model_path && *flm_model_path) {
-        roots.push_back(path_from_utf8(flm_model_path) / "models");
-    }
-
-#ifdef _WIN32
-    const char* userprofile = std::getenv("USERPROFILE");
-    if (userprofile && *userprofile) {
-        fs::path home = path_from_utf8(userprofile);
-        roots.push_back(home / ".flm" / "models");          // current installer default
-        roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default
-        roots.push_back(home / "flm" / "models");
-    }
-#else
-    const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME");
-    if (xdg_config_home && *xdg_config_home) {
-        roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models");
-    }
-    const char* home = std::getenv("HOME");
-    if (home && *home) {
-        fs::path home_path = path_from_utf8(home);
-        roots.push_back(home_path / ".flm" / "models");
-        roots.push_back(home_path / ".config" / "flm" / "models");
-    }
-#endif
-
-    return roots;
-}
-
-static fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) {
-    if (repo_dir.empty()) return fs::path();
-
-    for (const auto& root : get_flm_models_dir_candidates()) {
-        fs::path candidate = root / repo_dir / "config.json";
-        if (safe_exists(candidate)) return candidate;
-    }
-    return fs::path();
-}
-
-static std::string repo_dir_from_url(const std::string& url) {
-    std::string clean = url;
-    while (!clean.empty() && clean.back() == '/') clean.pop_back();
-    size_t query_pos = clean.find_first_of("?#");
-    if (query_pos != std::string::npos) clean = clean.substr(0, query_pos);
-
-    for (const std::string marker : {"/tree/", "/resolve/"}) {
-        size_t marker_pos = clean.find(marker);
-        if (marker_pos != std::string::npos) {
-            clean = clean.substr(0, marker_pos);
-            break;
-        }
-    }
-
-    size_t slash = clean.find_last_of('/');
-    return slash == std::string::npos ? clean : clean.substr(slash + 1);
-}
-
-static int64_t read_flm_max_context_window(const ModelInfo& info) {
-    if (info.type != ModelType::LLM) return 0;
-
-    std::string config_path = info.resolved_path("config");
-    if (config_path.empty()) return 0;
-
-    try {
-        json config = JsonUtils::load_from_file(config_path);
-        if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) {
-            int64_t value = config["max_position_embeddings"].get<int64_t>();
-            return value > 0 ? value : 0;
-        }
-        if (config.contains("text_config") && config["text_config"].is_object()) {
-            const auto& text_config = config["text_config"];
-            if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) {
-                int64_t value = text_config["max_position_embeddings"].get<int64_t>();
-                return value > 0 ? value : 0;
-            }
-        }
-    } catch (const std::exception& e) {
-        LOG(DEBUG, "ModelManager") << "Could not read FLM config metadata for "
-                                   << info.model_name << ": " << e.what() << std::endl;
-    }
-    return 0;
-}
 
 static void populate_model_metadata(ModelInfo& info) {
     info.max_context_window = 0;
     if (!info.downloaded) return;
 
-    if (info.recipe == "llamacpp") {
-        std::string gguf_path = info.resolved_path();
-        if (!gguf_path.empty() && gguf_reader_detail::ends_with_ignore_case(gguf_path, ".gguf") && safe_exists(path_from_utf8(gguf_path))) {
-            GgufMetadata meta;
-            if (read_gguf_metadata(meta, gguf_path)) {
-                info.max_context_window = meta.context_length;
-                info.gguf = std::move(meta);
-
-                // GGUF vision/tool metadata are LLM capabilities. Do not apply
-                // them to embedding/reranking models, otherwise labels such as
-                // tool-calling would reclassify the model away from its endpoint
-                // type and break /embeddings or /rerank.
-                if (info.type == ModelType::LLM) {
-                    apply_gguf_capability_labels(info.labels, info.gguf.caps);
-                }
-            }
-        }
-    } else if (info.recipe == "flm") {
-        info.max_context_window = read_flm_max_context_window(info);
-    }
+    // Per-backend metadata (GGUF arch/labels for llamacpp, config.json ctx for
+    // flm, …) is read by the backend's ops, not a recipe switchboard here.
+    backends::BackendOpsContext ctx;
+    backends::ops_for(info.recipe)->populate_metadata(info, ctx);
 }
 
 static bool is_user_model_name(const std::string& model_name) {
@@ -359,6 +258,35 @@ static void parse_image_defaults(ModelInfo& info, const json& model_json) {
     }
 }
 
+// Populate ModelInfo::extras with any model-JSON key not consumed by a typed
+// ModelInfo field. This lets a new backend read custom per-model fields in load()
+// without editing the shared ModelInfo struct. Keep this set in sync with the
+// keys read by the parse blocks in build_cache().
+static void parse_extras(ModelInfo& info, const json& model_json) {
+    static const std::set<std::string> kKnownKeys = {
+        "checkpoint", "checkpoints", "components", "mmproj", "recipe", "suggested",
+        "source", "size", "cloud_provider",
+        "labels", "image_defaults", "recipe_options"
+    };
+    if (!model_json.is_object()) return;
+    for (auto& [key, value] : model_json.items()) {
+        if (kKnownKeys.count(key) == 0) {
+            info.extras[key] = value;
+        }
+    }
+}
+
+// Default device for a recipe: the backend descriptor is authoritative for
+// registered backends; collection/unknown recipes fall back to the recipe map.
+// (A backend whose device depends on the chosen backend variant resolves the
+// final device at load time via WrappedServer::effective_device.)
+static DeviceType device_type_for_recipe(const std::string& recipe) {
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        return desc->default_device;
+    }
+    return get_device_type_from_recipe(recipe);
+}
+
 // Build merged recipe options: image_defaults -> JSON recipe_options -> user-saved overrides.
 // json_recipe_options: pre-extracted recipe_options for this model (from build_cache's
 // two-phase pattern). Pass a null json if the model JSON should be read directly instead.
@@ -1017,7 +945,7 @@ std::map<std::string, ModelInfo> ModelManager::discover_extra_models() const {
         info.downloaded = true;
         info.source = EXTRA_MODEL_SOURCE;
         info.labels.push_back("custom");
-        info.device = get_device_type_from_recipe(EXTRA_MODEL_RECIPE);
+        info.device = device_type_for_recipe(EXTRA_MODEL_RECIPE);
         return info;
     };
 
@@ -1135,426 +1063,38 @@ std::map<std::string, ModelInfo> ModelManager::discover_extra_models() const {
 }
 
 std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::string& type, const std::string& checkpoint) const {
-    // Collections are virtual entries with no direct checkpoint to resolve
+    // Collections are virtual entries with no direct checkpoint to resolve.
     if (is_collection_recipe(info.recipe)) {
         return "";
     }
 
-    // Cloud-offloaded models have no local artifacts; checkpoint is the
-    // upstream provider's model id, used directly when forwarding requests.
-    if (info.recipe == "cloud") {
-        return "";
-    }
-
-    // FLM models use checkpoint as-is (e.g., "gemma3:4b")
-    if (info.recipe == "flm") {
-        return checkpoint;
-    }
-
-    // Local path models use checkpoint as-is (absolute path to file)
+    // Local-path models use the checkpoint as-is (absolute path to a file).
     if (info.source == "local_path") {
         return checkpoint;
     }
 
     std::string hf_cache = get_hf_cache_dir();
 
-    // Local uploads: checkpoint is relative path from HF cache
+    // Local uploads: checkpoint is a relative path from the HF cache.
     if (info.source == "local_upload") {
         std::string normalized = checkpoint;
         std::replace(normalized.begin(), normalized.end(), '\\', '/');
         return hf_cache + "/" + normalized;
     }
 
-    // For now, NPU cache is handled directly in whisper.cpp
-    if (type == "npu_cache") {
-        return "";
-    }
-
-    // HuggingFace models: need to find the GGUF file in cache
-    // Parse checkpoint to get repo_id and variant
-    // Use the checkpoint's own repo, falling back to main repo for backward compatibility
-    std::string checkpoint_repo_id = checkpoint_to_repo_id(checkpoint);
-    std::string main_repo_id = checkpoint_to_repo_id(info.checkpoint("main"));
-    std::string repo_id = checkpoint_repo_id;
-    std::string variant = checkpoint_to_variant(checkpoint);
-
-    std::string model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(repo_id);
-    fs::path model_cache_path_fs = path_from_utf8(model_cache_path);
-
-    // For RyzenAI LLM models, look for genai_config.json directory
-    if (info.recipe == "ryzenai-llm") {
-        if (safe_exists(model_cache_path_fs)) {
-            for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-                if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") {
-                    return path_to_utf8(entry.path().parent_path());
-                }
-            }
-        }
-        return model_cache_path;  // Return directory even if genai_config not found
-    }
-
-    // For kokoro models, look for index.json directory
-    if (info.recipe == "kokoro") {
-        if (safe_exists(model_cache_path_fs)) {
-            for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-                if (entry.is_regular_file() && entry.path().filename() == "index.json") {
-                    return path_to_utf8(entry.path());
-                }
-            }
-        }
-
-        return model_cache_path;  // Return directory even if index not found
-    }
-
-    // For whispercpp, find the .bin model file
-    if (info.recipe == "whispercpp" && variant.empty()) {
-        // No variant specified - use fallback logic to find any .bin file
-        if (!safe_exists(model_cache_path_fs)) {
-            return model_cache_path;  // Return directory path even if not found
-        }
-
-        // Collect all .bin files
-        std::vector<std::string> all_bin_files;
-        for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-            if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                if (filename.find(".bin") != std::string::npos) {
-                    all_bin_files.push_back(path_to_utf8(entry.path()));
-                }
-            }
-        }
-
-        if (all_bin_files.empty()) {
-            return model_cache_path;  // Return directory if no .bin found
-        }
-
-        // Sort files for consistent ordering
-        std::sort(all_bin_files.begin(), all_bin_files.end());
-
-        // Return first .bin file as fallback (only when no variant specified)
-        return all_bin_files[0];
-    }
-
-    // For llamacpp, find the GGUF file with advanced sharded model support
-    if (info.recipe == "llamacpp" && type == "main") {
-        if (!safe_exists(model_cache_path_fs)) {
-            return model_cache_path;  // Return directory path even if not found
-        }
-
-        // Prefer the active HF snapshot recorded in refs/main. This lets
-        // Lemonade keep using the previous snapshot when upstream only changed
-        // README/metadata and the requested model artifacts are unchanged.
-        auto collect_gguf_files = [](const fs::path& search_root) {
-            std::vector<std::string> files;
-            if (search_root.empty() || !safe_exists(search_root)) {
-                return files;
-            }
-
-            std::error_code ec;
-            for (const auto& entry : fs::recursive_directory_iterator(search_root, safe_dir_options, ec)) {
-                if (ec) break;
-                if (!entry.is_regular_file(ec)) {
-                    ec.clear();
-                    continue;
-                }
-
-                std::string filename = entry.path().filename().string();
-                std::string filename_lower = filename;
-                std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
-
-                if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) {
-                    files.push_back(path_to_utf8(entry.path()));
-                }
-            }
-            return files;
-        };
-
-        // Resolve the requested GGUF variant within a candidate list of files.
-        // Returns the matched absolute path, or "" if this candidate set does not
-        // contain the variant. Factored into a lambda so the search can be retried
-        // against a broader set of snapshots (see #2300 below) without duplicating
-        // the matching logic.
-        auto resolve_gguf_variant =
-            [&](const std::vector<std::string>& gguf_files) -> std::string {
-            if (gguf_files.empty()) {
-                return "";
-            }
-
-            // Case 0: Wildcard (*) - return first file (llama-server will auto-load shards)
-            if (variant == "*") {
-                return gguf_files[0];
-            }
-
-            // Case 1: Empty variant - return first file
-            if (variant.empty()) {
-                return gguf_files[0];
-            }
-
-            // Case 2: Exact filename match (variant ends with .gguf)
-            if (variant.find(".gguf") != std::string::npos) {
-                for (const auto& filepath : gguf_files) {
-                    std::string filename = path_from_utf8(filepath).filename().string();
-                    if (filename == variant) {
-                        return filepath;
-                    }
-                }
-                return "";  // Exact variant not found in this candidate set
-            }
-
-            // Case 3: Files ending with {variant}.gguf (case insensitive)
-            std::string variant_lower = variant;
-            std::transform(variant_lower.begin(), variant_lower.end(), variant_lower.begin(), ::tolower);
-            std::string suffix = variant_lower + ".gguf";
-
-            std::vector<std::string> matching_files;
-            for (const auto& filepath : gguf_files) {
-                std::string filename = path_from_utf8(filepath).filename().string();
-                std::string filename_lower = filename;
-                std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
-
-                if (filename_lower.size() >= suffix.size() &&
-                    filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) {
-                    matching_files.push_back(filepath);
-                }
-            }
-
-            if (!matching_files.empty()) {
-                return matching_files[0];
-            }
-
-            // Case 4: Folder-based sharding (files in variant/ folder)
-            std::string folder_prefix_lower = variant_lower + "/";
-
-            for (const auto& filepath : gguf_files) {
-                // Get relative path from model cache path
-                std::string relative_path = path_to_utf8(
-                    path_from_utf8(filepath).lexically_relative(model_cache_path_fs));
-                std::string relative_lower = relative_path;
-                // Normalize path separators and case so folder-variant matching works cross-platform.
-                std::transform(relative_lower.begin(), relative_lower.end(), relative_lower.begin(), ::tolower);
-                std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/');
-
-                if (relative_lower.find(folder_prefix_lower) != std::string::npos) {
-                    return filepath;
-                }
-            }
-
-            // Case 5: Local quant-token fallback.
-            //
-            // Keep the existing resolver cases above as the primary logic: exact
-            // filenames, suffix matches, and folder-based sharding are more
-            // specific and preserve the CHECKPOINT:VARIANT contract.
-            //
-            // Some GGUF repositories name files with the quant token in the middle,
-            // for example:
-            //   Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf
-            // for variant:
-            //   IQ4_XS
-            // That file does not end with IQ4_XS.gguf, so mirror the downloader's
-            // GGUF variant enumeration over the files that are already present in
-            // the local HF cache before declaring the model missing.
-            //
-            // HF cache paths have an extra snapshots/<revision>/ prefix that is not
-            // part of the repository-relative filename. Strip it before calling
-            // enumerate_gguf_variants(); otherwise the enumerator treats
-            // "snapshots" as a top-level sharded-folder variant and never extracts
-            // the quant token from the actual GGUF filename.
-            std::vector<std::string> relative_gguf_files;
-            std::map<std::string, std::string> absolute_by_relative;
-            auto repo_relative_from_cache_relative = [](std::string rel) {
-                std::replace(rel.begin(), rel.end(), '\\', '/');
-
-                static const std::string snapshots_prefix = "snapshots/";
-                if (rel.rfind(snapshots_prefix, 0) == 0) {
-                    size_t revision_end = rel.find('/', snapshots_prefix.size());
-                    if (revision_end != std::string::npos && revision_end + 1 < rel.size()) {
-                        rel = rel.substr(revision_end + 1);
-                    }
-                }
-
-                return rel;
-            };
-
-            for (const auto& filepath : gguf_files) {
-                std::string relative_path = path_to_utf8(
-                    path_from_utf8(filepath).lexically_relative(model_cache_path_fs));
-                relative_path = repo_relative_from_cache_relative(relative_path);
-
-                // Multiple HF snapshots can contain the same repo-relative file.
-                // Keep the first absolute path from the sorted gguf_files list
-                // so duplicates do not create false ambiguity.
-                if (absolute_by_relative.emplace(relative_path, filepath).second) {
-                    relative_gguf_files.push_back(relative_path);
-                }
-            }
-
-            std::vector<std::string> enumerated_matches;
-            auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files);
-            for (const auto& local_variant : local_variants.variants) {
-                if (gguf_reader_detail::to_lower(local_variant.name) != variant_lower) {
-                    continue;
-                }
-
-                auto it = absolute_by_relative.find(local_variant.primary_file);
-                if (it != absolute_by_relative.end()) {
-                    enumerated_matches.push_back(it->second);
-                }
-            }
-
-            if (enumerated_matches.size() == 1) {
-                LOG(INFO, "ModelManager")
-                    << "Resolved local GGUF variant '" << variant
-                    << "' via quant-token fallback: " << enumerated_matches[0] << std::endl;
-                return enumerated_matches[0];
-            }
-
-            if (enumerated_matches.size() > 1) {
-                LOG(WARNING, "ModelManager")
-                    << "Multiple local GGUF files matched variant '" << variant
-                    << "' via quant-token fallback; refusing to guess" << std::endl;
-                return "";
-            }
-
-            // No match in this candidate set. Do not fall back to another
-            // quantization in the same Hugging Face repo; otherwise a custom
-            // download with a different quant can make a built-in model appear
-            // downloaded and allow deleting the wrong file.
-            return "";
-        };
-
-        // Prefer the active refs/main snapshot so that when upstream only changed
-        // README/metadata Lemonade keeps using the previous snapshot's artifacts.
-        // (Sorted for consistent ordering, important for sharded models.)
-        std::vector<std::string> active_gguf_files =
-            collect_gguf_files(active_hf_snapshot_path(model_cache_path_fs));
-        std::sort(active_gguf_files.begin(), active_gguf_files.end());
-
-        // Whole-repo-cache candidates spanning every snapshot, populated on demand.
-        std::vector<std::string> all_cache_gguf_files;
-        bool all_cache_collected = false;
-        auto whole_cache_gguf_files = [&]() -> const std::vector<std::string>& {
-            if (!all_cache_collected) {
-                all_cache_gguf_files = collect_gguf_files(model_cache_path_fs);
-                std::sort(all_cache_gguf_files.begin(), all_cache_gguf_files.end());
-                all_cache_collected = true;
-            }
-            return all_cache_gguf_files;
-        };
-
-        if (active_gguf_files.empty() && whole_cache_gguf_files().empty()) {
-            return model_cache_path;  // Return directory if no GGUF found anywhere
-        }
-
-        std::string resolved_path = resolve_gguf_variant(active_gguf_files);
-
-        // #2300: a sibling variant that shares this HF repo can live in a snapshot
-        // other than the one refs/main points at. refs/main advances to the
-        // snapshot of whichever variant was pulled or updated last, leaving the
-        // other variants' symlinks behind in earlier snapshots; after a restart the
-        // refs/main-only search above then reports them as missing. If the active
-        // snapshot did not contain the requested variant, broaden the search to
-        // every snapshot in this repo's cache before declaring it missing. Blobs are
-        // content-addressed and shared, so reading an older snapshot is safe, and
-        // resolving against the active snapshot first preserves the CHECKPOINT:VARIANT
-        // contract (a different quant is never substituted while the exact one exists).
-        //
-        // The whole-cache set is a superset of the active set (it recurses the repo
-        // cache, which contains the active snapshot dir), so the two are equal only
-        // when refs/main's snapshot is the sole snapshot holding GGUFs — in which case
-        // the broader search is identical and skipped. Comparing the (sorted) sets,
-        // rather than just their sizes, makes that intent explicit and stays correct
-        // even if that superset relationship ever changes.
-        //
-        // When more than one inactive snapshot holds the requested variant, the
-        // existing first-by-sorted-path dedup (see Case 5) picks one deterministically;
-        // every such copy is a valid GGUF of that quant, so this is safe for the
-        // resolve/downloaded-status purpose. Preferring the newest snapshot per variant
-        // would need per-variant snapshot state the HF cache does not record today and
-        // is left as a follow-up (out of scope for this fix).
-        if (resolved_path.empty()) {
-            const std::vector<std::string>& all_files = whole_cache_gguf_files();
-            if (all_files != active_gguf_files) {
-                resolved_path = resolve_gguf_variant(all_files);
-            }
-        }
-
-        return resolved_path;
-    }
-
-    // Everything else
-    if (!variant.empty()) {
-        // Prefer refs/main for auxiliary checkpoints too (for example mmproj),
-        // so companion files stay on the same active snapshot as the main model
-        // when unchanged artifacts are reused across README-only commits.
-        fs::path active_snapshot = active_hf_snapshot_path(model_cache_path_fs);
-        if (!active_snapshot.empty()) {
-            fs::path direct_variant_path = active_snapshot / path_from_utf8(variant);
-            if (safe_exists(direct_variant_path)) {
-                return path_to_utf8(direct_variant_path);
-            }
-
-            std::error_code ec;
-            for (const auto& entry : fs::recursive_directory_iterator(active_snapshot, safe_dir_options, ec)) {
-                if (ec) break;
-                if (entry.is_regular_file(ec)) {
-                    std::string filename = entry.path().filename().string();
-                    if (filename == variant) {
-                        return path_to_utf8(entry.path());
-                    }
-                } else if (entry.is_directory(ec)) {
-                    fs::path variant_path = entry.path() / path_from_utf8(variant);
-                    if (safe_exists(variant_path)) {
-                        return path_to_utf8(variant_path);
-                    }
-                }
-                ec.clear();
-            }
-        }
-
-        // Try to find the exact variant in snapshots subdirectories
-        if (safe_exists(model_cache_path_fs)) {
-            for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-                if (entry.is_regular_file()) {
-                    std::string filename = entry.path().filename().string();
-                    if (filename == variant) {
-                        return path_to_utf8(entry.path());
-                    }
-                } else if (entry.is_directory()) {
-                    fs::path variant_path = entry.path() / path_from_utf8(variant);
-                    if (safe_exists(variant_path)) {
-                        return path_to_utf8(variant_path);
-                    }
-                }
-            }
-        }
-        // Variant not found in checkpoint's own repo - try main repo as fallback
-        // (backward compat: older downloads placed all files in the main repo dir)
-        if (checkpoint_repo_id != main_repo_id) {
-            std::string main_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(main_repo_id);
-            fs::path main_cache_path_fs = path_from_utf8(main_cache_path);
-            if (fs::exists(main_cache_path_fs)) {
-                for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) {
-                    if (entry.is_regular_file()) {
-                        std::string filename = entry.path().filename().string();
-                        if (filename == variant) {
-                            return path_to_utf8(entry.path());
-                        }
-                    } else if (entry.is_directory()) {
-                        fs::path variant_path = entry.path() / path_from_utf8(variant);
-                        if (fs::exists(variant_path)) {
-                            return path_to_utf8(variant_path);
-                        }
-                    }
-                }
-            }
-        }
-
-        // Variant not found - return empty string to indicate model not downloaded
-        return "";
-    }
+    // Compute the HF cache location for this checkpoint's repo, then let the
+    // backend's ops find its artifact within (a .gguf file, a genai_config.json
+    // directory, a .bin, …) — no per-recipe switchboard here.
+    backends::CheckpointResolveContext ctx;
+    ctx.hf_cache = hf_cache;
+    ctx.repo_id = checkpoint_to_repo_id(checkpoint);
+    ctx.main_repo_id = checkpoint_to_repo_id(info.checkpoint("main"));
+    ctx.variant = checkpoint_to_variant(checkpoint);
+    ctx.model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(ctx.repo_id);
+    ctx.type = type;
+    ctx.checkpoint = checkpoint;
 
-    // Fallback: return directory path
-    return model_cache_path;
+    return backends::ops_for(info.recipe)->resolve_checkpoint_path(info, ctx);
 }
 
 void ModelManager::resolve_all_model_paths(ModelInfo& info) {
@@ -1729,21 +1269,6 @@ static bool has_partial_files(const fs::path& dir) {
     return false;
 }
 
-static bool is_valid_gguf_file_for_cache(const std::string& path) {
-    std::ifstream in(path_from_utf8(path), std::ios::binary);
-    if (!in.is_open()) {
-        return false;
-    }
-
-    char magic[4] = {};
-    in.read(magic, sizeof(magic));
-    return in.gcount() == static_cast<std::streamsize>(sizeof(magic)) &&
-           magic[0] == 'G' &&
-           magic[1] == 'G' &&
-           magic[2] == 'U' &&
-           magic[3] == 'F';
-}
-
 static bool is_checkpoint_path_complete(const std::string& path_str) {
     if (path_str.empty()) return false;
 
@@ -1779,20 +1304,26 @@ static bool are_required_checkpoints_complete(const ModelInfo& info) {
             return false;
         }
 
-        fs::path resolved = path_from_utf8(resolved_path);
-        if (info.recipe == "llamacpp" &&
-            !safe_is_directory(resolved) &&
-            gguf_reader_detail::ends_with_ignore_case(resolved_path, ".gguf") &&
-            !is_valid_gguf_file_for_cache(resolved_path)) {
+        // Per-backend file validation (e.g. llamacpp checks GGUF magic).
+        std::string invalid = backends::ops_for(info.recipe)->validate_checkpoint_file(resolved_path);
+        if (!invalid.empty()) {
             LOG(WARNING, "ModelManager")
-                << "Invalid GGUF cache file; marking model as not downloaded: "
-                << resolved_path << std::endl;
+                << invalid << "; marking model as not downloaded: " << resolved_path << std::endl;
             return false;
         }
     }
     return true;
 }
 
+bool ModelManager::checkpoints_complete(const ModelInfo& info) const {
+    return are_required_checkpoints_complete(info);
+}
+
+void ModelManager::download_from_huggingface_engine(const ModelInfo& info,
+                                                    DownloadProgressCallback progress_callback) {
+    download_from_huggingface(info, progress_callback);
+}
+
 void ModelManager::build_cache() {
     std::lock_guard<std::mutex> lock(models_cache_mutex_);
 
@@ -1816,11 +1347,9 @@ void ModelManager::build_cache() {
         parse_components(info, value);
         info.recipe = JsonUtils::get_or_default<std::string>(value, "recipe", "");
         info.suggested = JsonUtils::get_or_default<bool>(value, "suggested", false);
-        info.hf_load = JsonUtils::get_or_default<bool>(value, "hf_load", false);
         info.source = JsonUtils::get_or_default<std::string>(value, "source", "");
         info.size = JsonUtils::get_or_default<double>(value, "size", 0.0);
         info.cloud_provider = JsonUtils::get_or_default<std::string>(value, "cloud_provider", "");
-        info.moonshine_arch = JsonUtils::get_or_default<int>(value, "moonshine_arch", -1);
 
         // HF-backed collections store their components on Hugging Face — the
         // cached manifest is the single source of truth. Rebuild the component
@@ -1842,6 +1371,7 @@ void ModelManager::build_cache() {
         }
 
         parse_image_defaults(info, value);
+        parse_extras(info, value);
 
         // Parse recipe_options if present (for per-model runtime config like sdcpp_args)
         if (value.contains("recipe_options") && value["recipe_options"].is_object()) {
@@ -1850,7 +1380,7 @@ void ModelManager::build_cache() {
 
         // Populate type and device fields (multi-model support)
         info.type = get_model_type_from_labels(info.labels);
-        info.device = get_device_type_from_recipe(info.recipe);
+        info.device = device_type_for_recipe(info.recipe);
 
         try {
             resolve_all_model_paths(info);
@@ -1870,11 +1400,9 @@ void ModelManager::build_cache() {
         parse_components(info, value);
         info.recipe = JsonUtils::get_or_default<std::string>(value, "recipe", "");
         info.suggested = JsonUtils::get_or_default<bool>(value, "suggested", true);
-        info.hf_load = JsonUtils::get_or_default<bool>(value, "hf_load", false);
         info.source = JsonUtils::get_or_default<std::string>(value, "source", "");
         info.size = JsonUtils::get_or_default<double>(value, "size", 0.0);
         info.cloud_provider = JsonUtils::get_or_default<std::string>(value, "cloud_provider", "");
-        info.moonshine_arch = JsonUtils::get_or_default<int>(value, "moonshine_arch", -1);
 
         // HF-backed user collections (created by `lemonade pull <org>/<repo>`)
         // keep only a repo pointer in user_models.json; their components live in
@@ -1895,6 +1423,7 @@ void ModelManager::build_cache() {
         }
 
         parse_image_defaults(info, value);
+        parse_extras(info, value);
 
         // Parse recipe_options if present (for per-model runtime config like sdcpp_args)
         if (value.contains("recipe_options") && value["recipe_options"].is_object()) {
@@ -1903,7 +1432,7 @@ void ModelManager::build_cache() {
 
         // Populate type and device fields (multi-model support)
         info.type = get_model_type_from_labels(info.labels);
-        info.device = get_device_type_from_recipe(info.recipe);
+        info.device = device_type_for_recipe(info.recipe);
 
         try {
             resolve_all_model_paths(info);
@@ -1928,56 +1457,20 @@ void ModelManager::build_cache() {
         all_models[name] = info;
     }
 
-    // Step 1.6: Discover FLM models from 'flm list --json'
-    // Only discover FLM models if FLM is fully installed
-    // Precedence: server_models.json > user_models.json > extra_models > flm_list
-    auto flm_status = SystemInfoCache::get_flm_status();
-    if (flm_status.is_ready()) {
-        auto flm_available = get_flm_available_models();
-        for (const auto& info : flm_available) {
-            // Use emplace to only add if key doesn't exist (respect precedence)
-            all_models.emplace(info.model_name, info);
-        }
-    }
-
-    // Cloud-offload discovery is server-side and automatic. For each
-    // installed cloud provider with a resolvable credential (env var or
-    // runtime-auth POST), call discover_models and merge the results into
-    // all_models. Per AGENTS.md invariant #11, the registry persists only
-    // {provider, base_url} pairs — API keys live in env vars or process
-    // memory, never on disk. Failures are logged, never propagated, so a
-    // single offline provider can't block the rest of cache build.
-    if (cloud_registry_ != nullptr) {
-        auto installed = cloud_registry_->list_installed();
-        for (const auto& rec : installed) {
-            const std::string api_key = cloud_registry_->resolve_key(rec.name);
-            if (api_key.empty() || rec.base_url.empty()) {
-                LOG(INFO, "ModelManager") << "Skipping cloud discovery for '"
-                                           << rec.name << "': no API key resolvable"
-                                           << " (set " << CloudProviderRegistry::env_var_name(rec.name)
-                                           << " or POST /v1/cloud/auth)" << std::endl;
-                continue;
-            }
-            if (CloudProviderRegistry::is_http_base_url(rec.base_url) &&
-                !rec.allow_insecure_http) {
-                LOG(WARNING, "ModelManager") << "Skipping cloud discovery for '"
-                                             << rec.name << "': http:// with API key "
-                                             << "requires allow_insecure_http=true"
-                                             << std::endl;
-                continue;
-            }
-            std::vector<ModelInfo> discovered;
-            try {
-                discovered = backends::CloudServer::discover_models(rec.name, api_key, rec.base_url);
-            } catch (const std::exception& e) {
-                LOG(WARNING, "ModelManager") << "Cloud discovery threw for '"
-                                              << rec.name << "': " << e.what()
-                                              << std::endl;
+    // Step 1.6: Dynamic discovery. Backends whose models are supplied at runtime
+    // (descriptor dynamic_models = true — flm from `flm list`, cloud from each
+    // provider) contribute their models via ops->discover_models(). Each carries
+    // its own downloaded status. Precedence: server/user/extra models win, so we
+    // emplace (don't overwrite). Failures are handled inside each backend's ops.
+    {
+        backends::BackendOpsContext octx;
+        octx.model_manager = this;
+        octx.cloud_registry = cloud_registry_;
+        for (const auto* desc : backends::all_descriptors()) {
+            if (!desc->dynamic_models) {
                 continue;
             }
-            for (auto& m : discovered) {
-                if (m.recipe != "cloud" || m.model_name.empty()) continue;
-                // Same merge precedence as FLM: emplace, don't overwrite.
+            for (auto& m : backends::ops_for(desc->recipe)->discover_models(octx)) {
                 all_models.emplace(m.model_name, std::move(m));
             }
         }
@@ -1994,21 +1487,21 @@ void ModelManager::build_cache() {
     // Step 2: Filter by backend availability
     all_models = filter_models_by_backend(all_models);
 
-    // Step 3: Check download status ONCE for all models
-    auto flm_models = get_flm_installed_models();
-    std::unordered_set<std::string> flm_set(flm_models.begin(), flm_models.end());
+    // Step 3: Check download status for all models. Dynamic-discovery backends
+    // (flm, cloud) already set downloaded during discovery; everyone else asks
+    // its backend ops (default = shared HF completeness check).
+    backends::BackendOpsContext status_ctx;
+    status_ctx.model_manager = this;
 
     int downloaded_count = 0;
     // First pass: determine download status for non-collection models
     for (auto& [name, info] : all_models) {
         if (is_collection_recipe(info.recipe)) {
             continue;  // Handled in second pass after components are resolved
-        } else if (info.recipe == "flm") {
-            info.downloaded = flm_set.count(info.checkpoint()) > 0;
-        } else if (info.recipe == "cloud") {
-            info.downloaded = true;  // Cloud-offloaded models have no local artifacts
-        } else {
-            info.downloaded = are_required_checkpoints_complete(info);
+        }
+        const auto* desc = backends::descriptor_for(info.recipe);
+        if (!(desc && desc->dynamic_models)) {
+            info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, status_ctx);
         }
 
         if (info.downloaded) {
@@ -2076,12 +1569,12 @@ void ModelManager::add_model_to_cache(const std::string& model_name) {
     info.cloud_provider = JsonUtils::get_or_default<std::string>(*model_json, "cloud_provider", "");
 
     parse_image_defaults(info, *model_json);
+    parse_extras(info, *model_json);
     json jro = (model_json->contains("recipe_options") && (*model_json)["recipe_options"].is_object())
         ? (*model_json)["recipe_options"] : json(nullptr);
     info.recipe_options = build_recipe_options(info, jro, cache_key_to_canonical_id(model_name), recipe_options_);
 
     info.suggested = JsonUtils::get_or_default<bool>(*model_json, "suggested", is_user_model);
-    info.hf_load = JsonUtils::get_or_default<bool>(*model_json, "hf_load", false);
     info.source = JsonUtils::get_or_default<std::string>(*model_json, "source", "");
 
     if (model_json->contains("labels") && (*model_json)["labels"].is_array()) {
@@ -2092,7 +1585,7 @@ void ModelManager::add_model_to_cache(const std::string& model_name) {
 
     // Populate type and device fields (multi-model support)
     info.type = get_model_type_from_labels(info.labels);
-    info.device = get_device_type_from_recipe(info.recipe);
+    info.device = device_type_for_recipe(info.recipe);
 
     resolve_all_model_paths(info);
 
@@ -2105,16 +1598,14 @@ void ModelManager::add_model_to_cache(const std::string& model_name) {
         return; // Backend not available, don't add to cache
     }
 
-    // Check download status
+    // Check download status (collections aggregate their components; everyone
+    // else asks its backend ops).
     if (is_collection_recipe(info.recipe)) {
         info.downloaded = check_component_downloaded(info, models_cache_);
-    } else if (info.recipe == "flm") {
-        auto flm_models = get_flm_installed_models();
-        info.downloaded = std::find(flm_models.begin(), flm_models.end(), info.checkpoint()) != flm_models.end();
-    } else if (info.recipe == "cloud") {
-        info.downloaded = true;  // Cloud-offloaded models have no local artifacts
     } else {
-        info.downloaded = are_required_checkpoints_complete(info);
+        backends::BackendOpsContext octx;
+        octx.model_manager = this;
+        info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, octx);
     }
 
     populate_model_metadata(info);
@@ -2153,10 +1644,10 @@ void ModelManager::update_model_in_cache(const std::string& model_name, bool dow
         // The path changes now that files exist on disk
         if (downloaded) {
             resolve_all_model_paths(it->second);
-            if (it->second.recipe == "flm") {
+            if (backends::ops_for(it->second.recipe)->invalidates_cache_after_download()) {
                 cache_valid_ = false;
-                LOG(INFO, "ModelManager") << "Invalidated model cache after FLM download for '"
-                          << model_name << "'" << std::endl;
+                LOG(INFO, "ModelManager") << "Invalidated model cache after download for '"
+                          << model_name << "' (backend rebuilds its model list)" << std::endl;
                 return;
             }
             populate_model_metadata(it->second);
@@ -2668,16 +2159,12 @@ void ModelManager::register_user_model(const std::string& model_name,
     // loop above; this local is just for the label inference below.
     std::string recipe = model_data.value("recipe", "");
 
-    if (recipe == "sd-cpp") {
-        labels.insert("image");
-    }
-    if (recipe == "whispercpp") {
-        labels.insert("transcription");
-        labels.insert("realtime-transcription");
-    }
-    if (recipe == "moonshine") {
-        labels.insert("transcription");
-        labels.insert("realtime-transcription");
+    // Inject the backend's default labels for models that omit them (e.g. sd-cpp
+    // -> image, whispercpp/moonshine -> transcription). Sourced from the descriptor.
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        for (const auto& label : desc->default_labels) {
+            labels.insert(label);
+        }
     }
 
     model_entry["labels"] = labels;
@@ -2738,188 +2225,8 @@ void ModelManager::unregister_user_model(const std::string& model_name) {
     cache_valid_ = false;
 }
 
-// Find the FLM executable: install dir on Windows, system PATH on Linux.
-// Returns empty string if not found.
-static std::string find_flm_binary() {
-    try {
-        return backends::BackendUtils::get_backend_binary_path(
-            backends::FastFlowLMServer::SPEC, "npu");
-    } catch (...) {
-#ifndef _WIN32
-        return utils::find_flm_executable();
-#else
-        return "";
-#endif
-    }
-}
-
-// Helper function to get FLM installed models by calling 'flm list --filter installed --quiet'
-std::vector<std::string> ModelManager::get_flm_installed_models() {
-    std::vector<std::string> installed_models;
-
-    std::string flm_path = find_flm_binary();
-    if (flm_path.empty()) return installed_models;
 
-    // Run 'flm list --filter installed --quiet --json' to get only installed models
-    std::string output;
-#ifdef _WIN32
-    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-#else
-    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null";
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        return installed_models;
-    }
-
-    char buffer[256];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    pclose(pipe);
-#endif
-
-    // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] }
-    try {
-        json j = JsonUtils::parse(output);
-        if (j.contains("models") && j["models"].is_array()) {
-            for (const auto& model : j["models"]) {
-                if (model.contains("name") && model["name"].is_string()) {
-                    installed_models.push_back(model["name"].get<std::string>());
-                }
-            }
-            return installed_models;
-        }
-    } catch (...) {
-        // Fallback to legacy parsing if JSON parsing fails
-    }
-
-    // Legacy parsing - cleaner format without emojis
-    // Expected format:
-    //   Models:
-    //     - modelname:tag
-    //     - another:model
-    std::istringstream stream(output);
-    std::string line;
-    while (std::getline(stream, line)) {
-        // Trim whitespace
-        line.erase(0, line.find_first_not_of(" \t\r\n"));
-        line.erase(line.find_last_not_of(" \t\r\n") + 1);
-
-        // Skip the "Models:" header line or empty lines
-        if (line == "Models:" || line.empty()) {
-            continue;
-        }
-
-        // Parse model checkpoint (format: "  - modelname:tag")
-        if (line.find("- ") == 0) {
-            std::string checkpoint = line.substr(2);
-            // Trim any remaining whitespace
-            checkpoint.erase(0, checkpoint.find_first_not_of(" \t"));
-            checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1);
-            if (!checkpoint.empty()) {
-                installed_models.push_back(checkpoint);
-            }
-        }
-    }
-
-    return installed_models;
-}
-
-std::vector<ModelInfo> ModelManager::get_flm_available_models() {
-    std::vector<ModelInfo> flm_models;
-
-    std::string flm_path = find_flm_binary();
-    if (flm_path.empty()) return flm_models;
-
-    LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl;
-
-    // Run 'flm list --json' to get all available models
-    std::string output;
-#ifdef _WIN32
-    std::string command = "\"" + flm_path + "\" list --json";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-    LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc
-              << ", output length: " << output.size() << std::endl;
-    if (rc != 0 || output.empty()) {
-        LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. "
-                  << "Output: " << output.substr(0, 200) << std::endl;
-    }
-#else
-    std::string command = "\"" + flm_path + "\" list --json 2>/dev/null";
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        return flm_models;
-    }
-
-    char buffer[256];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    pclose(pipe);
-#endif
-
-    // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] }
-    try {
-        json j = JsonUtils::parse(output);
-        if (j.contains("models") && j["models"].is_array()) {
-            for (const auto& m : j["models"]) {
-                if (m.contains("name") && m["name"].is_string()) {
-                    std::string checkpoint = m["name"].get<std::string>();
-
-                    // Format display name: replace : with -, append -FLM
-                    // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM"
-                    std::string display_name = checkpoint;
-                    // Replace : with -
-                    std::replace(display_name.begin(), display_name.end(), ':', '-');
-
-                    std::string model_name = display_name + "-FLM";
-
-                    ModelInfo info;
-                    info.model_name = model_name;
-                    info.checkpoints["main"] = checkpoint;
-                    info.recipe = "flm";
-                    info.suggested = true; // All official FLM models are suggested
-
-                    if (JsonUtils::get_or_default<bool>(m, "installed", false) && m.contains("url") && m["url"].is_string()) {
-                        fs::path config_path = find_flm_config_path_from_repo_dir(repo_dir_from_url(m["url"].get<std::string>()));
-                        if (!config_path.empty()) {
-                            info.resolved_paths["config"] = path_to_utf8(config_path);
-                        }
-                    }
-
-                    // Size in GB (footprint field contains disk size in GB)
-                    if (m.contains("footprint") && m["footprint"].is_number()) {
-                        info.size = m["footprint"].get<double>();
-                    }
-
-                    // Labels from FLM metadata
-                    if (m.contains("label") && m["label"].is_array()) {
-                        for (const auto& l : m["label"]) {
-                            if (l.is_string()) {
-                                info.labels.push_back(l.get<std::string>());
-                            }
-                        }
-                    }
-
-                    // Populate type and device fields (multi-model support)
-                    info.type = get_model_type_from_labels(info.labels);
-                    info.device = get_device_type_from_recipe(info.recipe);
-
-                    flm_models.push_back(info);
-                }
-            }
-        }
-    } catch (const std::exception& e) {
-        LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl;
-    } catch (...) {
-        LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl;
-    }
 
-    return flm_models;
-}
 
 bool ModelManager::is_model_downloaded(const std::string& model_name) {
     // Build cache if needed
@@ -2943,19 +2250,17 @@ bool ModelManager::is_model_downloaded(const std::string& model_name) {
     return false;
 }
 
-void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) {
-    // Cloud models have no local artifacts; "downloading" is a no-op.
-    if (info.recipe == "cloud") {
-        update_model_in_cache(info.model_name, true);
-        return;
-    }
+bool ModelManager::backend_self_manages_downloads(const std::string& recipe) const {
+    const auto* desc = backends::descriptor_for(recipe);
+    return desc && desc->self_manages_downloads;
+}
 
-    // Use recipe-specific download paths
-    if (info.recipe == "flm") {
-        download_from_flm(info.checkpoint(), do_not_upgrade, progress_callback);
-    } else {
-        download_from_huggingface(info, progress_callback);
-    }
+void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) {
+    // The backend's ops own the download (shared HF engine by default; flm pulls
+    // via the flm CLI; cloud is a no-op).
+    backends::BackendOpsContext octx;
+    octx.model_manager = this;
+    backends::ops_for(info.recipe)->download_model(info, do_not_upgrade, progress_callback, octx);
 
     // Update cache after successful download
     update_model_in_cache(info.model_name, true);
@@ -3349,20 +2654,11 @@ void ModelManager::download_model(const std::string& model_name,
                 );
             }
 
-            // Validate GGUF models (llamacpp recipe) require a variant
-            if (actual_recipe == "llamacpp") {
-                std::string checkpoint_lower = actual_checkpoint;
-                std::transform(checkpoint_lower.begin(), checkpoint_lower.end(),
-                              checkpoint_lower.begin(), ::tolower);
-                if (checkpoint_lower.find("gguf") != std::string::npos &&
-                    actual_checkpoint.find(':') == std::string::npos) {
-                    throw std::runtime_error(
-                        "You are required to provide a 'variant' in the checkpoint field when "
-                        "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. "
-                        "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
-                        "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
-                    );
-                }
+            // Backend-specific checkpoint validation (llamacpp: GGUF needs :variant).
+            if (auto err = backends::ops_for(actual_recipe)->validate_registration_checkpoint(
+                    actual_checkpoint);
+                !err.empty()) {
+                throw std::runtime_error(err);
             }
 
             LOG(INFO, "ModelManager") << "Registering new user model: " << model_name << std::endl;
@@ -4185,7 +3481,11 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
         bool is_direct_file = ends_with(main_variant, ".safetensors") ||
                               ends_with(main_variant, ".pth") ||
                               ends_with(main_variant, ".ckpt");
-        bool is_moonshine = info.recipe == "moonshine";
+
+        // Backends with a bespoke artifact layout (moonshine = a directory of
+        // files) select their own download set; nullopt = the default paths.
+        auto backend_files =
+            backends::ops_for(info.recipe)->select_checkpoint_files(main_variant, repo_files);
 
         if (is_direct_file) {
             // For non-GGUF model files, download the specified file directly
@@ -4195,22 +3495,10 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
             } else {
                 throw std::runtime_error("Model file not found in repository: " + main_variant);
             }
-        } else if (is_moonshine) {
-            // Moonshine variant is a directory path (e.g., "medium-streaming-en/quantized")
-            // Download all files under that directory
-            std::string folder_prefix = main_variant;
-            if (!folder_prefix.empty() && folder_prefix.back() != '/') {
-                folder_prefix += "/";
-            }
-            for (const auto& file : repo_files) {
-                if (gguf_reader_detail::starts_with_ignore_case(file, folder_prefix)) {
-                    files_to_download[main_repo_id].push_back(file);
-                }
-            }
-            if (files_to_download[main_repo_id].empty()) {
-                throw std::runtime_error("No Moonshine model files found in folder: " + main_variant);
-            }
-            LOG(INFO, "ModelManager") << "Moonshine: downloading " << files_to_download[main_repo_id].size()
+        } else if (backend_files) {
+            files_to_download[main_repo_id] = std::move(*backend_files);
+            LOG(INFO, "ModelManager") << info.recipe << ": downloading "
+                                      << files_to_download[main_repo_id].size()
                                       << " files from " << main_variant << std::endl;
         } else {
             // GGUF model: Use identify_gguf_models to determine which files to download
@@ -4440,224 +3728,6 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
     LOG(INFO, "ModelManager") << "Download location: " << reported_download_path << std::endl;
 }
 
-void ModelManager::download_from_flm(const std::string& checkpoint,
-                                     bool do_not_upgrade,
-                                     DownloadProgressCallback progress_callback) {
-    LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl;
-
-    // Ensure FLM is ready (single source of truth)
-    auto status = SystemInfoCache::get_flm_status();
-    if (!status.is_ready()) {
-        throw std::runtime_error(status.error_string());
-    }
-
-    std::string flm_path = find_flm_binary();
-    if (flm_path.empty()) {
-        throw std::runtime_error("FLM executable not found");
-    }
-
-    // Prepare arguments
-    std::vector<std::string> args = {"pull", checkpoint};
-    if (!do_not_upgrade) {
-        args.push_back("--force");
-    }
-
-    LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\"";
-    for (const auto& arg : args) {
-        LOG(INFO, "ProcessManager") << " \"" << arg << "\"";
-    }
-    LOG(INFO, "ProcessManager") << std::endl;
-
-    // State for parsing FLM output
-    int total_files = 0;
-    int current_file_index = 0;
-    std::string current_filename;
-    bool cancelled = false;
-
-    // Run flm pull command and parse output
-    int exit_code = utils::ProcessManager::run_process_with_output(
-        flm_path, args,
-        [&](const std::string& line) -> bool {
-            // Always print the line to console
-            LOG(INFO, "FLM") << line << std::endl;
-
-            // Parse FLM output to extract progress information
-            // Pattern: "[FLM]  Downloading X/Y: filename"
-            if (line.find("[FLM]  Downloading ") != std::string::npos &&
-                line.find("/") != std::string::npos &&
-                line.find(":") != std::string::npos) {
-
-                // Extract "X/Y: filename" from "[FLM]  Downloading X/Y: filename"
-                size_t start = line.find("Downloading ") + 12;
-                size_t slash = line.find("/", start);
-                size_t colon = line.find(":", slash);
-
-                if (slash != std::string::npos && colon != std::string::npos) {
-                    try {
-                        current_file_index = std::stoi(line.substr(start, slash - start));
-                        total_files = std::stoi(line.substr(slash + 1, colon - slash - 1));
-                        current_filename = line.substr(colon + 2);  // Skip ": "
-
-                        // Send progress update
-                        if (progress_callback) {
-                            DownloadProgress progress;
-                            progress.file = current_filename;
-                            progress.file_index = current_file_index;
-                            progress.total_files = total_files;
-                            progress.bytes_downloaded = 0;
-                            progress.bytes_total = 0;
-                            progress.percent = (total_files > 0) ?
-                                ((current_file_index - 1) * 100 / total_files) : 0;
-
-                            if (!progress_callback(progress)) {
-                                cancelled = true;
-                                return false;  // Kill the process
-                            }
-                        }
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-            // Pattern: "[FLM]  Downloading: XX.X% (XXX.XMB / XXX.XMB)"
-            else if (line.find("[FLM]  Downloading: ") != std::string::npos &&
-                     line.find("%") != std::string::npos) {
-
-                // Extract percentage and bytes
-                size_t start = line.find("Downloading: ") + 13;
-                size_t pct_end = line.find("%", start);
-
-                if (pct_end != std::string::npos) {
-                    try {
-                        std::string pct_str = line.substr(start, pct_end - start);
-                        double file_percent = std::stod(pct_str);
-
-                        // Try to extract bytes (XXX.XMB / XXX.XMB)
-                        size_t open_paren = line.find("(", pct_end);
-                        size_t slash = line.find("/", open_paren);
-                        size_t close_paren = line.find(")", slash);
-
-                        size_t bytes_downloaded = 0;
-                        size_t bytes_total = 0;
-
-                        if (open_paren != std::string::npos && slash != std::string::npos) {
-                            std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1);
-                            std::string total_str = line.substr(slash + 1, close_paren - slash - 1);
-
-                            // Parse "XXX.XMB" format
-                            auto parse_size = [](const std::string& s) -> size_t {
-                                double val = 0;
-                                size_t mb_pos = s.find("MB");
-                                size_t gb_pos = s.find("GB");
-                                size_t kb_pos = s.find("KB");
-
-                                if (mb_pos != std::string::npos) {
-                                    val = std::stod(s.substr(0, mb_pos));
-                                    return static_cast<size_t>(val * 1024 * 1024);
-                                } else if (gb_pos != std::string::npos) {
-                                    val = std::stod(s.substr(0, gb_pos));
-                                    return static_cast<size_t>(val * 1024 * 1024 * 1024);
-                                } else if (kb_pos != std::string::npos) {
-                                    val = std::stod(s.substr(0, kb_pos));
-                                    return static_cast<size_t>(val * 1024);
-                                }
-                                return 0;
-                            };
-
-                            bytes_downloaded = parse_size(downloaded_str);
-                            bytes_total = parse_size(total_str);
-                        }
-
-                        // Send progress update with byte-level info
-                        if (progress_callback) {
-                            DownloadProgress progress;
-                            progress.file = current_filename;
-                            progress.file_index = current_file_index;
-                            progress.total_files = total_files;
-                            progress.bytes_downloaded = bytes_downloaded;
-                            progress.bytes_total = bytes_total;
-                            // Use intra-file percent when we have byte-level progress
-                            progress.percent = static_cast<int>(file_percent);
-
-                            if (!progress_callback(progress)) {
-                                cancelled = true;
-                                return false;  // Kill the process
-                            }
-                        }
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-            // Pattern: "[FLM]  Overall progress: XX.X% (X/Y files)"
-            else if (line.find("[FLM]  Overall progress: ") != std::string::npos) {
-                size_t start = line.find("progress: ") + 10;
-                size_t pct_end = line.find("%", start);
-
-                if (pct_end != std::string::npos) {
-                    try {
-                        int overall_percent = static_cast<int>(std::stod(line.substr(start, pct_end - start)));
-
-                        if (progress_callback) {
-                            DownloadProgress progress;
-                            progress.file = current_filename;
-                            progress.file_index = current_file_index;
-                            progress.total_files = total_files;
-                            progress.bytes_downloaded = 0;  // Not available for overall progress
-                            progress.bytes_total = 0;
-                            progress.percent = overall_percent;
-
-                            if (!progress_callback(progress)) {
-                                cancelled = true;
-                                return false;  // Kill the process
-                            }
-                        }
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-            // Pattern: "[FLM]  Missing files (N):"
-            else if (line.find("[FLM]  Missing files (") != std::string::npos) {
-                size_t start = line.find("(") + 1;
-                size_t end = line.find(")", start);
-                if (end != std::string::npos) {
-                    try {
-                        total_files = std::stoi(line.substr(start, end - start));
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-
-            return true;  // Continue
-        },
-        "",  // Working directory
-        3600  // 1 hour timeout for large model downloads
-    );
-
-    if (cancelled) {
-        LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl;
-        throw std::runtime_error("Download cancelled");
-    }
-
-    if (exit_code != 0) {
-        LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl;
-        throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code));
-    }
-
-    // Send completion event
-    if (progress_callback) {
-        DownloadProgress progress;
-        progress.complete = true;
-        progress.file_index = total_files;
-        progress.total_files = total_files;
-        progress.percent = 100;
-        (void)progress_callback(progress);  // Ignore return - download already complete
-    }
-
-    LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl;
-}
 
 void ModelManager::delete_model(const std::string& model_name) {
     auto info = get_model_info(model_name);
@@ -4673,55 +3743,9 @@ void ModelManager::delete_model(const std::string& model_name) {
                                  "Delete the file directly from: " + info.checkpoint());
     }
 
-    // Handle FLM models separately
+    // FLM models have no local HF cache; deletion is the backend's `flm remove`.
     if (info.recipe == "flm") {
-        LOG(INFO, "ModelManager") << "Deleting FLM model: " << info.checkpoint() << std::endl;
-
-        // Validate checkpoint is not empty
-        if (info.checkpoint().empty()) {
-            throw std::runtime_error("FLM model has empty checkpoint field, cannot delete");
-        }
-
-        // Find flm executable — on Windows flm.exe lives under the lemonade
-        // cache dir, not on PATH, so we must resolve the full path.
-        std::string flm_path = find_flm_binary();
-        if (flm_path.empty()) {
-            throw std::runtime_error("FLM executable not found");
-        }
-
-        // Prepare arguments for 'flm remove' command
-        std::vector<std::string> args = {"remove", info.checkpoint()};
-
-        LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\"";
-        for (const auto& arg : args) {
-            LOG(INFO, "ProcessManager") << " \"" << arg << "\"";
-        }
-        LOG(INFO, "ProcessManager") << std::endl;
-
-        // Run flm remove command
-        auto handle = utils::ProcessManager::start_process(flm_path, args, "", false);
-
-        // Wait for process to complete
-        int timeout_seconds = 60; // 1 minute timeout for removal
-        for (int i = 0; i < timeout_seconds * 10; ++i) {
-            if (!utils::ProcessManager::is_running(handle)) {
-                int exit_code = utils::ProcessManager::get_exit_code(handle);
-                if (exit_code != 0) {
-                    LOG(ERROR, "ModelManager") << "FLM remove failed with exit code: " << exit_code << std::endl;
-                    throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove failed with exit code " + std::to_string(exit_code));
-                }
-                break;
-            }
-            std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        }
-
-        // Check if process is still running (timeout)
-        if (utils::ProcessManager::is_running(handle)) {
-            LOG(ERROR, "ModelManager") << "FLM remove timed out" << std::endl;
-            throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove timed out");
-        }
-
-        LOG(INFO, "ModelManager") << "Successfully deleted FLM model: " << canonical_model_name << std::endl;
+        backends::fastflowlm::flm_remove(info.checkpoint());
 
         // Remove from user models if it's a user model
         if (is_user_model_name(canonical_model_name)) {
@@ -5214,7 +4238,6 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name)
     parse_components(info, *model_json);
     info.recipe = JsonUtils::get_or_default<std::string>(*model_json, "recipe", "");
     info.suggested = JsonUtils::get_or_default<bool>(*model_json, "suggested", false);
-    info.hf_load = JsonUtils::get_or_default<bool>(*model_json, "hf_load", false);
     info.source = JsonUtils::get_or_default<std::string>(*model_json, "source", "");
 
     // Parse labels array
@@ -5233,10 +4256,7 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name)
         }
     }
 
-    // Parse moonshine_arch
-    if (model_json->contains("moonshine_arch") && (*model_json)["moonshine_arch"].is_number_integer()) {
-        info.moonshine_arch = (*model_json)["moonshine_arch"].get<int>();
-    }
+    parse_extras(info, *model_json);
 
     return info;
 }
diff --git a/src/cpp/server/ollama_api.cpp b/src/cpp/server/ollama_api.cpp
index 7687caab4..0604a3935 100644
--- a/src/cpp/server/ollama_api.cpp
+++ b/src/cpp/server/ollama_api.cpp
@@ -238,8 +238,9 @@ void OllamaApi::auto_load_model(const std::string& model) {
 
     auto info = model_manager_->get_model_info(name);
 
-    // Download if not cached
-    if (info.recipe != "flm" && !model_manager_->is_model_downloaded(name)) {
+    // Download if not cached (backends that self-manage downloads pull on load)
+    if (!model_manager_->backend_self_manages_downloads(info.recipe) &&
+        !model_manager_->is_model_downloaded(name)) {
         LOG(INFO, "OllamaApi") << "Model not cached, downloading..." << std::endl;
         model_manager_->download_registered_model(info, true);
         info = model_manager_->get_model_info(name);
diff --git a/src/cpp/server/prometheus_metrics.cpp b/src/cpp/server/prometheus_metrics.cpp
index 8ecfdb288..88f7bdaf3 100644
--- a/src/cpp/server/prometheus_metrics.cpp
+++ b/src/cpp/server/prometheus_metrics.cpp
@@ -1,5 +1,6 @@
 #include "lemon/prometheus_metrics.h"
 
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/version.h"
 
 #include <algorithm>
@@ -274,7 +275,8 @@ void append_llamacpp_backend_metrics(PrometheusBuilder& metrics,
                                      const json& model,
                                      const std::map<std::string, std::string>& labels,
                                      std::set<std::string>& described_backend_metrics) {
-    if (model.value("recipe", "") != "llamacpp") {
+    const auto* desc = backends::descriptor_for(model.value("recipe", ""));
+    if (desc == nullptr || !desc->exposes_prometheus_metrics) {
         return;
     }
 
diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp
index 65d4bb676..70c188e34 100644
--- a/src/cpp/server/recipe_options.cpp
+++ b/src/cpp/server/recipe_options.cpp
@@ -1,4 +1,5 @@
 #include <lemon/recipe_options.h>
+#include <lemon/backends/backend_descriptor_registry.h>
 #include <lemon/utils/custom_args.h>
 #include <nlohmann/json.hpp>
 #include <map>
@@ -12,78 +13,68 @@ namespace lemon {
 
 using json = nlohmann::json;
 
-static const json DEFAULTS = {
-    {"ctx_size", -1},  // -1 triggers auto-resolution (memory + arch metadata)
-    {"merge_args", true},
-    {"llamacpp_device", ""},
-    {"llamacpp_backend", ""},  // Will be overridden dynamically
-    {"llamacpp_args", ""},
-    {"sd-cpp_backend", ""},   // "" means auto-detect (mapped from "auto" in config.json)
-    {"sdcpp_args", ""},
-    {"whispercpp_backend", ""},  // "" means auto-detect (mapped from "auto" in config.json)
-    {"whispercpp_args", ""},
-    {"moonshine_args", ""},      // Custom arguments to pass to moonshine-server
-    // Image generation defaults (for sd-cpp recipe)
-    // These are recipe-level defaults only, not CLI arguments — per reviewer guidance,
-    // there are too many image gen params for CLI flags, and no universal defaults.
-    {"steps", 20},
-    {"cfg_scale", 7.0},
-    {"width", 512},
-    {"height", 512},
-    {"sampling_method", ""},
-    {"flow_shift", 0.0},
-    // vLLM-specific options
-    {"vllm_backend", ""},  // "" means auto-detect
-    {"vllm_args", ""},     // Custom arguments to pass to vllm-server
-    // Cloud recipe has no backend variants (provider selection lives on the
-    // per-model cloud_provider field). The empty string satisfies Router's
-    // per-backend-args lookup; cloud reads no backend-specific config.
-    {"cloud_backend", ""},
-
-    // Auto-eviction options
-    {"auto_evict", nullptr},          // nullptr means fallback to global config
-    {"evict_idle_timeout", 300},      // Default hard idle timeout (5 mins)
-    {"downsize_idle_timeout", 60},    // Default soft idle timeout (1 min)
-    {"evict_weight_factor", 1.0},     // Eviction-protection weight (higher = more protected)
-    {"pinned", false}
-};
-
-
-// Mapping from flat option names to CLI flags (used by to_cli_options)
-// Note: Image generation params (steps, cfg_scale, width, height, sampling_method,
-// flow_shift) are recipe-level defaults only — not exposed as CLI arguments.
-// Runtime options (diffusion_fa, offload_to_cpu) go through --sdcpp-args.
-static const std::map<std::string, std::string> OPTION_TO_CLI_FLAG = {
-    {"ctx_size", "--ctx-size"},
-    {"merge_args", "--merge-args"},
-    {"llamacpp_backend", "--llamacpp"},
-    {"llamacpp_device", "--llamacpp-device"},
-    {"llamacpp_args", "--llamacpp-args"},
-    {"sd-cpp_backend", "--sdcpp"},
-    {"sdcpp_args", "--sdcpp-args"},
-    {"whispercpp_backend", "--whispercpp"},
-    {"whispercpp_args", "--whispercpp-args"},
-    {"moonshine_args", "--moonshine-args"},
-    {"vllm_backend", "--vllm"},
-    {"vllm_args", "--vllm-args"}
-};
+// Options shared by every backend. Per-backend options (and ctx_size opt-in)
+// come from each backend's descriptor; these are the universal kit.
+static const json& common_defaults() {
+    static const json d = {
+        {"ctx_size", -1},  // -1 triggers auto-resolution (memory + arch metadata)
+        {"merge_args", true},
+        // Auto-eviction options (apply to every recipe)
+        {"auto_evict", nullptr},          // nullptr means fallback to global config
+        {"evict_idle_timeout", 300},      // Default hard idle timeout (5 mins)
+        {"downsize_idle_timeout", 60},    // Default soft idle timeout (1 min)
+        {"evict_weight_factor", 1.0},     // Eviction-protection weight (higher = more protected)
+        {"pinned", false},
+    };
+    return d;
+}
+
+// Defaults for every option: the common kit plus each backend descriptor's
+// declared options. Built once from the registry so config defaults, CLI flags,
+// and load-time resolution can never drift from the descriptors.
+static const json& get_defaults() {
+    static const json defaults = [] {
+        json d = common_defaults();
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& opt : desc->options) {
+                d[opt.name] = opt.default_value;
+            }
+        }
+        return d;
+    }();
+    return defaults;
+}
+
+// Flat option name -> CLI flag, for to_cli_options(). ctx_size/merge_args are
+// the common flags; the rest come from descriptor options that declare a flag.
+static const std::map<std::string, std::string>& get_option_to_cli_flag() {
+    static const std::map<std::string, std::string> mapping = [] {
+        std::map<std::string, std::string> m{
+            {"ctx_size", "--ctx-size"},
+            {"merge_args", "--merge-args"},
+        };
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& opt : desc->options) {
+                if (!opt.cli_flag.empty()) {
+                    m[opt.name] = opt.cli_flag;
+                }
+            }
+        }
+        return m;
+    }();
+    return mapping;
+}
 
 static std::vector<std::string> get_keys_for_recipe(const std::string& recipe) {
     std::vector<std::string> keys;
-    if (recipe == "llamacpp") {
-        keys = {"ctx_size", "llamacpp_device", "llamacpp_backend", "llamacpp_args", "merge_args"};
-    } else if (recipe == "whispercpp") {
-        keys = {"whispercpp_backend", "whispercpp_args", "merge_args"};
-    } else if (recipe == "moonshine") {
-        keys = {"moonshine_args", "merge_args"};
-    } else if (recipe == "flm") {
-        return {"ctx_size", "merge_args"};
-    } else if (recipe == "ryzenai-llm") {
-        keys = {"ctx_size"};
-    } else if (recipe == "sd-cpp") {
-        keys = {"sd-cpp_backend", "sdcpp_args", "steps", "cfg_scale", "width", "height", "sampling_method", "flow_shift", "merge_args"};
-    } else if (recipe == "vllm") {
-        keys = {"ctx_size", "vllm_backend", "vllm_args", "merge_args"};
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        if (desc->uses_ctx_size) {
+            keys.push_back("ctx_size");
+        }
+        for (const auto& opt : desc->options) {
+            keys.push_back(opt.name);
+        }
+        keys.push_back("merge_args");
     }
 
     // Add auto-eviction options for all recipes
@@ -125,7 +116,7 @@ static bool try_get_backend_options(const std::string& opt_name, SystemInfo::Sup
 std::vector<std::string> RecipeOptions::to_cli_options(const json& raw_options) {
     std::vector<std::string> cli;
 
-    for (auto& [opt_name, cli_flag] : OPTION_TO_CLI_FLAG) {
+    for (auto& [opt_name, cli_flag] : get_option_to_cli_flag()) {
         if (raw_options.contains(opt_name)) {
             auto val = raw_options[opt_name];
             if (!val.is_null() && val != "") {
@@ -146,7 +137,7 @@ std::vector<std::string> RecipeOptions::to_cli_options(const json& raw_options)
 
 std::vector<std::string> RecipeOptions::known_keys() {
     std::vector<std::string> keys;
-    for (auto& [key, value] : DEFAULTS.items()) {
+    for (auto& [key, value] : get_defaults().items()) {
         keys.push_back(key);
     }
     return keys;
@@ -239,7 +230,7 @@ json RecipeOptions::get_option(const std::string& opt) const {
         }
     }
 #endif
-    return DEFAULTS.contains(opt) ? DEFAULTS[opt] : json();
+    return get_defaults().contains(opt) ? get_defaults()[opt] : json();
 }
 
 void RecipeOptions::set_option(const std::string& opt, const json& value) {
@@ -247,29 +238,38 @@ void RecipeOptions::set_option(const std::string& opt, const json& value) {
 }
 
 #ifdef LEMONADE_CLI
-// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options
-static const json CLI_OPTIONS = {
-    {"--ctx-size", {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}}},
-    {"--merge-args", {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}}},
-    {"--llamacpp", {{"option_name", "llamacpp_backend"}, {"type_name", "BACKEND"}, {"help", "LlamaCpp backend to use"}, {"group", "Llama.cpp Backend Options"}}},
-    {"--llamacpp-device", {{"option_name", "llamacpp_device"}, {"type_name", "DEVICES"}, {"help", "Comma-separated list of accelerator devices to use (e.g. Vulkan0)"}, {"group", "Llama.cpp Backend Options"}}},
-    {"--llamacpp-args", {{"option_name", "llamacpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to llama-server"}, {"group", "Llama.cpp Backend Options"}}},
-    {"--sdcpp", {{"option_name", "sd-cpp_backend"}, {"type_name", "BACKEND"}, {"help", "SD.cpp backend to use"}, {"group", "Stable Diffusion Options"}}},
-    {"--sdcpp-args", {{"option_name", "sdcpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to sd-server (must not conflict with managed args)"}, {"group", "Stable Diffusion Options"}}},
-    {"--whispercpp", {{"option_name", "whispercpp_backend"}, {"type_name", "BACKEND"}, {"help", "WhisperCpp backend to use"}, {"group", "Whisper.cpp Options"}}},
-    {"--whispercpp-args", {{"option_name", "whispercpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to whisper-server"}, {"group", "Whisper.cpp Options"}}},
-    {"--moonshine-args", {{"option_name", "moonshine_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to moonshine-server"}}},
-    {"--vllm", {{"option_name", "vllm_backend"}, {"type_name", "BACKEND"}, {"help", "vLLM backend to use"}, {"group", "vLLM Options"}}},
-    {"--vllm-args", {{"option_name", "vllm_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to vllm-server"}, {"group", "vLLM Options"}}},
-    // Note: Image gen params (--steps, --cfg-scale, --width, --height) removed — recipe-level only.
-    // Runtime options (--diffusion-fa, --offload-to-cpu) go through --sdcpp-args.
-};
+// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options.
+// ctx_size/merge_args are the common flags; everything else is derived from
+// descriptor options that declare a CLI flag, so the CLI never needs editing
+// when a backend is added. Image-gen params (steps/cfg_scale/width/height) have
+// no cli_flag in their descriptor, so they stay recipe-level only as before.
+static const json& get_cli_options() {
+    static const json cli_options = [] {
+        json o = json::object();
+        o["--ctx-size"] = {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}};
+        o["--merge-args"] = {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}};
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& opt : desc->options) {
+                if (opt.cli_flag.empty()) {
+                    continue;
+                }
+                json entry = {{"option_name", opt.name}, {"type_name", opt.type_name}, {"help", opt.help}};
+                if (!opt.group.empty()) {
+                    entry["group"] = opt.group;
+                }
+                o[opt.cli_flag] = entry;
+            }
+        }
+        return o;
+    }();
+    return cli_options;
+}
 
 void RecipeOptions::add_cli_options(CLI::App& app, json& storage) {
-    for (auto& [key, opt] : CLI_OPTIONS.items()) {
+    for (auto& [key, opt] : get_cli_options().items()) {
         const std::string opt_name = opt["option_name"];
         CLI::Option* o;
-        json defval = DEFAULTS[opt_name];
+        json defval = get_defaults()[opt_name];
 
         if (defval.is_number_float()) {
             o = app.add_option_function<double>(key, [opt_name, &storage = storage](double val) { storage[opt_name] = val; }, opt["help"]);
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index b3ec22c3b..514a9773e 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -1,14 +1,15 @@
 #include "lemon/router.h"
 #include "lemon/cloud_provider_registry.h"
-#include "lemon/backends/cloud_server.h"
-#include "lemon/backends/llamacpp_server.h"
-#include "lemon/backends/fastflowlm_server.h"
-#include "lemon/backends/ryzenaiserver.h"
-#include "lemon/backends/whisper_server.h"
-#include "lemon/backends/moonshine_server.h"
-#include "lemon/backends/kokoro_server.h"
-#include "lemon/backends/sd_server.h"
-#include "lemon/backends/vllm_server.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/cloud/cloud_server.h"
+#include "lemon/backends/llamacpp/llamacpp_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm_server.h"
+#include "lemon/backends/ryzenai/ryzenai_server.h"
+#include "lemon/backends/whispercpp/whispercpp_server.h"
+#include "lemon/backends/moonshine/moonshine_server.h"
+#include "lemon/backends/kokoro/kokoro_server.h"
+#include "lemon/backends/sdcpp/sdcpp_server.h"
+#include "lemon/backends/vllm/vllm_server.h"
 #include "lemon/server_capabilities.h"
 #include "lemon/error_types.h"
 #include "lemon/recipe_options.h"
@@ -143,12 +144,26 @@ bool Router::reload_model_after_watchdog_reset(const std::string& requested_mode
     }
 }
 
+// Slot/eviction policy for a recipe, from its descriptor (default Standard).
+// This is the recipe-static policy used for pre-load slot decisions, mirroring
+// the historical use of get_device_type_from_recipe at load time.
+static SlotPolicy slot_policy_for_recipe(const std::string& recipe) {
+    if (const auto* desc = backends::descriptor_for(recipe)) {
+        return desc->slot_policy;
+    }
+    return SlotPolicy::Standard;
+}
+
+static bool is_unmetered_recipe(const std::string& recipe) {
+    return slot_policy_for_recipe(recipe) == SlotPolicy::Unmetered;
+}
+
 int Router::count_servers_by_type(ModelType type) const {
     int count = 0;
     for (const auto& server : loaded_servers_) {
-        // Cloud servers consume no local memory and stay loaded for free, so
-        // they are excluded from the slot accounting that drives LRU eviction.
-        if (server->get_recipe_options().get_recipe() == "cloud") {
+        // Unmetered backends (cloud) consume no local memory and stay loaded for
+        // free, so they are excluded from the slot accounting that drives LRU eviction.
+        if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) {
             continue;
         }
         if (server->is_backend_alive() && server->get_model_type() == type) {
@@ -162,10 +177,10 @@ WrappedServer* Router::find_lru_server_by_type(ModelType type) const {
     WrappedServer* lru = nullptr;
 
     for (const auto& server : loaded_servers_) {
-        // Cloud servers are not eviction candidates; they have no memory cost
-        // and reloading them is essentially free, but evicting them throws
-        // away the cached api key/upstream-id binding for no benefit.
-        if (server->get_recipe_options().get_recipe() == "cloud") {
+        // Unmetered backends (cloud) are not eviction candidates; they have no
+        // memory cost and reloading them is essentially free, but evicting them
+        // throws away the cached api key/upstream-id binding for no benefit.
+        if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) {
             continue;
         }
         if (server->is_backend_alive() && server->get_model_type() == type) {
@@ -210,10 +225,11 @@ WrappedServer* Router::find_npu_server_by_recipe(const std::string& recipe) cons
     return nullptr;
 }
 
-WrappedServer* Router::find_flm_server_by_type(ModelType type) const {
+WrappedServer* Router::find_coexisting_server_by_type(ModelType type) const {
     for (const auto& server : loaded_servers_) {
         if (server->is_backend_alive() &&
-            server->get_recipe_options().get_recipe() == "flm" &&
+            slot_policy_for_recipe(server->get_recipe_options().get_recipe()) ==
+                SlotPolicy::CoexistByType &&
             server->get_model_type() == type) {
             return server.get();
         }
@@ -299,49 +315,28 @@ void Router::simulate_vram_pressure(double pct) {
 }
 
 std::unique_ptr<WrappedServer> Router::create_backend_server(const ModelInfo& model_info) {
-    std::unique_ptr<WrappedServer> new_server;
     std::string log_level = config_->log_level();
 
-    if (model_info.recipe == "cloud") {
-        LOG(DEBUG, "Router") << "Creating CloudServer backend (provider: "
-                             << model_info.cloud_provider << ")" << std::endl;
-        new_server = std::make_unique<backends::CloudServer>(model_info.cloud_provider, log_level,
-                                                              model_manager_, backend_manager_,
-                                                              cloud_registry_);
-    } else if (model_info.recipe == "whispercpp") {
-        LOG(DEBUG, "Router") << "Creating WhisperServer backend" << std::endl;
-        new_server = std::make_unique<backends::WhisperServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "moonshine") {
-        LOG(DEBUG, "Router") << "Creating MoonshineServer backend" << std::endl;
-        new_server = std::make_unique<backends::MoonshineServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "kokoro") {
-        LOG(DEBUG, "Router") << "Creating Kokoro backend" << std::endl;
-        new_server = std::make_unique<backends::KokoroServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "sd-cpp") {
-        LOG(DEBUG, "Router") << "Creating SDServer backend" << std::endl;
-        new_server = std::make_unique<backends::SDServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "flm") {
-        LOG(DEBUG, "Router") << "Creating FastFlowLM backend" << std::endl;
-        new_server = std::make_unique<backends::FastFlowLMServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "ryzenai-llm") {
-        LOG(DEBUG, "Router") << "Creating RyzenAI-Server backend" << std::endl;
-
-        std::string model_path = model_info.resolved_path();
-        LOG(DEBUG, "Router") << "Using model path: " << model_path << std::endl;
-
-        auto* ryzenai_server = new RyzenAIServer(model_info.model_name,
-                                                  log_level == "debug", model_manager_, backend_manager_);
-        ryzenai_server->set_model_path(model_path);
-        new_server.reset(ryzenai_server);
-    } else if (model_info.recipe == "vllm") {
-        LOG(DEBUG, "Router") << "Creating vLLM backend" << std::endl;
-        new_server = std::make_unique<backends::VLLMServer>(log_level, model_manager_, backend_manager_);
-    } else {
-        LOG(DEBUG, "Router") << "Creating LlamaCpp backend" << std::endl;
-        new_server = std::make_unique<backends::LlamaCppServer>(log_level, model_manager_, backend_manager_);
+    backends::BackendContext ctx;
+    ctx.log_level = log_level;
+    ctx.model_manager = model_manager_;
+    ctx.backend_manager = backend_manager_;
+    ctx.cloud_registry = cloud_registry_;
+    ctx.model_info = &model_info;
+
+    // The backend registry binds each recipe's descriptor to its create(). It is
+    // the single source of truth for backend construction (see LEMON_BACKENDS).
+    std::unique_ptr<WrappedServer> new_server = backends::create_server(model_info.recipe, ctx);
+    if (new_server) {
+        LOG(DEBUG, "Router") << "Created backend for recipe '" << model_info.recipe
+                             << "' via registry" << std::endl;
+        return new_server;
     }
 
-    return new_server;
+    // Unknown recipe: fall back to llamacpp, preserving the historical default.
+    LOG(DEBUG, "Router") << "No registered backend for recipe '" << model_info.recipe
+                         << "', defaulting to LlamaCpp" << std::endl;
+    return std::make_unique<backends::LlamaCppServer>(log_level, model_manager_, backend_manager_);
 }
 
 void Router::load_model(const std::string& model_name,
@@ -427,52 +422,61 @@ void Router::load_model(const std::string& model_name,
         // Get max models for this type (same limit for all types)
         int max_models = config_->max_loaded_models();
 
-        // NPU EXCLUSIVITY CHECK (recipe-aware rules)
-        // FLM can run up to 3 concurrent NPU processes (1 LLM + 1 transcription + 1 embedding)
-        // RyzenAI and WhisperCpp lock the entire NPU exclusively
-        if (device_type & DEVICE_NPU) {
-            if (model_info.recipe == "ryzenai-llm" || model_info.recipe == "whispercpp") {
-                // Exclusive NPU recipes - evict ALL NPU servers
+        // NPU EXCLUSIVITY CHECK — driven by the backend's slot policy (descriptor).
+        //   ExclusiveNpu (ryzenai-llm, whisper-on-npu): lock the entire NPU,
+        //                evicting ALL NPU servers first.
+        //   CoexistByType (flm): coexist with other FLM types (max 1 per type),
+        //                but evict exclusive-NPU peers.
+        // Standard/Unmetered backends share no device exclusivity.
+        switch (slot_policy_for_recipe(model_info.recipe)) {
+            case SlotPolicy::ExclusiveNpu: {
                 if (has_npu_server()) {
                     LOG(INFO, "Router") << model_info.recipe
                               << " requires exclusive NPU access, evicting all NPU servers..." << std::endl;
                     evict_all_npu_servers();
                 }
-            } else if (model_info.recipe == "flm") {
-                // FLM can coexist with other FLM types, but not with exclusive-NPU recipes
-                // 1. Evict any exclusive-NPU server (mutually exclusive)
-                for (const std::string& exclusive_recipe : {"ryzenai-llm", "whispercpp"}) {
-                    WrappedServer* exclusive_server = find_npu_server_by_recipe(exclusive_recipe);
-                    if (exclusive_server) {
-                        LOG(INFO, "Router") << "FLM cannot coexist with " << exclusive_recipe
-                                  << ", evicting: " << exclusive_server->get_model_name() << std::endl;
-                        evict_server(exclusive_server);
+                break;
+            }
+            case SlotPolicy::CoexistByType: {
+                // 1. Evict every NPU holder that is not itself a coexisting (FLM)
+                //    backend — i.e. exclusive-NPU peers like ryzenai-llm and
+                //    whisper-on-npu. Collect first; evict_server mutates loaded_servers_.
+                std::vector<WrappedServer*> exclusive_peers;
+                for (const auto& server : loaded_servers_) {
+                    if (server->is_backend_alive() && (server->get_device_type() & DEVICE_NPU) &&
+                        slot_policy_for_recipe(server->get_recipe_options().get_recipe()) !=
+                            SlotPolicy::CoexistByType) {
+                        exclusive_peers.push_back(server.get());
                     }
                 }
+                for (auto* peer : exclusive_peers) {
+                    LOG(INFO, "Router") << "FLM cannot coexist with "
+                              << peer->get_recipe_options().get_recipe()
+                              << ", evicting: " << peer->get_model_name() << std::endl;
+                    evict_server(peer);
+                }
                 // 2. Evict FLM of the SAME model type (max 1 per type: 1 LLM, 1 transcription, 1 embed)
-                WrappedServer* same_type_flm = find_flm_server_by_type(model_type);
+                WrappedServer* same_type_flm = find_coexisting_server_by_type(model_type);
                 if (same_type_flm) {
                     LOG(INFO, "Router") << "FLM " << model_type_to_string(model_type)
                               << " slot occupied by: " << same_type_flm->get_model_name()
                               << ", evicting..." << std::endl;
                     evict_server(same_type_flm);
                 }
-            } else {
-                // Unknown NPU recipe - default to exclusive access
-                if (has_npu_server()) {
-                    LOG(INFO, "Router") << "Unknown NPU recipe, evicting all NPU servers..." << std::endl;
-                    evict_all_npu_servers();
-                }
+                break;
             }
+            case SlotPolicy::Standard:
+            case SlotPolicy::Unmetered:
+                break;
         }
 
         // LRU EVICTION CHECK (from spec: Least Recently Used Cache)
-        // Skip eviction if unlimited (-1). Cloud-recipe loads also skip the
+        // Skip eviction if unlimited (-1). Unmetered (cloud) loads also skip the
         // check entirely: they consume no local resources, so they have no
         // business kicking a warm local model out of memory.
-        bool is_cloud_load = (model_info.recipe == "cloud");
+        bool is_unmetered_load = is_unmetered_recipe(model_info.recipe);
         int current_count = count_servers_by_type(model_type);
-        if (!is_cloud_load && max_models != -1 && current_count >= max_models) {
+        if (!is_unmetered_load && max_models != -1 && current_count >= max_models) {
             WrappedServer* lru = find_lru_server_by_type(model_type);
             if (lru) {
                 LOG(INFO, "Router") << "Slot limit reached for type "
@@ -1446,7 +1450,8 @@ void Router::responses_stream(const std::string& request_body, httplib::DataSink
 int Router::count_pinned_servers_by_type(ModelType type) const {
     int count = 0;
     for (const auto& server : loaded_servers_) {
-        if (server->get_recipe_options().get_recipe() == "cloud") {
+        // Unmetered servers (cloud) never occupy a slot, so they don't count.
+        if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) {
             continue;
         }
         if (server->is_backend_alive() && server->get_model_type() == type && server->is_pinned()) {
diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp
index 261de4477..cc9bd6189 100644
--- a/src/cpp/server/runtime_config.cpp
+++ b/src/cpp/server/runtime_config.cpp
@@ -1,4 +1,5 @@
 #include "lemon/runtime_config.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/system_info.h"
 #include "lemon/utils/aixlog.hpp"
 #include "lemon/utils/path_utils.h"
@@ -29,22 +30,26 @@ RuntimeConfig* RuntimeConfig::global() {
     return s_global_instance.load(std::memory_order_acquire);
 }
 
-static const std::vector<std::string> s_backend_names = {
-    "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro"
-};
-
+// A valid config.json backend section is the config_section of any descriptor
+// that runs a local subprocess (binary != ""). Cloud has no binary, so it is not
+// a backend section. Derived from descriptors — no hand-maintained list.
 static bool is_backend_name(const std::string& key) {
-    return std::find(s_backend_names.begin(), s_backend_names.end(), key) != s_backend_names.end();
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        if (!desc->binary.empty() && desc->effective_config_section() == key) {
+            return true;
+        }
+    }
+    return false;
 }
 
-// Backends that have a selectable "backend" key
-static const std::vector<std::string> s_selectable_backends = {
-    "llamacpp", "whispercpp", "sdcpp", "vllm"
-};
-
+// A config section has a selectable "backend" key iff its descriptor opts in.
 static bool has_backend_selection(const std::string& config_section) {
-    return std::find(s_selectable_backends.begin(), s_selectable_backends.end(),
-                     config_section) != s_selectable_backends.end();
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        if (desc->selectable_backend && desc->effective_config_section() == config_section) {
+            return true;
+        }
+    }
+    return false;
 }
 
 static std::pair<json, std::string> normalize_config_set_changes(const json& changes) {
@@ -106,12 +111,18 @@ static std::pair<json, std::string> normalize_config_set_changes(const json& cha
 }
 
 std::string RuntimeConfig::config_section_to_recipe(const std::string& config_section) {
-    if (config_section == "sdcpp") return "sd-cpp";
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        if (desc->effective_config_section() == config_section) {
+            return desc->recipe;
+        }
+    }
     return config_section;
 }
 
 std::string RuntimeConfig::recipe_to_config_section(const std::string& recipe) {
-    if (recipe == "sd-cpp") return "sdcpp";
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        return desc->effective_config_section();
+    }
     return recipe;
 }
 
@@ -278,9 +289,16 @@ std::string RuntimeConfig::rocm_channel() const {
 
 std::string RuntimeConfig::rocm_channel_for_recipe(const std::string& recipe) const {
     std::string channel = rocm_channel();
-    // sd-cpp currently has no nightly artifacts; use stable builds.
-    if (recipe == "sd-cpp" && channel == "nightly") {
-        return "stable";
+    // Clamp to a channel the backend actually publishes. A backend that lists
+    // only {"stable"} (e.g. sd-cpp, which has no nightly artifacts) falls back to
+    // its first channel when "nightly" is requested. Driven by the descriptor's
+    // rocm_channels, so no per-recipe special case lives here.
+    const auto* desc = lemon::backends::descriptor_for(recipe);
+    if (desc && !desc->rocm_channels.empty()) {
+        const auto& channels = desc->rocm_channels;
+        if (std::find(channels.begin(), channels.end(), channel) == channels.end()) {
+            return channels.front();
+        }
     }
     return channel;
 }
@@ -340,56 +358,43 @@ json RuntimeConfig::recipe_options(const std::string& backend) const {
         return val;
     };
 
-    const std::string backend_args = backend + "_args";
-
-    if (config_.contains("llamacpp")) {
-        const auto& lc = config_["llamacpp"];
-        if (lc.contains("backend")) result["llamacpp_backend"] = resolve_auto(lc["backend"]);
-        if (lc.contains(backend_args) && lc[backend_args] != "") {
-            result["llamacpp_args"] = lc[backend_args];
-        } else if (lc.contains("args")) {
-            result["llamacpp_args"] = lc["args"];
-        }
-        if (lc.contains("device")) result["llamacpp_device"] = lc["device"];
-    }
-
-    if (config_.contains("whispercpp")) {
-        const auto& wc = config_["whispercpp"];
-        if (wc.contains("backend")) result["whispercpp_backend"] = resolve_auto(wc["backend"]);
-        if (wc.contains(backend_args) && wc[backend_args] != "") {
-            result["whispercpp_args"] = wc[backend_args];
-        } else if (wc.contains("args")) {
-            result["whispercpp_args"] = wc["args"];
-        }
-    }
+    auto ends_with = [](const std::string& s, const std::string& suf) {
+        return s.size() >= suf.size() && s.compare(s.size() - suf.size(), suf.size(), suf) == 0;
+    };
 
-    if (config_.contains("moonshine")) {
-        const auto& ms = config_["moonshine"];
-        if (ms.contains(backend_args) && ms[backend_args] != "") {
-            result["moonshine_args"] = ms[backend_args];
-        } else if (ms.contains("args")) {
-            result["moonshine_args"] = ms["args"];
-        }
-    }
+    const std::string backend_args = backend + "_args";
 
-    if (config_.contains("sdcpp")) {
-        const auto& sd = config_["sdcpp"];
-        if (sd.contains("backend")) result["sd-cpp_backend"] = resolve_auto(sd["backend"]);
-        if (sd.contains(backend_args) && sd[backend_args] != "") {
-            result["sdcpp_args"] = sd[backend_args];
-        } else if (sd.contains("args")) {
-            result["sdcpp_args"] = sd["args"];
+    // Translate each backend's nested config.json section into the flat
+    // recipe_options format, driven by the descriptor's option list — no
+    // per-recipe block. The flat key is the descriptor option name; the
+    // config.json key is derived from the option's role (its name suffix):
+    //   *_backend -> "backend"   *_args -> variant "<backend>_args" then "args"
+    //   *_device  -> "device"    everything else -> the option name verbatim
+    //                            (sd-cpp's steps/cfg_scale/width/height/…)
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        const std::string section = desc->effective_config_section();
+        if (!config_.contains(section) || !config_[section].is_object()) {
+            continue;
+        }
+        const auto& cfg = config_[section];
+        for (const auto& opt : desc->options) {
+            if (ends_with(opt.name, "_backend")) {
+                if (cfg.contains("backend")) {
+                    result[opt.name] = resolve_auto(cfg["backend"]);
+                }
+            } else if (ends_with(opt.name, "_args")) {
+                if (cfg.contains(backend_args) && cfg[backend_args] != "") {
+                    result[opt.name] = cfg[backend_args];
+                } else if (cfg.contains("args")) {
+                    result[opt.name] = cfg["args"];
+                }
+            } else {
+                const std::string ckey = ends_with(opt.name, "_device") ? "device" : opt.name;
+                if (cfg.contains(ckey)) {
+                    result[opt.name] = cfg[ckey];
+                }
+            }
         }
-        if (sd.contains("steps")) result["steps"] = sd["steps"];
-        if (sd.contains("cfg_scale")) result["cfg_scale"] = sd["cfg_scale"];
-        if (sd.contains("width")) result["width"] = sd["width"];
-        if (sd.contains("height")) result["height"] = sd["height"];
-    }
-
-    if (config_.contains("vllm")) {
-        const auto& vl = config_["vllm"];
-        if (vl.contains("backend")) result["vllm_backend"] = resolve_auto(vl["backend"]);
-        if (vl.contains("args")) result["vllm_args"] = vl["args"];
     }
 
     if (config_.contains("ctx_size")) result["ctx_size"] = config_["ctx_size"];
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index b2b0327ab..511aa080c 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -5,8 +5,8 @@
 #include "lemon/config_file.h"
 #include "lemon/mcp_server.h"
 #include "lemon/ollama_api.h"
-#include "lemon/backends/cloud_server.h"
-#include "lemon/backends/sd_server.h"
+#include "lemon/backends/cloud/cloud_server.h"
+#include "lemon/backends/sdcpp/sdcpp_server.h"
 #include "lemon/backends/backend_utils.h"
 #include <cstring>
 #include "lemon/utils/json_utils.h"
@@ -677,6 +677,9 @@ void Server::setup_routes(httplib::Server &web_server) {
     web_server.Get("/internal/config", [this](const httplib::Request& req, httplib::Response& res) {
         handle_config_get(req, res);
     });
+    web_server.Get("/internal/config/defaults", [this](const httplib::Request& req, httplib::Response& res) {
+        handle_config_defaults_get(req, res);
+    });
     web_server.Post("/internal/cleanup-cache", [this](const httplib::Request& req, httplib::Response& res) {
         handle_cleanup_cache(req, res);
     });
@@ -1698,7 +1701,8 @@ void Server::auto_load_model_if_needed(const std::string& requested_model) {
     //   - If model is NOT downloaded: Download it from HuggingFace
     //   - If model IS downloaded: Skip HuggingFace API check entirely (use cached version)
     // Only the /pull endpoint should check for updates (uses do_not_upgrade=false)
-    if (info.recipe != "flm" && !model_manager_->is_model_downloaded(requested_model)) {
+    if (!model_manager_->backend_self_manages_downloads(info.recipe) &&
+        !model_manager_->is_model_downloaded(requested_model)) {
         LOG(INFO, "Server") << "Model not cached, downloading from Hugging Face..." << std::endl;
         LOG(INFO, "Server") << "This may take several minutes for large models." << std::endl;
         model_manager_->download_registered_model(info, true);
@@ -3244,7 +3248,7 @@ void Server::handle_image_upscale(const httplib::Request& req, httplib::Response
         // as a separate request from generation, which lets the frontend show
         // the original and upscaled images side by side with independent timing.
         std::string exe_dir = lemon::backends::BackendUtils::get_backend_binary_path(
-            lemon::backends::SDServer::SPEC, backend);
+            *lemon::backends::try_get_spec_for_recipe("sd-cpp"), backend);
         std::filesystem::path cli_exe = std::filesystem::path(exe_dir).parent_path() /
 #ifdef _WIN32
             "sd-cli.exe";
@@ -4146,60 +4150,12 @@ void Server::resolve_and_register_local_model(
     std::string recipe = model_data.value("recipe", "");
     bool vision = model_data.value("vision", false);
 
-    std::string resolved_checkpoint;
+    // The backend's ops locate its primary artifact within the imported
+    // directory (.gguf / .bin file, genai_config.json dir, …); "" means register
+    // the directory itself.
+    std::string resolved_checkpoint = backends::ops_for(recipe)->find_imported_checkpoint(dest_path);
     std::string resolved_mmproj;
 
-    // For RyzenAI LLM models, find genai_config.json
-    if (recipe == "ryzenai-llm") {
-        for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {
-            if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") {
-                resolved_checkpoint = entry.path().parent_path().string();
-                break;
-            }
-        }
-        if (resolved_checkpoint.empty()) {
-            resolved_checkpoint = dest_path;
-        }
-    }
-    // For llamacpp models, find the GGUF file
-    else if (recipe == "llamacpp") {
-        std::string gguf_file_found;
-
-        // If no variant or variant not found, search for any .gguf file (excluding mmproj)
-        if (gguf_file_found.empty()) {
-            for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {
-                if (entry.is_regular_file()) {
-                    std::string filename = entry.path().filename().string();
-                    std::string filename_lower = filename;
-                    std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
-
-                    if (filename_lower.find(".gguf") != std::string::npos &&
-                        filename_lower.find("mmproj") == std::string::npos) {
-                        gguf_file_found = entry.path().string();
-                        break;
-                    }
-                }
-            }
-        }
-
-        resolved_checkpoint = gguf_file_found.empty() ? dest_path : gguf_file_found;
-    }
-    // For whispercpp, find .bin file
-    else if (recipe == "whispercpp") {
-        for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {
-            if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                if (filename.find(".bin") != std::string::npos) {
-                    resolved_checkpoint = entry.path().string();
-                    break;
-                }
-            }
-        }
-        if (resolved_checkpoint.empty()) {
-            resolved_checkpoint = dest_path;
-        }
-    }
-
     // Search for mmproj file if vision is enabled or mmproj hint provided
     if (vision || !mmproj.empty()) {
         for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {
@@ -4515,6 +4471,20 @@ void Server::handle_config_get(const httplib::Request& /*req*/, httplib::Respons
     }
 }
 
+void Server::handle_config_defaults_get(const httplib::Request& /*req*/, httplib::Response& res) {
+    try {
+        // The canonical default config (global keys + descriptor-derived per-recipe
+        // sections), independent of this host's config.json or deployment overrides.
+        // gen_backend_boilerplate.py reads this to regenerate resources/defaults.json.
+        res.set_content(ConfigFile::base_defaults().dump(2), "application/json");
+    } catch (const std::exception& e) {
+        LOG(ERROR, "Server") << "ERROR in handle_config_defaults_get: " << e.what() << std::endl;
+        res.status = 500;
+        nlohmann::json error = {{"error", e.what()}};
+        res.set_content(error.dump(), "application/json");
+    }
+}
+
 void Server::handle_bin_change(const std::string& section,
                                 const std::string& bin_key,
                                 const std::string& new_value) {
@@ -4525,9 +4495,8 @@ void Server::handle_bin_change(const std::string& section,
     std::string backend = bin_key.substr(0, bin_key.size() - 4);
 
     // The "server_bin" key (as in ryzenai.server_bin) is not consumed by the
-    // current install flow — find_external_backend_binary uses recipe-based
-    // section lookup and there is no recipe whose section equals "ryzenai".
-    // Skip the hot-swap rather than attempt an install that won't help.
+    // current install flow, so skip the hot-swap rather than attempt an install
+    // that won't help.
     if (backend == "server") {
         LOG(WARNING, "Server") << section << "." << bin_key
                                << " is not consumed by the install flow; "
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index cf0adfc52..f7cccc162 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -7,6 +7,9 @@
 #include "lemon/utils/json_utils.h"
 #include "lemon/utils/process_manager.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/backends/backend_descriptor_registry.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/recipe_backend_def.h"
 #include <filesystem>
 #include <fstream>
 #include <sstream>
@@ -404,15 +407,8 @@ std::vector<GPUInfo> query_dxg_amd_gpus(const std::string& gpu_type) {
 // Recipe/Backend definition table - single source of truth for support matrix
 // ============================================================================
 
-// Device constraints: device_type -> set of allowed families (empty = all families)
-using DeviceConstraints = std::map<std::string, std::set<std::string>>;
-
-struct RecipeBackendDef {
-    std::string recipe;
-    std::string backend;
-    std::set<std::string> supported_os;
-    DeviceConstraints devices;
-};
+// RecipeBackendDef and DeviceConstraints are declared in lemon/recipe_backend_def.h
+// so backend descriptors can carry their own support rows.
 
 // Recipe definitions table - single source of truth for all recipe/backend support
 // Format: {recipe, backend, {supported_os}, {{device_type, {allowed_families}}}}
@@ -422,115 +418,23 @@ struct RecipeBackendDef {
 // Example: metal is listed before vulkan on macOS, vulkan before cpu elsewhere.
 //
 // Empty family set {} means "all families of that device type"
-static const std::vector<RecipeBackendDef> RECIPE_DEFS = {
-    // llamacpp with multiple backends (order = preference)
-    {"llamacpp", "system", {"linux"}, {
-        {"cpu", {"x86_64", "arm64"}}, // Placeholder, actual check is PATH-based
-    }},
-    {"llamacpp", "metal", {"macos"},
-    {
-        {"metal", {}},
-    }},
-    {"llamacpp", "cuda", {"windows", "linux"}, {
-        {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}},
-    }},
-    {"llamacpp", "vulkan", {"windows", "linux"}, {
-        {"cpu", {"x86_64", "arm64"}},
-        {"amd_gpu", {}},      // all AMD GPU families
-    }},
-    {"llamacpp", "rocm", {"windows", "linux"}, {
-        {"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}},  // STX iGPUs + RDNA2/3/4 dGPUs
-    }},
-    {"llamacpp", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64", "arm64"}},
-    }},
-
-    // whisper.cpp - NPU, ROCm GPU, Vulkan, CPU, Metal
-    {"whispercpp", "npu", {"windows"}, {
-        {"amd_npu", {"XDNA2"}},
-    }},
-    {"whispercpp", "rocm", {"windows", "linux"}, {
-        // gfx103X omitted: lemonade-sdk/whisper.cpp-rocm publishes no gfx103X
-        // ROCm whisper build, so advertising it would yield a 404 on install.
-        {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}},
-    }},
-    {"whispercpp", "vulkan", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-        {"amd_gpu", {}},
-    }},
-    {"whispercpp", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-    }},
-    {"whispercpp", "metal", {"macos"}, {
-        {"metal", {}},
-    }},
-
-    // kokoro - Windows/Linux x86_64; macOS arm64 (Metal)
-    {"kokoro", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-    }},
-    {"kokoro", "metal", {"macos"}, {
-        {"metal", {}},
-    }},
-
-    // stable-diffusion.cpp - ROCm backend for AMD GPUs
-    {"sd-cpp", "rocm", {"windows", "linux"}, {
-        {"amd_gpu", {
-            "gfx1150", "gfx1151", "gfx1152",
-            "gfx103X", "gfx110X", "gfx120X"
-        }},
-    }},
-
-    // stable-diffusion.cpp - CUDA backend for NVIDIA GPUs (Linux)
-    {"sd-cpp", "cuda", {"linux"}, {
-        {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}},
-    }},
-
-    // stable-diffusion.cpp - Vulkan backend (Windows/Linux x86_64)
-    {"sd-cpp", "vulkan", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-        {"amd_gpu", {}},
-        {"nvidia_gpu", {}},
-    }},
-
-    // stable-diffusion.cpp - CPU backend (Windows/Linux x86_64)
-    {"sd-cpp", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-    }},
-
-    // stable-diffusion.cpp - Metal backend (macOS arm64)
-    {"sd-cpp", "metal", {"macos"}, {
-        {"metal", {}},
-    }},
-
-    // FLM - NPU (XDNA2)
-    {"flm", "npu", {"windows", "linux"}, {
-        {"amd_npu", {"XDNA2"}},
-    }},
-
-    // RyzenAI LLM - Windows NPU (XDNA2)
-    {"ryzenai-llm", "npu", {"windows"}, {
-        {"amd_npu", {"XDNA2"}},
-    }},
-
-    // vLLM - ROCm backend for AMD GPUs (Linux only)
-    {"vllm", "rocm", {"linux"}, {
-        {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}},
-    }},
-
-    // Moonshine - CPU-only streaming STT. Platforms match the published
-    // moonshine-server-rocm bundles (moonshine-voice wheels): Windows x64,
-    // Linux x64/arm64, macOS arm64. No Intel macOS or Windows-arm64 wheel.
-    {"moonshine", "cpu", {"windows"}, {
-        {"cpu", {"x86_64"}},
-    }},
-    {"moonshine", "cpu", {"linux"}, {
-        {"cpu", {"x86_64", "arm64"}},
-    }},
-    {"moonshine", "cpu", {"macos"}, {
-        {"cpu", {"arm64"}},
-    }},
-};
+// The recipe/backend support matrix is assembled from every backend descriptor's
+// `support` rows (see lemon/backends/*_descriptor.cpp). Concatenated in registry
+// order; within a recipe, row order is the backend preference order. This is the
+// single source of truth — there is no separate hand-maintained table.
+static const std::vector<RecipeBackendDef>& recipe_defs() {
+    static const std::vector<RecipeBackendDef> defs = [] {
+        std::vector<RecipeBackendDef> v;
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& row : desc->support) {
+                // Fill in the recipe (the owning descriptor's) per support row.
+                v.push_back({desc->recipe, row.backend, row.supported_os, row.devices, row.device_summary});
+            }
+        }
+        return v;
+    }();
+    return defs;
+}
 
 // ============================================================================
 // Device family to human-readable name mapping
@@ -592,7 +496,7 @@ std::string SystemInfo::get_unsupported_backend_error(const std::string& recipe,
     std::string error;
 
     // Find the recipe/backend in RECIPE_DEFS
-    for (const auto& def : RECIPE_DEFS) {
+    for (const auto& def : recipe_defs()) {
         if (def.recipe == recipe && def.backend == backend) {
             // Collect all required family names
             std::vector<std::string> family_names;
@@ -674,76 +578,49 @@ static bool device_matches_constraint(const std::string& device_family,
 
 // Generic installation check
 static bool is_recipe_installed(const std::string& recipe, const std::string& backend, std::string& error_message) {
-    bool is_llamacpp_rocm_backend = recipe == "llamacpp" && backend == "rocm";
-
-    // Special handling for ROCm backends on gfx1151 (Strix Halo) if kernel CWSR fix is missing
-    bool is_vllm_rocm_backend = recipe == "vllm" && backend == "rocm";
-    if ((recipe == "sd-cpp" && backend == "rocm") || is_llamacpp_rocm_backend || is_vllm_rocm_backend) {
-        if (needs_gfx1151_cwsr_fix()) {
-            error_message = "Linux kernel missing support";
-            return false;
-        }
+    // Special handling for ROCm backends on gfx1151 (Strix Halo) if the kernel
+    // CWSR fix is missing — which backends' rocm build needs it is a descriptor flag.
+    const auto* cwsr_desc = backends::descriptor_for(recipe);
+    if (backend == "rocm" && cwsr_desc && cwsr_desc->rocm_requires_cwsr_fix &&
+        needs_gfx1151_cwsr_fix()) {
+        error_message = "Linux kernel missing support";
+        return false;
     }
-    auto* spec = try_get_spec_for_recipe(recipe);
-    if (spec) {
+    // Find the managed binary, then let the backend's ops decide installed-ness
+    // (llamacpp "system" also needs the HIP plugin; flm can be a PATH package).
+    bool binary_found = false;
+    if (auto* spec = try_get_spec_for_recipe(recipe)) {
         try {
             BackendUtils::get_backend_binary_path(*spec, backend);
-
-            // For system llamacpp backend, also verify the HIP plugin is available
-            // This is required for ROCm GPU acceleration with dynamically loaded backends
-            if (recipe == "llamacpp" && backend == "system") {
-#ifdef __linux__
-                // Check if AMD GPU driver is loaded (KFD indicates amdgpu driver)
-                if (fs::exists("/sys/class/kfd")) {
-                    // System has AMD GPU(s), so we need the HIP plugin
-                    if (!is_ggml_hip_plugin_available()) {
-                        error_message = "HIP plugin libggml-hip.so not installed";
-                        return false;
-                    }
-                }
-#endif
-            }
-
-            return true;
+            binary_found = true;
         } catch (...) {
-#ifndef _WIN32
-            // On Linux, FLM is installed as a system package (in PATH, not install dir)
-            if (recipe == "flm" && !utils::find_flm_executable().empty()) {
-                return true;
-            }
-#endif
-            return false;
+            binary_found = false;
         }
     }
-    return false;
+    auto check = backends::ops_for(recipe)->check_install(backend, binary_found);
+    if (!check.installed && !check.error.empty()) {
+        error_message = check.error;
+    }
+    return check.installed;
 }
 
 static std::string get_recipe_version(const std::string& recipe, const std::string& backend) {
-    if (recipe == "llamacpp" && backend == "system") {
-        return SystemInfo::get_system_llamacpp_version();
-    }
+    // Read the on-disk version.txt generically, then let the backend's ops
+    // override (llamacpp "system" runs llama-server --version; flm queries the
+    // CLI when no file is present). No per-recipe branches here.
     auto* spec = try_get_spec_for_recipe(recipe);
+    std::string file_version;
     if (spec) {
         std::string version_file = BackendUtils::get_installed_version_file(*spec, backend);
-        if (version_file.empty()) {
-#ifndef _WIN32
-            // On Linux, FLM is a system package with no version.txt - query directly
-            if (recipe == "flm") {
-                return SystemInfo::get_flm_version();
-            }
-#endif
-            return "unknown";
-        }
-        std::string version = read_version_file(version_file);
-#ifndef _WIN32
-        // On Linux, version.txt may not exist on disk for system-installed FLM
-        if (recipe == "flm" && (version.empty() || version == "unknown")) {
-            return SystemInfo::get_flm_version();
+        if (!version_file.empty()) {
+            file_version = read_version_file(version_file);
         }
-#endif
-        return version;
     }
-    return "";
+    std::string resolved = backends::ops_for(recipe)->resolve_version(backend, file_version);
+    if (!spec && resolved.empty()) {
+        return "";
+    }
+    return resolved.empty() ? "unknown" : resolved;
 }
 
 static std::string get_install_command(const std::string& recipe, const std::string& backend) {
@@ -828,7 +705,7 @@ static std::string get_expected_backend_version(const std::string& recipe, const
     // version pins ("rocm-stable", "rocm-nightly") in backend_versions.json.
     // Mirror the resolution done by BackendUtils::get_backend_version().
     std::string resolved_backend = backend;
-    if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") {
+    if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") {
         std::string channel = "stable";
         if (auto* cfg = RuntimeConfig::global()) {
             channel = cfg->rocm_channel_for_recipe(recipe);
@@ -1215,12 +1092,12 @@ json SystemInfo::build_recipes_info(const json& devices) {
     std::map<std::string, std::string> configured_default_backends;
     if (auto* cfg = RuntimeConfig::global()) {
         std::set<std::string> processed_recipes;
-        for (const auto& def : RECIPE_DEFS) {
+        for (const auto& def : recipe_defs()) {
             if (!processed_recipes.insert(def.recipe).second) continue;
             std::string section = RuntimeConfig::recipe_to_config_section(def.recipe);
             std::string backend = cfg->backend_string(section, "backend");
             if (backend.empty() || backend == "auto") continue;
-            bool known = std::any_of(RECIPE_DEFS.begin(), RECIPE_DEFS.end(),
+            bool known = std::any_of(recipe_defs().begin(), recipe_defs().end(),
                 [&](const RecipeBackendDef& d) {
                     return d.recipe == def.recipe && d.backend == backend;
                 });
@@ -1280,7 +1157,7 @@ json SystemInfo::build_recipes_info(const json& devices) {
     }
 
     // Build recipes from the definition table
-    for (const auto& def : RECIPE_DEFS) {
+    for (const auto& def : recipe_defs()) {
         // Skip if not supported on current OS
         if (def.supported_os.count(current_os) == 0) {
             // Helper to format OS name nicely
@@ -1439,41 +1316,21 @@ json SystemInfo::build_recipes_info(const json& devices) {
             backend["message"] = message;
             backend["action"] = "";
         } else if (!available) {
-            // FLM on Linux needs richer state to guide users through manual setup
-            // (installing .deb, xrt drivers, etc.)
-            if (def.recipe == "flm") {
-                bool is_not_installed = install_error.empty()
-                                     || install_error.find("not installed") != std::string::npos
-                                     || install_error.find("not found") != std::string::npos;
-                bool is_version_mismatch = install_error.find("requires") != std::string::npos;
-
-                if (is_not_installed) {
-                    backend["state"] = "installable";
-                } else if (is_version_mismatch) {
-                    backend["state"] = "update_required";
-                } else {
-                    backend["state"] = "action_required";
-                }
-                backend["message"] = install_error;
-
-                if (!is_not_installed) {
+            // Backends with bespoke unavailable-state guidance (flm: a system .deb
+            // + drivers needing manual setup) classify themselves; everyone else
+            // uses the generic installable/no-fetch default below.
+            const std::string default_install_command = get_install_command(def.recipe, def.backend);
+            if (auto st = backends::ops_for(def.recipe)->classify_unavailable(
+                    def.backend, install_error, default_install_command)) {
+                backend["state"] = st->state;
+                backend["message"] = st->message;
+                backend["action"] = st->action;
+                if (st->attach_installed_version) {
                     std::string installed_version = get_recipe_version(def.recipe, def.backend);
                     if (!installed_version.empty() && installed_version != "unknown") {
                         backend["version"] = installed_version;
                     }
                 }
-
-#ifdef __linux__
-                backend["action"] = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot";
-#elif defined(_WIN32)
-                if (!is_not_installed && !is_version_mismatch) {
-                    backend["action"] = "Visit https://lemonade-server.ai/driver_install.html";
-                } else {
-                    backend["action"] = get_install_command(def.recipe, def.backend);
-                }
-#else
-                backend["action"] = get_install_command(def.recipe, def.backend);
-#endif
             } else {
                 auto* cfg = RuntimeConfig::global();
                 bool no_fetch = cfg && cfg->no_fetch_executables();
@@ -1483,16 +1340,16 @@ json SystemInfo::build_recipes_info(const json& devices) {
                     : "Backend is supported but not installed.";
                 backend["message"] = install_error.empty() ? default_message : install_error;
 
-                bool is_rocm_backend = (def.recipe == "sd-cpp" && def.backend == "rocm") ||
-                    (def.recipe == "llamacpp" && def.backend == "rocm") ||
-                    (def.recipe == "vllm" && def.backend == "rocm");
+                const auto* cwsr_desc = backends::descriptor_for(def.recipe);
+                bool is_rocm_backend = def.backend == "rocm" && cwsr_desc &&
+                                       cwsr_desc->rocm_requires_cwsr_fix;
 
-                // Special action for ROCm backends on llamacpp/sd-cpp/vllm if CWSR fix is missing
+                // Special action for ROCm backends that need the gfx1151 CWSR fix.
                 if (is_rocm_backend
                     && !install_error.empty() && needs_gfx1151_cwsr_fix()) {
                     backend["action"] = "Visit https://lemonade-server.ai/gfx1151_linux.html";
                 } else {
-                    backend["action"] = get_install_command(def.recipe, def.backend);
+                    backend["action"] = default_install_command;
                 }
             }
         } else {
@@ -1537,9 +1394,10 @@ json SystemInfo::build_recipes_info(const json& devices) {
                 return installed.compare(0, prefix.size(), prefix) == 0;
             };
 #if !defined(_WIN32)
-            // On non-Windows, FLM is a system-managed package; a version newer
-            // than the minimum required is acceptable.
-            if (def.recipe == "flm") {
+            // System-managed packages (e.g. flm on Linux) accept a version newer
+            // than the minimum required.
+            const auto* ver_desc = backends::descriptor_for(def.recipe);
+            if (ver_desc && ver_desc->version_policy == VersionPolicy::AtLeast) {
                 auto installed_ver = utils::Version::parse(installed_version);
                 auto expected_ver = utils::Version::parse(expected_version);
                 // If either version cannot be parsed, fall back to exact equality check
@@ -1611,6 +1469,60 @@ json SystemInfo::build_recipes_info(const json& devices) {
         }
     }
 
+    // Enrich each recipe entry with descriptor metadata so clients (the desktop
+    // app, the docs generator) can render display names and per-recipe option
+    // schemas without hardcoding them. This is the single source the frontend
+    // reads instead of its own per-recipe TypeScript tables.
+    int recipe_order = 0;
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        auto it = recipes.find(desc->recipe);
+        if (it == recipes.end()) {
+            ++recipe_order;
+            continue;  // recipe not surfaced on this system (e.g. cloud has no support rows)
+        }
+        json& entry = it.value();
+        entry["order"] = recipe_order++;  // descriptor registry order, for deterministic doc rendering
+        entry["display_name"] = desc->display_name;
+        entry["selectable_backend"] = desc->selectable_backend;
+        entry["uses_ctx_size"] = desc->uses_ctx_size;
+        entry["modality"] = desc->modality;
+        entry["experimental"] = desc->experimental;
+        entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name;
+        entry["web_priority"] = desc->web_priority;
+        entry["slot_policy"] = slot_policy_to_string(desc->slot_policy);
+        // Machine-independent support matrix (OS + device families + friendly
+        // device summary per backend), straight from the descriptor — used by the
+        // docs generator to render the README support matrix etc.
+        json support = json::array();
+        for (const auto& row : desc->support) {
+            json devices = json::array();
+            for (const auto& [device, families] : row.devices) {
+                devices.push_back({{"device", device},
+                                   {"families", std::vector<std::string>(families.begin(), families.end())}});
+            }
+            support.push_back({
+                {"backend", row.backend},
+                {"os", std::vector<std::string>(row.supported_os.begin(), row.supported_os.end())},
+                {"devices", devices},
+                {"device_summary", row.device_summary},
+            });
+        }
+        entry["support"] = support;
+        json options = json::array();
+        for (const auto& opt : desc->options) {
+            json o = {
+                {"name", opt.name},
+                {"cli_flag", opt.cli_flag},
+                {"default", opt.default_value},
+                {"type_name", opt.type_name},
+                {"help", opt.help},
+                {"group", opt.group},
+            };
+            options.push_back(o);
+        }
+        entry["options"] = options;
+    }
+
     return recipes;
 }
 
@@ -1643,7 +1555,7 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std
     }
 
     // Collect remaining supported backends and capture first error (in preference order from RECIPE_DEFS)
-    for (const auto& def : RECIPE_DEFS) {
+    for (const auto& def : recipe_defs()) {
         if (def.recipe == recipe) {
             // Skip the default_backend since we already added it
             if (def.backend == default_backend) {
@@ -1672,11 +1584,12 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std
 }
 
 std::string SystemInfo::check_recipe_supported(const std::string& recipe) {
-    // Cloud offload has no local hardware/OS requirements; availability is
-    // gated by the CloudProviderRegistry (config.json "cloud_providers") and
-    // a resolvable API key (env var or runtime auth), checked elsewhere in
-    // filter_models_by_backend / CloudServer::load.
-    if (recipe == "cloud") {
+    // A backend whose descriptor declares no support rows has no local
+    // hardware/OS gating (e.g. cloud offload): availability is determined at
+    // runtime (provider creds via the CloudProviderRegistry / API key), checked
+    // elsewhere in filter_models_by_backend / CloudServer::load.
+    const auto* desc = lemon::backends::descriptor_for(recipe);
+    if (desc && desc->support.empty()) {
         return "";
     }
     auto result = get_supported_backends(recipe);
@@ -1697,7 +1610,7 @@ std::vector<SystemInfo::RecipeStatus> SystemInfo::get_all_recipe_statuses() {
 
         if (recipe_info.contains("backends") && recipe_info["backends"].is_object()) {
             // Iterate in preference order (from RECIPE_DEFS table)
-            for (const auto& def : RECIPE_DEFS) {
+            for (const auto& def : recipe_defs()) {
                 if (def.recipe != recipe_name) continue;
 
                 if (!recipe_info["backends"].contains(def.backend)) continue;
@@ -1736,43 +1649,6 @@ static std::string read_version_file(const fs::path& version_file) {
     return "unknown";
 }
 
-std::string SystemInfo::get_system_llamacpp_version() {
-    std::string output;
-    #ifdef _WIN32
-    std::string command = "llama-server --version 2>NUL";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-    #else
-    FILE* pipe = popen("llama-server --version 2>/dev/null", "r");
-    if (!pipe) {
-        return "unknown";
-    }
-
-    char buffer[256];
-    if (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output = buffer;
-    }
-
-    pclose(pipe);
-    #endif
-
-    // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432"
-    if (!output.empty()) {
-        // Try to find a version number
-        std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))");
-        std::smatch match;
-        if (std::regex_search(output, match, version_regex)) {
-            for (size_t i = 1; i < match.size(); ++i) {
-                if (match[i].matched) {
-                    return "b" + match[i].str();
-                }
-            }
-        }
-        return "detected";
-    }
-
-    return "unknown";
-}
-
 // Map a CUDA Compute Capability "MAJOR.MINOR" string (as reported by nvidia-smi
 // --query-gpu=compute_cap) to the sm_XX token used in llamacpp-cuda release filenames.
 // Returns empty if the value cannot be parsed.
@@ -2321,74 +2197,6 @@ bool SystemInfo::get_has_igpu() {
     return false;  // No iGPU detected
 }
 
-std::string SystemInfo::get_flm_version() {
-    // Cache real version strings to avoid spawning the subprocess twice per
-    // build_recipes_info() pass. "unknown" is NOT cached so that post-install
-    // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed.
-    static std::string cached_version;
-    if (!cached_version.empty()) {
-        return cached_version;
-    }
-
-    // Find the flm executable using shared utility
-    std::string flm_path = utils::find_flm_executable();
-    if (flm_path.empty() || !utils::is_safe_executable_path(flm_path)) {
-        return "unknown";
-    }
-
-    std::string output;
-    #ifdef _WIN32
-    std::string command = "\"" + flm_path + "\" version --json 2>NUL";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-    #else
-    std::string command = "\"" + flm_path + "\" version --json 2>/dev/null";
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        return "unknown";
-    }
-
-    char buffer[256];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    pclose(pipe);
-    #endif
-
-    // Parse JSON output: { "version": "0.9.34" }
-    try {
-        json j = JsonUtils::parse(output);
-        if (j.contains("version") && j["version"].is_string()) {
-            std::string version = j["version"].get<std::string>();
-            // If the version doesn't start with 'v', prepend it
-            // for backend_versions.json compatibility (e.g. "v0.9.34").
-            if (!version.empty() && version[0] != 'v') {
-                version = "v" + version;
-            }
-            cached_version = version;
-            return cached_version;
-        }
-    } catch (...) {
-        // Fallback to legacy parsing if JSON parsing fails
-    }
-
-    // Legacy parsing from output like "FLM v0.9.4"
-    if (output.find("FLM v") != std::string::npos) {
-        size_t pos = output.find("FLM v");
-        // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34").
-        std::string version = output.substr(pos + 4);
-        // Trim whitespace and newlines
-        size_t end = version.find_first_of(" \t\n\r");
-        if (end != std::string::npos) {
-            version = version.substr(0, end);
-        }
-        cached_version = version;
-        return cached_version;
-    }
-
-    return "unknown";
-}
-
 // ============================================================================
 // Factory function
 // ============================================================================
diff --git a/src/cpp/server/utils/path_utils.cpp b/src/cpp/server/utils/path_utils.cpp
index dc7492295..fb8591337 100644
--- a/src/cpp/server/utils/path_utils.cpp
+++ b/src/cpp/server/utils/path_utils.cpp
@@ -103,30 +103,6 @@ bool looks_like_path(const std::string& v) {
     }
 }
 
-std::string find_flm_executable() {
-#ifdef _WIN32
-    // On Windows, only check the Lemonade install directory (auto-installed zip).
-    // No system PATH fallback - FLM should be installed via install_backend().
-    std::string install_dir = (fs::path(get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string();
-    if (fs::exists(install_dir)) {
-        for (const auto& entry : fs::recursive_directory_iterator(install_dir)) {
-            if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") {
-                std::string path = entry.path().string();
-                if (is_safe_executable_path(path)) {
-                    return path;
-                }
-            }
-        }
-    }
-    return "";
-#else
-    // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`.
-    if (!find_executable_in_path("flm").empty()) {
-        return "flm";
-    }
-    return "";
-#endif
-}
 
 std::string find_executable_in_path(const std::string& executable_name) {
     if (!is_safe_executable_path(executable_name)) {
@@ -180,50 +156,6 @@ std::string find_executable_in_path(const std::string& executable_name) {
 #endif
 }
 
-bool is_ggml_hip_plugin_available() {
-#ifdef __linux__
-    // Allow distros/packagers that install outside the FHS paths below
-    // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so.
-    if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) {
-        // Require the basename to look like the HIP plugin (libggml-hip*.so*,
-        // case-insensitive, versioned sonames allowed). This is a sanity check,
-        // not a security boundary: the path is not forwarded to ggml's loader,
-        // so we cannot verify it is actually loadable. It only guards against an
-        // accidental override pointing at an unrelated existing file.
-        std::string name = fs::path(env).filename().string();
-        std::transform(name.begin(), name.end(), name.begin(),
-                       [](unsigned char c) { return std::tolower(c); });
-        const bool name_matches = name.rfind("libggml-hip", 0) == 0 &&
-                                  name.find(".so") != std::string::npos;
-        // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing
-        // filesystem overload: an odd or malformed path resolves to "not a
-        // regular file" (ec set) instead of raising a filesystem_error.
-        std::error_code hip_path_ec;
-        if (name_matches && fs::is_regular_file(env, hip_path_ec)) {
-            return true;
-        }
-    }
-    // On Linux x86_64, check common system library paths for the HIP plugin
-    std::vector<std::string> possible_paths = {
-        // Debian/Ubuntu multiarch path (most common)
-        "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so",
-	// Arch AUR path
-	"/usr/lib/libggml-hip.so",
-        // Standard Linux paths
-        "/usr/lib/ggml/backends0/libggml-hip.so",
-        "/usr/lib64/ggml/backends0/libggml-hip.so"
-    };
-
-    // Check all possible paths
-    for (const auto& path : possible_paths) {
-        if (fs::exists(path)) {
-            return true;
-        }
-    }
-#endif
-
-    return false;
-}
 
 std::string get_cache_dir() {
     // If set_cache_dir() was called at startup, use that
@@ -295,98 +227,5 @@ std::string get_downloaded_bin_dir() {
     return bin_dir;
 }
 
-bool run_flm_validate(const std::string& flm_path, std::string& error_message) {
-    std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path;
-    if (flm_exe.empty()) {
-        error_message = "FLM executable not found";
-        return false;
-    }
-    if (!is_safe_executable_path(flm_exe)) {
-        error_message = "FLM path contains invalid characters";
-        return false;
-    }
-
-    std::string command = "\"" + flm_exe + "\" validate --json";
-    std::string output;
-    int exit_code;
-#ifdef _WIN32
-    exit_code = ProcessManager::run_command(command, output);
-#else
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        error_message = "Failed to execute " + flm_exe;
-        return false;
-    }
-
-    char buffer[1024];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    exit_code = pclose(pipe);
-    if (exit_code != -1) {
-        exit_code = WEXITSTATUS(exit_code);
-    }
-#endif
-
-    try {
-        if (!output.empty()) {
-            json j = JsonUtils::parse(output);
-            if (j.is_object()) {
-                // Check for overall status
-                bool validation_ok = false;
-                if (j.contains("ready")) {
-                    validation_ok = j["ready"].get<bool>();
-                }
-
-                if (validation_ok) {
-                    error_message.clear();
-                    return true;
-                }
-
-                std::vector<std::string> errors;
-
-                if (j.contains("amd_device_found") && !j["amd_device_found"].get<bool>()) {
-                    errors.push_back("No AMD NPU device found.");
-                }
-
-                if (j.contains("all_fw_ok") && !j["all_fw_ok"].get<bool>()) {
-                    errors.push_back("NPU firmware is incompatible.");
-                }
-                if (j.contains("kernel_ok") && !j["kernel_ok"].get<bool>()) {
-                    errors.push_back("Kernel version is incompatible.");
-                }
-
-                if (j.contains("memlock_ok") && !j["memlock_ok"].get<bool>()) {
-                    errors.push_back("Memlock limits are too low.");
-                }
-
-                if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get<bool>()) {
-                    errors.push_back("NPU driver version is too old.");
-                }
-
-                if (errors.empty()) {
-                    error_message = "NPU validation failed.";
-                } else {
-                    error_message = "";
-                    for (size_t i = 0; i < errors.size(); ++i) {
-                        error_message += errors[i] + (i == errors.size() - 1 ? "" : " ");
-                    }
-                }
-                return false;
-            }
-        }
-    } catch (...) {
-        // Fallback for non-JSON output or parsing error
-    }
-
-    if (exit_code != 0) {
-        error_message = "flm validate failed with exit code " + std::to_string(exit_code);
-        return false;
-    }
-
-    error_message.clear();
-    return true;
-}
 
 } // namespace utils::lemon

Text generation	`llamacpp`	`vulkan`	`x86_64` CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)	Windows, Linux	`system`	`x86_64`/ARM64 CPU, GPU	Linux
		`rocm`	Supported AMD ROCm iGPU/dGPU families*	Windows, Linux	`metal`	Apple Silicon GPU	macOS
		`cuda`	Windows, Linux
		`cpu`	`x86_64` CPU; ARM64 CPU (Linux)	`vulkan`	`x86_64` CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)	Windows, Linux
		`metal`	Apple Silicon GPU	macOS	`rocm`	Supported AMD ROCm iGPU/dGPU families*	Windows, Linux
		`system`	`x86_64`/ARM64 CPU, GPU	Linux	`cpu`	`x86_64` CPU; ARM64 CPU (Linux)	Windows, Linux
	`flm`	`flm`	`npu`	XDNA2 NPU	Windows, Linux
	`ryzenai-llm`	`ryzenai-llm`	`npu`	XDNA2 NPU	Windows
	`vllm` (experimental)	`vllm` (experimental)	`rocm`	Strix Halo iGPU (gfx1151)	Linux
Speech-to-text	`whispercpp`	Speech-to-text	`whispercpp`	`npu`	XDNA2 NPU	Windows
				`rocm`	Supported AMD ROCm iGPU/dGPU families*	Windows, Linux
				`vulkan`	`x86_64` CPU	Linux	Windows, Linux
	`cpu`			Windows, Linux
`moonshine`	`metal`			Apple Silicon GPU	macOS
`moonshine`	`cpu`		`x86_64`/`arm64` CPU	Windows, Linux, macOS
Text-to-speech	`kokoro`	Text-to-speech	`kokoro`	`cpu`	`x86_64` CPU	Windows, Linux
Image generation	`sd-cpp`	Text-to-speech	`kokoro`	`rocm`	Supported AMD ROCm iGPU/dGPU families*	Windows, Linux	`metal`	Apple Silicon GPU	macOS
		`vulkan`	Vulkan-capable GPUs	Image generation	`sd-cpp`	`rocm`	Supported AMD ROCm iGPU/dGPU families*	Windows, Linux
		NVIDIA GPUs (Turing or newer)**	Linux
		`vulkan`	Vulkan-capable GPUs			Windows, Linux
`cpu`	`x86_64` CPU	Windows, Linux
`metal`	Apple Silicon GPU	macOS