From 118f54a632ac1d0cdbf93754da795d5d38e75d7b Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Fri, 19 Jun 2026 12:48:14 -0400
Subject: [PATCH 01/39] refactor(backends): self-describing WrappedServer
 backends (#2287)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make each inference backend describe itself with a plain-data descriptor plus
a server class, and rewrite the scattered `if (recipe == "...")` sites to read
a registry built from those descriptors. Adding a backend becomes one
LEMON_BACKENDS line plus a descriptor + factory file — no router, CLI, docs, or
support-matrix edits.

- Descriptor types (BackendDescriptor/BackendOption/SlotPolicy) + a CLI-safe
  data registry and a server-only factory registry, generated from the
  LEMON_BACKENDS list at CMake configure time.
- All 9 backends carry a descriptor (device, slot policy, options, support
  matrix, labels, binary) and a create().
- Descriptor-driven: router creation, NPU/slot eviction, device type, recipe
  options/CLI flags, config-section identity, support matrix, recipe labels,
  cloud availability.
- /system-info recipes enriched with display_name/selectable_backend/options/
  support; the app reads recipe display names from it instead of hardcoded TS.
- docs/tools/gen_backend_docs.py generates docs/dev/backends-reference.md from
  /system-info; a CI step fails on drift. Authoring guide in
  docs/dev/adding-a-backend.md.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/docs_and_style.yml          |  17 +
 CMakeLists.txt                                |  69 ++++
 docs/dev/adding-a-backend.md                  | 145 ++++++++
 docs/dev/backends-reference.md                | 325 ++++++++++++++++++
 docs/dev/contribute.md                        |   4 +
 docs/tools/gen_backend_docs.py                | 309 +++++++++++++++++
 src/app/src/renderer/utils/recipeNames.ts     |  29 +-
 src/app/src/renderer/utils/systemData.ts      |  20 ++
 src/cpp/cli/CMakeLists.txt                    |   4 +
 .../lemon/backends/backend_descriptor.h       |  58 ++++
 .../backends/backend_descriptor_registry.h    |  25 ++
 .../include/lemon/backends/backend_registry.h |  47 +++
 .../include/lemon/backends/cloud_descriptor.h |  13 +
 .../include/lemon/backends/cloud_factory.h    |  14 +
 .../lemon/backends/fastflowlm_descriptor.h    |  13 +
 .../lemon/backends/fastflowlm_factory.h       |  14 +
 .../lemon/backends/kokoro_descriptor.h        |  13 +
 .../include/lemon/backends/kokoro_factory.h   |  14 +
 .../lemon/backends/llamacpp_descriptor.h      |  13 +
 .../include/lemon/backends/llamacpp_factory.h |  14 +
 .../lemon/backends/moonshine_descriptor.h     |  13 +
 .../lemon/backends/moonshine_factory.h        |  14 +
 .../lemon/backends/ryzenai_descriptor.h       |  13 +
 .../include/lemon/backends/ryzenai_factory.h  |  14 +
 .../include/lemon/backends/sdcpp_descriptor.h |  13 +
 .../include/lemon/backends/sdcpp_factory.h    |  14 +
 .../include/lemon/backends/vllm_descriptor.h  |  13 +
 src/cpp/include/lemon/backends/vllm_factory.h |  14 +
 .../lemon/backends/whispercpp_descriptor.h    |  13 +
 .../lemon/backends/whispercpp_factory.h       |  14 +
 src/cpp/include/lemon/model_manager.h         |  13 +
 src/cpp/include/lemon/recipe_backend_def.h    |  26 ++
 src/cpp/include/lemon/wrapped_server.h        |  57 ++-
 .../backends/backend_descriptor_registry.cpp  |  29 ++
 .../backend_descriptors_generated.h.in        |  19 +
 .../backends/backend_factories_generated.h.in |  21 ++
 src/cpp/server/backends/backend_registry.cpp  |  31 ++
 src/cpp/server/backends/cloud_descriptor.cpp  |  23 ++
 src/cpp/server/backends/cloud_factory.cpp     |  16 +
 .../server/backends/fastflowlm_descriptor.cpp |  29 ++
 .../server/backends/fastflowlm_factory.cpp    |  13 +
 src/cpp/server/backends/kokoro_descriptor.cpp |  30 ++
 src/cpp/server/backends/kokoro_factory.cpp    |  13 +
 .../server/backends/llamacpp_descriptor.cpp   |  43 +++
 src/cpp/server/backends/llamacpp_factory.cpp  |  13 +
 .../server/backends/moonshine_descriptor.cpp  |  30 ++
 src/cpp/server/backends/moonshine_factory.cpp |  13 +
 .../server/backends/ryzenai_descriptor.cpp    |  29 ++
 src/cpp/server/backends/ryzenai_factory.cpp   |  20 ++
 src/cpp/server/backends/sdcpp_descriptor.cpp  |  47 +++
 src/cpp/server/backends/sdcpp_factory.cpp     |  13 +
 src/cpp/server/backends/vllm_descriptor.cpp   |  30 ++
 src/cpp/server/backends/vllm_factory.cpp      |  13 +
 .../server/backends/whispercpp_descriptor.cpp |  39 +++
 .../server/backends/whispercpp_factory.cpp    |  13 +
 src/cpp/server/model_manager.cpp              |  58 +++-
 src/cpp/server/recipe_options.cpp             | 182 +++++-----
 src/cpp/server/router.cpp                     | 143 ++++----
 src/cpp/server/runtime_config.cpp             |  39 ++-
 src/cpp/server/server.cpp                     |   5 +-
 src/cpp/server/system_info.cpp                | 205 ++++-------
 61 files changed, 2216 insertions(+), 334 deletions(-)
 create mode 100644 docs/dev/adding-a-backend.md
 create mode 100644 docs/dev/backends-reference.md
 create mode 100644 docs/tools/gen_backend_docs.py
 create mode 100644 src/cpp/include/lemon/backends/backend_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/backend_descriptor_registry.h
 create mode 100644 src/cpp/include/lemon/backends/backend_registry.h
 create mode 100644 src/cpp/include/lemon/backends/cloud_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/cloud_factory.h
 create mode 100644 src/cpp/include/lemon/backends/fastflowlm_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/fastflowlm_factory.h
 create mode 100644 src/cpp/include/lemon/backends/kokoro_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/kokoro_factory.h
 create mode 100644 src/cpp/include/lemon/backends/llamacpp_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/llamacpp_factory.h
 create mode 100644 src/cpp/include/lemon/backends/moonshine_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/moonshine_factory.h
 create mode 100644 src/cpp/include/lemon/backends/ryzenai_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/ryzenai_factory.h
 create mode 100644 src/cpp/include/lemon/backends/sdcpp_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/sdcpp_factory.h
 create mode 100644 src/cpp/include/lemon/backends/vllm_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/vllm_factory.h
 create mode 100644 src/cpp/include/lemon/backends/whispercpp_descriptor.h
 create mode 100644 src/cpp/include/lemon/backends/whispercpp_factory.h
 create mode 100644 src/cpp/include/lemon/recipe_backend_def.h
 create mode 100644 src/cpp/server/backends/backend_descriptor_registry.cpp
 create mode 100644 src/cpp/server/backends/backend_descriptors_generated.h.in
 create mode 100644 src/cpp/server/backends/backend_factories_generated.h.in
 create mode 100644 src/cpp/server/backends/backend_registry.cpp
 create mode 100644 src/cpp/server/backends/cloud_descriptor.cpp
 create mode 100644 src/cpp/server/backends/cloud_factory.cpp
 create mode 100644 src/cpp/server/backends/fastflowlm_descriptor.cpp
 create mode 100644 src/cpp/server/backends/fastflowlm_factory.cpp
 create mode 100644 src/cpp/server/backends/kokoro_descriptor.cpp
 create mode 100644 src/cpp/server/backends/kokoro_factory.cpp
 create mode 100644 src/cpp/server/backends/llamacpp_descriptor.cpp
 create mode 100644 src/cpp/server/backends/llamacpp_factory.cpp
 create mode 100644 src/cpp/server/backends/moonshine_descriptor.cpp
 create mode 100644 src/cpp/server/backends/moonshine_factory.cpp
 create mode 100644 src/cpp/server/backends/ryzenai_descriptor.cpp
 create mode 100644 src/cpp/server/backends/ryzenai_factory.cpp
 create mode 100644 src/cpp/server/backends/sdcpp_descriptor.cpp
 create mode 100644 src/cpp/server/backends/sdcpp_factory.cpp
 create mode 100644 src/cpp/server/backends/vllm_descriptor.cpp
 create mode 100644 src/cpp/server/backends/vllm_factory.cpp
 create mode 100644 src/cpp/server/backends/whispercpp_descriptor.cpp
 create mode 100644 src/cpp/server/backends/whispercpp_factory.cpp

diff --git a/.github/workflows/docs_and_style.yml b/.github/workflows/docs_and_style.yml
index a64e4d7f2..35aa4cf50 100644
--- a/.github/workflows/docs_and_style.yml
+++ b/.github/workflows/docs_and_style.yml
@@ -24,6 +24,23 @@ jobs:
       - name: Run app regression tests
         run: node test/app/run-app-regression-tests.cjs
 
+  backend-docs-drift:
+    # The backend reference doc (docs/dev/backends-reference.md) is generated from
+    # the self-describing backend descriptors. Build lemond, regenerate, and fail
+    # if the committed doc is stale — the same guarantee a lint provides.
+    runs-on: ubuntu-latest
+    concurrency:
+      group: ${{ github.workflow }}-backend-docs-${{ github.ref }}
+      cancel-in-progress: true
+    steps:
+      - uses: actions/checkout@v5
+      - name: Configure and install build dependencies
+        run: ./setup.sh
+      - name: Build lemond
+        run: cmake --build --preset default --target lemond
+      - name: Check backend reference docs are up to date
+        run: python3 docs/tools/gen_backend_docs.py --check
+
   markdown-link-check:
     runs-on: ubuntu-latest
     concurrency:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7e125642f..3220e6c42 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -647,6 +647,75 @@ elseif(UNIX)
     list(APPEND SOURCES_CORE src/cpp/server/utils/platform/process_unix.cpp)
 endif()
 
+# ============================================================
+# Self-describing backends registry
+# ============================================================
+# The authoritative backend list. Each entry is "<recipe>|<stem>":
+#   recipe - the recipe string used in server_models.json (may contain dashes)
+#   stem   - identifier-safe name. The backend ships two files:
+#              src/cpp/server/backends/<stem>_descriptor.cpp  (plain data; CLI-safe)
+#              src/cpp/server/backends/<stem>_factory.cpp      (create(); server-only)
+#            declaring lemon::backends::<stem>_descriptor and <stem>_create.
+#
+# Adding a backend is one line here plus those two files. The foreach below
+# compiles the sources and regenerates the registry headers, which bind each
+# descriptor to its create(). Because this list is a tracked input, editing it
+# forces regeneration on the next build (a file(GLOB) would silently miss a
+# newly added backend). Descriptor DATA links into both the lemonade CLI and
+# lemond; only lemond links the factories (which pull in server classes).
+set(LEMON_BACKENDS
+    # "<recipe>|<stem>"
+    "llamacpp|llamacpp"
+    "whispercpp|whispercpp"
+    "moonshine|moonshine"
+    "kokoro|kokoro"
+    "sd-cpp|sdcpp"
+    "flm|fastflowlm"
+    "ryzenai-llm|ryzenai"
+    "vllm|vllm"
+    "cloud|cloud"
+)
+
+set(LEMON_DESCRIPTOR_INCLUDES "")
+set(LEMON_DESCRIPTOR_ENTRIES "")
+set(LEMON_FACTORY_INCLUDES "")
+set(LEMON_FACTORY_ENTRIES "")
+# Descriptor sources are CLI-safe (data only); factory sources are server-only.
+# Absolute paths so the CLI subdirectory can reuse LEMON_BACKEND_DESCRIPTOR_SOURCES.
+set(LEMON_BACKEND_DESCRIPTOR_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp)
+set(LEMON_BACKEND_FACTORY_SOURCES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp)
+foreach(_backend_entry ${LEMON_BACKENDS})
+    string(REPLACE "|" ";" _backend_parts "${_backend_entry}")
+    list(GET _backend_parts 1 _backend_stem)
+    list(APPEND LEMON_BACKEND_DESCRIPTOR_SOURCES
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_descriptor.cpp)
+    list(APPEND LEMON_BACKEND_FACTORY_SOURCES
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_factory.cpp)
+    string(APPEND LEMON_DESCRIPTOR_INCLUDES
+        "#include \"lemon/backends/${_backend_stem}_descriptor.h\"\n")
+    string(APPEND LEMON_DESCRIPTOR_ENTRIES
+        "        &lemon::backends::${_backend_stem}_descriptor,\n")
+    string(APPEND LEMON_FACTORY_INCLUDES
+        "#include \"lemon/backends/${_backend_stem}_factory.h\"\n")
+    string(APPEND LEMON_FACTORY_ENTRIES
+        "        { &lemon::backends::${_backend_stem}_descriptor, &lemon::backends::${_backend_stem}_create },\n")
+endforeach()
+
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptors_generated.h.in
+    ${CMAKE_CURRENT_BINARY_DIR}/include/backend_descriptors_generated.h
+    @ONLY)
+configure_file(
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_factories_generated.h.in
+    ${CMAKE_CURRENT_BINARY_DIR}/include/backend_factories_generated.h
+    @ONLY)
+
+# lemond gets both descriptor data and factories; the CLI gets only the data
+# (see src/cpp/cli/CMakeLists.txt, which reuses LEMON_BACKEND_DESCRIPTOR_SOURCES).
+list(APPEND SOURCES_CORE ${LEMON_BACKEND_DESCRIPTOR_SOURCES} ${LEMON_BACKEND_FACTORY_SOURCES})
+
 # ============================================================
 # Server core OBJECT library (shared by lemond and Lemonade.exe)
 # ============================================================
diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md
new file mode 100644
index 000000000..512770e73
--- /dev/null
+++ b/docs/dev/adding-a-backend.md
@@ -0,0 +1,145 @@
+# Adding a backend
+
+Lemonade backends are **self-describing**. A backend declares *what it is* in a
+plain-data **descriptor** and implements *how it runs* in a **server class**. A
+registry collects every descriptor, and the router, the CLI, `/system-info`, and
+the generated docs all read it — so there are no scattered `if (recipe == "...")`
+sites to update.
+
+Adding a backend is **one folder's worth of files plus three small appends**:
+
+| You edit | What goes there |
+|----------|-----------------|
+| `CMakeLists.txt` → `LEMON_BACKENDS` | **one line**: `"<recipe>\|<stem>"` |
+| `src/cpp/server/backends/<stem>_descriptor.cpp` + `.h` | the descriptor (plain data) |
+| `src/cpp/server/backends/<stem>_factory.cpp` + `.h` | `create()` + the `WrappedServer` subclass |
+| `src/cpp/resources/backend_versions.json` | version pin(s) — skip if there's no downloaded binary (e.g. cloud) |
+| `src/cpp/resources/server_models.json` | the models |
+
+No router edits, no CLI edits, no doc edits, no support-matrix edits.
+
+## The descriptor (plain data — CLI-safe)
+
+The descriptor is the single object every consumer reads. It links into **both**
+the `lemonade` CLI and `lemond`, so it must not reference server classes.
+
+`src/cpp/include/lemon/backends/<stem>_descriptor.h`:
+
+```cpp
+#pragma once
+#include "lemon/backends/backend_descriptor.h"
+namespace lemon { namespace backends {
+extern const BackendDescriptor <stem>_descriptor;
+} }
+```
+
+`src/cpp/server/backends/<stem>_descriptor.cpp`:
+
+```cpp
+#include "lemon/backends/<stem>_descriptor.h"
+namespace lemon { namespace backends {
+const BackendDescriptor <stem>_descriptor = {
+    /*recipe*/          "myrecipe",
+    /*display_name*/    "My Backend",
+    /*binary*/          "my-server",        // "" = no subprocess (e.g. cloud)
+    /*config_section*/  "myrecipe",         // defaults to recipe
+    /*default_device*/  DEVICE_GPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ false,           // true auto-exposes "<recipe>_backend" + "--<recipe>"
+    /*uses_ctx_size*/   true,               // opt in to the shared ctx_size option
+    /*dynamic_models*/  false,              // true = models discovered at runtime (cloud)
+    /*options*/ {                           // backend-specific knobs (common ones are automatic)
+        {"myrecipe_args", "--myrecipe-args", "", "ARGS", "Custom args to pass", "My Options"},
+    },
+    /*support*/ {                           // OS / device families ({} = no local gating)
+        {"myrecipe", "cpu", {"linux", "windows"}, {{"cpu", {"x86_64"}}}},
+    },
+    /*default_labels*/  {},                 // labels injected when a model omits them
+    /*required_checkpoints*/ {"main"},      // unconditional files; conditional ones checked in load()
+};
+} }
+```
+
+`SlotPolicy` controls accelerator sharing: `Standard` (counts toward LRU slots),
+`ExclusiveNpu` (evicts all NPU servers first), `CoexistByType` (one per model
+type), `Unmetered` (never counted, never auto-evicted — cloud).
+
+## The factory + server class (server-only)
+
+The factory builds the `WrappedServer` subclass. It is compiled into `lemond`
+only (it references server classes), which keeps the `lemonade` CLI link clean.
+
+`src/cpp/include/lemon/backends/<stem>_factory.h`:
+
+```cpp
+#pragma once
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+namespace lemon { namespace backends {
+std::unique_ptr<WrappedServer> <stem>_create(const BackendContext& ctx);
+} }
+```
+
+`src/cpp/server/backends/<stem>_factory.cpp`:
+
+```cpp
+#include "lemon/backends/<stem>_factory.h"
+#include "lemon/backends/<stem>_server.h"
+#include "lemon/wrapped_server.h"
+namespace lemon { namespace backends {
+std::unique_ptr<WrappedServer> <stem>_create(const BackendContext& ctx) {
+    return std::make_unique<MyServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+} }
+```
+
+The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`,
+and only the capability interfaces you actually serve (`ITranscriptionServer`,
+`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default
+"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend
+does not stub them.
+
+## Register it: one line
+
+```cmake
+set(LEMON_BACKENDS
+    ...
+    "myrecipe|myrecipe"   # "<recipe>|<stem>"
+)
+```
+
+The `foreach` in `CMakeLists.txt` compiles your two sources and regenerates the
+registry headers, binding the descriptor to its `create()`.
+
+## What you get for free
+
+- **Standard options:** `merge_args`, `auto_evict`, `evict_idle_timeout`,
+  `downsize_idle_timeout`, `evict_weight_factor`, `pinned`. `ctx_size` is opt-in
+  via `uses_ctx_size`.
+- **Generated CLI flags** for every descriptor option with a `cli_flag`, plus
+  `--<recipe>` when `selectable_backend = true`.
+- **Install/download** via the backend's `BackendSpec` (binary + install params).
+- **`/system-info`** `recipes` entry (display name, options schema, support matrix).
+- **Generated docs** — your backend appears in
+  [`backends-reference.md`](backends-reference.md) automatically.
+
+## Escape hatches
+
+| Need | Hook |
+|------|------|
+| Device depends on the chosen backend variant (whisper npu vs cpu) | override `WrappedServer::effective_device(opts)` |
+| Eviction rule depends on the variant | override `WrappedServer::effective_slot_policy(opts)` |
+| Availability decided at runtime (cloud creds) | override `WrappedServer::availability()` |
+| Conditional / grouped checkpoints (sd-cpp flux, whisper npu_cache) | validate in `load()`; list only unconditional files in `required_checkpoints` |
+| Custom per-model fields without editing `ModelInfo` | read `model_info.extra<T>("my_field", fallback)` (populated from unknown `server_models.json` keys) |
+| Models supplied at runtime, not from `server_models.json` | set `dynamic_models = true` and provide them in the class (see cloud's `discover_models()`) |
+| Per-create setup before load (ryzenai `set_model_path`) | do it in `create()` |
+
+## The simplest end-to-end example
+
+**Moonshine** is the minimal case: a single descriptor option, no backend
+selection, CPU-only, one capability interface. See
+`src/cpp/server/backends/moonshine_descriptor.cpp` and `moonshine_factory.cpp`.
+
+> Note: collections (`collection.omni`) are orchestrator-driven, not
+> `WrappedServer` subprocesses, and are the one explicit exception to this model.
diff --git a/docs/dev/backends-reference.md b/docs/dev/backends-reference.md
new file mode 100644
index 000000000..f5c8edebb
--- /dev/null
+++ b/docs/dev/backends-reference.md
@@ -0,0 +1,325 @@
+# Backend reference
+
+<!-- This file is generated by docs/tools/gen_backend_docs.py from the C++ backend
+descriptors. Do not edit the regions between the GENERATED markers by hand; run
+the generator instead. Prose outside the markers is preserved. -->
+
+## Backends
+
+<!-- BEGIN GENERATED: backends-overview -->
+| Recipe | Name | Selectable backend | Uses ctx_size | Backends |
+|--------|------|--------------------|---------------|----------|
+| `flm` | FastFlowLM NPU | no | yes | npu |
+| `kokoro` | Kokoro | no | no | cpu, metal |
+| `llamacpp` | Llama.cpp GPU | yes | yes | cpu, cuda, metal, rocm, system, vulkan |
+| `moonshine` | Moonshine | no | no | cpu |
+| `ryzenai-llm` | Ryzen AI LLM | no | yes | npu |
+| `sd-cpp` | StableDiffusion.cpp | yes | no | cpu, cuda, metal, rocm, vulkan |
+| `vllm` | vLLM ROCm (experimental) | yes | yes | rocm |
+| `whispercpp` | Whisper.cpp | yes | no | cpu, metal, npu, rocm, vulkan |
+<!-- END GENERATED: backends-overview -->
+
+## Support matrix
+
+<!-- BEGIN GENERATED: backends-matrix -->
+| Recipe | Backend | OS | Device families |
+|--------|---------|----|-----------------|
+| `flm` | npu | linux, windows | amd_npu (XDNA2) |
+| `kokoro` | cpu | linux, windows | cpu (x86_64) |
+| `kokoro` | metal | macos | metal |
+| `llamacpp` | system | linux | cpu (arm64, x86_64) |
+| `llamacpp` | metal | macos | metal |
+| `llamacpp` | cuda | linux, windows | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) |
+| `llamacpp` | vulkan | linux, windows | amd_gpu; cpu (arm64, x86_64) |
+| `llamacpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) |
+| `llamacpp` | cpu | linux, windows | cpu (arm64, x86_64) |
+| `moonshine` | cpu | windows | cpu (x86_64) |
+| `moonshine` | cpu | linux | cpu (arm64, x86_64) |
+| `moonshine` | cpu | macos | cpu (arm64) |
+| `ryzenai-llm` | npu | windows | amd_npu (XDNA2) |
+| `sd-cpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) |
+| `sd-cpp` | cuda | linux | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) |
+| `sd-cpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64); nvidia_gpu |
+| `sd-cpp` | cpu | linux, windows | cpu (x86_64) |
+| `sd-cpp` | metal | macos | metal |
+| `vllm` | rocm | linux | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) |
+| `whispercpp` | npu | windows | amd_npu (XDNA2) |
+| `whispercpp` | rocm | linux, windows | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) |
+| `whispercpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64) |
+| `whispercpp` | cpu | linux, windows | cpu (x86_64) |
+| `whispercpp` | metal | macos | metal |
+<!-- END GENERATED: backends-matrix -->
+
+## Recipe options
+
+<!-- BEGIN GENERATED: backend-options -->
+#### `llamacpp` — Llama.cpp GPU
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |
+| `llamacpp_backend` | `--llamacpp` | BACKEND | "" | LlamaCpp backend to use |
+| `llamacpp_device` | `--llamacpp-device` | DEVICES | "" | Comma-separated list of accelerator devices to use (e.g. Vulkan0) |
+| `llamacpp_args` | `--llamacpp-args` | ARGS | "" | Custom arguments to pass to llama-server |
+
+#### `moonshine` — Moonshine
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `moonshine_args` | `--moonshine-args` | ARGS | "" | Custom arguments to pass to moonshine-server |
+
+#### `sd-cpp` — StableDiffusion.cpp
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `sd-cpp_backend` | `--sdcpp` | BACKEND | "" | SD.cpp backend to use |
+| `sdcpp_args` | `--sdcpp-args` | ARGS | "" | Custom arguments to pass to sd-server (must not conflict with managed args) |
+| `steps` | — | SIZE | 20 | Number of diffusion steps |
+| `cfg_scale` | — | SIZE | 7.0 | Classifier-free guidance scale |
+| `width` | — | SIZE | 512 | Output image width |
+| `height` | — | SIZE | 512 | Output image height |
+| `sampling_method` | — | ARGS | "" | Sampling method |
+| `flow_shift` | — | SIZE | 0.0 | Flow shift |
+
+#### `vllm` — vLLM ROCm (experimental)
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |
+| `vllm_backend` | `--vllm` | BACKEND | "" | vLLM backend to use |
+| `vllm_args` | `--vllm-args` | ARGS | "" | Custom arguments to pass to vllm-server |
+
+#### `whispercpp` — Whisper.cpp
+
+| Option | CLI flag | Type | Default | Description |
+|--------|----------|------|---------|-------------|
+| `whispercpp_backend` | `--whispercpp` | BACKEND | "" | WhisperCpp backend to use |
+| `whispercpp_args` | `--whispercpp-args` | ARGS | "" | Custom arguments to pass to whisper-server |
+<!-- END GENERATED: backend-options -->
+
+## Models
+
+<!-- BEGIN GENERATED: backend-models -->
+#### `collection.omni` — collection.omni (4 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `LMX-Omni-5.5B-Lite` | 9.3 | — |
+| `LMX-Omni-52B-Halo` | 44.77 | — |
+| `Lite Collection` |  | — |
+| `Ultra Collection` |  | — |
+
+#### `kokoro` — Kokoro (1 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `kokoro-v1` | 0.354 | tts |
+
+#### `llamacpp` — Llama.cpp GPU (74 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Bonsai-1.7B-gguf` | 0.25 | llamacpp |
+| `Bonsai-4B-gguf` | 0.572 | llamacpp |
+| `Bonsai-8B-gguf` | 1.16 | llamacpp |
+| `Cogito-v2-llama-109B-MoE-GGUF` | 65.4 | vision |
+| `DeepSeek-Qwen3-8B-GGUF` | 5.25 | reasoning |
+| `Devstral-Small-2507-GGUF` | 14.3 | coding, tool-calling |
+| `GLM-4.5-Air-UD-Q4K-XL-GGUF` | 67.7 | reasoning |
+| `GLM-4.7-Flash-GGUF` | 17.5 | tool-calling |
+| `Gemma-3-4b-it-GGUF` | 3.34 | vision |
+| `Gemma-4-12B-it-GGUF` | 7.12 | tool-calling, llamacpp |
+| `Gemma-4-26B-A4B-it-GGUF` | 18.1 | hot, tool-calling, vision, llamacpp |
+| `Gemma-4-31B-it-GGUF` | 19.5 | hot, tool-calling, vision, llamacpp |
+| `Gemma-4-E2B-it-GGUF` | 4.09 | tool-calling, vision, llamacpp |
+| `Gemma-4-E4B-it-GGUF` | 5.97 | tool-calling, vision, llamacpp |
+| `Jan-nano-128k-GGUF` | 2.5 | — |
+| `Jan-v1-4B-GGUF` | 2.5 | — |
+| `LFM2-1.2B-GGUF` | 0.731 | — |
+| `LFM2-24B-A2B-GGUF` | 14.4 | — |
+| `LFM2-8B-A1B-GGUF` | 5.04 | — |
+| `LFM2.5-1.2B-Instruct-GGUF` | 0.731 | — |
+| `LFM2.5-8B-A1B` | 5.16 | — |
+| `Llama-3.2-1B-Instruct-GGUF` | 0.834 | — |
+| `Llama-3.2-3B-Instruct-GGUF` | 2.06 | — |
+| `Llama-4-Scout-17B-16E-Instruct-GGUF` | 63.2 | vision |
+| `Ministral-3-3B-Instruct-2512-GGUF` | 2.99 | vision |
+| `Nemotron-3-Nano-30B-A3B-GGUF` | 22.8 | — |
+| `Phi-4-mini-instruct-GGUF` | 2.49 | — |
+| `Playable1-GGUF` | 4.68 | coding |
+| `PromptBridge-0.6b-Alpha-GGUF` | 0.397 | — |
+| `Qwen2.5-Coder-32B-Instruct-GGUF` | 19.9 | coding |
+| `Qwen2.5-Omni-3B-GGUF` | 4.73 | vision, chat-transcription |
+| `Qwen2.5-Omni-7B-GGUF` | 7.33 | vision, chat-transcription |
+| `Qwen2.5-VL-3B-Instruct-GGUF` | 3.27 | vision |
+| `Qwen2.5-VL-7B-Instruct-GGUF` | 6.04 | vision |
+| `Qwen3-0.6B-GGUF` | 0.38 | reasoning |
+| `Qwen3-1.7B-GGUF` | 1.06 | reasoning |
+| `Qwen3-14B-GGUF` | 8.54 | reasoning |
+| `Qwen3-30B-A3B-GGUF` | 17.4 | reasoning |
+| `Qwen3-30B-A3B-Instruct-2507-GGUF` | 17.4 | tool-calling |
+| `Qwen3-4B-GGUF` | 2.38 | reasoning |
+| `Qwen3-4B-Instruct-2507-GGUF` | 2.5 | tool-calling |
+| `Qwen3-8B-GGUF` | 5.25 | reasoning |
+| `Qwen3-Coder-30B-A3B-Instruct-GGUF` | 18.6 | coding, tool-calling, hot |
+| `Qwen3-Coder-Next-GGUF` | 48.0 | coding, tool-calling, hot |
+| `Qwen3-Embedding-0.6B-GGUF` | 0.64 | embeddings |
+| `Qwen3-Embedding-4B-GGUF` | 4.28 | embeddings |
+| `Qwen3-Embedding-8B-GGUF` | 8.05 | embeddings |
+| `Qwen3-Next-80B-A3B-Instruct-GGUF` | 46.1 | tool-calling |
+| `Qwen3-VL-4B-Instruct-GGUF` | 3.33 | vision |
+| `Qwen3-VL-8B-Instruct-GGUF` | 6.19 | vision |
+| `Qwen3.5-0.8B-GGUF` | 0.764 | vision, tool-calling |
+| `Qwen3.5-122B-A10B-GGUF` | 77.9 | vision, tool-calling |
+| `Qwen3.5-122B-A10B-MTP-GGUF` | 79.6 | vision, tool-calling, mtp |
+| `Qwen3.5-27B-GGUF` | 18.5 | vision, tool-calling |
+| `Qwen3.5-2B-GGUF` | 2.01 | vision, tool-calling |
+| `Qwen3.5-35B-A3B-GGUF` | 23.1 | vision, tool-calling |
+| `Qwen3.5-4B-GGUF` | 3.58 | vision, tool-calling, hot |
+| `Qwen3.5-4B-MTP-GGUF` | 3.66 | vision, tool-calling, mtp |
+| `Qwen3.5-9B-GGUF` | 6.88 | vision, tool-calling |
+| `Qwen3.6-27B-GGUF` | 18.5 | vision, tool-calling |
+| `Qwen3.6-27B-MTP-GGUF` | 18.8 | vision, tool-calling, mtp, hot |
+| `Qwen3.6-35B-A3B-GGUF` | 23.3 | vision, tool-calling, hot |
+| `Qwen3.6-35B-A3B-MTP-GGUF` | 23.8 | vision, tool-calling, mtp |
+| `SmolLM3-3B-GGUF` | 1.94 | — |
+| `Tiny-Test-Model-GGUF` | 0.18 | — |
+| `bge-reranker-v2-m3-GGUF` | 0.636 | reranking |
+| `gpt-oss-120b-GGUF` | 62.8 | reasoning, tool-calling |
+| `gpt-oss-120b-mxfp-GGUF` | 63.4 | hot, reasoning, tool-calling |
+| `gpt-oss-20b-GGUF` | 11.6 | reasoning, tool-calling |
+| `gpt-oss-20b-mxfp4-GGUF` | 12.1 | hot, reasoning, tool-calling |
+| `granite-4.0-h-tiny-GGUF` | 4.25 | tool-calling |
+| `jina-reranker-v1-tiny-en-GGUF` | 0.0367 | reranking |
+| `nomic-embed-text-v1-GGUF` | 0.0781 | embeddings |
+| `nomic-embed-text-v2-moe-GGUF` | 0.51 | embeddings |
+
+#### `moonshine` — Moonshine (3 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Moonshine-Medium-Streaming` | 1.08 | transcription, realtime-transcription, hot |
+| `Moonshine-Small-Streaming` | 0.431 | transcription, realtime-transcription |
+| `Moonshine-Tiny-Streaming` | 0.202 | transcription, realtime-transcription |
+
+#### `ryzenai-llm` — Ryzen AI LLM (79 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `AMD-OLMo-1B-SFT-DPO-Hybrid` | 1.48 | — |
+| `CodeLlama-7b-Instruct-hf-Hybrid` | 7.24 | coding |
+| `CodeLlama-7b-Instruct-hf-NPU` | 7.54 | coding |
+| `DeepSeek-R1-Distill-Llama-8B-CPU` | 6.2 | reasoning |
+| `DeepSeek-R1-Distill-Llama-8B-Hybrid` | 9.09 | reasoning |
+| `DeepSeek-R1-Distill-Llama-8B-NPU` | 9.3 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-1.5B-Hybrid` | 2.19 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-1.5B-NPU` | 2.3 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-7B-CPU` | 6.2 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-7B-Hybrid` | 8.67 | reasoning |
+| `DeepSeek-R1-Distill-Qwen-7B-NPU` | 8.87 | reasoning |
+| `Gemma-3-4b-it-mm-NPU` | 6.68 | vision |
+| `Llama-2-7b-chat-hf-Hybrid` | 7.31 | — |
+| `Llama-2-7b-chat-hf-NPU` | 7.47 | — |
+| `Llama-2-7b-hf-Hybrid` | 7.31 | — |
+| `Llama-2-7b-hf-NPU` | 7.47 | — |
+| `Llama-3.1-8B-Hybrid` | 9.09 | — |
+| `Llama-3.1-8B-NPU` | 9.3 | — |
+| `Llama-3.2-1B-Hybrid` | 1.89 | — |
+| `Llama-3.2-1B-Instruct-CPU` | 1.76 | — |
+| `Llama-3.2-1B-Instruct-Hybrid` | 1.89 | — |
+| `Llama-3.2-1B-Instruct-NPU` | 1.96 | — |
+| `Llama-3.2-1B-NPU` | 1.96 | — |
+| `Llama-3.2-3B-Hybrid` | 4.28 | — |
+| `Llama-3.2-3B-Instruct-CPU` | 3.38 | — |
+| `Llama-3.2-3B-Instruct-Hybrid` | 4.28 | — |
+| `Meta-Llama-3-8B-Hybrid` | 9.06 | — |
+| `Meta-Llama-3-8B-NPU` | 9.23 | — |
+| `Meta-Llama-3.1-8B-Instruct-Hybrid` | 9.09 | — |
+| `Meta-Llama-3.1-8B-Instruct-NPU` | 9.3 | — |
+| `Mistral-7B-Instruct-v0.1-Hybrid` | 7.84 | — |
+| `Mistral-7B-Instruct-v0.1-NPU` | 8.01 | — |
+| `Mistral-7B-Instruct-v0.2-Hybrid` | 7.84 | — |
+| `Mistral-7B-Instruct-v0.2-NPU` | 8.01 | — |
+| `Mistral-7B-Instruct-v0.3-Hybrid` | 7.85 | — |
+| `Mistral-7B-Instruct-v0.3-NPU` | 8.09 | — |
+| `Mistral-7B-v0.3-Hybrid` | 7.85 | — |
+| `Mistral-7B-v0.3-NPU` | 8.09 | — |
+| `Phi-3-Mini-Instruct-CPU` | 2.39 | — |
+| `Phi-3-mini-128k-instruct-Hybrid` | 4.21 | — |
+| `Phi-3-mini-128k-instruct-NPU` | 4.35 | — |
+| `Phi-3-mini-4k-instruct-Hybrid` | 4.19 | — |
+| `Phi-3-mini-4k-instruct-NPU` | 4.3 | — |
+| `Phi-3.5-mini-instruct-Hybrid` | 4.21 | — |
+| `Phi-3.5-mini-instruct-NPU` | 4.35 | — |
+| `Phi-4-mini-instruct-Hybrid` | 5.47 | — |
+| `Phi-4-mini-instruct-NPU` | 5.59 | — |
+| `Phi-4-mini-reasoning-Hybrid` | 5.47 | reasoning |
+| `Qwen-1.5-7B-Chat-CPU` | 6.32 | — |
+| `Qwen-2.5-1.5B-Instruct-Hybrid` | 2.17 | — |
+| `Qwen-2.5-1.5B-Instruct-NPU` | 2.25 | — |
+| `Qwen1.5-7B-Chat-Hybrid` | 8.83 | — |
+| `Qwen1.5-7B-Chat-NPU` | 9.02 | — |
+| `Qwen2-1.5B-Hybrid` | 2.19 | — |
+| `Qwen2-1.5B-NPU` | 2.3 | — |
+| `Qwen2-7B-Hybrid` | 8.68 | — |
+| `Qwen2-7B-NPU` | 8.88 | — |
+| `Qwen2.5-0.5B-Instruct-CPU` | 0.834 | — |
+| `Qwen2.5-0.5B-Instruct-Hybrid` | 0.828 | — |
+| `Qwen2.5-14B-instruct-Hybrid` | 16.5 | — |
+| `Qwen2.5-3B-Instruct-Hybrid` | 3.97 | — |
+| `Qwen2.5-3B-Instruct-NPU` | 4.1 | — |
+| `Qwen2.5-7B-Instruct-Hybrid` | 8.65 | — |
+| `Qwen2.5-7B-Instruct-NPU` | 8.83 | — |
+| `Qwen2.5-Coder-0.5B-Instruct-Hybrid` | 0.828 | coding |
+| `Qwen2.5-Coder-1.5B-Instruct-Hybrid` | 2.17 | coding |
+| `Qwen2.5-Coder-1.5B-Instruct-NPU` | 2.25 | coding |
+| `Qwen2.5-Coder-7B-Instruct-Hybrid` | 8.65 | coding |
+| `Qwen2.5-Coder-7B-Instruct-NPU` | 8.83 | coding |
+| `Qwen3-1.7B-Hybrid` | 2.55 | reasoning |
+| `Qwen3-14B-Hybrid` | 16.5 | reasoning |
+| `Qwen3-4B-Hybrid` | 5.17 | reasoning |
+| `Qwen3-8B-Hybrid` | 9.42 | reasoning |
+| `SmolLM-135M-Instruct-Hybrid` | 0.232 | — |
+| `SmolLM2-135M-Instruct-Hybrid` | 0.233 | — |
+| `chatglm3-6b-Hybrid` | 6.9 | — |
+| `chatglm3-6b-NPU` | 7.04 | — |
+| `gemma-2-2b-Hybrid` | 4.04 | — |
+| `gpt-oss-20b-NPU` | 13.4 | — |
+
+#### `sd-cpp` — StableDiffusion.cpp (12 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Flux-2-Klein-4B` | 16.1 | image, edit |
+| `Flux-2-Klein-9B-GGUF` | 19.0 | image, edit |
+| `Qwen-Image-2512-GGUF` | 19.4 | image |
+| `Qwen-Image-GGUF` | 18.2 | image |
+| `RealESRGAN-x4plus` | 0.064 | upscaling, image |
+| `RealESRGAN-x4plus-anime` | 0.017 | upscaling, image |
+| `SD-1.5` | 7.7 | image |
+| `SD-Turbo` | 5.21 | image |
+| `SD-Turbo-GGUF` | 2.02 | image |
+| `SDXL-Base-1.0` | 6.94 | image |
+| `SDXL-Turbo` | 6.94 | image |
+| `Z-Image-Turbo` | 20.7 | image |
+
+#### `vllm` — vLLM ROCm (experimental) (4 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Qwen3.5-0.8B-FP16-vLLM` | 1.77 | reasoning |
+| `Qwen3.5-2B-FP16-vLLM` | 4.57 | reasoning, tool-calling |
+| `Qwen3.5-4B-FP16-vLLM` | 9.34 | reasoning, hot, tool-calling |
+| `Qwen3.5-9B-FP16-vLLM` | 19.3 | reasoning, tool-calling |
+
+#### `whispercpp` — Whisper.cpp (6 models)
+
+| Model | Size (GB) | Labels |
+|-------|-----------|--------|
+| `Whisper-Base` | 0.148 | transcription, realtime-transcription |
+| `Whisper-Large-v3` | 3.1 | transcription, realtime-transcription |
+| `Whisper-Large-v3-Turbo` | 1.62 | transcription, realtime-transcription, hot |
+| `Whisper-Medium` | 1.53 | transcription, realtime-transcription |
+| `Whisper-Small` | 0.488 | transcription, realtime-transcription |
+| `Whisper-Tiny` | 0.075 | transcription, realtime-transcription |
+<!-- END GENERATED: backend-models -->
diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md
index 36357658a..97b7ee4d6 100644
--- a/docs/dev/contribute.md
+++ b/docs/dev/contribute.md
@@ -23,6 +23,10 @@ Lemonade's roadmap is defined by a set of [working groups](./working-groups/READ
 
 Not sure what to work on? Come to the feature-requests and troubleshooting channels on the Discord and see what people need!
 
+### Adding a Backend
+
+Inference backends are self-describing: a backend is a descriptor (plain data) plus a server class, and everything else (router, CLI, `/system-info`, docs) is derived from it. See [Adding a backend](./adding-a-backend.md) for the full contract and a minimal example.
+
 ### Issues
 
 Issues are a great way to document a bug or feature request. However, Lemonade is a community-driven project and you still need to find someone to implement your issue. It is highly recommended that you bring your issue to the [Lemonade discord community](https://discord.gg/5xXzkMu8Zk) and connect with a contributor who wants to implement it.
diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py
new file mode 100644
index 000000000..737715605
--- /dev/null
+++ b/docs/tools/gen_backend_docs.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""Generate backend reference docs from the self-describing backend descriptors.
+
+The C++ backend descriptors (src/cpp/server/backends/*_descriptor.cpp) are the
+single source of truth for what each backend is. This script boots a `lemond`
+server, reads the descriptor-generated ``/system-info`` ``recipes`` object and
+``server_models.json``, and rewrites the marker-delimited regions of the target
+doc(s). A CI step runs it with ``--check`` and fails if the committed docs drift.
+
+Usage:
+    python docs/tools/gen_backend_docs.py [--lemond PATH] [--check]
+
+``--check`` regenerates in memory and exits non-zero if the on-disk docs differ,
+without modifying them.
+
+Only the regions between::
+
+    <!-- BEGIN GENERATED: <id> -->
+    <!-- END GENERATED: <id> -->
+
+are rewritten; surrounding prose is left untouched.
+"""
+
+import argparse
+import json
+import re
+import socket
+import subprocess
+import sys
+import tempfile
+import time
+import urllib.request
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+SERVER_MODELS = REPO_ROOT / "src" / "cpp" / "resources" / "server_models.json"
+TARGET_DOC = REPO_ROOT / "docs" / "dev" / "backends-reference.md"
+
+
+def free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+def find_lemond(explicit: str | None) -> Path:
+    if explicit:
+        p = Path(explicit)
+        if not p.exists():
+            sys.exit(f"lemond not found at {p}")
+        return p
+    for candidate in [
+        REPO_ROOT / "build" / "lemond",
+        REPO_ROOT / "build" / "lemond.exe",
+    ]:
+        if candidate.exists():
+            return candidate
+    sys.exit("Could not find a built lemond (looked in build/). Pass --lemond PATH.")
+
+
+class Lemond:
+    """Boots a throwaway lemond on a free port with an isolated cache dir."""
+
+    def __init__(self, binary: Path):
+        self.binary = binary
+        self.port = free_port()
+        self._cache = tempfile.TemporaryDirectory(prefix="lemond-docs-")
+        self._proc: subprocess.Popen | None = None
+
+    def __enter__(self):
+        self._proc = subprocess.Popen(
+            [str(self.binary), self._cache.name, "--port", str(self.port)],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+        )
+        deadline = time.time() + 60
+        while time.time() < deadline:
+            try:
+                self._get("/api/v1/health")
+                return self
+            except Exception:
+                if self._proc.poll() is not None:
+                    sys.exit("lemond exited before becoming ready")
+                time.sleep(0.5)
+        self.__exit__(None, None, None)
+        sys.exit("lemond did not become ready within 60s")
+
+    def __exit__(self, *exc):
+        if self._proc and self._proc.poll() is None:
+            try:
+                self._get("/internal/shutdown", timeout=2)
+            except Exception:
+                pass
+            try:
+                self._proc.wait(timeout=10)
+            except Exception:
+                self._proc.kill()
+        self._cache.cleanup()
+
+    def _get(self, path: str, timeout: float = 5):
+        url = f"http://127.0.0.1:{self.port}{path}"
+        with urllib.request.urlopen(url, timeout=timeout) as r:
+            return r.read()
+
+    def system_info(self) -> dict:
+        return json.loads(self._get("/api/v1/system-info", timeout=30))
+
+
+def md_escape(text: str) -> str:
+    return str(text).replace("|", "\\|")
+
+
+def render_overview(recipes: dict) -> str:
+    rows = [
+        "| Recipe | Name | Selectable backend | Uses ctx_size | Backends |",
+        "|--------|------|--------------------|---------------|----------|",
+    ]
+    for recipe in sorted(recipes):
+        info = recipes[recipe]
+        if "display_name" not in info:
+            continue  # not a descriptor-backed recipe on this run
+        backends = sorted({b["backend"] for b in info.get("support", [])}) or sorted(
+            info.get("backends", {})
+        )
+        rows.append(
+            "| `{r}` | {n} | {s} | {c} | {b} |".format(
+                r=recipe,
+                n=md_escape(info.get("display_name", "")),
+                s="yes" if info.get("selectable_backend") else "no",
+                c="yes" if info.get("uses_ctx_size") else "no",
+                b=", ".join(backends) if backends else "—",
+            )
+        )
+    return "\n".join(rows)
+
+
+def render_support_matrix(recipes: dict) -> str:
+    rows = [
+        "| Recipe | Backend | OS | Device families |",
+        "|--------|---------|----|-----------------|",
+    ]
+    for recipe in sorted(recipes):
+        info = recipes[recipe]
+        for row in info.get("support", []):
+            fams = []
+            for d in row.get("devices", []):
+                f = d.get("families") or []
+                fams.append(d["device"] + (f" ({', '.join(f)})" if f else ""))
+            rows.append(
+                "| `{r}` | {b} | {o} | {d} |".format(
+                    r=recipe,
+                    b=row.get("backend", ""),
+                    o=", ".join(sorted(row.get("os", []))),
+                    d=md_escape("; ".join(fams)) if fams else "—",
+                )
+            )
+    return "\n".join(rows)
+
+
+def render_options(recipes: dict) -> str:
+    blocks = []
+    for recipe in sorted(recipes):
+        info = recipes[recipe]
+        opts = info.get("options")
+        if not opts:
+            continue
+        blocks.append(f"#### `{recipe}` — {info.get('display_name', recipe)}\n")
+        blocks.append("| Option | CLI flag | Type | Default | Description |")
+        blocks.append("|--------|----------|------|---------|-------------|")
+        if info.get("uses_ctx_size"):
+            blocks.append(
+                "| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |"
+            )
+        for o in opts:
+            blocks.append(
+                "| `{n}` | {f} | {t} | {d} | {h} |".format(
+                    n=o["name"],
+                    f=f"`{o['cli_flag']}`" if o.get("cli_flag") else "—",
+                    t=o.get("type_name", ""),
+                    d=md_escape(
+                        json.dumps(o.get("default"))
+                        if not isinstance(o.get("default"), str)
+                        else o.get("default") or '""'
+                    ),
+                    h=md_escape(o.get("help", "")),
+                )
+            )
+        blocks.append("")
+    return "\n".join(blocks).rstrip()
+
+
+def render_models(recipes: dict) -> str:
+    models = json.loads(SERVER_MODELS.read_text())
+    by_recipe: dict[str, list] = {}
+    for name, data in models.items():
+        if not isinstance(data, dict):
+            continue
+        by_recipe.setdefault(data.get("recipe", "(unspecified)"), []).append(
+            (name, data)
+        )
+    blocks = []
+    for recipe in sorted(by_recipe):
+        entries = sorted(by_recipe[recipe])
+        display = recipes.get(recipe, {}).get("display_name", recipe)
+        blocks.append(f"#### `{recipe}` — {display} ({len(entries)} models)\n")
+        blocks.append("| Model | Size (GB) | Labels |")
+        blocks.append("|-------|-----------|--------|")
+        for name, data in entries:
+            blocks.append(
+                "| `{n}` | {s} | {l} |".format(
+                    n=md_escape(name),
+                    s=data.get("size", ""),
+                    l=md_escape(", ".join(data.get("labels", []))) or "—",
+                )
+            )
+        blocks.append("")
+    return "\n".join(blocks).rstrip()
+
+
+DEFAULT_TEMPLATE = """# Backend reference
+
+<!-- This file is generated by docs/tools/gen_backend_docs.py from the C++ backend
+descriptors. Do not edit the regions between the GENERATED markers by hand; run
+the generator instead. Prose outside the markers is preserved. -->
+
+## Backends
+
+<!-- BEGIN GENERATED: backends-overview -->
+<!-- END GENERATED: backends-overview -->
+
+## Support matrix
+
+<!-- BEGIN GENERATED: backends-matrix -->
+<!-- END GENERATED: backends-matrix -->
+
+## Recipe options
+
+<!-- BEGIN GENERATED: backend-options -->
+<!-- END GENERATED: backend-options -->
+
+## Models
+
+<!-- BEGIN GENERATED: backend-models -->
+<!-- END GENERATED: backend-models -->
+"""
+
+
+def apply_sections(text: str, sections: dict[str, str]) -> str:
+    for marker_id, body in sections.items():
+        pattern = re.compile(
+            r"(<!-- BEGIN GENERATED: "
+            + re.escape(marker_id)
+            + r" -->).*?(<!-- END GENERATED: "
+            + re.escape(marker_id)
+            + r" -->)",
+            re.DOTALL,
+        )
+        if not pattern.search(text):
+            sys.exit(f"Marker region '{marker_id}' not found in target doc")
+        # Escape backslashes and group-ref markers in the body for re.sub.
+        safe_body = body.replace("\\", "\\\\")
+        replacement = r"\1" + "\n" + safe_body + "\n" + r"\2"
+        text = pattern.sub(replacement, text)
+    return text
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    ap.add_argument("--lemond", help="Path to the built lemond binary")
+    ap.add_argument(
+        "--check", action="store_true", help="Fail if docs are stale; do not write"
+    )
+    args = ap.parse_args()
+
+    binary = find_lemond(args.lemond)
+    with Lemond(binary) as server:
+        info = server.system_info()
+    recipes = info.get("recipes", {})
+    if not recipes:
+        sys.exit("/system-info returned no recipes")
+
+    sections = {
+        "backends-overview": render_overview(recipes),
+        "backends-matrix": render_support_matrix(recipes),
+        "backend-options": render_options(recipes),
+        "backend-models": render_models(recipes),
+    }
+
+    current = TARGET_DOC.read_text() if TARGET_DOC.exists() else DEFAULT_TEMPLATE
+    updated = apply_sections(current, sections)
+
+    if args.check:
+        if not TARGET_DOC.exists() or TARGET_DOC.read_text() != updated:
+            sys.exit(
+                f"{TARGET_DOC.relative_to(REPO_ROOT)} is stale. Run: python docs/tools/gen_backend_docs.py"
+            )
+        print(f"{TARGET_DOC.relative_to(REPO_ROOT)} is up to date.")
+        return 0
+
+    TARGET_DOC.parent.mkdir(parents=True, exist_ok=True)
+    TARGET_DOC.write_text(updated)
+    print(f"Wrote {TARGET_DOC.relative_to(REPO_ROOT)}")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/src/app/src/renderer/utils/recipeNames.ts b/src/app/src/renderer/utils/recipeNames.ts
index d654c635a..8f1fdbb1f 100644
--- a/src/app/src/renderer/utils/recipeNames.ts
+++ b/src/app/src/renderer/utils/recipeNames.ts
@@ -4,15 +4,28 @@ export const isCollectionRecipe = (recipe?: string): boolean => {
   return recipe === COLLECTION_OMNI_MODEL_RECIPE;
 };
 
+// Recipe display names. Hardware-backend names (llamacpp, whispercpp, sd-cpp, …)
+// are populated at runtime from /system-info's `recipes[].display_name`, which is
+// generated from the C++ backend descriptors — the single source of truth. Only
+// recipes NOT surfaced by /system-info's hardware support matrix are seeded here:
+// the collection orchestrator (not a backend) and cloud offload (a backend with
+// no local support rows).
 export const RECIPE_DISPLAY_NAMES: Record<string, string> = {
   [COLLECTION_OMNI_MODEL_RECIPE]: 'Lemonade',
-  'flm': 'FastFlowLM NPU',
-  'llamacpp': 'Llama.cpp GPU',
-  'ryzenai-llm': 'Ryzen AI LLM',
-  'whispercpp': 'Whisper.cpp',
-  'moonshine': 'Moonshine',
-  'sd-cpp': 'StableDiffusion.cpp',
-  'kokoro': 'Kokoro',
   'cloud': 'Cloud',
-  'vllm': 'vLLM ROCm (experimental)',
+};
+
+// Merge display names from a /system-info `recipes` object into RECIPE_DISPLAY_NAMES.
+// Called whenever system info is (re)fetched so the map reflects the descriptors.
+export const updateRecipeDisplayNames = (
+  recipes?: Record<string, { display_name?: string }>
+): void => {
+  if (!recipes) {
+    return;
+  }
+  for (const [recipe, info] of Object.entries(recipes)) {
+    if (info && typeof info.display_name === 'string' && info.display_name) {
+      RECIPE_DISPLAY_NAMES[recipe] = info.display_name;
+    }
+  }
 };
diff --git a/src/app/src/renderer/utils/systemData.ts b/src/app/src/renderer/utils/systemData.ts
index 63f1d9427..fcd3b8f92 100644
--- a/src/app/src/renderer/utils/systemData.ts
+++ b/src/app/src/renderer/utils/systemData.ts
@@ -39,8 +39,23 @@ export interface Recipes {
   [recipeName: string]: Recipe;
 }
 
+// Per-recipe option schema, generated from the C++ backend descriptor.
+export interface RecipeOptionSchema {
+  name: string;
+  cli_flag: string;
+  default: unknown;
+  type_name: string;
+  help: string;
+  group: string;
+}
+
 export interface Recipe {
   default_backend?: string;
+  // Descriptor metadata (generated from the C++ backend descriptors).
+  display_name?: string;
+  selectable_backend?: boolean;
+  uses_ctx_size?: boolean;
+  options?: RecipeOptionSchema[];
   backends: {
     [backendName: string]: BackendInfo;
   };
@@ -75,6 +90,11 @@ const fetchSystemInfoFromAPI = async (): Promise<SystemData> => {
     const data = await response.json();
     const systemInfo: SystemInfo = { ...data };
 
+    // Seed recipe display names from the descriptor-generated /system-info data
+    // so the UI doesn't hardcode per-recipe names.
+    const { updateRecipeDisplayNames } = await import('./recipeNames');
+    updateRecipeDisplayNames(systemInfo.recipes);
+
     return { info: systemInfo };
   } catch (error) {
     console.error('Failed to fetch supported inference data from API:', error);
diff --git a/src/cpp/cli/CMakeLists.txt b/src/cpp/cli/CMakeLists.txt
index bd58c60ba..b6a0f26d6 100644
--- a/src/cpp/cli/CMakeLists.txt
+++ b/src/cpp/cli/CMakeLists.txt
@@ -97,6 +97,10 @@ set(COMMON_SOURCES
     agent_config_file.cpp
     opencode_profile.cpp
     pi_profile.cpp
+    # Self-describing backend descriptors (plain data; CLI-safe). Lets the CLI
+    # read recipe options/flags from descriptors without linking server classes.
+    # The matching factories (create()) are server-only and NOT listed here.
+    ${LEMON_BACKEND_DESCRIPTOR_SOURCES}
 )
 
 # Add platform-specific sources
diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
new file mode 100644
index 000000000..fc6c50bc2
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -0,0 +1,58 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <nlohmann/json.hpp>
+#include "lemon/model_types.h"
+#include "lemon/recipe_backend_def.h"
+
+namespace lemon {
+
+// A single declarative configuration knob a backend exposes. The same list
+// drives config.json defaults, CLI flag registration, and load-time option
+// resolution, so they can never drift apart.
+struct BackendOption {
+    std::string name;                 // option key, e.g. "vllm_args"
+    std::string cli_flag;             // CLI flag, e.g. "--vllm-args" ("" = not a CLI flag)
+    nlohmann::json default_value;     // default value when the option is unset
+    std::string type_name;            // "ARGS" | "SIZE" | "BACKEND" | "BOOL"
+    std::string help;                 // CLI help text
+    std::string group;                // CLI help group, e.g. "General Options"
+};
+
+// How a backend shares the accelerator. Replaces the router's recipe-string
+// checks for NPU exclusivity and LRU slot accounting.
+enum class SlotPolicy {
+    Standard,      // counts toward the LRU slots, no device exclusivity (llamacpp, sd-cpp)
+    ExclusiveNpu,  // evict ALL npu servers before loading (ryzenai-llm, whispercpp-npu)
+    CoexistByType, // one per model type, evicts exclusive-npu peers (flm)
+    Unmetered      // never counts toward slots, never auto-evicted (cloud)
+};
+
+// Plain data declaring *what a backend is*. This is the single object the
+// registry, the CLI, /system-info, and the docs all read. Behavior lives in the
+// paired WrappedServer subclass (see backend_registry.h for how they bind).
+struct BackendDescriptor {
+    std::string recipe;             // "vllm"
+    std::string display_name;       // "vLLM ROCm (experimental)"
+    std::string binary;             // subprocess to launch/install ("" = none, e.g. cloud)
+    std::string config_section;     // config.json section; defaults to recipe (sd-cpp -> "sdcpp")
+
+    DeviceType default_device = DEVICE_GPU;           // default; override effective_device() if variant-dependent
+    SlotPolicy slot_policy    = SlotPolicy::Standard; // default; override effective_slot_policy() if variant-dependent
+    bool selectable_backend   = false;  // auto-creates "<recipe>_backend" option + "--<recipe>" flag
+    bool uses_ctx_size        = false;  // opt in to the shared ctx_size option
+    bool dynamic_models       = false;  // true = class supplies models at runtime (cloud), not server_models.json
+
+    std::vector<BackendOption>    options;                       // backend-specific knobs (common ones are automatic)
+    std::vector<RecipeBackendDef> support;                       // which OS / GPU families it runs on ({} = no local gating)
+    std::vector<std::string>      default_labels;                // labels injected when a model omits them
+    std::vector<std::string>      required_checkpoints{"main"};  // unconditional files; conditional ones checked in load()
+
+    // The config.json section name for this backend, falling back to the recipe.
+    std::string effective_config_section() const {
+        return config_section.empty() ? recipe : config_section;
+    }
+};
+
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/backend_descriptor_registry.h b/src/cpp/include/lemon/backends/backend_descriptor_registry.h
new file mode 100644
index 000000000..e3be93cda
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_descriptor_registry.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// Read-only view over every backend descriptor (plain data). This API is
+// CLI-safe: it pulls in no server classes, so it links into both the lemonade
+// CLI and lemond. The factory side (create_server) lives in backend_registry.h
+// and is server-only.
+
+// All registered descriptors, in LEMON_BACKENDS order.
+const std::vector<const BackendDescriptor*>& all_descriptors();
+
+// Descriptor for a recipe, or nullptr if the recipe has no registered backend.
+const BackendDescriptor* descriptor_for(const std::string& recipe);
+
+// True if the recipe is backed by a registered descriptor.
+bool has_backend(const std::string& recipe);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h
new file mode 100644
index 000000000..394f49145
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_registry.h
@@ -0,0 +1,47 @@
+#pragma once
+
+#include <memory>
+#include <string>
+#include "lemon/backends/backend_descriptor.h"
+#include "lemon/backends/backend_descriptor_registry.h"
+
+namespace lemon {
+
+class WrappedServer;
+class ModelManager;
+class BackendManager;
+class CloudProviderRegistry;
+struct ModelInfo;
+
+namespace backends {
+
+// Everything a backend's create() needs to build an instance. Mirrors the
+// arguments the old router factory passed to each backend constructor.
+struct BackendContext {
+    std::string log_level;
+    ModelManager* model_manager = nullptr;
+    BackendManager* backend_manager = nullptr;
+    CloudProviderRegistry* cloud_registry = nullptr;
+    const ModelInfo* model_info = nullptr;  // for per-create setup (cloud provider, ryzenai model path)
+};
+
+using BackendCreateFn = std::unique_ptr<WrappedServer> (*)(const BackendContext&);
+
+// Binds a descriptor (what the backend is) to its server class's create() (how
+// it runs). The generated factory registry supplies one per backend. This API is
+// server-only: it references server classes via create(), so it is compiled into
+// lemond but not the CLI. The CLI reads descriptors through backend_descriptor_registry.h.
+struct BackendRegistration {
+    const BackendDescriptor* descriptor;
+    BackendCreateFn create;
+};
+
+// All registered (descriptor, create) pairs, in LEMON_BACKENDS order.
+const std::vector<BackendRegistration>& all_registrations();
+
+// Construct a backend instance for a recipe and associate its descriptor, or
+// nullptr if the recipe has no registered backend.
+std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/cloud_descriptor.h b/src/cpp/include/lemon/backends/cloud_descriptor.h
new file mode 100644
index 000000000..6e5f49bdb
--- /dev/null
+++ b/src/cpp/include/lemon/backends/cloud_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The cloud backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in cloud_descriptor.cpp.
+extern const BackendDescriptor cloud_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/cloud_factory.h b/src/cpp/include/lemon/backends/cloud_factory.h
new file mode 100644
index 000000000..889958bd1
--- /dev/null
+++ b/src/cpp/include/lemon/backends/cloud_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The cloud backend's factory (constructs the server class — lemond only).
+// Defined in cloud_factory.cpp.
+std::unique_ptr<WrappedServer> cloud_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm_descriptor.h b/src/cpp/include/lemon/backends/fastflowlm_descriptor.h
new file mode 100644
index 000000000..5e8f71467
--- /dev/null
+++ b/src/cpp/include/lemon/backends/fastflowlm_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The fastflowlm backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in fastflowlm_descriptor.cpp.
+extern const BackendDescriptor fastflowlm_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm_factory.h b/src/cpp/include/lemon/backends/fastflowlm_factory.h
new file mode 100644
index 000000000..8581dbdf7
--- /dev/null
+++ b/src/cpp/include/lemon/backends/fastflowlm_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The fastflowlm backend's factory (constructs the server class — lemond only).
+// Defined in fastflowlm_factory.cpp.
+std::unique_ptr<WrappedServer> fastflowlm_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro_descriptor.h b/src/cpp/include/lemon/backends/kokoro_descriptor.h
new file mode 100644
index 000000000..1d3542f0a
--- /dev/null
+++ b/src/cpp/include/lemon/backends/kokoro_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The kokoro backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in kokoro_descriptor.cpp.
+extern const BackendDescriptor kokoro_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro_factory.h b/src/cpp/include/lemon/backends/kokoro_factory.h
new file mode 100644
index 000000000..0df3ec37b
--- /dev/null
+++ b/src/cpp/include/lemon/backends/kokoro_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The kokoro backend's factory (constructs the server class — lemond only).
+// Defined in kokoro_factory.cpp.
+std::unique_ptr<WrappedServer> kokoro_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp_descriptor.h b/src/cpp/include/lemon/backends/llamacpp_descriptor.h
new file mode 100644
index 000000000..501e0854c
--- /dev/null
+++ b/src/cpp/include/lemon/backends/llamacpp_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The llamacpp backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in llamacpp_descriptor.cpp.
+extern const BackendDescriptor llamacpp_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp_factory.h b/src/cpp/include/lemon/backends/llamacpp_factory.h
new file mode 100644
index 000000000..853f5171b
--- /dev/null
+++ b/src/cpp/include/lemon/backends/llamacpp_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The llamacpp backend's factory (constructs the server class — lemond only).
+// Defined in llamacpp_factory.cpp.
+std::unique_ptr<WrappedServer> llamacpp_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine_descriptor.h b/src/cpp/include/lemon/backends/moonshine_descriptor.h
new file mode 100644
index 000000000..d70083244
--- /dev/null
+++ b/src/cpp/include/lemon/backends/moonshine_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The moonshine backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in moonshine_descriptor.cpp.
+extern const BackendDescriptor moonshine_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine_factory.h b/src/cpp/include/lemon/backends/moonshine_factory.h
new file mode 100644
index 000000000..67e6f7298
--- /dev/null
+++ b/src/cpp/include/lemon/backends/moonshine_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The moonshine backend's factory (constructs the server class — lemond only).
+// Defined in moonshine_factory.cpp.
+std::unique_ptr<WrappedServer> moonshine_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenai_descriptor.h b/src/cpp/include/lemon/backends/ryzenai_descriptor.h
new file mode 100644
index 000000000..26aa0b21f
--- /dev/null
+++ b/src/cpp/include/lemon/backends/ryzenai_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The ryzenai backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in ryzenai_descriptor.cpp.
+extern const BackendDescriptor ryzenai_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenai_factory.h b/src/cpp/include/lemon/backends/ryzenai_factory.h
new file mode 100644
index 000000000..9483d8d55
--- /dev/null
+++ b/src/cpp/include/lemon/backends/ryzenai_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The ryzenai backend's factory (constructs the server class — lemond only).
+// Defined in ryzenai_factory.cpp.
+std::unique_ptr<WrappedServer> ryzenai_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sdcpp_descriptor.h b/src/cpp/include/lemon/backends/sdcpp_descriptor.h
new file mode 100644
index 000000000..0bee2e552
--- /dev/null
+++ b/src/cpp/include/lemon/backends/sdcpp_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The sdcpp backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in sdcpp_descriptor.cpp.
+extern const BackendDescriptor sdcpp_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sdcpp_factory.h b/src/cpp/include/lemon/backends/sdcpp_factory.h
new file mode 100644
index 000000000..f7da955e2
--- /dev/null
+++ b/src/cpp/include/lemon/backends/sdcpp_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The sdcpp backend's factory (constructs the server class — lemond only).
+// Defined in sdcpp_factory.cpp.
+std::unique_ptr<WrappedServer> sdcpp_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm_descriptor.h b/src/cpp/include/lemon/backends/vllm_descriptor.h
new file mode 100644
index 000000000..7119dff88
--- /dev/null
+++ b/src/cpp/include/lemon/backends/vllm_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The vllm backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in vllm_descriptor.cpp.
+extern const BackendDescriptor vllm_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm_factory.h b/src/cpp/include/lemon/backends/vllm_factory.h
new file mode 100644
index 000000000..7bf398987
--- /dev/null
+++ b/src/cpp/include/lemon/backends/vllm_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The vllm backend's factory (constructs the server class — lemond only).
+// Defined in vllm_factory.cpp.
+std::unique_ptr<WrappedServer> vllm_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whispercpp_descriptor.h b/src/cpp/include/lemon/backends/whispercpp_descriptor.h
new file mode 100644
index 000000000..2c3c87f19
--- /dev/null
+++ b/src/cpp/include/lemon/backends/whispercpp_descriptor.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+// The whispercpp backend's descriptor (plain data — CLI-safe, links into both the
+// lemonade CLI and lemond). Defined in whispercpp_descriptor.cpp.
+extern const BackendDescriptor whispercpp_descriptor;
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whispercpp_factory.h b/src/cpp/include/lemon/backends/whispercpp_factory.h
new file mode 100644
index 000000000..d98c97b27
--- /dev/null
+++ b/src/cpp/include/lemon/backends/whispercpp_factory.h
@@ -0,0 +1,14 @@
+#pragma once
+
+#include <memory>
+#include "lemon/backends/backend_registry.h"
+
+namespace lemon {
+namespace backends {
+
+// The whispercpp backend's factory (constructs the server class — lemond only).
+// Defined in whispercpp_factory.cpp.
+std::unique_ptr<WrappedServer> whispercpp_create(const BackendContext& ctx);
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
index cdcf844dc..be850b583 100644
--- a/src/cpp/include/lemon/model_manager.h
+++ b/src/cpp/include/lemon/model_manager.h
@@ -110,6 +110,19 @@ struct ModelInfo {
     // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING)
     int moonshine_arch = -1;
 
+    // Generic per-model fields a backend declares for itself. Any server_models.json
+    // key not consumed by a typed field above lands here, so a new backend can read
+    // custom per-model config in load() without editing this shared struct.
+    std::map<std::string, json> extras;
+
+    // Look up an extra field, returning a default when absent.
+    template <typename T>
+    T extra(const std::string& key, const T& fallback) const {
+        auto it = extras.find(key);
+        if (it == extras.end() || it->second.is_null()) return fallback;
+        try { return it->second.get<T>(); } catch (...) { return fallback; }
+    }
+
     // Utility
     std::string checkpoint(const std::string& type = "main") const { return checkpoints.count(type) ? checkpoints.at(type) : ""; }
     std::string resolved_path(const std::string& type = "main") const { return resolved_paths.count(type) ? resolved_paths.at(type) : ""; }
diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h
new file mode 100644
index 000000000..1557db077
--- /dev/null
+++ b/src/cpp/include/lemon/recipe_backend_def.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <map>
+#include <set>
+#include <string>
+
+namespace lemon {
+
+// Device constraints: device_type -> set of allowed families (empty = all families)
+using DeviceConstraints = std::map<std::string, std::set<std::string>>;
+
+// A single recipe/backend support row: which OS and device families a given
+// (recipe, backend) pair runs on. The canonical support matrix is assembled by
+// collecting these rows from every backend descriptor (see BackendDescriptor::support).
+//
+// IMPORTANT: For recipes with multiple backends (e.g. llamacpp), the order in
+// which these rows appear defines the preference order — first listed = most
+// preferred. Empty family set {} means "all families of that device type".
+struct RecipeBackendDef {
+    std::string recipe;
+    std::string backend;
+    std::set<std::string> supported_os;
+    DeviceConstraints devices;
+};
+
+} // namespace lemon
diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h
index f3ec74da4..41e91595b 100644
--- a/src/cpp/include/lemon/wrapped_server.h
+++ b/src/cpp/include/lemon/wrapped_server.h
@@ -17,6 +17,7 @@
 #include "model_manager.h"
 #include "backend_manager.h"
 #include "recipe_options.h"
+#include "backends/backend_descriptor.h"
 
 namespace lemon {
 
@@ -307,10 +308,46 @@ class WrappedServer : public ICompletionServer {
         // No-op by default
     }
 
-    // ICompletionServer implementation - forward requests to the wrapped server
-    virtual json chat_completion(const json& request) override = 0;
-    virtual json completion(const json& request) override = 0;
-    virtual json responses(const json& request) = 0;
+    // ICompletionServer implementation - forward requests to the wrapped server.
+    // Default to an "unsupported" error so non-chat backends (TTS, image,
+    // transcription) inherit a sensible response instead of stubbing each one.
+    virtual json chat_completion(const json& request) override {
+        return unsupported_capability_error("chat completion");
+    }
+    virtual json completion(const json& request) override {
+        return unsupported_capability_error("text completion");
+    }
+    virtual json responses(const json& request) {
+        return unsupported_capability_error("responses");
+    }
+
+    // Descriptor association (set by the backend registry at create() time). The
+    // effective_* hooks below default to the descriptor's declared values; a
+    // backend whose device or eviction rule depends on the chosen backend
+    // variant overrides them (e.g. whisper on npu vs cpu, llamacpp on cpu vs gpu).
+    void set_descriptor(const BackendDescriptor* descriptor) { descriptor_ = descriptor; }
+    const BackendDescriptor* get_descriptor() const { return descriptor_; }
+
+    // Effective accelerator device for this load. The router calls this after it
+    // resolves the "<recipe>_backend" option but before eviction. Defaults to the
+    // descriptor's default_device; variant-dependent backends override.
+    virtual DeviceType effective_device(const RecipeOptions& options) const {
+        (void)options;
+        return descriptor_ ? descriptor_->default_device : device_type_;
+    }
+
+    // Effective slot/eviction policy for this load. The router switches on this
+    // value to enforce NPU exclusivity and LRU slot accounting. Defaults to the
+    // descriptor's slot_policy; variant-dependent backends override.
+    virtual SlotPolicy effective_slot_policy(const RecipeOptions& options) const {
+        (void)options;
+        return descriptor_ ? descriptor_->slot_policy : SlotPolicy::Standard;
+    }
+
+    // Dynamic availability check. Returns "" if the backend can run on this
+    // system, or a user-facing reason why it cannot. Defaults to "available";
+    // backends with runtime-dependent availability (cloud) override.
+    virtual std::string availability() const { return ""; }
 
     // Forward streaming requests to the wrapped server (public for Router access)
     // Virtual so backends can transform request (e.g., FLM needs checkpoint in model field)
@@ -373,6 +410,17 @@ class WrappedServer : public ICompletionServer {
         BackendRequestKind kind_;
     };
 
+    // Standard "this backend does not serve <what>" error payload, matching the
+    // shape backends return from unsupported capability methods.
+    json unsupported_capability_error(const std::string& what) const {
+        return json{{"error", {
+            {"message", server_name_ + " does not support " + what +
+                            ". Use the appropriate endpoint for this model type instead."},
+            {"type", "unsupported_operation"},
+            {"code", "model_not_applicable"}
+        }}};
+    }
+
     static bool has_process_handle(const ProcessHandle& handle);
     ProcessHandle get_process_handle_snapshot() const;
     void set_process_handle(ProcessHandle handle);
@@ -420,6 +468,7 @@ class WrappedServer : public ICompletionServer {
     std::string log_level_;
     ModelManager* model_manager_;  // Non-owning pointer to ModelManager
     BackendManager* backend_manager_;  // Non-owning pointer to BackendManager
+    const BackendDescriptor* descriptor_ = nullptr;  // Non-owning; set by the backend registry at create()
 
     // Multi-model support fields
     std::string model_name_;
diff --git a/src/cpp/server/backends/backend_descriptor_registry.cpp b/src/cpp/server/backends/backend_descriptor_registry.cpp
new file mode 100644
index 000000000..5fd217909
--- /dev/null
+++ b/src/cpp/server/backends/backend_descriptor_registry.cpp
@@ -0,0 +1,29 @@
+#include "lemon/backends/backend_descriptor_registry.h"
+
+// Generated from LEMON_BACKENDS at configure time. Defines
+// lemon::backends::all_generated_descriptors() (descriptor data only).
+#include "backend_descriptors_generated.h"
+
+namespace lemon {
+namespace backends {
+
+const std::vector<const BackendDescriptor*>& all_descriptors() {
+    static const std::vector<const BackendDescriptor*> kDescriptors = all_generated_descriptors();
+    return kDescriptors;
+}
+
+const BackendDescriptor* descriptor_for(const std::string& recipe) {
+    for (const BackendDescriptor* d : all_descriptors()) {
+        if (d->recipe == recipe) {
+            return d;
+        }
+    }
+    return nullptr;
+}
+
+bool has_backend(const std::string& recipe) {
+    return descriptor_for(recipe) != nullptr;
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_descriptors_generated.h.in b/src/cpp/server/backends/backend_descriptors_generated.h.in
new file mode 100644
index 000000000..3f6d7ec2a
--- /dev/null
+++ b/src/cpp/server/backends/backend_descriptors_generated.h.in
@@ -0,0 +1,19 @@
+#pragma once
+//
+// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt.
+// Do not edit by hand. Descriptor DATA only (CLI-safe; no server classes).
+//
+#include <vector>
+#include "lemon/backends/backend_descriptor.h"
+@LEMON_DESCRIPTOR_INCLUDES@
+namespace lemon {
+namespace backends {
+
+inline std::vector<const BackendDescriptor*> all_generated_descriptors() {
+    return {
+@LEMON_DESCRIPTOR_ENTRIES@
+    };
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_factories_generated.h.in b/src/cpp/server/backends/backend_factories_generated.h.in
new file mode 100644
index 000000000..d488ce014
--- /dev/null
+++ b/src/cpp/server/backends/backend_factories_generated.h.in
@@ -0,0 +1,21 @@
+#pragma once
+//
+// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt.
+// Do not edit by hand. Binds each descriptor to its server class's create()
+// (server-only: pulls in server classes, compiled into lemond not the CLI).
+//
+#include <vector>
+#include "lemon/backends/backend_registry.h"
+@LEMON_DESCRIPTOR_INCLUDES@
+@LEMON_FACTORY_INCLUDES@
+namespace lemon {
+namespace backends {
+
+inline std::vector<BackendRegistration> generated_registrations() {
+    return {
+@LEMON_FACTORY_ENTRIES@
+    };
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp
new file mode 100644
index 000000000..5e0de071f
--- /dev/null
+++ b/src/cpp/server/backends/backend_registry.cpp
@@ -0,0 +1,31 @@
+#include "lemon/backends/backend_registry.h"
+#include "lemon/wrapped_server.h"
+
+// Generated from LEMON_BACKENDS at configure time. Defines
+// lemon::backends::generated_registrations(), pairing each descriptor with its
+// server class's create().
+#include "backend_factories_generated.h"
+
+namespace lemon {
+namespace backends {
+
+const std::vector<BackendRegistration>& all_registrations() {
+    static const std::vector<BackendRegistration> kRegistrations = generated_registrations();
+    return kRegistrations;
+}
+
+std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx) {
+    for (const auto& reg : all_registrations()) {
+        if (reg.descriptor->recipe == recipe) {
+            std::unique_ptr<WrappedServer> server = reg.create(ctx);
+            if (server) {
+                server->set_descriptor(reg.descriptor);
+            }
+            return server;
+        }
+    }
+    return nullptr;
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/cloud_descriptor.cpp b/src/cpp/server/backends/cloud_descriptor.cpp
new file mode 100644
index 000000000..fe87a32a2
--- /dev/null
+++ b/src/cpp/server/backends/cloud_descriptor.cpp
@@ -0,0 +1,23 @@
+#include "lemon/backends/cloud_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor cloud_descriptor = {
+    /*recipe*/          "cloud",
+    /*display_name*/    "Cloud",
+    /*binary*/          "",  // no subprocess: runs on a remote provider
+    /*config_section*/  "cloud",
+    /*default_device*/  DEVICE_NONE,
+    /*slot_policy*/     SlotPolicy::Unmetered,  // never counts toward slots, never auto-evicted
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  true,   // models discovered at runtime from the provider
+    /*options*/ {},
+    /*support*/ {},             // no local gating: install/support machinery skips cloud
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {},  // no downloaded files
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/cloud_factory.cpp b/src/cpp/server/backends/cloud_factory.cpp
new file mode 100644
index 000000000..cee2c4ab5
--- /dev/null
+++ b/src/cpp/server/backends/cloud_factory.cpp
@@ -0,0 +1,16 @@
+#include "lemon/backends/cloud_factory.h"
+#include "lemon/backends/cloud_server.h"
+#include "lemon/model_manager.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> cloud_create(const BackendContext& ctx) {
+    return std::make_unique<CloudServer>(
+        ctx.model_info->cloud_provider, ctx.log_level,
+        ctx.model_manager, ctx.backend_manager, ctx.cloud_registry);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm_descriptor.cpp b/src/cpp/server/backends/fastflowlm_descriptor.cpp
new file mode 100644
index 000000000..7b67b8d42
--- /dev/null
+++ b/src/cpp/server/backends/fastflowlm_descriptor.cpp
@@ -0,0 +1,29 @@
+#include "lemon/backends/fastflowlm_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor fastflowlm_descriptor = {
+    /*recipe*/          "flm",
+    /*display_name*/    "FastFlowLM NPU",
+#ifdef _WIN32
+    /*binary*/          "flm.exe",
+#else
+    /*binary*/          "flm",
+#endif
+    /*config_section*/  "flm",
+    /*default_device*/  DEVICE_NPU,
+    /*slot_policy*/     SlotPolicy::CoexistByType,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {},
+    /*support*/ {
+        {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm_factory.cpp b/src/cpp/server/backends/fastflowlm_factory.cpp
new file mode 100644
index 000000000..96eddd998
--- /dev/null
+++ b/src/cpp/server/backends/fastflowlm_factory.cpp
@@ -0,0 +1,13 @@
+#include "lemon/backends/fastflowlm_factory.h"
+#include "lemon/backends/fastflowlm_server.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> fastflowlm_create(const BackendContext& ctx) {
+    return std::make_unique<FastFlowLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/kokoro_descriptor.cpp b/src/cpp/server/backends/kokoro_descriptor.cpp
new file mode 100644
index 000000000..281f0e0f1
--- /dev/null
+++ b/src/cpp/server/backends/kokoro_descriptor.cpp
@@ -0,0 +1,30 @@
+#include "lemon/backends/kokoro_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor kokoro_descriptor = {
+    /*recipe*/          "kokoro",
+    /*display_name*/    "Kokoro",
+#ifdef _WIN32
+    /*binary*/          "koko.exe",
+#else
+    /*binary*/          "koko",
+#endif
+    /*config_section*/  "kokoro",
+    /*default_device*/  DEVICE_CPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {},
+    /*support*/ {
+        {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}},
+        {"kokoro", "metal", {"macos"}, {{"metal", {}}}},
+    },
+    /*default_labels*/  {},  // kokoro models carry "tts" explicitly in server_models.json
+    /*required_checkpoints*/ {"main"},
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/kokoro_factory.cpp b/src/cpp/server/backends/kokoro_factory.cpp
new file mode 100644
index 000000000..a7d4f3be8
--- /dev/null
+++ b/src/cpp/server/backends/kokoro_factory.cpp
@@ -0,0 +1,13 @@
+#include "lemon/backends/kokoro_factory.h"
+#include "lemon/backends/kokoro_server.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> kokoro_create(const BackendContext& ctx) {
+    return std::make_unique<KokoroServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp_descriptor.cpp b/src/cpp/server/backends/llamacpp_descriptor.cpp
new file mode 100644
index 000000000..f426e9f20
--- /dev/null
+++ b/src/cpp/server/backends/llamacpp_descriptor.cpp
@@ -0,0 +1,43 @@
+#include "lemon/backends/llamacpp_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor llamacpp_descriptor = {
+    /*recipe*/          "llamacpp",
+    /*display_name*/    "Llama.cpp GPU",
+#ifdef _WIN32
+    /*binary*/          "llama-server.exe",
+#else
+    /*binary*/          "llama-server",
+#endif
+    /*config_section*/  "llamacpp",
+    /*default_device*/  DEVICE_GPU,   // cpu/system variants resolve to CPU via effective_device()
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"llamacpp_backend", "--llamacpp", "", "BACKEND",
+         "LlamaCpp backend to use", "Llama.cpp Backend Options"},
+        {"llamacpp_device", "--llamacpp-device", "", "DEVICES",
+         "Comma-separated list of accelerator devices to use (e.g. Vulkan0)", "Llama.cpp Backend Options"},
+        {"llamacpp_args", "--llamacpp-args", "", "ARGS",
+         "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"},
+    },
+    /*support*/ {
+        {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}},
+        {"llamacpp", "metal", {"macos"}, {{"metal", {}}}},
+        {"llamacpp", "cuda", {"windows", "linux"},
+         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}},
+        {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}},
+        {"llamacpp", "rocm", {"windows", "linux"},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}},
+        {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp_factory.cpp b/src/cpp/server/backends/llamacpp_factory.cpp
new file mode 100644
index 000000000..cd34fab5a
--- /dev/null
+++ b/src/cpp/server/backends/llamacpp_factory.cpp
@@ -0,0 +1,13 @@
+#include "lemon/backends/llamacpp_factory.h"
+#include "lemon/backends/llamacpp_server.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> llamacpp_create(const BackendContext& ctx) {
+    return std::make_unique<LlamaCppServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/moonshine_descriptor.cpp b/src/cpp/server/backends/moonshine_descriptor.cpp
new file mode 100644
index 000000000..63277ad3c
--- /dev/null
+++ b/src/cpp/server/backends/moonshine_descriptor.cpp
@@ -0,0 +1,30 @@
+#include "lemon/backends/moonshine_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor moonshine_descriptor = {
+    /*recipe*/          "moonshine",
+    /*display_name*/    "Moonshine",
+    /*binary*/          "moonshine-server",
+    /*config_section*/  "moonshine",
+    /*default_device*/  DEVICE_CPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"moonshine_args", "--moonshine-args", "", "ARGS",
+         "Custom arguments to pass to moonshine-server", ""},
+    },
+    /*support*/ {
+        {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}},
+        {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}},
+        {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}},
+    },
+    /*default_labels*/  {"transcription", "realtime-transcription"},
+    /*required_checkpoints*/ {"main"},
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/moonshine_factory.cpp b/src/cpp/server/backends/moonshine_factory.cpp
new file mode 100644
index 000000000..859b37b30
--- /dev/null
+++ b/src/cpp/server/backends/moonshine_factory.cpp
@@ -0,0 +1,13 @@
+#include "lemon/backends/moonshine_factory.h"
+#include "lemon/backends/moonshine_server.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> moonshine_create(const BackendContext& ctx) {
+    return std::make_unique<MoonshineServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/ryzenai_descriptor.cpp b/src/cpp/server/backends/ryzenai_descriptor.cpp
new file mode 100644
index 000000000..23651ec94
--- /dev/null
+++ b/src/cpp/server/backends/ryzenai_descriptor.cpp
@@ -0,0 +1,29 @@
+#include "lemon/backends/ryzenai_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor ryzenai_descriptor = {
+    /*recipe*/          "ryzenai-llm",
+    /*display_name*/    "Ryzen AI LLM",
+#ifdef _WIN32
+    /*binary*/          "ryzenai-server.exe",
+#else
+    /*binary*/          "ryzenai-server",
+#endif
+    /*config_section*/  "ryzenai",
+    /*default_device*/  DEVICE_NPU,
+    /*slot_policy*/     SlotPolicy::ExclusiveNpu,
+    /*selectable_backend*/ false,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {},
+    /*support*/ {
+        {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/ryzenai_factory.cpp b/src/cpp/server/backends/ryzenai_factory.cpp
new file mode 100644
index 000000000..4e013a30c
--- /dev/null
+++ b/src/cpp/server/backends/ryzenai_factory.cpp
@@ -0,0 +1,20 @@
+#include "lemon/backends/ryzenai_factory.h"
+#include "lemon/backends/ryzenaiserver.h"
+#include "lemon/model_manager.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> ryzenai_create(const BackendContext& ctx) {
+    // RyzenAI resolves its model path before load (set_model_path), matching the
+    // original router factory's special-casing.
+    auto server = std::make_unique<::lemon::RyzenAIServer>(
+        ctx.model_info->model_name, ctx.log_level == "debug",
+        ctx.model_manager, ctx.backend_manager);
+    server->set_model_path(ctx.model_info->resolved_path());
+    return server;
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/sdcpp_descriptor.cpp b/src/cpp/server/backends/sdcpp_descriptor.cpp
new file mode 100644
index 000000000..10ebfdd58
--- /dev/null
+++ b/src/cpp/server/backends/sdcpp_descriptor.cpp
@@ -0,0 +1,47 @@
+#include "lemon/backends/sdcpp_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor sdcpp_descriptor = {
+    /*recipe*/          "sd-cpp",
+    /*display_name*/    "StableDiffusion.cpp",
+#ifdef _WIN32
+    /*binary*/          "sd-server.exe",
+#else
+    /*binary*/          "sd-server",
+#endif
+    /*config_section*/  "sdcpp",
+    /*default_device*/  DEVICE_CPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"sd-cpp_backend", "--sdcpp", "", "BACKEND",
+         "SD.cpp backend to use", "Stable Diffusion Options"},
+        {"sdcpp_args", "--sdcpp-args", "", "ARGS",
+         "Custom arguments to pass to sd-server (must not conflict with managed args)", "Stable Diffusion Options"},
+        // Image generation defaults (recipe-level only, not CLI flags).
+        {"steps", "", 20, "SIZE", "Number of diffusion steps", "Stable Diffusion Options"},
+        {"cfg_scale", "", 7.0, "SIZE", "Classifier-free guidance scale", "Stable Diffusion Options"},
+        {"width", "", 512, "SIZE", "Output image width", "Stable Diffusion Options"},
+        {"height", "", 512, "SIZE", "Output image height", "Stable Diffusion Options"},
+        {"sampling_method", "", "", "ARGS", "Sampling method", "Stable Diffusion Options"},
+        {"flow_shift", "", 0.0, "SIZE", "Flow shift", "Stable Diffusion Options"},
+    },
+    /*support*/ {
+        {"sd-cpp", "rocm", {"windows", "linux"},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}},
+        {"sd-cpp", "cuda", {"linux"},
+         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}},
+        {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}},
+        {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}},
+        {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}},
+    },
+    /*default_labels*/  {"image"},
+    /*required_checkpoints*/ {"main"},  // flux text_encoder+vae validated together in load()
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/sdcpp_factory.cpp b/src/cpp/server/backends/sdcpp_factory.cpp
new file mode 100644
index 000000000..009fffd43
--- /dev/null
+++ b/src/cpp/server/backends/sdcpp_factory.cpp
@@ -0,0 +1,13 @@
+#include "lemon/backends/sdcpp_factory.h"
+#include "lemon/backends/sd_server.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> sdcpp_create(const BackendContext& ctx) {
+    return std::make_unique<SDServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/vllm_descriptor.cpp b/src/cpp/server/backends/vllm_descriptor.cpp
new file mode 100644
index 000000000..54451f365
--- /dev/null
+++ b/src/cpp/server/backends/vllm_descriptor.cpp
@@ -0,0 +1,30 @@
+#include "lemon/backends/vllm_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor vllm_descriptor = {
+    /*recipe*/          "vllm",
+    /*display_name*/    "vLLM ROCm (experimental)",
+    /*binary*/          "vllm-server",
+    /*config_section*/  "vllm",
+    /*default_device*/  DEVICE_GPU,
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   true,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"vllm_backend", "--vllm", "", "BACKEND",
+         "vLLM backend to use", "vLLM Options"},
+        {"vllm_args", "--vllm-args", "", "ARGS",
+         "Custom arguments to pass to vllm-server", "vLLM Options"},
+    },
+    /*support*/ {
+        {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}},
+    },
+    /*default_labels*/  {},
+    /*required_checkpoints*/ {"main"},
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/vllm_factory.cpp b/src/cpp/server/backends/vllm_factory.cpp
new file mode 100644
index 000000000..20fd71851
--- /dev/null
+++ b/src/cpp/server/backends/vllm_factory.cpp
@@ -0,0 +1,13 @@
+#include "lemon/backends/vllm_factory.h"
+#include "lemon/backends/vllm_server.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> vllm_create(const BackendContext& ctx) {
+    return std::make_unique<VLLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/whispercpp_descriptor.cpp b/src/cpp/server/backends/whispercpp_descriptor.cpp
new file mode 100644
index 000000000..6124e779e
--- /dev/null
+++ b/src/cpp/server/backends/whispercpp_descriptor.cpp
@@ -0,0 +1,39 @@
+#include "lemon/backends/whispercpp_descriptor.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendDescriptor whispercpp_descriptor = {
+    /*recipe*/          "whispercpp",
+    /*display_name*/    "Whisper.cpp",
+#ifdef _WIN32
+    /*binary*/          "whisper-server.exe",
+#else
+    /*binary*/          "whisper-server",
+#endif
+    /*config_section*/  "whispercpp",
+    /*default_device*/  DEVICE_CPU,   // npu variant resolves to NPU + ExclusiveNpu via effective_*()
+    /*slot_policy*/     SlotPolicy::Standard,
+    /*selectable_backend*/ true,
+    /*uses_ctx_size*/   false,
+    /*dynamic_models*/  false,
+    /*options*/ {
+        {"whispercpp_backend", "--whispercpp", "", "BACKEND",
+         "WhisperCpp backend to use", "Whisper.cpp Options"},
+        {"whispercpp_args", "--whispercpp-args", "", "ARGS",
+         "Custom arguments to pass to whisper-server", "Whisper.cpp Options"},
+    },
+    /*support*/ {
+        {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}},
+        {"whispercpp", "rocm", {"windows", "linux"},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}},
+        {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}},
+        {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}},
+        {"whispercpp", "metal", {"macos"}, {{"metal", {}}}},
+    },
+    /*default_labels*/  {"transcription", "realtime-transcription"},
+    /*required_checkpoints*/ {"main"},  // npu_cache validated in load() (npu variant only)
+};
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/whispercpp_factory.cpp b/src/cpp/server/backends/whispercpp_factory.cpp
new file mode 100644
index 000000000..3223804aa
--- /dev/null
+++ b/src/cpp/server/backends/whispercpp_factory.cpp
@@ -0,0 +1,13 @@
+#include "lemon/backends/whispercpp_factory.h"
+#include "lemon/backends/whisper_server.h"
+#include "lemon/wrapped_server.h"
+
+namespace lemon {
+namespace backends {
+
+std::unique_ptr<WrappedServer> whispercpp_create(const BackendContext& ctx) {
+    return std::make_unique<WhisperServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 1036e6bb6..febdc4ec8 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -7,6 +7,7 @@
 #include <lemon/utils/process_manager.h>
 #include <lemon/utils/path_utils.h>
 #include <lemon/system_info.h>
+#include <lemon/backends/backend_descriptor_registry.h>
 #include <lemon/backends/backend_utils.h>
 #include <lemon/backends/cloud_server.h>
 #include <lemon/cloud_provider_registry.h>
@@ -618,6 +619,35 @@ static void parse_image_defaults(ModelInfo& info, const json& model_json) {
     }
 }
 
+// Populate ModelInfo::extras with any model-JSON key not consumed by a typed
+// ModelInfo field. This lets a new backend read custom per-model fields in load()
+// without editing the shared ModelInfo struct. Keep this set in sync with the
+// keys read by the parse blocks in build_cache().
+static void parse_extras(ModelInfo& info, const json& model_json) {
+    static const std::set<std::string> kKnownKeys = {
+        "checkpoint", "checkpoints", "components", "mmproj", "recipe", "suggested",
+        "hf_load", "source", "size", "cloud_provider", "moonshine_arch",
+        "labels", "image_defaults", "recipe_options"
+    };
+    if (!model_json.is_object()) return;
+    for (auto& [key, value] : model_json.items()) {
+        if (kKnownKeys.count(key) == 0) {
+            info.extras[key] = value;
+        }
+    }
+}
+
+// Default device for a recipe: the backend descriptor is authoritative for
+// registered backends; collection/unknown recipes fall back to the recipe map.
+// (A backend whose device depends on the chosen backend variant resolves the
+// final device at load time via WrappedServer::effective_device.)
+static DeviceType device_type_for_recipe(const std::string& recipe) {
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        return desc->default_device;
+    }
+    return get_device_type_from_recipe(recipe);
+}
+
 // Build merged recipe options: image_defaults -> JSON recipe_options -> user-saved overrides.
 // json_recipe_options: pre-extracted recipe_options for this model (from build_cache's
 // two-phase pattern). Pass a null json if the model JSON should be read directly instead.
@@ -1276,7 +1306,7 @@ std::map<std::string, ModelInfo> ModelManager::discover_extra_models() const {
         info.downloaded = true;
         info.source = EXTRA_MODEL_SOURCE;
         info.labels.push_back("custom");
-        info.device = get_device_type_from_recipe(EXTRA_MODEL_RECIPE);
+        info.device = device_type_for_recipe(EXTRA_MODEL_RECIPE);
         return info;
     };
 
@@ -2045,6 +2075,7 @@ void ModelManager::build_cache() {
         }
 
         parse_image_defaults(info, value);
+        parse_extras(info, value);
 
         // Parse recipe_options if present (for per-model runtime config like sdcpp_args)
         if (value.contains("recipe_options") && value["recipe_options"].is_object()) {
@@ -2053,7 +2084,7 @@ void ModelManager::build_cache() {
 
         // Populate type and device fields (multi-model support)
         info.type = get_model_type_from_labels(info.labels);
-        info.device = get_device_type_from_recipe(info.recipe);
+        info.device = device_type_for_recipe(info.recipe);
 
         try {
             resolve_all_model_paths(info);
@@ -2098,6 +2129,7 @@ void ModelManager::build_cache() {
         }
 
         parse_image_defaults(info, value);
+        parse_extras(info, value);
 
         // Parse recipe_options if present (for per-model runtime config like sdcpp_args)
         if (value.contains("recipe_options") && value["recipe_options"].is_object()) {
@@ -2106,7 +2138,7 @@ void ModelManager::build_cache() {
 
         // Populate type and device fields (multi-model support)
         info.type = get_model_type_from_labels(info.labels);
-        info.device = get_device_type_from_recipe(info.recipe);
+        info.device = device_type_for_recipe(info.recipe);
 
         try {
             resolve_all_model_paths(info);
@@ -2287,7 +2319,7 @@ void ModelManager::add_model_to_cache(const std::string& model_name) {
 
     // Populate type and device fields (multi-model support)
     info.type = get_model_type_from_labels(info.labels);
-    info.device = get_device_type_from_recipe(info.recipe);
+    info.device = device_type_for_recipe(info.recipe);
 
     resolve_all_model_paths(info);
 
@@ -2862,16 +2894,12 @@ void ModelManager::register_user_model(const std::string& model_name,
     // loop above; this local is just for the label inference below.
     std::string recipe = model_data.value("recipe", "");
 
-    if (recipe == "sd-cpp") {
-        labels.insert("image");
-    }
-    if (recipe == "whispercpp") {
-        labels.insert("transcription");
-        labels.insert("realtime-transcription");
-    }
-    if (recipe == "moonshine") {
-        labels.insert("transcription");
-        labels.insert("realtime-transcription");
+    // Inject the backend's default labels for models that omit them (e.g. sd-cpp
+    // -> image, whispercpp/moonshine -> transcription). Sourced from the descriptor.
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        for (const auto& label : desc->default_labels) {
+            labels.insert(label);
+        }
     }
 
     model_entry["labels"] = labels;
@@ -3100,7 +3128,7 @@ std::vector<ModelInfo> ModelManager::get_flm_available_models() {
 
                     // Populate type and device fields (multi-model support)
                     info.type = get_model_type_from_labels(info.labels);
-                    info.device = get_device_type_from_recipe(info.recipe);
+                    info.device = device_type_for_recipe(info.recipe);
 
                     flm_models.push_back(info);
                 }
diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp
index 65d4bb676..70c188e34 100644
--- a/src/cpp/server/recipe_options.cpp
+++ b/src/cpp/server/recipe_options.cpp
@@ -1,4 +1,5 @@
 #include <lemon/recipe_options.h>
+#include <lemon/backends/backend_descriptor_registry.h>
 #include <lemon/utils/custom_args.h>
 #include <nlohmann/json.hpp>
 #include <map>
@@ -12,78 +13,68 @@ namespace lemon {
 
 using json = nlohmann::json;
 
-static const json DEFAULTS = {
-    {"ctx_size", -1},  // -1 triggers auto-resolution (memory + arch metadata)
-    {"merge_args", true},
-    {"llamacpp_device", ""},
-    {"llamacpp_backend", ""},  // Will be overridden dynamically
-    {"llamacpp_args", ""},
-    {"sd-cpp_backend", ""},   // "" means auto-detect (mapped from "auto" in config.json)
-    {"sdcpp_args", ""},
-    {"whispercpp_backend", ""},  // "" means auto-detect (mapped from "auto" in config.json)
-    {"whispercpp_args", ""},
-    {"moonshine_args", ""},      // Custom arguments to pass to moonshine-server
-    // Image generation defaults (for sd-cpp recipe)
-    // These are recipe-level defaults only, not CLI arguments — per reviewer guidance,
-    // there are too many image gen params for CLI flags, and no universal defaults.
-    {"steps", 20},
-    {"cfg_scale", 7.0},
-    {"width", 512},
-    {"height", 512},
-    {"sampling_method", ""},
-    {"flow_shift", 0.0},
-    // vLLM-specific options
-    {"vllm_backend", ""},  // "" means auto-detect
-    {"vllm_args", ""},     // Custom arguments to pass to vllm-server
-    // Cloud recipe has no backend variants (provider selection lives on the
-    // per-model cloud_provider field). The empty string satisfies Router's
-    // per-backend-args lookup; cloud reads no backend-specific config.
-    {"cloud_backend", ""},
-
-    // Auto-eviction options
-    {"auto_evict", nullptr},          // nullptr means fallback to global config
-    {"evict_idle_timeout", 300},      // Default hard idle timeout (5 mins)
-    {"downsize_idle_timeout", 60},    // Default soft idle timeout (1 min)
-    {"evict_weight_factor", 1.0},     // Eviction-protection weight (higher = more protected)
-    {"pinned", false}
-};
-
-
-// Mapping from flat option names to CLI flags (used by to_cli_options)
-// Note: Image generation params (steps, cfg_scale, width, height, sampling_method,
-// flow_shift) are recipe-level defaults only — not exposed as CLI arguments.
-// Runtime options (diffusion_fa, offload_to_cpu) go through --sdcpp-args.
-static const std::map<std::string, std::string> OPTION_TO_CLI_FLAG = {
-    {"ctx_size", "--ctx-size"},
-    {"merge_args", "--merge-args"},
-    {"llamacpp_backend", "--llamacpp"},
-    {"llamacpp_device", "--llamacpp-device"},
-    {"llamacpp_args", "--llamacpp-args"},
-    {"sd-cpp_backend", "--sdcpp"},
-    {"sdcpp_args", "--sdcpp-args"},
-    {"whispercpp_backend", "--whispercpp"},
-    {"whispercpp_args", "--whispercpp-args"},
-    {"moonshine_args", "--moonshine-args"},
-    {"vllm_backend", "--vllm"},
-    {"vllm_args", "--vllm-args"}
-};
+// Options shared by every backend. Per-backend options (and ctx_size opt-in)
+// come from each backend's descriptor; these are the universal kit.
+static const json& common_defaults() {
+    static const json d = {
+        {"ctx_size", -1},  // -1 triggers auto-resolution (memory + arch metadata)
+        {"merge_args", true},
+        // Auto-eviction options (apply to every recipe)
+        {"auto_evict", nullptr},          // nullptr means fallback to global config
+        {"evict_idle_timeout", 300},      // Default hard idle timeout (5 mins)
+        {"downsize_idle_timeout", 60},    // Default soft idle timeout (1 min)
+        {"evict_weight_factor", 1.0},     // Eviction-protection weight (higher = more protected)
+        {"pinned", false},
+    };
+    return d;
+}
+
+// Defaults for every option: the common kit plus each backend descriptor's
+// declared options. Built once from the registry so config defaults, CLI flags,
+// and load-time resolution can never drift from the descriptors.
+static const json& get_defaults() {
+    static const json defaults = [] {
+        json d = common_defaults();
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& opt : desc->options) {
+                d[opt.name] = opt.default_value;
+            }
+        }
+        return d;
+    }();
+    return defaults;
+}
+
+// Flat option name -> CLI flag, for to_cli_options(). ctx_size/merge_args are
+// the common flags; the rest come from descriptor options that declare a flag.
+static const std::map<std::string, std::string>& get_option_to_cli_flag() {
+    static const std::map<std::string, std::string> mapping = [] {
+        std::map<std::string, std::string> m{
+            {"ctx_size", "--ctx-size"},
+            {"merge_args", "--merge-args"},
+        };
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& opt : desc->options) {
+                if (!opt.cli_flag.empty()) {
+                    m[opt.name] = opt.cli_flag;
+                }
+            }
+        }
+        return m;
+    }();
+    return mapping;
+}
 
 static std::vector<std::string> get_keys_for_recipe(const std::string& recipe) {
     std::vector<std::string> keys;
-    if (recipe == "llamacpp") {
-        keys = {"ctx_size", "llamacpp_device", "llamacpp_backend", "llamacpp_args", "merge_args"};
-    } else if (recipe == "whispercpp") {
-        keys = {"whispercpp_backend", "whispercpp_args", "merge_args"};
-    } else if (recipe == "moonshine") {
-        keys = {"moonshine_args", "merge_args"};
-    } else if (recipe == "flm") {
-        return {"ctx_size", "merge_args"};
-    } else if (recipe == "ryzenai-llm") {
-        keys = {"ctx_size"};
-    } else if (recipe == "sd-cpp") {
-        keys = {"sd-cpp_backend", "sdcpp_args", "steps", "cfg_scale", "width", "height", "sampling_method", "flow_shift", "merge_args"};
-    } else if (recipe == "vllm") {
-        keys = {"ctx_size", "vllm_backend", "vllm_args", "merge_args"};
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        if (desc->uses_ctx_size) {
+            keys.push_back("ctx_size");
+        }
+        for (const auto& opt : desc->options) {
+            keys.push_back(opt.name);
+        }
+        keys.push_back("merge_args");
     }
 
     // Add auto-eviction options for all recipes
@@ -125,7 +116,7 @@ static bool try_get_backend_options(const std::string& opt_name, SystemInfo::Sup
 std::vector<std::string> RecipeOptions::to_cli_options(const json& raw_options) {
     std::vector<std::string> cli;
 
-    for (auto& [opt_name, cli_flag] : OPTION_TO_CLI_FLAG) {
+    for (auto& [opt_name, cli_flag] : get_option_to_cli_flag()) {
         if (raw_options.contains(opt_name)) {
             auto val = raw_options[opt_name];
             if (!val.is_null() && val != "") {
@@ -146,7 +137,7 @@ std::vector<std::string> RecipeOptions::to_cli_options(const json& raw_options)
 
 std::vector<std::string> RecipeOptions::known_keys() {
     std::vector<std::string> keys;
-    for (auto& [key, value] : DEFAULTS.items()) {
+    for (auto& [key, value] : get_defaults().items()) {
         keys.push_back(key);
     }
     return keys;
@@ -239,7 +230,7 @@ json RecipeOptions::get_option(const std::string& opt) const {
         }
     }
 #endif
-    return DEFAULTS.contains(opt) ? DEFAULTS[opt] : json();
+    return get_defaults().contains(opt) ? get_defaults()[opt] : json();
 }
 
 void RecipeOptions::set_option(const std::string& opt, const json& value) {
@@ -247,29 +238,38 @@ void RecipeOptions::set_option(const std::string& opt, const json& value) {
 }
 
 #ifdef LEMONADE_CLI
-// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options
-static const json CLI_OPTIONS = {
-    {"--ctx-size", {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}}},
-    {"--merge-args", {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}}},
-    {"--llamacpp", {{"option_name", "llamacpp_backend"}, {"type_name", "BACKEND"}, {"help", "LlamaCpp backend to use"}, {"group", "Llama.cpp Backend Options"}}},
-    {"--llamacpp-device", {{"option_name", "llamacpp_device"}, {"type_name", "DEVICES"}, {"help", "Comma-separated list of accelerator devices to use (e.g. Vulkan0)"}, {"group", "Llama.cpp Backend Options"}}},
-    {"--llamacpp-args", {{"option_name", "llamacpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to llama-server"}, {"group", "Llama.cpp Backend Options"}}},
-    {"--sdcpp", {{"option_name", "sd-cpp_backend"}, {"type_name", "BACKEND"}, {"help", "SD.cpp backend to use"}, {"group", "Stable Diffusion Options"}}},
-    {"--sdcpp-args", {{"option_name", "sdcpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to sd-server (must not conflict with managed args)"}, {"group", "Stable Diffusion Options"}}},
-    {"--whispercpp", {{"option_name", "whispercpp_backend"}, {"type_name", "BACKEND"}, {"help", "WhisperCpp backend to use"}, {"group", "Whisper.cpp Options"}}},
-    {"--whispercpp-args", {{"option_name", "whispercpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to whisper-server"}, {"group", "Whisper.cpp Options"}}},
-    {"--moonshine-args", {{"option_name", "moonshine_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to moonshine-server"}}},
-    {"--vllm", {{"option_name", "vllm_backend"}, {"type_name", "BACKEND"}, {"help", "vLLM backend to use"}, {"group", "vLLM Options"}}},
-    {"--vllm-args", {{"option_name", "vllm_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to vllm-server"}, {"group", "vLLM Options"}}},
-    // Note: Image gen params (--steps, --cfg-scale, --width, --height) removed — recipe-level only.
-    // Runtime options (--diffusion-fa, --offload-to-cpu) go through --sdcpp-args.
-};
+// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options.
+// ctx_size/merge_args are the common flags; everything else is derived from
+// descriptor options that declare a CLI flag, so the CLI never needs editing
+// when a backend is added. Image-gen params (steps/cfg_scale/width/height) have
+// no cli_flag in their descriptor, so they stay recipe-level only as before.
+static const json& get_cli_options() {
+    static const json cli_options = [] {
+        json o = json::object();
+        o["--ctx-size"] = {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}};
+        o["--merge-args"] = {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}};
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& opt : desc->options) {
+                if (opt.cli_flag.empty()) {
+                    continue;
+                }
+                json entry = {{"option_name", opt.name}, {"type_name", opt.type_name}, {"help", opt.help}};
+                if (!opt.group.empty()) {
+                    entry["group"] = opt.group;
+                }
+                o[opt.cli_flag] = entry;
+            }
+        }
+        return o;
+    }();
+    return cli_options;
+}
 
 void RecipeOptions::add_cli_options(CLI::App& app, json& storage) {
-    for (auto& [key, opt] : CLI_OPTIONS.items()) {
+    for (auto& [key, opt] : get_cli_options().items()) {
         const std::string opt_name = opt["option_name"];
         CLI::Option* o;
-        json defval = DEFAULTS[opt_name];
+        json defval = get_defaults()[opt_name];
 
         if (defval.is_number_float()) {
             o = app.add_option_function<double>(key, [opt_name, &storage = storage](double val) { storage[opt_name] = val; }, opt["help"]);
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index b3ec22c3b..a3c4bec74 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -1,5 +1,6 @@
 #include "lemon/router.h"
 #include "lemon/cloud_provider_registry.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/cloud_server.h"
 #include "lemon/backends/llamacpp_server.h"
 #include "lemon/backends/fastflowlm_server.h"
@@ -143,12 +144,26 @@ bool Router::reload_model_after_watchdog_reset(const std::string& requested_mode
     }
 }
 
+// Slot/eviction policy for a recipe, from its descriptor (default Standard).
+// This is the recipe-static policy used for pre-load slot decisions, mirroring
+// the historical use of get_device_type_from_recipe at load time.
+static SlotPolicy slot_policy_for_recipe(const std::string& recipe) {
+    if (const auto* desc = backends::descriptor_for(recipe)) {
+        return desc->slot_policy;
+    }
+    return SlotPolicy::Standard;
+}
+
+static bool is_unmetered_recipe(const std::string& recipe) {
+    return slot_policy_for_recipe(recipe) == SlotPolicy::Unmetered;
+}
+
 int Router::count_servers_by_type(ModelType type) const {
     int count = 0;
     for (const auto& server : loaded_servers_) {
-        // Cloud servers consume no local memory and stay loaded for free, so
-        // they are excluded from the slot accounting that drives LRU eviction.
-        if (server->get_recipe_options().get_recipe() == "cloud") {
+        // Unmetered backends (cloud) consume no local memory and stay loaded for
+        // free, so they are excluded from the slot accounting that drives LRU eviction.
+        if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) {
             continue;
         }
         if (server->is_backend_alive() && server->get_model_type() == type) {
@@ -162,10 +177,10 @@ WrappedServer* Router::find_lru_server_by_type(ModelType type) const {
     WrappedServer* lru = nullptr;
 
     for (const auto& server : loaded_servers_) {
-        // Cloud servers are not eviction candidates; they have no memory cost
-        // and reloading them is essentially free, but evicting them throws
-        // away the cached api key/upstream-id binding for no benefit.
-        if (server->get_recipe_options().get_recipe() == "cloud") {
+        // Unmetered backends (cloud) are not eviction candidates; they have no
+        // memory cost and reloading them is essentially free, but evicting them
+        // throws away the cached api key/upstream-id binding for no benefit.
+        if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) {
             continue;
         }
         if (server->is_backend_alive() && server->get_model_type() == type) {
@@ -299,49 +314,28 @@ void Router::simulate_vram_pressure(double pct) {
 }
 
 std::unique_ptr<WrappedServer> Router::create_backend_server(const ModelInfo& model_info) {
-    std::unique_ptr<WrappedServer> new_server;
     std::string log_level = config_->log_level();
 
-    if (model_info.recipe == "cloud") {
-        LOG(DEBUG, "Router") << "Creating CloudServer backend (provider: "
-                             << model_info.cloud_provider << ")" << std::endl;
-        new_server = std::make_unique<backends::CloudServer>(model_info.cloud_provider, log_level,
-                                                              model_manager_, backend_manager_,
-                                                              cloud_registry_);
-    } else if (model_info.recipe == "whispercpp") {
-        LOG(DEBUG, "Router") << "Creating WhisperServer backend" << std::endl;
-        new_server = std::make_unique<backends::WhisperServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "moonshine") {
-        LOG(DEBUG, "Router") << "Creating MoonshineServer backend" << std::endl;
-        new_server = std::make_unique<backends::MoonshineServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "kokoro") {
-        LOG(DEBUG, "Router") << "Creating Kokoro backend" << std::endl;
-        new_server = std::make_unique<backends::KokoroServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "sd-cpp") {
-        LOG(DEBUG, "Router") << "Creating SDServer backend" << std::endl;
-        new_server = std::make_unique<backends::SDServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "flm") {
-        LOG(DEBUG, "Router") << "Creating FastFlowLM backend" << std::endl;
-        new_server = std::make_unique<backends::FastFlowLMServer>(log_level, model_manager_, backend_manager_);
-    } else if (model_info.recipe == "ryzenai-llm") {
-        LOG(DEBUG, "Router") << "Creating RyzenAI-Server backend" << std::endl;
-
-        std::string model_path = model_info.resolved_path();
-        LOG(DEBUG, "Router") << "Using model path: " << model_path << std::endl;
-
-        auto* ryzenai_server = new RyzenAIServer(model_info.model_name,
-                                                  log_level == "debug", model_manager_, backend_manager_);
-        ryzenai_server->set_model_path(model_path);
-        new_server.reset(ryzenai_server);
-    } else if (model_info.recipe == "vllm") {
-        LOG(DEBUG, "Router") << "Creating vLLM backend" << std::endl;
-        new_server = std::make_unique<backends::VLLMServer>(log_level, model_manager_, backend_manager_);
-    } else {
-        LOG(DEBUG, "Router") << "Creating LlamaCpp backend" << std::endl;
-        new_server = std::make_unique<backends::LlamaCppServer>(log_level, model_manager_, backend_manager_);
+    backends::BackendContext ctx;
+    ctx.log_level = log_level;
+    ctx.model_manager = model_manager_;
+    ctx.backend_manager = backend_manager_;
+    ctx.cloud_registry = cloud_registry_;
+    ctx.model_info = &model_info;
+
+    // The backend registry binds each recipe's descriptor to its create(). It is
+    // the single source of truth for backend construction (see LEMON_BACKENDS).
+    std::unique_ptr<WrappedServer> new_server = backends::create_server(model_info.recipe, ctx);
+    if (new_server) {
+        LOG(DEBUG, "Router") << "Created backend for recipe '" << model_info.recipe
+                             << "' via registry" << std::endl;
+        return new_server;
     }
 
-    return new_server;
+    // Unknown recipe: fall back to llamacpp, preserving the historical default.
+    LOG(DEBUG, "Router") << "No registered backend for recipe '" << model_info.recipe
+                         << "', defaulting to LlamaCpp" << std::endl;
+    return std::make_unique<backends::LlamaCppServer>(log_level, model_manager_, backend_manager_);
 }
 
 void Router::load_model(const std::string& model_name,
@@ -427,28 +421,39 @@ void Router::load_model(const std::string& model_name,
         // Get max models for this type (same limit for all types)
         int max_models = config_->max_loaded_models();
 
-        // NPU EXCLUSIVITY CHECK (recipe-aware rules)
-        // FLM can run up to 3 concurrent NPU processes (1 LLM + 1 transcription + 1 embedding)
-        // RyzenAI and WhisperCpp lock the entire NPU exclusively
-        if (device_type & DEVICE_NPU) {
-            if (model_info.recipe == "ryzenai-llm" || model_info.recipe == "whispercpp") {
-                // Exclusive NPU recipes - evict ALL NPU servers
+        // NPU EXCLUSIVITY CHECK — driven by the backend's slot policy (descriptor).
+        //   ExclusiveNpu (ryzenai-llm, whisper-on-npu): lock the entire NPU,
+        //                evicting ALL NPU servers first.
+        //   CoexistByType (flm): coexist with other FLM types (max 1 per type),
+        //                but evict exclusive-NPU peers.
+        // Standard/Unmetered backends share no device exclusivity.
+        switch (slot_policy_for_recipe(model_info.recipe)) {
+            case SlotPolicy::ExclusiveNpu: {
                 if (has_npu_server()) {
                     LOG(INFO, "Router") << model_info.recipe
                               << " requires exclusive NPU access, evicting all NPU servers..." << std::endl;
                     evict_all_npu_servers();
                 }
-            } else if (model_info.recipe == "flm") {
-                // FLM can coexist with other FLM types, but not with exclusive-NPU recipes
-                // 1. Evict any exclusive-NPU server (mutually exclusive)
-                for (const std::string& exclusive_recipe : {"ryzenai-llm", "whispercpp"}) {
-                    WrappedServer* exclusive_server = find_npu_server_by_recipe(exclusive_recipe);
-                    if (exclusive_server) {
-                        LOG(INFO, "Router") << "FLM cannot coexist with " << exclusive_recipe
-                                  << ", evicting: " << exclusive_server->get_model_name() << std::endl;
-                        evict_server(exclusive_server);
+                break;
+            }
+            case SlotPolicy::CoexistByType: {
+                // 1. Evict every NPU holder that is not itself a coexisting (FLM)
+                //    backend — i.e. exclusive-NPU peers like ryzenai-llm and
+                //    whisper-on-npu. Collect first; evict_server mutates loaded_servers_.
+                std::vector<WrappedServer*> exclusive_peers;
+                for (const auto& server : loaded_servers_) {
+                    if (server->is_backend_alive() && (server->get_device_type() & DEVICE_NPU) &&
+                        slot_policy_for_recipe(server->get_recipe_options().get_recipe()) !=
+                            SlotPolicy::CoexistByType) {
+                        exclusive_peers.push_back(server.get());
                     }
                 }
+                for (auto* peer : exclusive_peers) {
+                    LOG(INFO, "Router") << "FLM cannot coexist with "
+                              << peer->get_recipe_options().get_recipe()
+                              << ", evicting: " << peer->get_model_name() << std::endl;
+                    evict_server(peer);
+                }
                 // 2. Evict FLM of the SAME model type (max 1 per type: 1 LLM, 1 transcription, 1 embed)
                 WrappedServer* same_type_flm = find_flm_server_by_type(model_type);
                 if (same_type_flm) {
@@ -457,22 +462,20 @@ void Router::load_model(const std::string& model_name,
                               << ", evicting..." << std::endl;
                     evict_server(same_type_flm);
                 }
-            } else {
-                // Unknown NPU recipe - default to exclusive access
-                if (has_npu_server()) {
-                    LOG(INFO, "Router") << "Unknown NPU recipe, evicting all NPU servers..." << std::endl;
-                    evict_all_npu_servers();
-                }
+                break;
             }
+            case SlotPolicy::Standard:
+            case SlotPolicy::Unmetered:
+                break;
         }
 
         // LRU EVICTION CHECK (from spec: Least Recently Used Cache)
-        // Skip eviction if unlimited (-1). Cloud-recipe loads also skip the
+        // Skip eviction if unlimited (-1). Unmetered (cloud) loads also skip the
         // check entirely: they consume no local resources, so they have no
         // business kicking a warm local model out of memory.
-        bool is_cloud_load = (model_info.recipe == "cloud");
+        bool is_unmetered_load = is_unmetered_recipe(model_info.recipe);
         int current_count = count_servers_by_type(model_type);
-        if (!is_cloud_load && max_models != -1 && current_count >= max_models) {
+        if (!is_unmetered_load && max_models != -1 && current_count >= max_models) {
             WrappedServer* lru = find_lru_server_by_type(model_type);
             if (lru) {
                 LOG(INFO, "Router") << "Slot limit reached for type "
diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp
index 5aa62457d..08aa41dc6 100644
--- a/src/cpp/server/runtime_config.cpp
+++ b/src/cpp/server/runtime_config.cpp
@@ -1,4 +1,5 @@
 #include "lemon/runtime_config.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/system_info.h"
 #include "lemon/utils/aixlog.hpp"
 #include "lemon/utils/path_utils.h"
@@ -29,22 +30,26 @@ RuntimeConfig* RuntimeConfig::global() {
     return s_global_instance.load(std::memory_order_acquire);
 }
 
-static const std::vector<std::string> s_backend_names = {
-    "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro"
-};
-
+// A valid config.json backend section is the config_section of any descriptor
+// that runs a local subprocess (binary != ""). Cloud has no binary, so it is not
+// a backend section. Derived from descriptors — no hand-maintained list.
 static bool is_backend_name(const std::string& key) {
-    return std::find(s_backend_names.begin(), s_backend_names.end(), key) != s_backend_names.end();
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        if (!desc->binary.empty() && desc->effective_config_section() == key) {
+            return true;
+        }
+    }
+    return false;
 }
 
-// Backends that have a selectable "backend" key
-static const std::vector<std::string> s_selectable_backends = {
-    "llamacpp", "whispercpp", "sdcpp", "vllm"
-};
-
+// A config section has a selectable "backend" key iff its descriptor opts in.
 static bool has_backend_selection(const std::string& config_section) {
-    return std::find(s_selectable_backends.begin(), s_selectable_backends.end(),
-                     config_section) != s_selectable_backends.end();
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        if (desc->selectable_backend && desc->effective_config_section() == config_section) {
+            return true;
+        }
+    }
+    return false;
 }
 
 static std::pair<json, std::string> normalize_config_set_changes(const json& changes) {
@@ -71,12 +76,18 @@ static std::pair<json, std::string> normalize_config_set_changes(const json& cha
 }
 
 std::string RuntimeConfig::config_section_to_recipe(const std::string& config_section) {
-    if (config_section == "sdcpp") return "sd-cpp";
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        if (desc->effective_config_section() == config_section) {
+            return desc->recipe;
+        }
+    }
     return config_section;
 }
 
 std::string RuntimeConfig::recipe_to_config_section(const std::string& recipe) {
-    if (recipe == "sd-cpp") return "sdcpp";
+    if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
+        return desc->effective_config_section();
+    }
     return recipe;
 }
 
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index b95176957..d0fea0504 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -4329,9 +4329,8 @@ void Server::handle_bin_change(const std::string& section,
     std::string backend = bin_key.substr(0, bin_key.size() - 4);
 
     // The "server_bin" key (as in ryzenai.server_bin) is not consumed by the
-    // current install flow — find_external_backend_binary uses recipe-based
-    // section lookup and there is no recipe whose section equals "ryzenai".
-    // Skip the hot-swap rather than attempt an install that won't help.
+    // current install flow, so skip the hot-swap rather than attempt an install
+    // that won't help.
     if (backend == "server") {
         LOG(WARNING, "Server") << section << "." << bin_key
                                << " is not consumed by the install flow; "
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index d2d3f7d51..6a27a4fb2 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -7,6 +7,8 @@
 #include "lemon/utils/json_utils.h"
 #include "lemon/utils/process_manager.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/backends/backend_descriptor_registry.h"
+#include "lemon/recipe_backend_def.h"
 #include <filesystem>
 #include <fstream>
 #include <sstream>
@@ -404,15 +406,8 @@ std::vector<GPUInfo> query_dxg_amd_gpus(const std::string& gpu_type) {
 // Recipe/Backend definition table - single source of truth for support matrix
 // ============================================================================
 
-// Device constraints: device_type -> set of allowed families (empty = all families)
-using DeviceConstraints = std::map<std::string, std::set<std::string>>;
-
-struct RecipeBackendDef {
-    std::string recipe;
-    std::string backend;
-    std::set<std::string> supported_os;
-    DeviceConstraints devices;
-};
+// RecipeBackendDef and DeviceConstraints are declared in lemon/recipe_backend_def.h
+// so backend descriptors can carry their own support rows.
 
 // Recipe definitions table - single source of truth for all recipe/backend support
 // Format: {recipe, backend, {supported_os}, {{device_type, {allowed_families}}}}
@@ -422,115 +417,22 @@ struct RecipeBackendDef {
 // Example: metal is listed before vulkan on macOS, vulkan before cpu elsewhere.
 //
 // Empty family set {} means "all families of that device type"
-static const std::vector<RecipeBackendDef> RECIPE_DEFS = {
-    // llamacpp with multiple backends (order = preference)
-    {"llamacpp", "system", {"linux"}, {
-        {"cpu", {"x86_64", "arm64"}}, // Placeholder, actual check is PATH-based
-    }},
-    {"llamacpp", "metal", {"macos"},
-    {
-        {"metal", {}},
-    }},
-    {"llamacpp", "cuda", {"windows", "linux"}, {
-        {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}},
-    }},
-    {"llamacpp", "vulkan", {"windows", "linux"}, {
-        {"cpu", {"x86_64", "arm64"}},
-        {"amd_gpu", {}},      // all AMD GPU families
-    }},
-    {"llamacpp", "rocm", {"windows", "linux"}, {
-        {"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}},  // STX iGPUs + RDNA2/3/4 dGPUs
-    }},
-    {"llamacpp", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64", "arm64"}},
-    }},
-
-    // whisper.cpp - NPU, ROCm GPU, Vulkan, CPU, Metal
-    {"whispercpp", "npu", {"windows"}, {
-        {"amd_npu", {"XDNA2"}},
-    }},
-    {"whispercpp", "rocm", {"windows", "linux"}, {
-        // gfx103X omitted: lemonade-sdk/whisper.cpp-rocm publishes no gfx103X
-        // ROCm whisper build, so advertising it would yield a 404 on install.
-        {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}},
-    }},
-    {"whispercpp", "vulkan", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-        {"amd_gpu", {}},
-    }},
-    {"whispercpp", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-    }},
-    {"whispercpp", "metal", {"macos"}, {
-        {"metal", {}},
-    }},
-
-    // kokoro - Windows/Linux x86_64; macOS arm64 (Metal)
-    {"kokoro", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-    }},
-    {"kokoro", "metal", {"macos"}, {
-        {"metal", {}},
-    }},
-
-    // stable-diffusion.cpp - ROCm backend for AMD GPUs
-    {"sd-cpp", "rocm", {"windows", "linux"}, {
-        {"amd_gpu", {
-            "gfx1150", "gfx1151", "gfx1152",
-            "gfx103X", "gfx110X", "gfx120X"
-        }},
-    }},
-
-    // stable-diffusion.cpp - CUDA backend for NVIDIA GPUs (Linux)
-    {"sd-cpp", "cuda", {"linux"}, {
-        {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}},
-    }},
-
-    // stable-diffusion.cpp - Vulkan backend (Windows/Linux x86_64)
-    {"sd-cpp", "vulkan", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-        {"amd_gpu", {}},
-        {"nvidia_gpu", {}},
-    }},
-
-    // stable-diffusion.cpp - CPU backend (Windows/Linux x86_64)
-    {"sd-cpp", "cpu", {"windows", "linux"}, {
-        {"cpu", {"x86_64"}},
-    }},
-
-    // stable-diffusion.cpp - Metal backend (macOS arm64)
-    {"sd-cpp", "metal", {"macos"}, {
-        {"metal", {}},
-    }},
-
-    // FLM - NPU (XDNA2)
-    {"flm", "npu", {"windows", "linux"}, {
-        {"amd_npu", {"XDNA2"}},
-    }},
-
-    // RyzenAI LLM - Windows NPU (XDNA2)
-    {"ryzenai-llm", "npu", {"windows"}, {
-        {"amd_npu", {"XDNA2"}},
-    }},
-
-    // vLLM - ROCm backend for AMD GPUs (Linux only)
-    {"vllm", "rocm", {"linux"}, {
-        {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}},
-    }},
-
-    // Moonshine - CPU-only streaming STT. Platforms match the published
-    // moonshine-server-rocm bundles (moonshine-voice wheels): Windows x64,
-    // Linux x64/arm64, macOS arm64. No Intel macOS or Windows-arm64 wheel.
-    {"moonshine", "cpu", {"windows"}, {
-        {"cpu", {"x86_64"}},
-    }},
-    {"moonshine", "cpu", {"linux"}, {
-        {"cpu", {"x86_64", "arm64"}},
-    }},
-    {"moonshine", "cpu", {"macos"}, {
-        {"cpu", {"arm64"}},
-    }},
-};
+// The recipe/backend support matrix is assembled from every backend descriptor's
+// `support` rows (see lemon/backends/*_descriptor.cpp). Concatenated in registry
+// order; within a recipe, row order is the backend preference order. This is the
+// single source of truth — there is no separate hand-maintained table.
+static const std::vector<RecipeBackendDef>& recipe_defs() {
+    static const std::vector<RecipeBackendDef> defs = [] {
+        std::vector<RecipeBackendDef> v;
+        for (const auto* desc : lemon::backends::all_descriptors()) {
+            for (const auto& row : desc->support) {
+                v.push_back(row);
+            }
+        }
+        return v;
+    }();
+    return defs;
+}
 
 // ============================================================================
 // Device family to human-readable name mapping
@@ -592,7 +494,7 @@ std::string SystemInfo::get_unsupported_backend_error(const std::string& recipe,
     std::string error;
 
     // Find the recipe/backend in RECIPE_DEFS
-    for (const auto& def : RECIPE_DEFS) {
+    for (const auto& def : recipe_defs()) {
         if (def.recipe == recipe && def.backend == backend) {
             // Collect all required family names
             std::vector<std::string> family_names;
@@ -1203,12 +1105,12 @@ json SystemInfo::build_recipes_info(const json& devices) {
     std::map<std::string, std::string> configured_default_backends;
     if (auto* cfg = RuntimeConfig::global()) {
         std::set<std::string> processed_recipes;
-        for (const auto& def : RECIPE_DEFS) {
+        for (const auto& def : recipe_defs()) {
             if (!processed_recipes.insert(def.recipe).second) continue;
             std::string section = RuntimeConfig::recipe_to_config_section(def.recipe);
             std::string backend = cfg->backend_string(section, "backend");
             if (backend.empty() || backend == "auto") continue;
-            bool known = std::any_of(RECIPE_DEFS.begin(), RECIPE_DEFS.end(),
+            bool known = std::any_of(recipe_defs().begin(), recipe_defs().end(),
                 [&](const RecipeBackendDef& d) {
                     return d.recipe == def.recipe && d.backend == backend;
                 });
@@ -1268,7 +1170,7 @@ json SystemInfo::build_recipes_info(const json& devices) {
     }
 
     // Build recipes from the definition table
-    for (const auto& def : RECIPE_DEFS) {
+    for (const auto& def : recipe_defs()) {
         // Skip if not supported on current OS
         if (def.supported_os.count(current_os) == 0) {
             // Helper to format OS name nicely
@@ -1599,6 +1501,50 @@ json SystemInfo::build_recipes_info(const json& devices) {
         }
     }
 
+    // Enrich each recipe entry with descriptor metadata so clients (the desktop
+    // app, the docs generator) can render display names and per-recipe option
+    // schemas without hardcoding them. This is the single source the frontend
+    // reads instead of its own per-recipe TypeScript tables.
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        auto it = recipes.find(desc->recipe);
+        if (it == recipes.end()) {
+            continue;  // recipe not surfaced on this system (e.g. cloud has no support rows)
+        }
+        json& entry = it.value();
+        entry["display_name"] = desc->display_name;
+        entry["selectable_backend"] = desc->selectable_backend;
+        entry["uses_ctx_size"] = desc->uses_ctx_size;
+        // Machine-independent support matrix (OS + device families per backend),
+        // straight from the descriptor — used by the docs generator.
+        json support = json::array();
+        for (const auto& row : desc->support) {
+            json devices = json::array();
+            for (const auto& [device, families] : row.devices) {
+                devices.push_back({{"device", device},
+                                   {"families", std::vector<std::string>(families.begin(), families.end())}});
+            }
+            support.push_back({
+                {"backend", row.backend},
+                {"os", std::vector<std::string>(row.supported_os.begin(), row.supported_os.end())},
+                {"devices", devices},
+            });
+        }
+        entry["support"] = support;
+        json options = json::array();
+        for (const auto& opt : desc->options) {
+            json o = {
+                {"name", opt.name},
+                {"cli_flag", opt.cli_flag},
+                {"default", opt.default_value},
+                {"type_name", opt.type_name},
+                {"help", opt.help},
+                {"group", opt.group},
+            };
+            options.push_back(o);
+        }
+        entry["options"] = options;
+    }
+
     return recipes;
 }
 
@@ -1631,7 +1577,7 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std
     }
 
     // Collect remaining supported backends and capture first error (in preference order from RECIPE_DEFS)
-    for (const auto& def : RECIPE_DEFS) {
+    for (const auto& def : recipe_defs()) {
         if (def.recipe == recipe) {
             // Skip the default_backend since we already added it
             if (def.backend == default_backend) {
@@ -1660,11 +1606,12 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std
 }
 
 std::string SystemInfo::check_recipe_supported(const std::string& recipe) {
-    // Cloud offload has no local hardware/OS requirements; availability is
-    // gated by the CloudProviderRegistry (config.json "cloud_providers") and
-    // a resolvable API key (env var or runtime auth), checked elsewhere in
-    // filter_models_by_backend / CloudServer::load.
-    if (recipe == "cloud") {
+    // A backend whose descriptor declares no support rows has no local
+    // hardware/OS gating (e.g. cloud offload): availability is determined at
+    // runtime (provider creds via the CloudProviderRegistry / API key), checked
+    // elsewhere in filter_models_by_backend / CloudServer::load.
+    const auto* desc = lemon::backends::descriptor_for(recipe);
+    if (desc && desc->support.empty()) {
         return "";
     }
     auto result = get_supported_backends(recipe);
@@ -1685,7 +1632,7 @@ std::vector<SystemInfo::RecipeStatus> SystemInfo::get_all_recipe_statuses() {
 
         if (recipe_info.contains("backends") && recipe_info["backends"].is_object()) {
             // Iterate in preference order (from RECIPE_DEFS table)
-            for (const auto& def : RECIPE_DEFS) {
+            for (const auto& def : recipe_defs()) {
                 if (def.recipe != recipe_name) continue;
 
                 if (!recipe_info["backends"].contains(def.backend)) continue;

From 2ef9379e1aa16a9296639d1129dad612a510a403 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Fri, 19 Jun 2026 16:25:18 -0400
Subject: [PATCH 02/39] refactor(backends): move each backend into its own
 folder (per spec)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Restructure the self-describing backends to the layout the issue #2287 plan
specified — one folder per backend — instead of the flat file layout I used
before. This also folds the earlier _descriptor/_factory split into the spec's
cleaner shape: the descriptor is a header-only `inline const` and create() lives
with the server class.

Each backend now lives in its own folder, in namespace lemon::backends::<stem>:
  include/lemon/backends/<stem>/<stem>.h         inline const descriptor (CLI-safe data)
  include/lemon/backends/<stem>/<stem>_server.h  WrappedServer subclass + create() decl
  server/backends/<stem>/<stem>_server.cpp       implementation + create() def

Shared registry/util files stay at the top of backends/. The CMake foreach over
LEMON_BACKENDS compiles each <stem>/<stem>_server.cpp and generates the registry
headers from the folder paths. Removes the per-backend *_descriptor.{h,cpp} and
*_factory.{h,cpp} files. Behavior is unchanged (same descriptors, same create()).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                                | 42 ++++-----
 docs/dev/adding-a-backend.md                  | 92 ++++++++++---------
 .../lemon/backends/cloud/cloud.h}             | 14 ++-
 .../lemon/backends/{ => cloud}/cloud_server.h | 15 ++-
 .../include/lemon/backends/cloud_descriptor.h | 13 ---
 .../include/lemon/backends/cloud_factory.h    | 14 ---
 .../lemon/backends/fastflowlm/fastflowlm.h}   | 14 ++-
 .../{ => fastflowlm}/fastflowlm_server.h      | 15 ++-
 .../lemon/backends/fastflowlm_descriptor.h    | 13 ---
 .../lemon/backends/fastflowlm_factory.h       | 14 ---
 .../lemon/backends/kokoro/kokoro.h}           | 14 ++-
 .../backends/{ => kokoro}/kokoro_server.h     | 17 +++-
 .../lemon/backends/kokoro_descriptor.h        | 13 ---
 .../include/lemon/backends/kokoro_factory.h   | 14 ---
 .../lemon/backends/llamacpp/llamacpp.h}       | 14 ++-
 .../backends/{ => llamacpp}/llamacpp_server.h | 15 ++-
 .../lemon/backends/llamacpp_descriptor.h      | 13 ---
 .../include/lemon/backends/llamacpp_factory.h | 14 ---
 .../lemon/backends/moonshine/moonshine.h}     | 14 ++-
 .../{ => moonshine}/moonshine_server.h        | 17 +++-
 .../lemon/backends/moonshine_descriptor.h     | 13 ---
 .../lemon/backends/moonshine_factory.h        | 14 ---
 .../lemon/backends/ryzenai/ryzenai.h}         | 14 ++-
 .../ryzenai_server.h}                         | 11 +++
 .../lemon/backends/ryzenai_descriptor.h       | 13 ---
 .../include/lemon/backends/ryzenai_factory.h  | 14 ---
 .../lemon/backends/sdcpp/sdcpp.h}             | 14 ++-
 .../{sd_server.h => sdcpp/sdcpp_server.h}     | 23 +++--
 .../include/lemon/backends/sdcpp_descriptor.h | 13 ---
 .../include/lemon/backends/sdcpp_factory.h    | 14 ---
 .../lemon/backends/vllm/vllm.h}               | 14 ++-
 .../lemon/backends/{ => vllm}/vllm_server.h   | 15 ++-
 .../include/lemon/backends/vllm_descriptor.h  | 13 ---
 src/cpp/include/lemon/backends/vllm_factory.h | 14 ---
 .../lemon/backends/whispercpp/whispercpp.h}   | 14 ++-
 .../whispercpp_server.h}                      | 17 +++-
 .../lemon/backends/whispercpp_descriptor.h    | 13 ---
 .../lemon/backends/whispercpp_factory.h       | 14 ---
 src/cpp/server/backends/backend_utils.cpp     | 16 ++--
 .../backends/{ => cloud}/cloud_server.cpp     | 18 +++-
 src/cpp/server/backends/cloud_factory.cpp     | 16 ----
 .../{ => fastflowlm}/fastflowlm_server.cpp    | 15 ++-
 .../server/backends/fastflowlm_factory.cpp    | 13 ---
 .../backends/{ => kokoro}/kokoro_server.cpp   | 15 ++-
 src/cpp/server/backends/kokoro_factory.cpp    | 13 ---
 .../{ => llamacpp}/llamacpp_server.cpp        | 15 ++-
 src/cpp/server/backends/llamacpp_factory.cpp  | 13 ---
 .../{ => moonshine}/moonshine_server.cpp      | 15 ++-
 src/cpp/server/backends/moonshine_factory.cpp | 13 ---
 .../ryzenai_server.cpp}                       | 22 ++++-
 src/cpp/server/backends/ryzenai_factory.cpp   | 20 ----
 .../{sd_server.cpp => sdcpp/sdcpp_server.cpp} | 15 ++-
 src/cpp/server/backends/sdcpp_factory.cpp     | 13 ---
 .../backends/{ => vllm}/vllm_server.cpp       | 15 ++-
 src/cpp/server/backends/vllm_factory.cpp      | 13 ---
 .../whispercpp_server.cpp}                    | 15 ++-
 .../server/backends/whispercpp_factory.cpp    | 13 ---
 src/cpp/server/model_manager.cpp              |  4 +-
 src/cpp/server/router.cpp                     | 18 ++--
 src/cpp/server/server.cpp                     |  4 +-
 60 files changed, 433 insertions(+), 529 deletions(-)
 rename src/cpp/{server/backends/cloud_descriptor.cpp => include/lemon/backends/cloud/cloud.h} (66%)
 rename src/cpp/include/lemon/backends/{ => cloud}/cloud_server.h (92%)
 delete mode 100644 src/cpp/include/lemon/backends/cloud_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/cloud_factory.h
 rename src/cpp/{server/backends/fastflowlm_descriptor.cpp => include/lemon/backends/fastflowlm/fastflowlm.h} (62%)
 rename src/cpp/include/lemon/backends/{ => fastflowlm}/fastflowlm_server.h (84%)
 delete mode 100644 src/cpp/include/lemon/backends/fastflowlm_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/fastflowlm_factory.h
 rename src/cpp/{server/backends/kokoro_descriptor.cpp => include/lemon/backends/kokoro/kokoro.h} (67%)
 rename src/cpp/include/lemon/backends/{ => kokoro}/kokoro_server.h (74%)
 delete mode 100644 src/cpp/include/lemon/backends/kokoro_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/kokoro_factory.h
 rename src/cpp/{server/backends/llamacpp_descriptor.cpp => include/lemon/backends/llamacpp/llamacpp.h} (82%)
 rename src/cpp/include/lemon/backends/{ => llamacpp}/llamacpp_server.h (80%)
 delete mode 100644 src/cpp/include/lemon/backends/llamacpp_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/llamacpp_factory.h
 rename src/cpp/{server/backends/moonshine_descriptor.cpp => include/lemon/backends/moonshine/moonshine.h} (70%)
 rename src/cpp/include/lemon/backends/{ => moonshine}/moonshine_server.h (79%)
 delete mode 100644 src/cpp/include/lemon/backends/moonshine_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/moonshine_factory.h
 rename src/cpp/{server/backends/ryzenai_descriptor.cpp => include/lemon/backends/ryzenai/ryzenai.h} (64%)
 rename src/cpp/include/lemon/backends/{ryzenaiserver.h => ryzenai/ryzenai_server.h} (82%)
 delete mode 100644 src/cpp/include/lemon/backends/ryzenai_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/ryzenai_factory.h
 rename src/cpp/{server/backends/sdcpp_descriptor.cpp => include/lemon/backends/sdcpp/sdcpp.h} (85%)
 rename src/cpp/include/lemon/backends/{sd_server.h => sdcpp/sdcpp_server.h} (86%)
 delete mode 100644 src/cpp/include/lemon/backends/sdcpp_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/sdcpp_factory.h
 rename src/cpp/{server/backends/vllm_descriptor.cpp => include/lemon/backends/vllm/vllm.h} (69%)
 rename src/cpp/include/lemon/backends/{ => vllm}/vllm_server.h (78%)
 delete mode 100644 src/cpp/include/lemon/backends/vllm_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/vllm_factory.h
 rename src/cpp/{server/backends/whispercpp_descriptor.cpp => include/lemon/backends/whispercpp/whispercpp.h} (79%)
 rename src/cpp/include/lemon/backends/{whisper_server.h => whispercpp/whispercpp_server.h} (85%)
 delete mode 100644 src/cpp/include/lemon/backends/whispercpp_descriptor.h
 delete mode 100644 src/cpp/include/lemon/backends/whispercpp_factory.h
 rename src/cpp/server/backends/{ => cloud}/cloud_server.cpp (98%)
 delete mode 100644 src/cpp/server/backends/cloud_factory.cpp
 rename src/cpp/server/backends/{ => fastflowlm}/fastflowlm_server.cpp (97%)
 delete mode 100644 src/cpp/server/backends/fastflowlm_factory.cpp
 rename src/cpp/server/backends/{ => kokoro}/kokoro_server.cpp (94%)
 delete mode 100644 src/cpp/server/backends/kokoro_factory.cpp
 rename src/cpp/server/backends/{ => llamacpp}/llamacpp_server.cpp (98%)
 delete mode 100644 src/cpp/server/backends/llamacpp_factory.cpp
 rename src/cpp/server/backends/{ => moonshine}/moonshine_server.cpp (96%)
 delete mode 100644 src/cpp/server/backends/moonshine_factory.cpp
 rename src/cpp/server/backends/{ryzenaiserver.cpp => ryzenai/ryzenai_server.cpp} (87%)
 delete mode 100644 src/cpp/server/backends/ryzenai_factory.cpp
 rename src/cpp/server/backends/{sd_server.cpp => sdcpp/sdcpp_server.cpp} (98%)
 delete mode 100644 src/cpp/server/backends/sdcpp_factory.cpp
 rename src/cpp/server/backends/{ => vllm}/vllm_server.cpp (97%)
 delete mode 100644 src/cpp/server/backends/vllm_factory.cpp
 rename src/cpp/server/backends/{whisper_server.cpp => whispercpp/whispercpp_server.cpp} (98%)
 delete mode 100644 src/cpp/server/backends/whispercpp_factory.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3220e6c42..0b59e883f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -607,15 +607,6 @@ set(SOURCES_CORE
     src/cpp/server/utils/wmi_helper.cpp
     src/cpp/server/utils/network_beacon.cpp
     src/cpp/server/utils/tcp_jsonl_client.cpp
-    src/cpp/server/backends/cloud_server.cpp
-    src/cpp/server/backends/llamacpp_server.cpp
-    src/cpp/server/backends/fastflowlm_server.cpp
-    src/cpp/server/backends/ryzenaiserver.cpp
-    src/cpp/server/backends/whisper_server.cpp
-    src/cpp/server/backends/moonshine_server.cpp
-    src/cpp/server/backends/kokoro_server.cpp
-    src/cpp/server/backends/sd_server.cpp
-    src/cpp/server/backends/vllm_server.cpp
     src/cpp/server/backends/backend_utils.cpp
     src/cpp/server/backend_manager.cpp
     src/cpp/server/ollama_api.cpp
@@ -652,17 +643,18 @@ endif()
 # ============================================================
 # The authoritative backend list. Each entry is "<recipe>|<stem>":
 #   recipe - the recipe string used in server_models.json (may contain dashes)
-#   stem   - identifier-safe name. The backend ships two files:
-#              src/cpp/server/backends/<stem>_descriptor.cpp  (plain data; CLI-safe)
-#              src/cpp/server/backends/<stem>_factory.cpp      (create(); server-only)
-#            declaring lemon::backends::<stem>_descriptor and <stem>_create.
+#   stem   - identifier-safe name and folder. Each backend lives in its own
+#            folder, shipping (in namespace lemon::backends::<stem>):
+#              include/lemon/backends/<stem>/<stem>.h         inline const descriptor (CLI-safe data)
+#              include/lemon/backends/<stem>/<stem>_server.h  WrappedServer subclass + create() decl
+#              server/backends/<stem>/<stem>_server.cpp       implementation + create() def
 #
-# Adding a backend is one line here plus those two files. The foreach below
-# compiles the sources and regenerates the registry headers, which bind each
+# Adding a backend is one line here plus that folder. The foreach below compiles
+# the server source and regenerates the registry headers, which bind each
 # descriptor to its create(). Because this list is a tracked input, editing it
 # forces regeneration on the next build (a file(GLOB) would silently miss a
-# newly added backend). Descriptor DATA links into both the lemonade CLI and
-# lemond; only lemond links the factories (which pull in server classes).
+# newly added backend). The descriptor is a header-only inline const, so it links
+# into both the lemonade CLI and lemond; only lemond links the server sources.
 set(LEMON_BACKENDS
     # "<recipe>|<stem>"
     "llamacpp|llamacpp"
@@ -680,7 +672,8 @@ set(LEMON_DESCRIPTOR_INCLUDES "")
 set(LEMON_DESCRIPTOR_ENTRIES "")
 set(LEMON_FACTORY_INCLUDES "")
 set(LEMON_FACTORY_ENTRIES "")
-# Descriptor sources are CLI-safe (data only); factory sources are server-only.
+# The data registry (descriptors, header-only) links into both binaries; the
+# factory registry + per-backend server sources are server-only.
 # Absolute paths so the CLI subdirectory can reuse LEMON_BACKEND_DESCRIPTOR_SOURCES.
 set(LEMON_BACKEND_DESCRIPTOR_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp)
@@ -689,18 +682,17 @@ set(LEMON_BACKEND_FACTORY_SOURCES
 foreach(_backend_entry ${LEMON_BACKENDS})
     string(REPLACE "|" ";" _backend_parts "${_backend_entry}")
     list(GET _backend_parts 1 _backend_stem)
-    list(APPEND LEMON_BACKEND_DESCRIPTOR_SOURCES
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_descriptor.cpp)
+    # The descriptor is header-only (no source); only the server source compiles.
     list(APPEND LEMON_BACKEND_FACTORY_SOURCES
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_factory.cpp)
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/${_backend_stem}_server.cpp)
     string(APPEND LEMON_DESCRIPTOR_INCLUDES
-        "#include \"lemon/backends/${_backend_stem}_descriptor.h\"\n")
+        "#include \"lemon/backends/${_backend_stem}/${_backend_stem}.h\"\n")
     string(APPEND LEMON_DESCRIPTOR_ENTRIES
-        "        &lemon::backends::${_backend_stem}_descriptor,\n")
+        "        &lemon::backends::${_backend_stem}::descriptor,\n")
     string(APPEND LEMON_FACTORY_INCLUDES
-        "#include \"lemon/backends/${_backend_stem}_factory.h\"\n")
+        "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n")
     string(APPEND LEMON_FACTORY_ENTRIES
-        "        { &lemon::backends::${_backend_stem}_descriptor, &lemon::backends::${_backend_stem}_create },\n")
+        "        { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create },\n")
 endforeach()
 
 configure_file(
diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md
index 512770e73..ae5006a4f 100644
--- a/docs/dev/adding-a-backend.md
+++ b/docs/dev/adding-a-backend.md
@@ -1,44 +1,39 @@
 # Adding a backend
 
 Lemonade backends are **self-describing**. A backend declares *what it is* in a
-plain-data **descriptor** and implements *how it runs* in a **server class**. A
-registry collects every descriptor, and the router, the CLI, `/system-info`, and
-the generated docs all read it — so there are no scattered `if (recipe == "...")`
-sites to update.
+plain-data **descriptor** and implements *how it runs* in a **server class**, and
+both live together in the backend's own folder. A registry collects every
+descriptor, and the router, the CLI, `/system-info`, and the generated docs all
+read it — so there are no scattered `if (recipe == "...")` sites to update.
 
-Adding a backend is **one folder's worth of files plus three small appends**:
+Adding a backend is **one folder plus three small appends**:
 
 | You edit | What goes there |
 |----------|-----------------|
 | `CMakeLists.txt` → `LEMON_BACKENDS` | **one line**: `"<recipe>\|<stem>"` |
-| `src/cpp/server/backends/<stem>_descriptor.cpp` + `.h` | the descriptor (plain data) |
-| `src/cpp/server/backends/<stem>_factory.cpp` + `.h` | `create()` + the `WrappedServer` subclass |
+| `src/cpp/include/lemon/backends/<stem>/<stem>.h` | the descriptor (header-only `inline const`) |
+| `src/cpp/include/lemon/backends/<stem>/<stem>_server.h` | the `WrappedServer` subclass + `create()` declaration |
+| `src/cpp/server/backends/<stem>/<stem>_server.cpp` | the implementation + `create()` definition |
 | `src/cpp/resources/backend_versions.json` | version pin(s) — skip if there's no downloaded binary (e.g. cloud) |
 | `src/cpp/resources/server_models.json` | the models |
 
 No router edits, no CLI edits, no doc edits, no support-matrix edits.
 
-## The descriptor (plain data — CLI-safe)
+Everything for one backend lives in `lemon::backends::<stem>`. The descriptor is
+header-only so it links into **both** the `lemonade` CLI and `lemond`; the server
+class and `create()` are server-only (compiled into `lemond`).
 
-The descriptor is the single object every consumer reads. It links into **both**
-the `lemonade` CLI and `lemond`, so it must not reference server classes.
+## The descriptor — `<stem>/<stem>.h`
 
-`src/cpp/include/lemon/backends/<stem>_descriptor.h`:
+Plain data. The single object the registry, CLI, `/system-info`, and docs all read.
 
 ```cpp
 #pragma once
 #include "lemon/backends/backend_descriptor.h"
-namespace lemon { namespace backends {
-extern const BackendDescriptor <stem>_descriptor;
-} }
-```
 
-`src/cpp/server/backends/<stem>_descriptor.cpp`:
+namespace lemon { namespace backends { namespace myrecipe {
 
-```cpp
-#include "lemon/backends/<stem>_descriptor.h"
-namespace lemon { namespace backends {
-const BackendDescriptor <stem>_descriptor = {
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "myrecipe",
     /*display_name*/    "My Backend",
     /*binary*/          "my-server",        // "" = no subprocess (e.g. cloud)
@@ -57,48 +52,55 @@ const BackendDescriptor <stem>_descriptor = {
     /*default_labels*/  {},                 // labels injected when a model omits them
     /*required_checkpoints*/ {"main"},      // unconditional files; conditional ones checked in load()
 };
-} }
+
+}}}  // namespace lemon::backends::myrecipe
 ```
 
 `SlotPolicy` controls accelerator sharing: `Standard` (counts toward LRU slots),
 `ExclusiveNpu` (evicts all NPU servers first), `CoexistByType` (one per model
 type), `Unmetered` (never counted, never auto-evicted — cloud).
 
-## The factory + server class (server-only)
+## The server class + factory — `<stem>/<stem>_server.{h,cpp}`
 
-The factory builds the `WrappedServer` subclass. It is compiled into `lemond`
-only (it references server classes), which keeps the `lemonade` CLI link clean.
+The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`,
+and only the capability interfaces you serve (`ITranscriptionServer`,
+`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default
+"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend
+does not stub them. Alongside it, a free `create()` builds the instance.
 
-`src/cpp/include/lemon/backends/<stem>_factory.h`:
+`<stem>_server.h`:
 
 ```cpp
 #pragma once
-#include <memory>
-#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_registry.h"   // BackendContext
+#include "lemon/wrapped_server.h"
+
 namespace lemon { namespace backends {
-std::unique_ptr<WrappedServer> <stem>_create(const BackendContext& ctx);
-} }
+
+class MyServer : public WrappedServer, public ICompletionServer {
+    // load(), unload(), the capability methods you serve …
+};
+
+namespace myrecipe {
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);  // server-only
+}
+
+}}  // namespace lemon::backends
 ```
 
-`src/cpp/server/backends/<stem>_factory.cpp`:
+`<stem>_server.cpp`:
 
 ```cpp
-#include "lemon/backends/<stem>_factory.h"
-#include "lemon/backends/<stem>_server.h"
-#include "lemon/wrapped_server.h"
-namespace lemon { namespace backends {
-std::unique_ptr<WrappedServer> <stem>_create(const BackendContext& ctx) {
+#include "lemon/backends/myrecipe/myrecipe_server.h"
+// … MyServer method definitions …
+
+namespace lemon { namespace backends { namespace myrecipe {
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<MyServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
-} }
+}}}  // namespace lemon::backends::myrecipe
 ```
 
-The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`,
-and only the capability interfaces you actually serve (`ITranscriptionServer`,
-`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default
-"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend
-does not stub them.
-
 ## Register it: one line
 
 ```cmake
@@ -108,8 +110,8 @@ set(LEMON_BACKENDS
 )
 ```
 
-The `foreach` in `CMakeLists.txt` compiles your two sources and regenerates the
-registry headers, binding the descriptor to its `create()`.
+The `foreach` in `CMakeLists.txt` compiles `<stem>/<stem>_server.cpp` and
+regenerates the registry headers, binding `<stem>::descriptor` to `<stem>::create`.
 
 ## What you get for free
 
@@ -139,7 +141,7 @@ registry headers, binding the descriptor to its `create()`.
 
 **Moonshine** is the minimal case: a single descriptor option, no backend
 selection, CPU-only, one capability interface. See
-`src/cpp/server/backends/moonshine_descriptor.cpp` and `moonshine_factory.cpp`.
+`src/cpp/server/backends/moonshine/` and `include/lemon/backends/moonshine/`.
 
 > Note: collections (`collection.omni`) are orchestrator-driven, not
 > `WrappedServer` subprocesses, and are the one explicit exception to this model.
diff --git a/src/cpp/server/backends/cloud_descriptor.cpp b/src/cpp/include/lemon/backends/cloud/cloud.h
similarity index 66%
rename from src/cpp/server/backends/cloud_descriptor.cpp
rename to src/cpp/include/lemon/backends/cloud/cloud.h
index fe87a32a2..2ad4f3186 100644
--- a/src/cpp/server/backends/cloud_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/cloud/cloud.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/cloud_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace cloud {
 
-const BackendDescriptor cloud_descriptor = {
+// The cloud backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "cloud",
     /*display_name*/    "Cloud",
     /*binary*/          "",  // no subprocess: runs on a remote provider
@@ -19,5 +24,6 @@ const BackendDescriptor cloud_descriptor = {
     /*required_checkpoints*/ {},  // no downloaded files
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace cloud
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h
similarity index 92%
rename from src/cpp/include/lemon/backends/cloud_server.h
rename to src/cpp/include/lemon/backends/cloud/cloud_server.h
index efc765728..21e28512a 100644
--- a/src/cpp/include/lemon/backends/cloud_server.h
+++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../model_manager.h"
-#include "../wrapped_server.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/model_manager.h"
+#include "lemon/wrapped_server.h"
 #include <string>
 #include <vector>
 
@@ -108,3 +110,12 @@ class CloudServer : public WrappedServer {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace cloud {
+// Factory for the cloud backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace cloud
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/cloud_descriptor.h b/src/cpp/include/lemon/backends/cloud_descriptor.h
deleted file mode 100644
index 6e5f49bdb..000000000
--- a/src/cpp/include/lemon/backends/cloud_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The cloud backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in cloud_descriptor.cpp.
-extern const BackendDescriptor cloud_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/cloud_factory.h b/src/cpp/include/lemon/backends/cloud_factory.h
deleted file mode 100644
index 889958bd1..000000000
--- a/src/cpp/include/lemon/backends/cloud_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The cloud backend's factory (constructs the server class — lemond only).
-// Defined in cloud_factory.cpp.
-std::unique_ptr<WrappedServer> cloud_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm_descriptor.cpp b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
similarity index 62%
rename from src/cpp/server/backends/fastflowlm_descriptor.cpp
rename to src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index 7b67b8d42..648d84c57 100644
--- a/src/cpp/server/backends/fastflowlm_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/fastflowlm_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace fastflowlm {
 
-const BackendDescriptor fastflowlm_descriptor = {
+// The fastflowlm backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "flm",
     /*display_name*/    "FastFlowLM NPU",
 #ifdef _WIN32
@@ -25,5 +30,6 @@ const BackendDescriptor fastflowlm_descriptor = {
     /*required_checkpoints*/ {"main"},
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace fastflowlm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
similarity index 84%
rename from src/cpp/include/lemon/backends/fastflowlm_server.h
rename to src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
index bd9c554ac..58b99f1ba 100644
--- a/src/cpp/include/lemon/backends/fastflowlm_server.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -72,3 +74,12 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace fastflowlm {
+// Factory for the fastflowlm backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace fastflowlm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm_descriptor.h b/src/cpp/include/lemon/backends/fastflowlm_descriptor.h
deleted file mode 100644
index 5e8f71467..000000000
--- a/src/cpp/include/lemon/backends/fastflowlm_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The fastflowlm backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in fastflowlm_descriptor.cpp.
-extern const BackendDescriptor fastflowlm_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm_factory.h b/src/cpp/include/lemon/backends/fastflowlm_factory.h
deleted file mode 100644
index 8581dbdf7..000000000
--- a/src/cpp/include/lemon/backends/fastflowlm_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The fastflowlm backend's factory (constructs the server class — lemond only).
-// Defined in fastflowlm_factory.cpp.
-std::unique_ptr<WrappedServer> fastflowlm_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/kokoro_descriptor.cpp b/src/cpp/include/lemon/backends/kokoro/kokoro.h
similarity index 67%
rename from src/cpp/server/backends/kokoro_descriptor.cpp
rename to src/cpp/include/lemon/backends/kokoro/kokoro.h
index 281f0e0f1..f0492576f 100644
--- a/src/cpp/server/backends/kokoro_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/kokoro_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace kokoro {
 
-const BackendDescriptor kokoro_descriptor = {
+// The kokoro backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "kokoro",
     /*display_name*/    "Kokoro",
 #ifdef _WIN32
@@ -26,5 +31,6 @@ const BackendDescriptor kokoro_descriptor = {
     /*required_checkpoints*/ {"main"},
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace kokoro
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
similarity index 74%
rename from src/cpp/include/lemon/backends/kokoro_server.h
rename to src/cpp/include/lemon/backends/kokoro/kokoro_server.h
index 0b99bcb96..c1f170ca7 100644
--- a/src/cpp/include/lemon/backends/kokoro_server.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 #include <filesystem>
 
@@ -47,3 +49,12 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace kokoro {
+// Factory for the kokoro backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace kokoro
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro_descriptor.h b/src/cpp/include/lemon/backends/kokoro_descriptor.h
deleted file mode 100644
index 1d3542f0a..000000000
--- a/src/cpp/include/lemon/backends/kokoro_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The kokoro backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in kokoro_descriptor.cpp.
-extern const BackendDescriptor kokoro_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro_factory.h b/src/cpp/include/lemon/backends/kokoro_factory.h
deleted file mode 100644
index 0df3ec37b..000000000
--- a/src/cpp/include/lemon/backends/kokoro_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The kokoro backend's factory (constructs the server class — lemond only).
-// Defined in kokoro_factory.cpp.
-std::unique_ptr<WrappedServer> kokoro_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp_descriptor.cpp b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
similarity index 82%
rename from src/cpp/server/backends/llamacpp_descriptor.cpp
rename to src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index f426e9f20..8348f877e 100644
--- a/src/cpp/server/backends/llamacpp_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/llamacpp_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace llamacpp {
 
-const BackendDescriptor llamacpp_descriptor = {
+// The llamacpp backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "llamacpp",
     /*display_name*/    "Llama.cpp GPU",
 #ifdef _WIN32
@@ -39,5 +44,6 @@ const BackendDescriptor llamacpp_descriptor = {
     /*required_checkpoints*/ {"main"},
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace llamacpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
similarity index 80%
rename from src/cpp/include/lemon/backends/llamacpp_server.h
rename to src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
index c9356f6b8..7ef4bb44b 100644
--- a/src/cpp/include/lemon/backends/llamacpp_server.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -58,3 +60,12 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+// Factory for the llamacpp backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace llamacpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp_descriptor.h b/src/cpp/include/lemon/backends/llamacpp_descriptor.h
deleted file mode 100644
index 501e0854c..000000000
--- a/src/cpp/include/lemon/backends/llamacpp_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The llamacpp backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in llamacpp_descriptor.cpp.
-extern const BackendDescriptor llamacpp_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp_factory.h b/src/cpp/include/lemon/backends/llamacpp_factory.h
deleted file mode 100644
index 853f5171b..000000000
--- a/src/cpp/include/lemon/backends/llamacpp_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The llamacpp backend's factory (constructs the server class — lemond only).
-// Defined in llamacpp_factory.cpp.
-std::unique_ptr<WrappedServer> llamacpp_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/moonshine_descriptor.cpp b/src/cpp/include/lemon/backends/moonshine/moonshine.h
similarity index 70%
rename from src/cpp/server/backends/moonshine_descriptor.cpp
rename to src/cpp/include/lemon/backends/moonshine/moonshine.h
index 63277ad3c..28b3e3e58 100644
--- a/src/cpp/server/backends/moonshine_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/moonshine_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace moonshine {
 
-const BackendDescriptor moonshine_descriptor = {
+// The moonshine backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "moonshine",
     /*display_name*/    "Moonshine",
     /*binary*/          "moonshine-server",
@@ -26,5 +31,6 @@ const BackendDescriptor moonshine_descriptor = {
     /*required_checkpoints*/ {"main"},
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace moonshine
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
similarity index 79%
rename from src/cpp/include/lemon/backends/moonshine_server.h
rename to src/cpp/include/lemon/backends/moonshine/moonshine_server.h
index 6f13f216b..b98e52806 100644
--- a/src/cpp/include/lemon/backends/moonshine_server.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -53,3 +55,12 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace moonshine {
+// Factory for the moonshine backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace moonshine
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine_descriptor.h b/src/cpp/include/lemon/backends/moonshine_descriptor.h
deleted file mode 100644
index d70083244..000000000
--- a/src/cpp/include/lemon/backends/moonshine_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The moonshine backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in moonshine_descriptor.cpp.
-extern const BackendDescriptor moonshine_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine_factory.h b/src/cpp/include/lemon/backends/moonshine_factory.h
deleted file mode 100644
index 67e6f7298..000000000
--- a/src/cpp/include/lemon/backends/moonshine_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The moonshine backend's factory (constructs the server class — lemond only).
-// Defined in moonshine_factory.cpp.
-std::unique_ptr<WrappedServer> moonshine_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/ryzenai_descriptor.cpp b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
similarity index 64%
rename from src/cpp/server/backends/ryzenai_descriptor.cpp
rename to src/cpp/include/lemon/backends/ryzenai/ryzenai.h
index 23651ec94..c1896ee7e 100644
--- a/src/cpp/server/backends/ryzenai_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/ryzenai_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace ryzenai {
 
-const BackendDescriptor ryzenai_descriptor = {
+// The ryzenai backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "ryzenai-llm",
     /*display_name*/    "Ryzen AI LLM",
 #ifdef _WIN32
@@ -25,5 +30,6 @@ const BackendDescriptor ryzenai_descriptor = {
     /*required_checkpoints*/ {"main"},
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace ryzenai
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenaiserver.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
similarity index 82%
rename from src/cpp/include/lemon/backends/ryzenaiserver.h
rename to src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
index 36e1ba98d..1420efae5 100644
--- a/src/cpp/include/lemon/backends/ryzenaiserver.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "lemon/backends/backend_registry.h"
+
 #include "lemon/wrapped_server.h"
 #include "lemon/server_capabilities.h"
 #include "lemon/backends/backend_utils.h"
@@ -54,3 +56,12 @@ class RyzenAIServer : public WrappedServer {
 };
 
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace ryzenai {
+// Factory for the ryzenai backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace ryzenai
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenai_descriptor.h b/src/cpp/include/lemon/backends/ryzenai_descriptor.h
deleted file mode 100644
index 26aa0b21f..000000000
--- a/src/cpp/include/lemon/backends/ryzenai_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The ryzenai backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in ryzenai_descriptor.cpp.
-extern const BackendDescriptor ryzenai_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenai_factory.h b/src/cpp/include/lemon/backends/ryzenai_factory.h
deleted file mode 100644
index 9483d8d55..000000000
--- a/src/cpp/include/lemon/backends/ryzenai_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The ryzenai backend's factory (constructs the server class — lemond only).
-// Defined in ryzenai_factory.cpp.
-std::unique_ptr<WrappedServer> ryzenai_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/sdcpp_descriptor.cpp b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
similarity index 85%
rename from src/cpp/server/backends/sdcpp_descriptor.cpp
rename to src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index 10ebfdd58..323ec11bc 100644
--- a/src/cpp/server/backends/sdcpp_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/sdcpp_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace sdcpp {
 
-const BackendDescriptor sdcpp_descriptor = {
+// The sdcpp backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "sd-cpp",
     /*display_name*/    "StableDiffusion.cpp",
 #ifdef _WIN32
@@ -43,5 +48,6 @@ const BackendDescriptor sdcpp_descriptor = {
     /*required_checkpoints*/ {"main"},  // flux text_encoder+vae validated together in load()
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace sdcpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sd_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
similarity index 86%
rename from src/cpp/include/lemon/backends/sd_server.h
rename to src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
index 857374951..999a1de72 100644
--- a/src/cpp/include/lemon/backends/sd_server.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
@@ -1,11 +1,13 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "../model_manager.h"
-#include "../recipe_options.h"
-#include "../utils/process_manager.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/model_manager.h"
+#include "lemon/recipe_options.h"
+#include "lemon/utils/process_manager.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 #include <filesystem>
 
@@ -95,3 +97,12 @@ class SDServer : public WrappedServer, public IImageServer {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace sdcpp {
+// Factory for the sdcpp backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace sdcpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sdcpp_descriptor.h b/src/cpp/include/lemon/backends/sdcpp_descriptor.h
deleted file mode 100644
index 0bee2e552..000000000
--- a/src/cpp/include/lemon/backends/sdcpp_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The sdcpp backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in sdcpp_descriptor.cpp.
-extern const BackendDescriptor sdcpp_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sdcpp_factory.h b/src/cpp/include/lemon/backends/sdcpp_factory.h
deleted file mode 100644
index f7da955e2..000000000
--- a/src/cpp/include/lemon/backends/sdcpp_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The sdcpp backend's factory (constructs the server class — lemond only).
-// Defined in sdcpp_factory.cpp.
-std::unique_ptr<WrappedServer> sdcpp_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/vllm_descriptor.cpp b/src/cpp/include/lemon/backends/vllm/vllm.h
similarity index 69%
rename from src/cpp/server/backends/vllm_descriptor.cpp
rename to src/cpp/include/lemon/backends/vllm/vllm.h
index 54451f365..5d0210a37 100644
--- a/src/cpp/server/backends/vllm_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/vllm_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace vllm {
 
-const BackendDescriptor vllm_descriptor = {
+// The vllm backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "vllm",
     /*display_name*/    "vLLM ROCm (experimental)",
     /*binary*/          "vllm-server",
@@ -26,5 +31,6 @@ const BackendDescriptor vllm_descriptor = {
     /*required_checkpoints*/ {"main"},
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace vllm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h
similarity index 78%
rename from src/cpp/include/lemon/backends/vllm_server.h
rename to src/cpp/include/lemon/backends/vllm/vllm_server.h
index 62ec94af2..0eaf4e7d8 100644
--- a/src/cpp/include/lemon/backends/vllm_server.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h
@@ -1,7 +1,9 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 
 namespace lemon {
@@ -47,3 +49,12 @@ class VLLMServer : public WrappedServer {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace vllm {
+// Factory for the vllm backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace vllm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm_descriptor.h b/src/cpp/include/lemon/backends/vllm_descriptor.h
deleted file mode 100644
index 7119dff88..000000000
--- a/src/cpp/include/lemon/backends/vllm_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The vllm backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in vllm_descriptor.cpp.
-extern const BackendDescriptor vllm_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm_factory.h b/src/cpp/include/lemon/backends/vllm_factory.h
deleted file mode 100644
index 7bf398987..000000000
--- a/src/cpp/include/lemon/backends/vllm_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The vllm backend's factory (constructs the server class — lemond only).
-// Defined in vllm_factory.cpp.
-std::unique_ptr<WrappedServer> vllm_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/whispercpp_descriptor.cpp b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
similarity index 79%
rename from src/cpp/server/backends/whispercpp_descriptor.cpp
rename to src/cpp/include/lemon/backends/whispercpp/whispercpp.h
index 6124e779e..f49fca08f 100644
--- a/src/cpp/server/backends/whispercpp_descriptor.cpp
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -1,9 +1,14 @@
-#include "lemon/backends/whispercpp_descriptor.h"
+#pragma once
+
+#include "lemon/backends/backend_descriptor.h"
 
 namespace lemon {
 namespace backends {
+namespace whispercpp {
 
-const BackendDescriptor whispercpp_descriptor = {
+// The whispercpp backend descriptor (plain data). Header-only `inline const` so it
+// links into both the lemonade CLI and lemond without a separate source file.
+inline const BackendDescriptor descriptor = {
     /*recipe*/          "whispercpp",
     /*display_name*/    "Whisper.cpp",
 #ifdef _WIN32
@@ -35,5 +40,6 @@ const BackendDescriptor whispercpp_descriptor = {
     /*required_checkpoints*/ {"main"},  // npu_cache validated in load() (npu variant only)
 };
 
-} // namespace backends
-} // namespace lemon
+}  // namespace whispercpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whisper_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
similarity index 85%
rename from src/cpp/include/lemon/backends/whisper_server.h
rename to src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
index 55a1734d9..90744875f 100644
--- a/src/cpp/include/lemon/backends/whisper_server.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include "../wrapped_server.h"
-#include "../server_capabilities.h"
-#include "backend_utils.h"
+#include "lemon/backends/backend_registry.h"
+
+#include "lemon/wrapped_server.h"
+#include "lemon/server_capabilities.h"
+#include "lemon/backends/backend_utils.h"
 #include <string>
 #include <filesystem>
 
@@ -76,3 +78,12 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace whispercpp {
+// Factory for the whispercpp backend (constructs the server class — lemond only).
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+}  // namespace whispercpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whispercpp_descriptor.h b/src/cpp/include/lemon/backends/whispercpp_descriptor.h
deleted file mode 100644
index 2c3c87f19..000000000
--- a/src/cpp/include/lemon/backends/whispercpp_descriptor.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-#include "lemon/backends/backend_descriptor.h"
-
-namespace lemon {
-namespace backends {
-
-// The whispercpp backend's descriptor (plain data — CLI-safe, links into both the
-// lemonade CLI and lemond). Defined in whispercpp_descriptor.cpp.
-extern const BackendDescriptor whispercpp_descriptor;
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whispercpp_factory.h b/src/cpp/include/lemon/backends/whispercpp_factory.h
deleted file mode 100644
index d98c97b27..000000000
--- a/src/cpp/include/lemon/backends/whispercpp_factory.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include <memory>
-#include "lemon/backends/backend_registry.h"
-
-namespace lemon {
-namespace backends {
-
-// The whispercpp backend's factory (constructs the server class — lemond only).
-// Defined in whispercpp_factory.cpp.
-std::unique_ptr<WrappedServer> whispercpp_create(const BackendContext& ctx);
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp
index 28feccaab..b5b6680fb 100644
--- a/src/cpp/server/backends/backend_utils.cpp
+++ b/src/cpp/server/backends/backend_utils.cpp
@@ -1,14 +1,14 @@
 #include "lemon/backends/backend_utils.h"
 #include "lemon/runtime_config.h"
 #include "lemon/system_info.h"
-#include "lemon/backends/llamacpp_server.h"
-#include "lemon/backends/whisper_server.h"
-#include "lemon/backends/sd_server.h"
-#include "lemon/backends/kokoro_server.h"
-#include "lemon/backends/ryzenaiserver.h"
-#include "lemon/backends/vllm_server.h"
-#include "lemon/backends/fastflowlm_server.h"
-#include "lemon/backends/moonshine_server.h"
+#include "lemon/backends/llamacpp/llamacpp_server.h"
+#include "lemon/backends/whispercpp/whispercpp_server.h"
+#include "lemon/backends/sdcpp/sdcpp_server.h"
+#include "lemon/backends/kokoro/kokoro_server.h"
+#include "lemon/backends/ryzenai/ryzenai_server.h"
+#include "lemon/backends/vllm/vllm_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm_server.h"
+#include "lemon/backends/moonshine/moonshine_server.h"
 #include "lemon/model_manager.h"  // For DownloadProgress, DownloadProgressCallback
 
 #include "lemon/utils/path_utils.h"
diff --git a/src/cpp/server/backends/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
similarity index 98%
rename from src/cpp/server/backends/cloud_server.cpp
rename to src/cpp/server/backends/cloud/cloud_server.cpp
index 96bdcf4a3..64a940e4f 100644
--- a/src/cpp/server/backends/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -1,4 +1,6 @@
-#include "lemon/backends/cloud_server.h"
+#include "lemon/backends/cloud/cloud_server.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/model_manager.h"
 #include "lemon/cloud_provider_registry.h"
 #include "lemon/error_types.h"
 #include "lemon/runtime_config.h"
@@ -792,3 +794,17 @@ std::vector<ModelInfo> CloudServer::discover_models(const std::string& provider,
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace cloud {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<CloudServer>(
+        ctx.model_info->cloud_provider, ctx.log_level,
+        ctx.model_manager, ctx.backend_manager, ctx.cloud_registry);
+}
+
+}  // namespace cloud
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/cloud_factory.cpp b/src/cpp/server/backends/cloud_factory.cpp
deleted file mode 100644
index cee2c4ab5..000000000
--- a/src/cpp/server/backends/cloud_factory.cpp
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "lemon/backends/cloud_factory.h"
-#include "lemon/backends/cloud_server.h"
-#include "lemon/model_manager.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> cloud_create(const BackendContext& ctx) {
-    return std::make_unique<CloudServer>(
-        ctx.model_info->cloud_provider, ctx.log_level,
-        ctx.model_manager, ctx.backend_manager, ctx.cloud_registry);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
similarity index 97%
rename from src/cpp/server/backends/fastflowlm_server.cpp
rename to src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index dc38928e3..81e40ba60 100644
--- a/src/cpp/server/backends/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -1,4 +1,5 @@
-#include "lemon/backends/fastflowlm_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm_server.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/system_info.h"
 #include "lemon/error_types.h"
@@ -465,3 +466,15 @@ std::string FastFlowLMServer::get_flm_path() {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace fastflowlm {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<FastFlowLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+}  // namespace fastflowlm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm_factory.cpp b/src/cpp/server/backends/fastflowlm_factory.cpp
deleted file mode 100644
index 96eddd998..000000000
--- a/src/cpp/server/backends/fastflowlm_factory.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lemon/backends/fastflowlm_factory.h"
-#include "lemon/backends/fastflowlm_server.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> fastflowlm_create(const BackendContext& ctx) {
-    return std::make_unique<FastFlowLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
similarity index 94%
rename from src/cpp/server/backends/kokoro_server.cpp
rename to src/cpp/server/backends/kokoro/kokoro_server.cpp
index 7a707cd7e..e0a2f7ada 100644
--- a/src/cpp/server/backends/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -1,4 +1,5 @@
-#include "lemon/backends/kokoro_server.h"
+#include "lemon/backends/kokoro/kokoro_server.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/utils/process_manager.h"
@@ -203,3 +204,15 @@ void KokoroServer::audio_speech(const json& request, httplib::DataSink& sink) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace kokoro {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<KokoroServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+}  // namespace kokoro
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/kokoro_factory.cpp b/src/cpp/server/backends/kokoro_factory.cpp
deleted file mode 100644
index a7d4f3be8..000000000
--- a/src/cpp/server/backends/kokoro_factory.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lemon/backends/kokoro_factory.h"
-#include "lemon/backends/kokoro_server.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> kokoro_create(const BackendContext& ctx) {
-    return std::make_unique<KokoroServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
similarity index 98%
rename from src/cpp/server/backends/llamacpp_server.cpp
rename to src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 7f50022af..2c828f1c4 100644
--- a/src/cpp/server/backends/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -1,4 +1,5 @@
-#include "lemon/backends/llamacpp_server.h"
+#include "lemon/backends/llamacpp/llamacpp_server.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/auto_tune.h"
 #include "lemon/backend_manager.h"
@@ -644,3 +645,15 @@ json LlamaCppServer::responses(const json& request) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<LlamaCppServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+}  // namespace llamacpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp_factory.cpp b/src/cpp/server/backends/llamacpp_factory.cpp
deleted file mode 100644
index cd34fab5a..000000000
--- a/src/cpp/server/backends/llamacpp_factory.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lemon/backends/llamacpp_factory.h"
-#include "lemon/backends/llamacpp_server.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> llamacpp_create(const BackendContext& ctx) {
-    return std::make_unique<LlamaCppServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
similarity index 96%
rename from src/cpp/server/backends/moonshine_server.cpp
rename to src/cpp/server/backends/moonshine/moonshine_server.cpp
index 3257c05ba..7cb338286 100644
--- a/src/cpp/server/backends/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -1,4 +1,5 @@
-#include "lemon/backends/moonshine_server.h"
+#include "lemon/backends/moonshine/moonshine_server.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
@@ -330,3 +331,15 @@ json MoonshineServer::audio_transcriptions(const json& request) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace moonshine {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<MoonshineServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+}  // namespace moonshine
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/moonshine_factory.cpp b/src/cpp/server/backends/moonshine_factory.cpp
deleted file mode 100644
index 859b37b30..000000000
--- a/src/cpp/server/backends/moonshine_factory.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lemon/backends/moonshine_factory.h"
-#include "lemon/backends/moonshine_server.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> moonshine_create(const BackendContext& ctx) {
-    return std::make_unique<MoonshineServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/ryzenaiserver.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
similarity index 87%
rename from src/cpp/server/backends/ryzenaiserver.cpp
rename to src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index 6e250fa35..925fece3f 100644
--- a/src/cpp/server/backends/ryzenaiserver.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -1,4 +1,6 @@
-#include "lemon/backends/ryzenaiserver.h"
+#include "lemon/backends/ryzenai/ryzenai_server.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/model_manager.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/utils/process_manager.h"
@@ -167,3 +169,21 @@ json RyzenAIServer::responses(const json& request) {
 }
 
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace ryzenai {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    // RyzenAI resolves its model path before load (set_model_path), matching the
+    // original router factory's special-casing.
+    auto server = std::make_unique<::lemon::RyzenAIServer>(
+        ctx.model_info->model_name, ctx.log_level == "debug",
+        ctx.model_manager, ctx.backend_manager);
+    server->set_model_path(ctx.model_info->resolved_path());
+    return server;
+}
+
+}  // namespace ryzenai
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/ryzenai_factory.cpp b/src/cpp/server/backends/ryzenai_factory.cpp
deleted file mode 100644
index 4e013a30c..000000000
--- a/src/cpp/server/backends/ryzenai_factory.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-#include "lemon/backends/ryzenai_factory.h"
-#include "lemon/backends/ryzenaiserver.h"
-#include "lemon/model_manager.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> ryzenai_create(const BackendContext& ctx) {
-    // RyzenAI resolves its model path before load (set_model_path), matching the
-    // original router factory's special-casing.
-    auto server = std::make_unique<::lemon::RyzenAIServer>(
-        ctx.model_info->model_name, ctx.log_level == "debug",
-        ctx.model_manager, ctx.backend_manager);
-    server->set_model_path(ctx.model_info->resolved_path());
-    return server;
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/sd_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
similarity index 98%
rename from src/cpp/server/backends/sd_server.cpp
rename to src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index 734454c36..b561906bb 100644
--- a/src/cpp/server/backends/sd_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -1,4 +1,5 @@
-#include "lemon/backends/sd_server.h"
+#include "lemon/backends/sdcpp/sdcpp_server.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
@@ -746,3 +747,15 @@ std::string SDServer::upscale_via_cli(
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace sdcpp {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<SDServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+}  // namespace sdcpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/sdcpp_factory.cpp b/src/cpp/server/backends/sdcpp_factory.cpp
deleted file mode 100644
index 009fffd43..000000000
--- a/src/cpp/server/backends/sdcpp_factory.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lemon/backends/sdcpp_factory.h"
-#include "lemon/backends/sd_server.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> sdcpp_create(const BackendContext& ctx) {
-    return std::make_unique<SDServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
similarity index 97%
rename from src/cpp/server/backends/vllm_server.cpp
rename to src/cpp/server/backends/vllm/vllm_server.cpp
index 7584d56d9..dae6fb883 100644
--- a/src/cpp/server/backends/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -1,4 +1,5 @@
-#include "lemon/backends/vllm_server.h"
+#include "lemon/backends/vllm/vllm_server.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/model_manager.h"
 #include "lemon/runtime_config.h"
@@ -311,3 +312,15 @@ void VLLMServer::forward_streaming_request(const std::string& endpoint,
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace vllm {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<VLLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+}  // namespace vllm
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/vllm_factory.cpp b/src/cpp/server/backends/vllm_factory.cpp
deleted file mode 100644
index 20fd71851..000000000
--- a/src/cpp/server/backends/vllm_factory.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lemon/backends/vllm_factory.h"
-#include "lemon/backends/vllm_server.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> vllm_create(const BackendContext& ctx) {
-    return std::make_unique<VLLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/backends/whisper_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
similarity index 98%
rename from src/cpp/server/backends/whisper_server.cpp
rename to src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index cc37be36d..3c574f27a 100644
--- a/src/cpp/server/backends/whisper_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -1,4 +1,5 @@
-#include "lemon/backends/whisper_server.h"
+#include "lemon/backends/whispercpp/whispercpp_server.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
@@ -688,3 +689,15 @@ json WhisperServer::audio_transcriptions(const json& request) {
 
 } // namespace backends
 } // namespace lemon
+
+namespace lemon {
+namespace backends {
+namespace whispercpp {
+
+std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
+    return std::make_unique<WhisperServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
+}  // namespace whispercpp
+}  // namespace backends
+}  // namespace lemon
diff --git a/src/cpp/server/backends/whispercpp_factory.cpp b/src/cpp/server/backends/whispercpp_factory.cpp
deleted file mode 100644
index 3223804aa..000000000
--- a/src/cpp/server/backends/whispercpp_factory.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lemon/backends/whispercpp_factory.h"
-#include "lemon/backends/whisper_server.h"
-#include "lemon/wrapped_server.h"
-
-namespace lemon {
-namespace backends {
-
-std::unique_ptr<WrappedServer> whispercpp_create(const BackendContext& ctx) {
-    return std::make_unique<WhisperServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
-}
-
-} // namespace backends
-} // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index febdc4ec8..6695fbfc7 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -9,9 +9,9 @@
 #include <lemon/system_info.h>
 #include <lemon/backends/backend_descriptor_registry.h>
 #include <lemon/backends/backend_utils.h>
-#include <lemon/backends/cloud_server.h>
+#include <lemon/backends/cloud/cloud_server.h>
 #include <lemon/cloud_provider_registry.h>
-#include <lemon/backends/fastflowlm_server.h>
+#include <lemon/backends/fastflowlm/fastflowlm_server.h>
 #include <filesystem>
 #include <iostream>
 #include <fstream>
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index a3c4bec74..307c51294 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -1,15 +1,15 @@
 #include "lemon/router.h"
 #include "lemon/cloud_provider_registry.h"
 #include "lemon/backends/backend_registry.h"
-#include "lemon/backends/cloud_server.h"
-#include "lemon/backends/llamacpp_server.h"
-#include "lemon/backends/fastflowlm_server.h"
-#include "lemon/backends/ryzenaiserver.h"
-#include "lemon/backends/whisper_server.h"
-#include "lemon/backends/moonshine_server.h"
-#include "lemon/backends/kokoro_server.h"
-#include "lemon/backends/sd_server.h"
-#include "lemon/backends/vllm_server.h"
+#include "lemon/backends/cloud/cloud_server.h"
+#include "lemon/backends/llamacpp/llamacpp_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm_server.h"
+#include "lemon/backends/ryzenai/ryzenai_server.h"
+#include "lemon/backends/whispercpp/whispercpp_server.h"
+#include "lemon/backends/moonshine/moonshine_server.h"
+#include "lemon/backends/kokoro/kokoro_server.h"
+#include "lemon/backends/sdcpp/sdcpp_server.h"
+#include "lemon/backends/vllm/vllm_server.h"
 #include "lemon/server_capabilities.h"
 #include "lemon/error_types.h"
 #include "lemon/recipe_options.h"
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index d0fea0504..384412753 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -5,8 +5,8 @@
 #include "lemon/config_file.h"
 #include "lemon/mcp_server.h"
 #include "lemon/ollama_api.h"
-#include "lemon/backends/cloud_server.h"
-#include "lemon/backends/sd_server.h"
+#include "lemon/backends/cloud/cloud_server.h"
+#include "lemon/backends/sdcpp/sdcpp_server.h"
 #include "lemon/backends/backend_utils.h"
 #include <cstring>
 #include "lemon/utils/json_utils.h"

From 33b437b634b8982943dc2ae9cd1b27763117554b Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Fri, 19 Jun 2026 20:51:16 -0400
Subject: [PATCH 03/39] docs(backends): mechanize the README support matrix
 from descriptors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the existing curated docs generate from the backend descriptors instead of
just shipping a separate reference file — closing appendix rows 14 and 22.

- Expand the descriptor with the editorial fields the curated docs need:
  `modality`, `experimental`, `web_display_name`, and a per-support-row
  `device_summary` (RecipeBackendDef). These keep the descriptor the single
  source of truth.
- /system-info exposes them plus a registry `order` index and `slot_policy`.
- gen_backend_docs.py now targets multiple docs and renders:
    * README.md "Supported Configurations" HTML matrix (grouped by modality,
      merged rows, rowspans, experimental tag) — wrapped in GENERATED markers;
    * docs/guide/configuration/multi-model.md NPU-exclusivity list.
  The backend-docs-drift CI job's --check now covers all three docs.

The generated README matrix is also more complete than the hand-written one
(it now includes whispercpp rocm/metal, kokoro metal, sd-cpp metal). Footnotes
and prose outside the markers are preserved.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md                                     |  82 +++++---
 docs/dev/adding-a-backend.md                  |   8 +-
 docs/guide/configuration/multi-model.md       |   4 +-
 docs/tools/gen_backend_docs.py                | 177 ++++++++++++++++--
 .../lemon/backends/backend_descriptor.h       |  15 ++
 src/cpp/include/lemon/backends/cloud/cloud.h  |   3 +
 .../lemon/backends/fastflowlm/fastflowlm.h    |   5 +-
 .../include/lemon/backends/kokoro/kokoro.h    |   7 +-
 .../lemon/backends/llamacpp/llamacpp.h        |  15 +-
 .../lemon/backends/moonshine/moonshine.h      |   9 +-
 .../include/lemon/backends/ryzenai/ryzenai.h  |   5 +-
 src/cpp/include/lemon/backends/sdcpp/sdcpp.h  |  13 +-
 src/cpp/include/lemon/backends/vllm/vllm.h    |   5 +-
 .../lemon/backends/whispercpp/whispercpp.h    |  13 +-
 src/cpp/include/lemon/recipe_backend_def.h    |   3 +
 src/cpp/server/system_info.cpp                |  13 +-
 16 files changed, 301 insertions(+), 76 deletions(-)

diff --git a/README.md b/README.md
index 38d9db6fe..2175b846e 100644
--- a/README.md
+++ b/README.md
@@ -123,6 +123,7 @@ Use `lemonade pull` or the built-in **Model Manager** to download models. You ca
 
 Lemonade supports multiple inference engines for LLM, speech, TTS, and image generation, and each has its own backend and hardware requirements.
 
+<!-- BEGIN GENERATED: backends-matrix -->
 <table>
   <thead>
     <tr>
@@ -137,14 +138,14 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
     <tr>
       <td rowspan="9"><strong>Text generation</strong></td>
       <td rowspan="6"><code>llamacpp</code></td>
-      <td><code>vulkan</code></td>
-      <td><code>x86_64</code> CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)</td>
-      <td>Windows, Linux</td>
+      <td><code>system</code></td>
+      <td><code>x86_64</code>/ARM64 CPU, GPU</td>
+      <td>Linux</td>
     </tr>
     <tr>
-      <td><code>rocm</code></td>
-      <td>Supported AMD ROCm iGPU/dGPU families*</td>
-      <td>Windows, Linux</td>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
     </tr>
     <tr>
       <td><code>cuda</code></td>
@@ -152,49 +153,54 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>cpu</code></td>
-      <td><code>x86_64</code> CPU; ARM64 CPU (Linux)</td>
+      <td><code>vulkan</code></td>
+      <td><code>x86_64</code> CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>metal</code></td>
-      <td>Apple Silicon GPU</td>
-      <td>macOS</td>
+      <td><code>rocm</code></td>
+      <td>Supported AMD ROCm iGPU/dGPU families*</td>
+      <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>system</code></td>
-      <td><code>x86_64</code>/ARM64 CPU, GPU</td>
-      <td>Linux</td>
+      <td><code>cpu</code></td>
+      <td><code>x86_64</code> CPU; ARM64 CPU (Linux)</td>
+      <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>flm</code></td>
+      <td rowspan="1"><code>flm</code></td>
       <td><code>npu</code></td>
       <td>XDNA2 NPU</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>ryzenai-llm</code></td>
+      <td rowspan="1"><code>ryzenai-llm</code></td>
       <td><code>npu</code></td>
       <td>XDNA2 NPU</td>
       <td>Windows</td>
     </tr>
     <tr>
-      <td><code>vllm</code> (experimental)</td>
+      <td rowspan="1"><code>vllm</code> (experimental)</td>
       <td><code>rocm</code></td>
       <td>Strix Halo iGPU (gfx1151)</td>
       <td>Linux</td>
     </tr>
     <tr>
-      <td rowspan="4"><strong>Speech-to-text</strong></td>
-      <td rowspan="3"><code>whispercpp</code></td>
+      <td rowspan="6"><strong>Speech-to-text</strong></td>
+      <td rowspan="5"><code>whispercpp</code></td>
       <td><code>npu</code></td>
       <td>XDNA2 NPU</td>
       <td>Windows</td>
     </tr>
+    <tr>
+      <td><code>rocm</code></td>
+      <td>Supported AMD ROCm iGPU/dGPU families*</td>
+      <td>Windows, Linux</td>
+    </tr>
     <tr>
       <td><code>vulkan</code></td>
       <td><code>x86_64</code> CPU</td>
-      <td>Linux</td>
+      <td>Windows, Linux</td>
     </tr>
     <tr>
       <td><code>cpu</code></td>
@@ -202,28 +208,33 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td><code>moonshine</code></td>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
+    </tr>
+    <tr>
+      <td rowspan="1"><code>moonshine</code></td>
       <td><code>cpu</code></td>
       <td><code>x86_64</code>/<code>arm64</code> CPU</td>
       <td>Windows, Linux, macOS</td>
     </tr>
     <tr>
-      <td><strong>Text-to-speech</strong></td>
-      <td><code>kokoro</code></td>
+      <td rowspan="2"><strong>Text-to-speech</strong></td>
+      <td rowspan="2"><code>kokoro</code></td>
       <td><code>cpu</code></td>
       <td><code>x86_64</code> CPU</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
-      <td rowspan="4"><strong>Image generation</strong></td>
-      <td rowspan="4"><code>sd-cpp</code></td>
-      <td><code>rocm</code></td>
-      <td>Supported AMD ROCm iGPU/dGPU families*</td>
-      <td>Windows, Linux</td>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
     </tr>
     <tr>
-      <td><code>vulkan</code></td>
-      <td>Vulkan-capable GPUs</td>
+      <td rowspan="5"><strong>Image generation</strong></td>
+      <td rowspan="5"><code>sd-cpp</code></td>
+      <td><code>rocm</code></td>
+      <td>Supported AMD ROCm iGPU/dGPU families*</td>
       <td>Windows, Linux</td>
     </tr>
     <tr>
@@ -231,13 +242,24 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen
       <td>NVIDIA GPUs (Turing or newer)**</td>
       <td>Linux</td>
     </tr>
+    <tr>
+      <td><code>vulkan</code></td>
+      <td>Vulkan-capable GPUs</td>
+      <td>Windows, Linux</td>
+    </tr>
     <tr>
       <td><code>cpu</code></td>
       <td><code>x86_64</code> CPU</td>
       <td>Windows, Linux</td>
     </tr>
+    <tr>
+      <td><code>metal</code></td>
+      <td>Apple Silicon GPU</td>
+      <td>macOS</td>
+    </tr>
   </tbody>
 </table>
+<!-- END GENERATED: backends-matrix -->
 
 To check exactly which recipes/backends are supported on your own machine, run:
 
diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md
index ae5006a4f..7699f97e6 100644
--- a/docs/dev/adding-a-backend.md
+++ b/docs/dev/adding-a-backend.md
@@ -122,8 +122,12 @@ regenerates the registry headers, binding `<stem>::descriptor` to `<stem>::creat
   `--<recipe>` when `selectable_backend = true`.
 - **Install/download** via the backend's `BackendSpec` (binary + install params).
 - **`/system-info`** `recipes` entry (display name, options schema, support matrix).
-- **Generated docs** — your backend appears in
-  [`backends-reference.md`](backends-reference.md) automatically.
+- **Generated docs** — your backend appears automatically in
+  [`backends-reference.md`](backends-reference.md), the README "Supported
+  Configurations" matrix, and the multi-model NPU-exclusivity list. A CI job
+  (`backend-docs-drift`) fails if the committed docs are stale. The descriptor's
+  `modality`, `experimental`, `web_display_name`, and each support row's
+  `device_summary` supply the editorial bits the matrix needs.
 
 ## Escape hatches
 
diff --git a/docs/guide/configuration/multi-model.md b/docs/guide/configuration/multi-model.md
index 30ed840d5..db9944ff9 100644
--- a/docs/guide/configuration/multi-model.md
+++ b/docs/guide/configuration/multi-model.md
@@ -22,7 +22,9 @@ Each type has its own independent LRU cache, all sharing the same slot limit set
 
 ## Device Constraints
 
-- **NPU Exclusivity:** `flm`, `ryzenai-llm`, and `whispercpp` are mutually exclusive on the NPU.
+<!-- BEGIN GENERATED: npu-exclusivity -->
+- **NPU Exclusivity:** `whispercpp`, `flm`, and `ryzenai-llm` are mutually exclusive on the NPU.
+<!-- END GENERATED: npu-exclusivity -->
     - Loading a model from one of these backends will automatically evict all NPU models from the other backends.
     - `flm` supports loading 1 ASR model, 1 LLM, and 1 embedding model on the NPU at the same time.
     - `ryzenai-llm` supports loading exactly 1 LLM, which uses the entire NPU.
diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py
index 737715605..895c0a318 100644
--- a/docs/tools/gen_backend_docs.py
+++ b/docs/tools/gen_backend_docs.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 """Generate backend reference docs from the self-describing backend descriptors.
 
-The C++ backend descriptors (src/cpp/server/backends/*_descriptor.cpp) are the
-single source of truth for what each backend is. This script boots a `lemond`
+The C++ backend descriptors (src/cpp/include/lemon/backends/<stem>/<stem>.h) are
+the single source of truth for what each backend is. This script boots a `lemond`
 server, reads the descriptor-generated ``/system-info`` ``recipes`` object and
 ``server_models.json``, and rewrites the marker-delimited regions of the target
 doc(s). A CI step runs it with ``--check`` and fails if the committed docs drift.
@@ -110,6 +110,121 @@ def md_escape(text: str) -> str:
     return str(text).replace("|", "\\|")
 
 
+MODALITY_ORDER = [
+    "Text generation",
+    "Speech-to-text",
+    "Text-to-speech",
+    "Image generation",
+]
+OS_LABEL = {"windows": "Windows", "linux": "Linux", "macos": "macOS"}
+OS_ORDER = ["windows", "linux", "macos"]
+
+
+def _fmt_os(os_set) -> str:
+    return ", ".join(OS_LABEL.get(o, o) for o in OS_ORDER if o in os_set)
+
+
+def _code_devices(summary: str) -> str:
+    # Light formatting: render bare arch tokens as <code>, matching the README style.
+    summary = re.sub(r"\bx86_64\b", "<code>x86_64</code>", summary)
+    summary = re.sub(r"\barm64\b", "<code>arm64</code>", summary)
+    return summary
+
+
+def _ordered(recipes: dict) -> list:
+    # Recipes in descriptor registry order (stable, deterministic doc rendering).
+    return sorted(recipes.items(), key=lambda kv: kv[1].get("order", 999))
+
+
+def render_readme_matrix(recipes: dict) -> str:
+    # Group descriptor-backed recipes by modality, in descriptor registry order.
+    by_mod: dict[str, list] = {m: [] for m in MODALITY_ORDER}
+    for recipe, info in _ordered(recipes):
+        mod = info.get("modality")
+        if not mod or mod not in by_mod:
+            continue
+        # Merge support rows sharing a (backend, device summary); union their OS.
+        merged: list[dict] = []
+        seen: dict[tuple, dict] = {}
+        for row in info.get("support", []):
+            key = (row["backend"], row.get("device_summary", ""))
+            if key in seen:
+                seen[key]["os"] |= set(row.get("os", []))
+            else:
+                d = {
+                    "backend": row["backend"],
+                    "summary": row.get("device_summary", ""),
+                    "os": set(row.get("os", [])),
+                }
+                seen[key] = d
+                merged.append(d)
+        if merged:
+            by_mod[mod].append((recipe, info, merged))
+
+    out = [
+        "<table>",
+        "  <thead>",
+        "    <tr>",
+        "      <th>Modality</th>",
+        "      <th>Engine</th>",
+        "      <th>Backend</th>",
+        "      <th>Device</th>",
+        "      <th>OS</th>",
+        "    </tr>",
+        "  </thead>",
+        "  <tbody>",
+    ]
+    for mod in MODALITY_ORDER:
+        recipes_in = by_mod[mod]
+        if not recipes_in:
+            continue
+        mod_span = sum(len(m) for _, _, m in recipes_in)
+        first_mod = True
+        for recipe, info, merged in recipes_in:
+            engine = f"<code>{recipe}</code>" + (
+                " (experimental)" if info.get("experimental") else ""
+            )
+            first_recipe = True
+            for d in merged:
+                out.append("    <tr>")
+                if first_mod:
+                    out.append(
+                        f'      <td rowspan="{mod_span}"><strong>{mod}</strong></td>'
+                    )
+                    first_mod = False
+                if first_recipe:
+                    out.append(f'      <td rowspan="{len(merged)}">{engine}</td>')
+                    first_recipe = False
+                out.append(f'      <td><code>{d["backend"]}</code></td>')
+                out.append(f"      <td>{_code_devices(d['summary'])}</td>")
+                out.append(f"      <td>{_fmt_os(d['os'])}</td>")
+                out.append("    </tr>")
+    out += ["  </tbody>", "</table>"]
+    return "\n".join(out)
+
+
+def _oxford(items: list) -> str:
+    items = [f"`{i}`" for i in items]
+    if len(items) <= 1:
+        return "".join(items)
+    if len(items) == 2:
+        return f"{items[0]} and {items[1]}"
+    return ", ".join(items[:-1]) + f", and {items[-1]}"
+
+
+def render_npu_exclusivity(recipes: dict) -> str:
+    npu = [
+        r
+        for r, info in _ordered(recipes)
+        if any(
+            row.get("backend") == "npu"
+            or any(d.get("device") == "amd_npu" for d in row.get("devices", []))
+            for row in info.get("support", [])
+        )
+    ]
+    return f"- **NPU Exclusivity:** {_oxford(npu)} are mutually exclusive on the NPU."
+
+
 def render_overview(recipes: dict) -> str:
     rows = [
         "| Recipe | Name | Selectable backend | Uses ctx_size | Backends |",
@@ -281,27 +396,55 @@ def main() -> int:
     if not recipes:
         sys.exit("/system-info returned no recipes")
 
-    sections = {
-        "backends-overview": render_overview(recipes),
-        "backends-matrix": render_support_matrix(recipes),
-        "backend-options": render_options(recipes),
-        "backend-models": render_models(recipes),
+    # Each target doc maps marker IDs -> generated content. backends-reference.md
+    # is created from a template if missing; the others must already contain their
+    # markers (the regions were added to the curated docs by hand once).
+    targets: dict = {
+        TARGET_DOC: {
+            "sections": {
+                "backends-overview": render_overview(recipes),
+                "backends-matrix": render_support_matrix(recipes),
+                "backend-options": render_options(recipes),
+                "backend-models": render_models(recipes),
+            },
+            "template": DEFAULT_TEMPLATE,
+        },
+        REPO_ROOT
+        / "README.md": {
+            "sections": {"backends-matrix": render_readme_matrix(recipes)},
+        },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "configuration"
+        / "multi-model.md": {
+            "sections": {"npu-exclusivity": render_npu_exclusivity(recipes)},
+        },
     }
 
-    current = TARGET_DOC.read_text() if TARGET_DOC.exists() else DEFAULT_TEMPLATE
-    updated = apply_sections(current, sections)
+    stale = []
+    for path, spec in targets.items():
+        rel = path.relative_to(REPO_ROOT)
+        current = path.read_text() if path.exists() else spec.get("template", "")
+        if not current:
+            sys.exit(f"{rel} is missing and has no template")
+        updated = apply_sections(current, spec["sections"])
+        if args.check:
+            if not path.exists() or path.read_text() != updated:
+                stale.append(str(rel))
+        else:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path.write_text(updated)
+            print(f"Wrote {rel}")
 
     if args.check:
-        if not TARGET_DOC.exists() or TARGET_DOC.read_text() != updated:
+        if stale:
             sys.exit(
-                f"{TARGET_DOC.relative_to(REPO_ROOT)} is stale. Run: python docs/tools/gen_backend_docs.py"
+                "Stale generated docs: "
+                + ", ".join(stale)
+                + "\nRun: python docs/tools/gen_backend_docs.py"
             )
-        print(f"{TARGET_DOC.relative_to(REPO_ROOT)} is up to date.")
-        return 0
-
-    TARGET_DOC.parent.mkdir(parents=True, exist_ok=True)
-    TARGET_DOC.write_text(updated)
-    print(f"Wrote {TARGET_DOC.relative_to(REPO_ROOT)}")
+        print("All generated docs are up to date.")
     return 0
 
 
diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index fc6c50bc2..4b26246b6 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -29,6 +29,16 @@ enum class SlotPolicy {
     Unmetered      // never counts toward slots, never auto-evicted (cloud)
 };
 
+inline const char* slot_policy_to_string(SlotPolicy p) {
+    switch (p) {
+        case SlotPolicy::Standard:      return "standard";
+        case SlotPolicy::ExclusiveNpu:  return "exclusive_npu";
+        case SlotPolicy::CoexistByType: return "coexist_by_type";
+        case SlotPolicy::Unmetered:     return "unmetered";
+    }
+    return "standard";
+}
+
 // Plain data declaring *what a backend is*. This is the single object the
 // registry, the CLI, /system-info, and the docs all read. Behavior lives in the
 // paired WrappedServer subclass (see backend_registry.h for how they bind).
@@ -49,6 +59,11 @@ struct BackendDescriptor {
     std::vector<std::string>      default_labels;                // labels injected when a model omits them
     std::vector<std::string>      required_checkpoints{"main"};  // unconditional files; conditional ones checked in load()
 
+    // Editorial metadata for the generated docs (README support matrix, website).
+    std::string modality;           // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation"
+    bool        experimental = false; // true renders "(experimental)" next to the recipe in generated docs
+    std::string web_display_name;   // name used on the docs website ("" = fall back to display_name)
+
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
         return config_section.empty() ? recipe : config_section;
diff --git a/src/cpp/include/lemon/backends/cloud/cloud.h b/src/cpp/include/lemon/backends/cloud/cloud.h
index 2ad4f3186..9d4f5559b 100644
--- a/src/cpp/include/lemon/backends/cloud/cloud.h
+++ b/src/cpp/include/lemon/backends/cloud/cloud.h
@@ -22,6 +22,9 @@ inline const BackendDescriptor descriptor = {
     /*support*/ {},             // no local gating: install/support machinery skips cloud
     /*default_labels*/  {},
     /*required_checkpoints*/ {},  // no downloaded files
+    /*modality*/        "",
+    /*experimental*/    false,
+    /*web_display_name*/ "",
 };
 
 }  // namespace cloud
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index 648d84c57..b5b04b853 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -24,10 +24,13 @@ inline const BackendDescriptor descriptor = {
     /*dynamic_models*/  false,
     /*options*/ {},
     /*support*/ {
-        {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}},
+        {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "FastFlowLM NPU",
 };
 
 }  // namespace fastflowlm
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h
index f0492576f..69cb17dc2 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -24,11 +24,14 @@ inline const BackendDescriptor descriptor = {
     /*dynamic_models*/  false,
     /*options*/ {},
     /*support*/ {
-        {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}},
-        {"kokoro", "metal", {"macos"}, {{"metal", {}}}},
+        {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"kokoro", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
     },
     /*default_labels*/  {},  // kokoro models carry "tts" explicitly in server_models.json
     /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text-to-speech",
+    /*experimental*/    false,
+    /*web_display_name*/ "",
 };
 
 }  // namespace kokoro
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index 8348f877e..ec101dd3c 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -31,17 +31,20 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"},
     },
     /*support*/ {
-        {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}},
-        {"llamacpp", "metal", {"macos"}, {{"metal", {}}}},
+        {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"},
+        {"llamacpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
         {"llamacpp", "cuda", {"windows", "linux"},
-         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}},
-        {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}},
+         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"},
+        {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"},
         {"llamacpp", "rocm", {"windows", "linux"},
-         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}},
-        {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
+        {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "llama.cpp GPU",
 };
 
 }  // namespace llamacpp
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h
index 28b3e3e58..81f45dc25 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h
@@ -23,12 +23,15 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to moonshine-server", ""},
     },
     /*support*/ {
-        {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}},
-        {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}},
-        {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}},
+        {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"},
+        {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"},
+        {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"},
     },
     /*default_labels*/  {"transcription", "realtime-transcription"},
     /*required_checkpoints*/ {"main"},
+    /*modality*/        "Speech-to-text",
+    /*experimental*/    false,
+    /*web_display_name*/ "",
 };
 
 }  // namespace moonshine
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
index c1896ee7e..2df87cc2e 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -24,10 +24,13 @@ inline const BackendDescriptor descriptor = {
     /*dynamic_models*/  false,
     /*options*/ {},
     /*support*/ {
-        {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}},
+        {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "Ryzen AI SW NPU",
 };
 
 }  // namespace ryzenai
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index 323ec11bc..3b8f78e85 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -37,15 +37,18 @@ inline const BackendDescriptor descriptor = {
     },
     /*support*/ {
         {"sd-cpp", "rocm", {"windows", "linux"},
-         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
         {"sd-cpp", "cuda", {"linux"},
-         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}},
-        {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}},
-        {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}},
-        {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}},
+         {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"},
+        {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"},
+        {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
     },
     /*default_labels*/  {"image"},
     /*required_checkpoints*/ {"main"},  // flux text_encoder+vae validated together in load()
+    /*modality*/        "Image generation",
+    /*experimental*/    false,
+    /*web_display_name*/ "stable-diffusion.cpp",
 };
 
 }  // namespace sdcpp
diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h
index 5d0210a37..6f468a1ed 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -25,10 +25,13 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to vllm-server", "vLLM Options"},
     },
     /*support*/ {
-        {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}},
+        {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
+    /*modality*/        "Text generation",
+    /*experimental*/    true,
+    /*web_display_name*/ "",
 };
 
 }  // namespace vllm
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
index f49fca08f..1a031b6e3 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -29,15 +29,18 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to whisper-server", "Whisper.cpp Options"},
     },
     /*support*/ {
-        {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}},
+        {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
         {"whispercpp", "rocm", {"windows", "linux"},
-         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}},
-        {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}},
-        {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}},
-        {"whispercpp", "metal", {"macos"}, {{"metal", {}}}},
+         {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
+        {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"},
+        {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"whispercpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
     },
     /*default_labels*/  {"transcription", "realtime-transcription"},
     /*required_checkpoints*/ {"main"},  // npu_cache validated in load() (npu variant only)
+    /*modality*/        "Speech-to-text",
+    /*experimental*/    false,
+    /*web_display_name*/ "whisper.cpp",
 };
 
 }  // namespace whispercpp
diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h
index 1557db077..829ff0f78 100644
--- a/src/cpp/include/lemon/recipe_backend_def.h
+++ b/src/cpp/include/lemon/recipe_backend_def.h
@@ -21,6 +21,9 @@ struct RecipeBackendDef {
     std::string backend;
     std::set<std::string> supported_os;
     DeviceConstraints devices;
+    // Human-friendly device description for the generated support matrix (README).
+    // May contain footnote markers (e.g. "*") whose text lives as prose in the doc.
+    std::string device_summary = "";
 };
 
 } // namespace lemon
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 6a27a4fb2..730d2c985 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -1505,17 +1505,25 @@ json SystemInfo::build_recipes_info(const json& devices) {
     // app, the docs generator) can render display names and per-recipe option
     // schemas without hardcoding them. This is the single source the frontend
     // reads instead of its own per-recipe TypeScript tables.
+    int recipe_order = 0;
     for (const auto* desc : lemon::backends::all_descriptors()) {
         auto it = recipes.find(desc->recipe);
         if (it == recipes.end()) {
+            ++recipe_order;
             continue;  // recipe not surfaced on this system (e.g. cloud has no support rows)
         }
         json& entry = it.value();
+        entry["order"] = recipe_order++;  // descriptor registry order, for deterministic doc rendering
         entry["display_name"] = desc->display_name;
         entry["selectable_backend"] = desc->selectable_backend;
         entry["uses_ctx_size"] = desc->uses_ctx_size;
-        // Machine-independent support matrix (OS + device families per backend),
-        // straight from the descriptor — used by the docs generator.
+        entry["modality"] = desc->modality;
+        entry["experimental"] = desc->experimental;
+        entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name;
+        entry["slot_policy"] = slot_policy_to_string(desc->slot_policy);
+        // Machine-independent support matrix (OS + device families + friendly
+        // device summary per backend), straight from the descriptor — used by the
+        // docs generator to render the README support matrix etc.
         json support = json::array();
         for (const auto& row : desc->support) {
             json devices = json::array();
@@ -1527,6 +1535,7 @@ json SystemInfo::build_recipes_info(const json& devices) {
                 {"backend", row.backend},
                 {"os", std::vector<std::string>(row.supported_os.begin(), row.supported_os.end())},
                 {"devices", devices},
+                {"device_summary", row.device_summary},
             });
         }
         entry["support"] = support;

From 84616c42f73092be4db66fd677218bff4404b12f Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Fri, 19 Jun 2026 20:54:04 -0400
Subject: [PATCH 04/39] docs(cli): mechanize the per-recipe load-options tables
 from descriptors

Wrap cli.md's "Recipe-Specific Options" tables in GENERATED markers and render
them from the descriptor options. This also fixes pre-existing drift: the section
documented `--steps`/`--cfg-scale`/`--width`/`--height` flags that the CLI no
longer registers, and omitted the moonshine and vllm recipes. Now covered by the
backend-docs-drift CI check.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/guide/cli.md              | 44 +++++++++++++++++++++------------
 docs/tools/gen_backend_docs.py | 45 ++++++++++++++++++++++++++++++++++
 2 files changed, 73 insertions(+), 16 deletions(-)

diff --git a/docs/guide/cli.md b/docs/guide/cli.md
index 8749b1661..cad252dcb 100644
--- a/docs/guide/cli.md
+++ b/docs/guide/cli.md
@@ -325,44 +325,56 @@ The following options apply to all model loads:
 
 The following options are available depending on the recipe being used:
 
-#### Llama.cpp (`llamacpp` recipe)
+<!-- BEGIN GENERATED: cli-recipe-options -->
+#### Llama.cpp GPU (`llamacpp` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--ctx-size SIZE` | Context size for the model | `4096` |
+| `--ctx-size SIZE` | Context size for the model | auto |
 | `--llamacpp BACKEND` | LlamaCpp backend to use | Auto-detected |
-| `--llamacpp-device DEVICE` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | (empty) |
-| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server (must not conflict with managed args) | `""` |
+| `--llamacpp-device DEVICES` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | `""` |
+| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server | `""` |
 
-#### FLM (`flm` recipe)
+#### Whisper.cpp (`whispercpp` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--ctx-size SIZE` | Context size for the model | `4096` |
+| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected |
+| `--whispercpp-args ARGS` | Custom arguments to pass to whisper-server | `""` |
 
-#### RyzenAI LLM (`ryzenai-llm` recipe)
+#### Moonshine (`moonshine` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--ctx-size SIZE` | Context size for the model | `4096` |
+| `--moonshine-args ARGS` | Custom arguments to pass to moonshine-server | `""` |
 
-#### SD.cpp (`sd-cpp` recipe)
+#### StableDiffusion.cpp (`sd-cpp` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--sdcpp BACKEND` | SD.cpp backend to use (`cpu` for CPU, `rocm` for AMD GPU) | Auto-detected |
+| `--sdcpp BACKEND` | SD.cpp backend to use | Auto-detected |
 | `--sdcpp-args ARGS` | Custom arguments to pass to sd-server (must not conflict with managed args) | `""` |
-| `--steps N` | Number of inference steps for image generation | `20` |
-| `--cfg-scale SCALE` | Classifier-free guidance scale for image generation | `7.0` |
-| `--width PX` | Image width in pixels | `512` |
-| `--height PX` | Image height in pixels | `512` |
 
-#### Whisper.cpp (`whispercpp` recipe)
+#### FastFlowLM NPU (`flm` recipe)
 
 | Option | Description | Default |
 |--------|-------------|---------|
-| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected |
+| `--ctx-size SIZE` | Context size for the model | auto |
+
+#### Ryzen AI LLM (`ryzenai-llm` recipe)
 
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--ctx-size SIZE` | Context size for the model | auto |
+
+#### vLLM ROCm (experimental) (`vllm` recipe)
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `--ctx-size SIZE` | Context size for the model | auto |
+| `--vllm BACKEND` | vLLM backend to use | Auto-detected |
+| `--vllm-args ARGS` | Custom arguments to pass to vllm-server | `""` |
+<!-- END GENERATED: cli-recipe-options -->
 **Notes:**
 - Unspecified options will use the backend's default values
 - Backend options (`--llamacpp`, `--sdcpp`, `--whispercpp`) are auto-detected based on system capabilities
diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py
index 895c0a318..49f9db2ba 100644
--- a/docs/tools/gen_backend_docs.py
+++ b/docs/tools/gen_backend_docs.py
@@ -203,6 +203,45 @@ def render_readme_matrix(recipes: dict) -> str:
     return "\n".join(out)
 
 
+def _cli_default(opt: dict) -> str:
+    d = opt.get("default")
+    if opt.get("type_name") == "BACKEND" and d == "":
+        return "Auto-detected"
+    if isinstance(d, str):
+        return '`""`' if d == "" else f"`{d}`"
+    if isinstance(d, bool):
+        return f"`{str(d).lower()}`"
+    if d == -1:
+        return "auto"
+    return f"`{d}`"
+
+
+def render_cli_recipe_options(recipes: dict) -> str:
+    # Per-recipe load options, exactly as the CLI registers them from descriptors.
+    # Recipes with no CLI options (kokoro, cloud) are omitted.
+    blocks: list[str] = []
+    for recipe, info in _ordered(recipes):
+        cli_opts = [o for o in info.get("options", []) if o.get("cli_flag")]
+        if not info.get("uses_ctx_size") and not cli_opts:
+            continue
+        blocks.append(f"#### {info.get('display_name', recipe)} (`{recipe}` recipe)\n")
+        blocks.append("| Option | Description | Default |")
+        blocks.append("|--------|-------------|---------|")
+        if info.get("uses_ctx_size"):
+            blocks.append("| `--ctx-size SIZE` | Context size for the model | auto |")
+        for o in cli_opts:
+            blocks.append(
+                "| `{flag} {t}` | {h} | {d} |".format(
+                    flag=o["cli_flag"],
+                    t=o.get("type_name", ""),
+                    h=md_escape(o.get("help", "")),
+                    d=_cli_default(o),
+                )
+            )
+        blocks.append("")
+    return "\n".join(blocks).rstrip()
+
+
 def _oxford(items: list) -> str:
     items = [f"`{i}`" for i in items]
     if len(items) <= 1:
@@ -420,6 +459,12 @@ def main() -> int:
         / "multi-model.md": {
             "sections": {"npu-exclusivity": render_npu_exclusivity(recipes)},
         },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "cli.md": {
+            "sections": {"cli-recipe-options": render_cli_recipe_options(recipes)},
+        },
     }
 
     stale = []

From 2d1fd36b14265ec8fc1b1e090b03300eb65f1a04 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Fri, 19 Jun 2026 20:56:08 -0400
Subject: [PATCH 05/39] docs(custom-models): mechanize the --recipe value list
 from descriptors

Add inline-marker support to the generator and wrap the `--recipe` "Common
values" list in custom-models.md so it renders from the descriptor recipe set
(plus collection.omni). Now covered by the backend-docs-drift CI check.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/guide/configuration/custom-models.md |  2 +-
 docs/tools/gen_backend_docs.py            | 22 ++++++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/docs/guide/configuration/custom-models.md b/docs/guide/configuration/custom-models.md
index c3e770442..5a7dbd878 100644
--- a/docs/guide/configuration/custom-models.md
+++ b/docs/guide/configuration/custom-models.md
@@ -71,7 +71,7 @@ Supported registration flags:
 | Flag | Description |
 |------|-------------|
 | `--checkpoint TYPE CHECKPOINT` | Add a checkpoint entry. Repeat for multi-file models such as `main` + `mmproj` or `main` + `vae`. |
-| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: `llamacpp`, `flm`, `ryzenai-llm`, `vllm`, `whispercpp`, `moonshine`, `sd-cpp`, `kokoro`, `collection.omni`. |
+| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: <!-- BEGIN GENERATED: recipe-values -->`llamacpp`, `whispercpp`, `moonshine`, `kokoro`, `sd-cpp`, `flm`, `ryzenai-llm`, `vllm`, `collection.omni`<!-- END GENERATED: recipe-values -->. |
 | `--label LABEL` | Add a label to the new model. Repeatable. Valid labels include `coding`, `embeddings`, `hot`, `mtp`, `reasoning`, `reranking`, `tool-calling`, `vision`. |
 | `--components MODEL [MODEL ...]` | Components for an omni collection (see below). Use with `--recipe collection.omni`. |
 
diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py
index 49f9db2ba..357d58aa6 100644
--- a/docs/tools/gen_backend_docs.py
+++ b/docs/tools/gen_backend_docs.py
@@ -251,6 +251,12 @@ def _oxford(items: list) -> str:
     return ", ".join(items[:-1]) + f", and {items[-1]}"
 
 
+def render_recipe_values(recipes: dict) -> str:
+    # Inline list of recipe values for `--recipe`, plus the collection orchestrator.
+    rs = [r for r, _ in _ordered(recipes)] + ["collection.omni"]
+    return ", ".join(f"`{r}`" for r in rs)
+
+
 def render_npu_exclusivity(recipes: dict) -> str:
     npu = [
         r
@@ -409,11 +415,16 @@ def apply_sections(text: str, sections: dict[str, str]) -> str:
             + r" -->)",
             re.DOTALL,
         )
-        if not pattern.search(text):
+        m = pattern.search(text)
+        if not m:
             sys.exit(f"Marker region '{marker_id}' not found in target doc")
+        # Inline regions (markers mid-line, e.g. inside a table cell) get no
+        # surrounding newlines; block regions are wrapped on their own lines.
+        inline = m.start() > 0 and text[m.start() - 1] != "\n"
         # Escape backslashes and group-ref markers in the body for re.sub.
         safe_body = body.replace("\\", "\\\\")
-        replacement = r"\1" + "\n" + safe_body + "\n" + r"\2"
+        sep = "" if inline else "\n"
+        replacement = r"\1" + sep + safe_body + sep + r"\2"
         text = pattern.sub(replacement, text)
     return text
 
@@ -465,6 +476,13 @@ def main() -> int:
         / "cli.md": {
             "sections": {"cli-recipe-options": render_cli_recipe_options(recipes)},
         },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "configuration"
+        / "custom-models.md": {
+            "sections": {"recipe-values": render_recipe_values(recipes)},
+        },
     }
 
     stale = []

From 9b8383cbc87aaa5db768684e120cd4c020686f74 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 13:44:42 -0400
Subject: [PATCH 06/39] docs: mechanize config.json example and models.js
 recipe metadata
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Close the last two cleanly-derivable doc touchpoints (appendix rows 16 and 21).

- configuration/README.md "Example config.json": generated from a fresh lemond's
  GET /internal/config (the real canonical config). This also fixes pre-existing
  drift — the hand-written block had `config_version: 1` (now 2), `prefer_system:
  false` (now true), a stray `device` key, and an invalid trailing comma. `port`
  is normalized to the documented default 13305.
- docs/assets/models.js RECIPE_PRIORITY + RECIPE_DISPLAY_NAMES: generated from
  descriptors. A new `web_priority` editorial field preserves the curated website
  ordering (so the order is descriptor-sourced, not a silent reorder); legacy
  `oga-*` recipes are dropped as agreed. Adds the correct `vllm` display name.

The generator now drives 7 docs and supports both `<!-- -->` (Markdown) and
`/* */` (JS) GENERATED markers. backend-docs-drift --check covers all of them.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/assets/models.js                         | 12 +--
 docs/guide/configuration/README.md            | 89 +++++++++++--------
 docs/tools/gen_backend_docs.py                | 89 +++++++++++++++++--
 .../lemon/backends/backend_descriptor.h       |  1 +
 .../lemon/backends/fastflowlm/fastflowlm.h    |  1 +
 .../include/lemon/backends/kokoro/kokoro.h    |  1 +
 .../lemon/backends/llamacpp/llamacpp.h        |  1 +
 .../include/lemon/backends/ryzenai/ryzenai.h  |  1 +
 src/cpp/include/lemon/backends/sdcpp/sdcpp.h  |  1 +
 .../lemon/backends/whispercpp/whispercpp.h    |  1 +
 src/cpp/server/system_info.cpp                |  1 +
 11 files changed, 148 insertions(+), 50 deletions(-)

diff --git a/docs/assets/models.js b/docs/assets/models.js
index 5bb604006..d9814cccb 100644
--- a/docs/assets/models.js
+++ b/docs/assets/models.js
@@ -2,25 +2,25 @@ const GITHUB_REPO = 'lemonade-sdk/lemonade';
 const TAGS_URL = `https://api.github.com/repos/${GITHUB_REPO}/tags?per_page=100`;
 const RAW_BASE = 'https://raw.githubusercontent.com/lemonade-sdk/lemonade';
 
+/* BEGIN GENERATED: models-js-recipes */
 const RECIPE_PRIORITY = [
   'llamacpp',
   'ryzenai-llm',
   'flm',
   'whispercpp',
   'sd-cpp',
-  'oga-hybrid',
-  'oga-npu',
-  'oga-cpu',
   'kokoro'
 ];
 
 const RECIPE_DISPLAY_NAMES = {
   llamacpp: 'llama.cpp GPU',
-  'ryzenai-llm': 'Ryzen AI SW NPU',
-  flm: 'FastFlowLM NPU',
   whispercpp: 'whisper.cpp',
-  'sd-cpp': 'stable-diffusion.cpp'
+  'sd-cpp': 'stable-diffusion.cpp',
+  flm: 'FastFlowLM NPU',
+  'ryzenai-llm': 'Ryzen AI SW NPU',
+  vllm: 'vLLM ROCm (experimental)'
 };
+/* END GENERATED: models-js-recipes */
 
 const state = {
   tag: null,
diff --git a/docs/guide/configuration/README.md b/docs/guide/configuration/README.md
index 93977148c..2a388dc8f 100644
--- a/docs/guide/configuration/README.md
+++ b/docs/guide/configuration/README.md
@@ -31,68 +31,81 @@ Values set in the user's `config.json` always take precedence over these seeded
 
 ### Example config.json
 
+<!-- BEGIN GENERATED: config-example -->
 ```json
 {
-  "config_version": 1,
-  "port": 13305,
-  "host": "localhost",
-  "log_level": "info",
-  "global_timeout": 600,
-  "max_loaded_models": 1,
-  "no_broadcast": false,
-  "extra_models_dir": "",
-  "models_dir": "auto",
+  "cloud_providers": [],
+  "config_version": 2,
   "ctx_size": -1,
-  "offline": false,
-  "no_fetch_executables": false,
   "disable_model_filtering": false,
   "enable_dgpu_gtt": false,
-  "rocm_channel": "stable",
+  "extra_models_dir": "",
+  "flm": {
+    "args": ""
+  },
+  "global_timeout": 600,
+  "host": "localhost",
+  "kokoro": {
+    "cpu_bin": "builtin"
+  },
   "llamacpp": {
-    "backend": "auto",
     "args": "",
-    "vulkan_args": "",
-    "rocm_args": "",
+    "backend": "auto",
     "cpu_args": "",
-	"device": "",
-    "prefer_system": false,
+    "cpu_bin": "builtin",
+    "cuda_bin": "builtin",
+    "prefer_system": true,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin",
-    "cpu_bin": "builtin"
+    "vulkan_args": "",
+    "vulkan_bin": "builtin"
   },
-  "whispercpp": {
-    "backend": "auto",
+  "log_level": "info",
+  "max_loaded_models": 1,
+  "models_dir": "auto",
+  "moonshine": {
     "args": "",
     "cpu_args": "",
-    "npu_args": "",
-    "cpu_bin": "builtin",
-    "npu_bin": "builtin"
+    "cpu_bin": "builtin"
+  },
+  "no_broadcast": false,
+  "no_fetch_executables": false,
+  "offline": false,
+  "port": 13305,
+  "rocm_channel": "stable",
+  "ryzenai": {
+    "server_bin": "builtin"
   },
   "sdcpp": {
-    "backend": "auto",
     "args": "",
-    "cpu_args": "",
-    "rocm_args": "",
-    "vulkan_args": "",
-    "steps": 20,
+    "backend": "auto",
     "cfg_scale": 7.0,
-    "width": 512,
-    "height": 512,
+    "cpu_args": "",
     "cpu_bin": "builtin",
+    "height": 512,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin"
+    "steps": 20,
+    "vulkan_args": "",
+    "vulkan_bin": "builtin",
+    "width": 512
   },
-  "flm": {
+  "vllm": {
     "args": "",
+    "backend": "auto"
   },
-  "ryzenai": {
-    "server_bin": "builtin"
-  },
-  "kokoro": {
-    "cpu_bin": "builtin"
+  "websocket_port": "auto",
+  "whispercpp": {
+    "args": "",
+    "backend": "auto",
+    "cpu_args": "",
+    "cpu_bin": "builtin",
+    "npu_args": "",
+    "npu_bin": "builtin"
   }
 }
 ```
+<!-- END GENERATED: config-example -->
 
 ### Settings Reference
 
diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py
index 357d58aa6..8e5bf3133 100644
--- a/docs/tools/gen_backend_docs.py
+++ b/docs/tools/gen_backend_docs.py
@@ -105,6 +105,9 @@ def _get(self, path: str, timeout: float = 5):
     def system_info(self) -> dict:
         return json.loads(self._get("/api/v1/system-info", timeout=30))
 
+    def config(self) -> dict:
+        return json.loads(self._get("/internal/config", timeout=10))
+
 
 def md_escape(text: str) -> str:
     return str(text).replace("|", "\\|")
@@ -251,6 +254,53 @@ def _oxford(items: list) -> str:
     return ", ".join(items[:-1]) + f", and {items[-1]}"
 
 
+def _js_to_title(recipe: str) -> str:
+    # Mirror models.js toTitle(): the website's fallback for unlisted display names.
+    return re.sub(
+        r"\b\w",
+        lambda m: m.group(0).upper(),
+        recipe.replace("_", " ").replace("-", " "),
+    )
+
+
+def _js_key(recipe: str) -> str:
+    # Bare identifier if it's a valid JS key, else quoted (matches models.js style).
+    return recipe if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", recipe) else f"'{recipe}'"
+
+
+def render_models_js(recipes: dict) -> str:
+    # RECIPE_PRIORITY: recipes with web_priority > 0, in that order (legacy oga-*
+    # recipes have no descriptor and are intentionally dropped).
+    prioritized = sorted(
+        (r for r, i in recipes.items() if i.get("web_priority", 0) > 0),
+        key=lambda r: recipes[r]["web_priority"],
+    )
+    pri_lines = ",\n".join(f"  '{r}'" for r in prioritized)
+
+    # RECIPE_DISPLAY_NAMES: only recipes whose name differs from the JS toTitle()
+    # fallback (matching the curated map, which omits redundant entries).
+    name_lines = []
+    for r, info in _ordered(recipes):
+        name = info.get("web_display_name") or info.get("display_name", r)
+        if name and name != _js_to_title(r):
+            name_lines.append(f"  {_js_key(r)}: '{name}'")
+    names = ",\n".join(name_lines)
+
+    return (
+        f"const RECIPE_PRIORITY = [\n{pri_lines}\n];\n\n"
+        f"const RECIPE_DISPLAY_NAMES = {{\n{names}\n}};"
+    )
+
+
+def render_config_example(config: dict) -> str:
+    # The canonical config.json, straight from a fresh lemond's /internal/config.
+    # `port` is the only environment-dependent field (it reflects the launch port);
+    # normalize it to the documented default.
+    cfg = dict(config)
+    cfg["port"] = 13305
+    return "```json\n" + json.dumps(cfg, indent=2) + "\n```"
+
+
 def render_recipe_values(recipes: dict) -> str:
     # Inline list of recipe values for `--recipe`, plus the collection orchestrator.
     rs = [r for r, _ in _ordered(recipes)] + ["collection.omni"]
@@ -407,17 +457,28 @@ def render_models(recipes: dict) -> str:
 
 def apply_sections(text: str, sections: dict[str, str]) -> str:
     for marker_id, body in sections.items():
-        pattern = re.compile(
+        # Accept HTML (`<!-- ... -->`) markers for Markdown and block (`/* ... */`)
+        # markers for code files like .js, so the same generator drives both.
+        mid = re.escape(marker_id)
+        begin = (
             r"(<!-- BEGIN GENERATED: "
-            + re.escape(marker_id)
-            + r" -->).*?(<!-- END GENERATED: "
-            + re.escape(marker_id)
-            + r" -->)",
-            re.DOTALL,
+            + mid
+            + r" -->|/\* BEGIN GENERATED: "
+            + mid
+            + r" \*/)"
         )
+        end = (
+            r"(<!-- END GENERATED: "
+            + mid
+            + r" -->|/\* END GENERATED: "
+            + mid
+            + r" \*/)"
+        )
+        pattern = re.compile(begin + r".*?" + end, re.DOTALL)
         m = pattern.search(text)
         if not m:
             sys.exit(f"Marker region '{marker_id}' not found in target doc")
+
         # Inline regions (markers mid-line, e.g. inside a table cell) get no
         # surrounding newlines; block regions are wrapped on their own lines.
         inline = m.start() > 0 and text[m.start() - 1] != "\n"
@@ -442,9 +503,12 @@ def main() -> int:
     binary = find_lemond(args.lemond)
     with Lemond(binary) as server:
         info = server.system_info()
+        config = server.config()
     recipes = info.get("recipes", {})
     if not recipes:
         sys.exit("/system-info returned no recipes")
+    if not config:
+        sys.exit("/internal/config returned nothing")
 
     # Each target doc maps marker IDs -> generated content. backends-reference.md
     # is created from a template if missing; the others must already contain their
@@ -483,6 +547,19 @@ def main() -> int:
         / "custom-models.md": {
             "sections": {"recipe-values": render_recipe_values(recipes)},
         },
+        REPO_ROOT
+        / "docs"
+        / "guide"
+        / "configuration"
+        / "README.md": {
+            "sections": {"config-example": render_config_example(config)},
+        },
+        REPO_ROOT
+        / "docs"
+        / "assets"
+        / "models.js": {
+            "sections": {"models-js-recipes": render_models_js(recipes)},
+        },
     }
 
     stale = []
diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index 4b26246b6..3b9cdb2fb 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -63,6 +63,7 @@ struct BackendDescriptor {
     std::string modality;           // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation"
     bool        experimental = false; // true renders "(experimental)" next to the recipe in generated docs
     std::string web_display_name;   // name used on the docs website ("" = fall back to display_name)
+    int         web_priority = 0;    // model-grouping order on the docs website (lower = higher; 0 = unlisted)
 
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index b5b04b853..b9efb610b 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -31,6 +31,7 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    false,
     /*web_display_name*/ "FastFlowLM NPU",
+    /*web_priority*/    3,
 };
 
 }  // namespace fastflowlm
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h
index 69cb17dc2..4663d3ad3 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -32,6 +32,7 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text-to-speech",
     /*experimental*/    false,
     /*web_display_name*/ "",
+    /*web_priority*/    6,
 };
 
 }  // namespace kokoro
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index ec101dd3c..19d63c370 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -45,6 +45,7 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    false,
     /*web_display_name*/ "llama.cpp GPU",
+    /*web_priority*/    1,
 };
 
 }  // namespace llamacpp
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
index 2df87cc2e..4171dbe93 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -31,6 +31,7 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    false,
     /*web_display_name*/ "Ryzen AI SW NPU",
+    /*web_priority*/    2,
 };
 
 }  // namespace ryzenai
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index 3b8f78e85..2e12af119 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -49,6 +49,7 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Image generation",
     /*experimental*/    false,
     /*web_display_name*/ "stable-diffusion.cpp",
+    /*web_priority*/    5,
 };
 
 }  // namespace sdcpp
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
index 1a031b6e3..ce2014dec 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -41,6 +41,7 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Speech-to-text",
     /*experimental*/    false,
     /*web_display_name*/ "whisper.cpp",
+    /*web_priority*/    4,
 };
 
 }  // namespace whispercpp
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 730d2c985..cdf089843 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -1520,6 +1520,7 @@ json SystemInfo::build_recipes_info(const json& devices) {
         entry["modality"] = desc->modality;
         entry["experimental"] = desc->experimental;
         entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name;
+        entry["web_priority"] = desc->web_priority;
         entry["slot_policy"] = slot_policy_to_string(desc->slot_policy);
         // Machine-independent support matrix (OS + device families + friendly
         // device summary per backend), straight from the descriptor — used by the

From 566ea83875d7eab50f754524185d64674cccb19e Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 14:14:47 -0400
Subject: [PATCH 07/39] refactor(backends): finish agreed touchpoints rows 4 &
 5 (registry-drive spec; drop device map)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two agreed plan touchpoints were left incomplete; this finishes them.

Row 4 — try_get_spec_for_recipe was still a hand-written 8-branch if-ladder in
backend_utils.cpp, which also forced it to #include all 8 server headers. Each
backend now exposes a uniform `spec()` accessor (alongside create()); the
generated factory registry binds it, and `backends::spec_for(recipe)` /
try_get_spec_for_recipe iterate the registry. backend_utils.cpp now includes
ZERO server headers. Also reroute the two leaking `Server::SPEC` references
(model_manager find_flm_binary) through the registry.

Row 5 — get_device_type_from_recipe still carried the full recipe->device map,
redundant with BackendDescriptor::default_device. Reduced to a DEVICE_NONE
fallback for non-descriptor recipes (collections/unknown); the descriptor is the
single source via ModelManager::device_type_for_recipe.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                                |  2 +-
 .../include/lemon/backends/backend_registry.h |  8 +++++-
 .../lemon/backends/cloud/cloud_server.h       |  1 +
 .../backends/fastflowlm/fastflowlm_server.h   |  1 +
 .../lemon/backends/kokoro/kokoro_server.h     |  1 +
 .../lemon/backends/llamacpp/llamacpp_server.h |  1 +
 .../backends/moonshine/moonshine_server.h     |  1 +
 .../lemon/backends/ryzenai/ryzenai_server.h   |  1 +
 .../lemon/backends/sdcpp/sdcpp_server.h       |  1 +
 .../include/lemon/backends/vllm/vllm_server.h |  1 +
 .../backends/whispercpp/whispercpp_server.h   |  1 +
 src/cpp/include/lemon/model_types.h           | 28 +++++--------------
 src/cpp/server/backends/backend_registry.cpp  |  9 ++++++
 src/cpp/server/backends/backend_utils.cpp     | 21 +++-----------
 .../server/backends/cloud/cloud_server.cpp    |  2 ++
 .../backends/fastflowlm/fastflowlm_server.cpp |  2 ++
 .../server/backends/kokoro/kokoro_server.cpp  |  2 ++
 .../backends/llamacpp/llamacpp_server.cpp     |  2 ++
 .../backends/moonshine/moonshine_server.cpp   |  2 ++
 .../backends/ryzenai/ryzenai_server.cpp       |  2 ++
 .../server/backends/sdcpp/sdcpp_server.cpp    |  4 ++-
 src/cpp/server/backends/vllm/vllm_server.cpp  |  2 ++
 .../backends/whispercpp/whispercpp_server.cpp |  4 ++-
 src/cpp/server/model_manager.cpp              |  8 ++++--
 24 files changed, 62 insertions(+), 45 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0b59e883f..ca95c586a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -692,7 +692,7 @@ foreach(_backend_entry ${LEMON_BACKENDS})
     string(APPEND LEMON_FACTORY_INCLUDES
         "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n")
     string(APPEND LEMON_FACTORY_ENTRIES
-        "        { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create },\n")
+        "        { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec() },\n")
 endforeach()
 
 configure_file(
diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h
index 394f49145..a5a116f14 100644
--- a/src/cpp/include/lemon/backends/backend_registry.h
+++ b/src/cpp/include/lemon/backends/backend_registry.h
@@ -15,6 +15,8 @@ struct ModelInfo;
 
 namespace backends {
 
+struct BackendSpec;  // install/download spec, defined in backend_utils.h
+
 // Everything a backend's create() needs to build an instance. Mirrors the
 // arguments the old router factory passed to each backend constructor.
 struct BackendContext {
@@ -34,11 +36,15 @@ using BackendCreateFn = std::unique_ptr<WrappedServer> (*)(const BackendContext&
 struct BackendRegistration {
     const BackendDescriptor* descriptor;
     BackendCreateFn create;
+    const BackendSpec* spec;  // install/download spec, or nullptr (e.g. cloud has none)
 };
 
-// All registered (descriptor, create) pairs, in LEMON_BACKENDS order.
+// All registered (descriptor, create, spec) entries, in LEMON_BACKENDS order.
 const std::vector<BackendRegistration>& all_registrations();
 
+// Install/download spec for a recipe, or nullptr if the recipe has none.
+const BackendSpec* spec_for(const std::string& recipe);
+
 // Construct a backend instance for a recipe and associate its descriptor, or
 // nullptr if the recipe has no registered backend.
 std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/cloud/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h
index 21e28512a..afddef3a7 100644
--- a/src/cpp/include/lemon/backends/cloud/cloud_server.h
+++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h
@@ -116,6 +116,7 @@ namespace backends {
 namespace cloud {
 // Factory for the cloud backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace cloud
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
index 58b99f1ba..cb6b7d73a 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
@@ -80,6 +80,7 @@ namespace backends {
 namespace fastflowlm {
 // Factory for the fastflowlm backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace fastflowlm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
index c1f170ca7..f2fd1746a 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
@@ -55,6 +55,7 @@ namespace backends {
 namespace kokoro {
 // Factory for the kokoro backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace kokoro
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
index 7ef4bb44b..a4086ac83 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
@@ -66,6 +66,7 @@ namespace backends {
 namespace llamacpp {
 // Factory for the llamacpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace llamacpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
index b98e52806..70a71bf2a 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
@@ -61,6 +61,7 @@ namespace backends {
 namespace moonshine {
 // Factory for the moonshine backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace moonshine
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
index 1420efae5..38152e478 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
@@ -62,6 +62,7 @@ namespace backends {
 namespace ryzenai {
 // Factory for the ryzenai backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace ryzenai
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
index 999a1de72..f86b322ec 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
@@ -103,6 +103,7 @@ namespace backends {
 namespace sdcpp {
 // Factory for the sdcpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace sdcpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h
index 0eaf4e7d8..1ce866118 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm_server.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h
@@ -55,6 +55,7 @@ namespace backends {
 namespace vllm {
 // Factory for the vllm backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace vllm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
index 90744875f..21d0d3ad4 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
@@ -84,6 +84,7 @@ namespace backends {
 namespace whispercpp {
 // Factory for the whispercpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
+const BackendSpec* spec();
 }  // namespace whispercpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h
index eb5d4e0b4..c92bedb37 100644
--- a/src/cpp/include/lemon/model_types.h
+++ b/src/cpp/include/lemon/model_types.h
@@ -139,28 +139,14 @@ inline ModelType get_model_type_from_labels(const std::vector<std::string>& labe
     return ModelType::LLM;
 }
 
-// Determine device type from recipe
-// Default device from recipe — individual backends override based on their config
+// Fallback device type for recipes with no registered backend descriptor
+// (collections and unknown recipes). The authoritative per-backend default lives
+// in BackendDescriptor::default_device; ModelManager::device_type_for_recipe
+// consults the descriptor registry first and only falls back here. Kept in this
+// low-level header (which must not depend on the backend registry) for that
+// fallback alone — it intentionally carries no per-backend knowledge.
 inline DeviceType get_device_type_from_recipe(const std::string& recipe) {
-    if (recipe == "llamacpp") {
-        return DEVICE_GPU;
-    } else if (recipe == "ryzenai-llm") {
-        return DEVICE_NPU;
-    } else if (recipe == "flm") {
-        return DEVICE_NPU;
-    } else if (recipe == "whispercpp") {
-        return DEVICE_CPU;
-    } else if (recipe == "moonshine") {
-        return DEVICE_CPU;
-    } else if (recipe == "sd-cpp") {
-        return DEVICE_CPU;
-    } else if (recipe == "kokoro") {
-        return DEVICE_CPU;
-    } else if (is_collection_recipe(recipe)) {
-        return DEVICE_NONE;
-    } else if (recipe == "cloud") {
-        return DEVICE_NONE;  // Cloud-offloaded models execute on a remote provider
-    }
+    (void)recipe;
     return DEVICE_NONE;
 }
 
diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp
index 5e0de071f..a7db3921a 100644
--- a/src/cpp/server/backends/backend_registry.cpp
+++ b/src/cpp/server/backends/backend_registry.cpp
@@ -14,6 +14,15 @@ const std::vector<BackendRegistration>& all_registrations() {
     return kRegistrations;
 }
 
+const BackendSpec* spec_for(const std::string& recipe) {
+    for (const auto& reg : all_registrations()) {
+        if (reg.descriptor->recipe == recipe) {
+            return reg.spec;
+        }
+    }
+    return nullptr;
+}
+
 std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx) {
     for (const auto& reg : all_registrations()) {
         if (reg.descriptor->recipe == recipe) {
diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp
index b5b6680fb..9a57a28db 100644
--- a/src/cpp/server/backends/backend_utils.cpp
+++ b/src/cpp/server/backends/backend_utils.cpp
@@ -1,14 +1,7 @@
 #include "lemon/backends/backend_utils.h"
 #include "lemon/runtime_config.h"
 #include "lemon/system_info.h"
-#include "lemon/backends/llamacpp/llamacpp_server.h"
-#include "lemon/backends/whispercpp/whispercpp_server.h"
-#include "lemon/backends/sdcpp/sdcpp_server.h"
-#include "lemon/backends/kokoro/kokoro_server.h"
-#include "lemon/backends/ryzenai/ryzenai_server.h"
-#include "lemon/backends/vllm/vllm_server.h"
-#include "lemon/backends/fastflowlm/fastflowlm_server.h"
-#include "lemon/backends/moonshine/moonshine_server.h"
+#include "lemon/backends/backend_registry.h"  // spec_for() — descriptor->install spec, no server includes
 #include "lemon/model_manager.h"  // For DownloadProgress, DownloadProgressCallback
 
 #include "lemon/utils/path_utils.h"
@@ -39,15 +32,9 @@ using json = nlohmann::json;
 namespace lemon::backends {
 
     const BackendSpec* try_get_spec_for_recipe(const std::string& recipe) {
-        if (recipe == "llamacpp") return &LlamaCppServer::SPEC;
-        if (recipe == "whispercpp") return &WhisperServer::SPEC;
-        if (recipe == "sd-cpp") return &SDServer::SPEC;
-        if (recipe == "kokoro") return &KokoroServer::SPEC;
-        if (recipe == "ryzenai-llm") return &::lemon::RyzenAIServer::SPEC;
-        if (recipe == "vllm") return &VLLMServer::SPEC;
-        if (recipe == "flm") return &FastFlowLMServer::SPEC;
-        if (recipe == "moonshine") return &MoonshineServer::SPEC;
-        return nullptr;
+        // Each backend exposes its install/download spec through the registry
+        // (see <stem>::spec()); no per-recipe branches or server includes here.
+        return spec_for(recipe);
     }
 
     static std::string hash_string_from_json(const json& node) {
diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
index 64a940e4f..3d06a3f90 100644
--- a/src/cpp/server/backends/cloud/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -805,6 +805,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
         ctx.model_manager, ctx.backend_manager, ctx.cloud_registry);
 }
 
+
+const BackendSpec* spec() { return nullptr; }
 }  // namespace cloud
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 81e40ba60..f2b6885e6 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -475,6 +475,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<FastFlowLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+
+const BackendSpec* spec() { return &FastFlowLMServer::SPEC; }
 }  // namespace fastflowlm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
index e0a2f7ada..534225965 100644
--- a/src/cpp/server/backends/kokoro/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -213,6 +213,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<KokoroServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+
+const BackendSpec* spec() { return &KokoroServer::SPEC; }
 }  // namespace kokoro
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 2c828f1c4..1c7980024 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -654,6 +654,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<LlamaCppServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+
+const BackendSpec* spec() { return &LlamaCppServer::SPEC; }
 }  // namespace llamacpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index 7cb338286..e03b9ac2b 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -340,6 +340,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<MoonshineServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+
+const BackendSpec* spec() { return &MoonshineServer::SPEC; }
 }  // namespace moonshine
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index 925fece3f..70bedb84b 100644
--- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -184,6 +184,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return server;
 }
 
+
+const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; }
 }  // namespace ryzenai
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index b561906bb..4749f0f0d 100644
--- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -203,7 +203,7 @@ void SDServer::load(const std::string& model_name,
     RuntimeConfig::validate_backend_choice("sdcpp", backend);
 
     // Update device type based on the actual backend selected.
-    // get_device_type_from_recipe() defaults sd-cpp to CPU, but rocm/vulkan/metal/cuda are GPU backends.
+    // The descriptor defaults sd-cpp to CPU; rocm/vulkan/metal/cuda variants are GPU backends.
     if (backend == "rocm" || backend == "vulkan" || backend == "metal" || backend == "cuda") {
         device_type_ = DEVICE_GPU;
     } else {
@@ -756,6 +756,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<SDServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+
+const BackendSpec* spec() { return &SDServer::SPEC; }
 }  // namespace sdcpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
index dae6fb883..171b4cc0f 100644
--- a/src/cpp/server/backends/vllm/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -321,6 +321,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<VLLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+
+const BackendSpec* spec() { return &VLLMServer::SPEC; }
 }  // namespace vllm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index 3c574f27a..8fb454f09 100644
--- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -230,7 +230,7 @@ void WhisperServer::load(const std::string& model_name,
     RuntimeConfig::validate_backend_choice("whispercpp", whispercpp_backend);
 
     // Update device type based on the actual backend selected.
-    // get_device_type_from_recipe() defaults whispercpp to CPU, but npu/vulkan use different devices.
+    // The descriptor defaults whispercpp to CPU; npu/vulkan variants use different devices.
     if (whispercpp_backend == "npu") {
         device_type_ = DEVICE_NPU;
     } else if (whispercpp_backend == "vulkan" || whispercpp_backend == "metal") {
@@ -698,6 +698,8 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<WhisperServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+
+const BackendSpec* spec() { return &WhisperServer::SPEC; }
 }  // namespace whispercpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 6695fbfc7..5253a16ad 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -11,7 +11,6 @@
 #include <lemon/backends/backend_utils.h>
 #include <lemon/backends/cloud/cloud_server.h>
 #include <lemon/cloud_provider_registry.h>
-#include <lemon/backends/fastflowlm/fastflowlm_server.h>
 #include <filesystem>
 #include <iostream>
 #include <fstream>
@@ -2964,8 +2963,11 @@ void ModelManager::unregister_user_model(const std::string& model_name) {
 // Returns empty string if not found.
 static std::string find_flm_binary() {
     try {
-        return backends::BackendUtils::get_backend_binary_path(
-            backends::FastFlowLMServer::SPEC, "npu");
+        const backends::BackendSpec* spec = backends::try_get_spec_for_recipe("flm");
+        if (!spec) {
+            return "";
+        }
+        return backends::BackendUtils::get_backend_binary_path(*spec, "npu");
     } catch (...) {
 #ifndef _WIN32
         return utils::find_flm_executable();

From cfb6e3d47b48f03c63442c1fedf6953d502311e5 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 14:37:18 -0400
Subject: [PATCH 08/39] refactor(backends): add BackendOps infrastructure
 (Tier-2 foundation)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduce a stateless per-backend behavior interface for model management that
happens WITHOUT a running subprocess (checkpoint-path resolution, download,
dynamic discovery, per-model metadata, version detection, availability) — the
home for the recipe switchboards currently scattered through model_manager and
system_info.

- BackendOps base class (lemon/backends/backend_ops.h): shared default behavior;
  backends override only the policy points they need (inherit shared logic, don't
  copy it). Methods are added incrementally as switchboards migrate; each has a
  default so adding one never forces edits to backends that don't override it.
- Each backend folder exposes a uniform ops() singleton (alongside create()/spec()),
  bound into BackendRegistration; backends::ops_for(recipe) returns it.
- Purely additive: every backend uses the default base ops for now, so there is
  no behavior change yet. Migrations follow in subsequent commits.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                                |  5 +-
 src/cpp/include/lemon/backends/backend_ops.h  | 46 +++++++++++++++++++
 .../include/lemon/backends/backend_registry.h |  8 +++-
 .../lemon/backends/cloud/cloud_server.h       |  1 +
 .../backends/fastflowlm/fastflowlm_server.h   |  1 +
 .../lemon/backends/kokoro/kokoro_server.h     |  1 +
 .../lemon/backends/llamacpp/llamacpp_server.h |  1 +
 .../backends/moonshine/moonshine_server.h     |  1 +
 .../lemon/backends/ryzenai/ryzenai_server.h   |  1 +
 .../lemon/backends/sdcpp/sdcpp_server.h       |  1 +
 .../include/lemon/backends/vllm/vllm_server.h |  1 +
 .../backends/whispercpp/whispercpp_server.h   |  1 +
 src/cpp/server/backends/backend_ops.cpp       | 12 +++++
 src/cpp/server/backends/backend_registry.cpp  |  9 ++++
 .../server/backends/cloud/cloud_server.cpp    |  1 +
 .../backends/fastflowlm/fastflowlm_server.cpp |  1 +
 .../server/backends/kokoro/kokoro_server.cpp  |  1 +
 .../backends/llamacpp/llamacpp_server.cpp     |  1 +
 .../backends/moonshine/moonshine_server.cpp   |  1 +
 .../backends/ryzenai/ryzenai_server.cpp       |  1 +
 .../server/backends/sdcpp/sdcpp_server.cpp    |  1 +
 src/cpp/server/backends/vllm/vllm_server.cpp  |  1 +
 .../backends/whispercpp/whispercpp_server.cpp |  1 +
 23 files changed, 95 insertions(+), 3 deletions(-)
 create mode 100644 src/cpp/include/lemon/backends/backend_ops.h
 create mode 100644 src/cpp/server/backends/backend_ops.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ca95c586a..6d6d980e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -678,7 +678,8 @@ set(LEMON_FACTORY_ENTRIES "")
 set(LEMON_BACKEND_DESCRIPTOR_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp)
 set(LEMON_BACKEND_FACTORY_SOURCES
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp)
 foreach(_backend_entry ${LEMON_BACKENDS})
     string(REPLACE "|" ";" _backend_parts "${_backend_entry}")
     list(GET _backend_parts 1 _backend_stem)
@@ -692,7 +693,7 @@ foreach(_backend_entry ${LEMON_BACKENDS})
     string(APPEND LEMON_FACTORY_INCLUDES
         "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n")
     string(APPEND LEMON_FACTORY_ENTRIES
-        "        { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec() },\n")
+        "        { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec(), lemon::backends::${_backend_stem}::ops() },\n")
 endforeach()
 
 configure_file(
diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
new file mode 100644
index 000000000..53b046e84
--- /dev/null
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace lemon {
+
+struct ModelInfo;
+class ModelManager;
+
+namespace backends {
+
+// Context handed to BackendOps methods — the bits of server state model
+// management needs without a running subprocess. Grows as migrations require.
+struct BackendOpsContext {
+    ModelManager* model_manager = nullptr;
+};
+
+// Stateless per-backend behavior for model management that happens WITHOUT a
+// running subprocess: checkpoint-path resolution, download, dynamic discovery,
+// per-model metadata, version detection, availability. One singleton per
+// backend, exposed via lemon::backends::<stem>::ops() and bound in the registry
+// (see BackendRegistration::ops).
+//
+// The base class is the shared default behavior (the common HF-backed case);
+// each backend folder overrides ONLY the policy points it needs, so shared
+// logic is inherited rather than copied. Methods are added here incrementally as
+// switchboards in model_manager / system_info are migrated; every method has a
+// default so adding one never forces edits to backends that don't override it.
+class BackendOps {
+public:
+    virtual ~BackendOps() = default;
+
+    // Populate model-specific metadata (context window, capability labels, …)
+    // for a downloaded model. Default: nothing.
+    virtual void populate_metadata(ModelInfo& info, const BackendOpsContext& ctx) const {
+        (void)info;
+        (void)ctx;
+    }
+};
+
+// Shared default ops instance for backends that override nothing.
+const BackendOps* default_backend_ops();
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h
index a5a116f14..75709781d 100644
--- a/src/cpp/include/lemon/backends/backend_registry.h
+++ b/src/cpp/include/lemon/backends/backend_registry.h
@@ -4,6 +4,7 @@
 #include <string>
 #include "lemon/backends/backend_descriptor.h"
 #include "lemon/backends/backend_descriptor_registry.h"
+#include "lemon/backends/backend_ops.h"
 
 namespace lemon {
 
@@ -37,14 +38,19 @@ struct BackendRegistration {
     const BackendDescriptor* descriptor;
     BackendCreateFn create;
     const BackendSpec* spec;  // install/download spec, or nullptr (e.g. cloud has none)
+    const BackendOps* ops;    // stateless model-management behavior (never null)
 };
 
-// All registered (descriptor, create, spec) entries, in LEMON_BACKENDS order.
+// All registered (descriptor, create, spec, ops) entries, in LEMON_BACKENDS order.
 const std::vector<BackendRegistration>& all_registrations();
 
 // Install/download spec for a recipe, or nullptr if the recipe has none.
 const BackendSpec* spec_for(const std::string& recipe);
 
+// Stateless model-management ops for a recipe. Falls back to the shared default
+// ops (base behavior) for recipes with no registered backend.
+const BackendOps* ops_for(const std::string& recipe);
+
 // Construct a backend instance for a recipe and associate its descriptor, or
 // nullptr if the recipe has no registered backend.
 std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/cloud/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h
index afddef3a7..774c44300 100644
--- a/src/cpp/include/lemon/backends/cloud/cloud_server.h
+++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h
@@ -117,6 +117,7 @@ namespace cloud {
 // Factory for the cloud backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace cloud
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
index cb6b7d73a..c422f4a4d 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
@@ -81,6 +81,7 @@ namespace fastflowlm {
 // Factory for the fastflowlm backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace fastflowlm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
index f2fd1746a..9c628c076 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
@@ -56,6 +56,7 @@ namespace kokoro {
 // Factory for the kokoro backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace kokoro
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
index a4086ac83..8b28296c4 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
@@ -67,6 +67,7 @@ namespace llamacpp {
 // Factory for the llamacpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace llamacpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
index 70a71bf2a..611bfe51c 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
@@ -62,6 +62,7 @@ namespace moonshine {
 // Factory for the moonshine backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace moonshine
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
index 38152e478..f824cfde3 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
@@ -63,6 +63,7 @@ namespace ryzenai {
 // Factory for the ryzenai backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace ryzenai
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
index f86b322ec..99be9e62c 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
@@ -104,6 +104,7 @@ namespace sdcpp {
 // Factory for the sdcpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace sdcpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h
index 1ce866118..700296b97 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm_server.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h
@@ -56,6 +56,7 @@ namespace vllm {
 // Factory for the vllm backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace vllm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
index 21d0d3ad4..8dc88bbb4 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
@@ -85,6 +85,7 @@ namespace whispercpp {
 // Factory for the whispercpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
 const BackendSpec* spec();
+const BackendOps* ops();
 }  // namespace whispercpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp
new file mode 100644
index 000000000..773e39494
--- /dev/null
+++ b/src/cpp/server/backends/backend_ops.cpp
@@ -0,0 +1,12 @@
+#include "lemon/backends/backend_ops.h"
+
+namespace lemon {
+namespace backends {
+
+const BackendOps* default_backend_ops() {
+    static const BackendOps kDefault;
+    return &kDefault;
+}
+
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp
index a7db3921a..abbeaf998 100644
--- a/src/cpp/server/backends/backend_registry.cpp
+++ b/src/cpp/server/backends/backend_registry.cpp
@@ -23,6 +23,15 @@ const BackendSpec* spec_for(const std::string& recipe) {
     return nullptr;
 }
 
+const BackendOps* ops_for(const std::string& recipe) {
+    for (const auto& reg : all_registrations()) {
+        if (reg.descriptor->recipe == recipe) {
+            return reg.ops;
+        }
+    }
+    return default_backend_ops();
+}
+
 std::unique_ptr<WrappedServer> create_server(const std::string& recipe, const BackendContext& ctx) {
     for (const auto& reg : all_registrations()) {
         if (reg.descriptor->recipe == recipe) {
diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
index 3d06a3f90..29dede2b0 100644
--- a/src/cpp/server/backends/cloud/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -807,6 +807,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return nullptr; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace cloud
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index f2b6885e6..424ea2e2c 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -477,6 +477,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &FastFlowLMServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace fastflowlm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
index 534225965..13f1a3ffe 100644
--- a/src/cpp/server/backends/kokoro/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -215,6 +215,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &KokoroServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace kokoro
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 1c7980024..05cd10a4d 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -656,6 +656,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &LlamaCppServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace llamacpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index e03b9ac2b..ed709990b 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -342,6 +342,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &MoonshineServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace moonshine
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index 70bedb84b..e965ea3b9 100644
--- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -186,6 +186,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace ryzenai
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index 4749f0f0d..718855d8f 100644
--- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -758,6 +758,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &SDServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace sdcpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
index 171b4cc0f..1ab4e22fc 100644
--- a/src/cpp/server/backends/vllm/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -323,6 +323,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &VLLMServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace vllm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index 8fb454f09..c77d10669 100644
--- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -700,6 +700,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 
 
 const BackendSpec* spec() { return &WhisperServer::SPEC; }
+const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace whispercpp
 }  // namespace backends
 }  // namespace lemon

From 5a1d5349e5e3acc53d02f234c71ea657221e0076 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 14:47:19 -0400
Subject: [PATCH 09/39] refactor(backends): migrate per-model metadata to ops;
 move GGUF/FLM readers into folders

Replace the populate_model_metadata recipe switchboard with
ops_for(recipe)->populate_metadata(). The backend-specific readers move into
their folders:

- GGUF metadata reader (read_gguf_metadata + byte parsers) -> backends/llamacpp/
  llamacpp_gguf.{h,cpp}; LlamaCppOps::populate_metadata reads arch + capability
  labels there.
- FLM model-file helpers (config.json ctx window, model-dir discovery) ->
  backends/fastflowlm/fastflowlm_models.{h,cpp}; FlmOps::populate_metadata uses it.

model_manager no longer knows how either backend stores or introspects models.
CMake now globs each backend folder's *.cpp (CONFIGURE_DEPENDS) so backend-private
helper files need no CMake edit; the backend LIST stays explicit.

Verified: GGUF context windows still populate (131072/128000/32768 for sample
models) and test_gguf_capabilities passes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                                |  11 +-
 .../backends/fastflowlm/fastflowlm_models.h   |  29 ++
 .../lemon/backends/llamacpp/llamacpp_gguf.h   |  30 ++
 .../backends/fastflowlm/fastflowlm_models.cpp | 119 ++++++
 .../backends/fastflowlm/fastflowlm_server.cpp |  17 +-
 .../backends/llamacpp/llamacpp_gguf.cpp       | 253 +++++++++++++
 .../backends/llamacpp/llamacpp_server.cpp     |  48 ++-
 src/cpp/server/model_manager.cpp              | 347 +-----------------
 8 files changed, 510 insertions(+), 344 deletions(-)
 create mode 100644 src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
 create mode 100644 src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
 create mode 100644 src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
 create mode 100644 src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d6d980e3..758108b69 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -683,9 +683,14 @@ set(LEMON_BACKEND_FACTORY_SOURCES
 foreach(_backend_entry ${LEMON_BACKENDS})
     string(REPLACE "|" ";" _backend_parts "${_backend_entry}")
     list(GET _backend_parts 1 _backend_stem)
-    # The descriptor is header-only (no source); only the server source compiles.
-    list(APPEND LEMON_BACKEND_FACTORY_SOURCES
-        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/${_backend_stem}_server.cpp)
+    # The descriptor is header-only (no source). Compile every .cpp in the
+    # backend's folder (server class + any backend-private helpers like GGUF
+    # parsing) — CONFIGURE_DEPENDS re-globs when a file is added/removed so a new
+    # helper in a folder needs no CMake edit. (The backend LIST is still explicit
+    # above so a whole new backend is never silently missed.)
+    file(GLOB _backend_srcs CONFIGURE_DEPENDS
+        ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/*.cpp)
+    list(APPEND LEMON_BACKEND_FACTORY_SOURCES ${_backend_srcs})
     string(APPEND LEMON_DESCRIPTOR_INCLUDES
         "#include \"lemon/backends/${_backend_stem}/${_backend_stem}.h\"\n")
     string(APPEND LEMON_DESCRIPTOR_ENTRIES
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
new file mode 100644
index 000000000..3e99e3003
--- /dev/null
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <cstdint>
+#include <filesystem>
+#include <string>
+
+namespace lemon {
+
+struct ModelInfo;
+
+namespace backends {
+namespace fastflowlm {
+
+// FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH /
+// platform-default roots and describes them with a config.json; this knowledge
+// lives in the fastflowlm backend folder rather than in the shared model manager.
+
+// Derive the on-disk repo directory name from an FLM model URL.
+std::string repo_dir_from_url(const std::string& url);
+
+// Locate config.json for an FLM repo dir across the candidate model roots.
+std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo_dir);
+
+// Read the model's max context window from its FLM config.json (0 if unknown).
+int64_t read_flm_max_context_window(const ModelInfo& info);
+
+} // namespace fastflowlm
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
new file mode 100644
index 000000000..2e431478b
--- /dev/null
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include "lemon/gguf_capabilities.h"
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+
+// GGUF metadata extracted in a single pass over the KV header. This is
+// llama.cpp-specific model introspection; it lives in the llamacpp backend
+// folder rather than in the shared model manager.
+struct GgufMetadata {
+    std::string architecture;
+    int64_t context_length = 0;
+    int64_t block_count = 0;
+    int64_t embedding_length = 0;
+    int64_t head_count_kv = 0;
+    int64_t key_length = 0;
+    GgufCapabilities caps;
+};
+
+// Read GGUF metadata from a .gguf file. Returns false if the file is missing or
+// not a valid GGUF container.
+bool read_gguf_metadata(GgufMetadata& out, const std::string& path);
+
+} // namespace llamacpp
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
new file mode 100644
index 000000000..0ac7f8caf
--- /dev/null
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
@@ -0,0 +1,119 @@
+#include "lemon/backends/fastflowlm/fastflowlm_models.h"
+
+#include <cstdlib>
+#include <vector>
+#include <nlohmann/json.hpp>
+#include "lemon/model_manager.h"
+#include "lemon/utils/aixlog.hpp"
+#include "lemon/utils/json_utils.h"
+#include "lemon/utils/path_utils.h"
+
+namespace fs = std::filesystem;
+using json = nlohmann::json;
+
+namespace lemon {
+namespace backends {
+namespace fastflowlm {
+namespace {
+
+using lemon::utils::path_from_utf8;
+
+bool safe_exists(const fs::path& p) {
+    std::error_code ec;
+    return fs::exists(p, ec);
+}
+
+// Candidate roots that FLM may use to store models. FLM resolves its model
+// directory from the FLM_MODEL_PATH env var (set by the installer) and falls
+// back to platform-default locations.
+std::vector<fs::path> get_flm_models_dir_candidates() {
+    std::vector<fs::path> roots;
+
+    const char* flm_model_path = std::getenv("FLM_MODEL_PATH");
+    if (flm_model_path && *flm_model_path) {
+        roots.push_back(path_from_utf8(flm_model_path) / "models");
+    }
+
+#ifdef _WIN32
+    const char* userprofile = std::getenv("USERPROFILE");
+    if (userprofile && *userprofile) {
+        fs::path home = path_from_utf8(userprofile);
+        roots.push_back(home / ".flm" / "models");              // current installer default
+        roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default
+        roots.push_back(home / "flm" / "models");
+    }
+#else
+    const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME");
+    if (xdg_config_home && *xdg_config_home) {
+        roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models");
+    }
+    const char* home = std::getenv("HOME");
+    if (home && *home) {
+        fs::path home_path = path_from_utf8(home);
+        roots.push_back(home_path / ".flm" / "models");
+        roots.push_back(home_path / ".config" / "flm" / "models");
+    }
+#endif
+
+    return roots;
+}
+
+} // namespace
+
+fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) {
+    if (repo_dir.empty()) return fs::path();
+
+    for (const auto& root : get_flm_models_dir_candidates()) {
+        fs::path candidate = root / repo_dir / "config.json";
+        if (safe_exists(candidate)) return candidate;
+    }
+    return fs::path();
+}
+
+std::string repo_dir_from_url(const std::string& url) {
+    std::string clean = url;
+    while (!clean.empty() && clean.back() == '/') clean.pop_back();
+    size_t query_pos = clean.find_first_of("?#");
+    if (query_pos != std::string::npos) clean = clean.substr(0, query_pos);
+
+    for (const std::string marker : {"/tree/", "/resolve/"}) {
+        size_t marker_pos = clean.find(marker);
+        if (marker_pos != std::string::npos) {
+            clean = clean.substr(0, marker_pos);
+            break;
+        }
+    }
+
+    size_t slash = clean.find_last_of('/');
+    return slash == std::string::npos ? clean : clean.substr(slash + 1);
+}
+
+int64_t read_flm_max_context_window(const ModelInfo& info) {
+    if (info.type != ModelType::LLM) return 0;
+
+    std::string config_path = info.resolved_path("config");
+    if (config_path.empty()) return 0;
+
+    try {
+        json config = lemon::utils::JsonUtils::load_from_file(config_path);
+        if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) {
+            int64_t value = config["max_position_embeddings"].get<int64_t>();
+            return value > 0 ? value : 0;
+        }
+        if (config.contains("text_config") && config["text_config"].is_object()) {
+            const auto& text_config = config["text_config"];
+            if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) {
+                int64_t value = text_config["max_position_embeddings"].get<int64_t>();
+                return value > 0 ? value : 0;
+            }
+        }
+    } catch (const std::exception& e) {
+        LOG(DEBUG, "FastFlowLM") << "Could not read FLM config metadata for "
+                                 << info.model_name << ": " << e.what() << std::endl;
+    }
+    return 0;
+}
+
+} // namespace fastflowlm
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 424ea2e2c..0b5e15934 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -1,6 +1,9 @@
 #include "lemon/backends/fastflowlm/fastflowlm_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm_models.h"
 #include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/model_manager.h"
 #include "lemon/system_info.h"
 #include "lemon/error_types.h"
 #include "lemon/utils/process_manager.h"
@@ -475,9 +478,21 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<FastFlowLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+namespace {
+// FLM model-management behavior: max context window from the model's config.json.
+class FlmOps : public BackendOps {
+public:
+    void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override {
+        info.max_context_window = read_flm_max_context_window(info);
+    }
+};
+}  // namespace
 
 const BackendSpec* spec() { return &FastFlowLMServer::SPEC; }
-const BackendOps* ops() { return default_backend_ops(); }
+const BackendOps* ops() {
+    static const FlmOps kOps;
+    return &kOps;
+}
 }  // namespace fastflowlm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
new file mode 100644
index 000000000..1e099d064
--- /dev/null
+++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
@@ -0,0 +1,253 @@
+#include "lemon/backends/llamacpp/llamacpp_gguf.h"
+
+#include <algorithm>
+#include <cctype>
+#include <cstring>
+#include <fstream>
+#include <istream>
+#include <limits>
+#include "lemon/utils/path_utils.h"
+
+namespace lemon {
+namespace backends {
+namespace llamacpp {
+namespace {
+
+using lemon::utils::path_from_utf8;
+
+// Local copies of the tiny case-insensitive string helpers (kept out of a shared
+// util to keep this GGUF reader self-contained).
+bool ends_with_ignore_case(const std::string& str, const std::string& suffix) {
+    if (suffix.size() > str.size()) return false;
+    return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin(),
+                      [](char a, char b) { return std::tolower(a) == std::tolower(b); });
+}
+
+bool contains_ignore_case(const std::string& str, const std::string& substr) {
+    auto it = std::search(str.begin(), str.end(), substr.begin(), substr.end(),
+                          [](char a, char b) { return std::tolower(a) == std::tolower(b); });
+    return it != str.end();
+}
+
+template <typename T>
+static bool read_le(std::istream& in, T& value) {
+    in.read(reinterpret_cast<char*>(&value), sizeof(T));
+    return static_cast<bool>(in);
+}
+
+static bool read_gguf_string(std::istream& in, std::string& value) {
+    uint64_t len = 0;
+    if (!read_le(in, len)) return false;
+    if (len > 1024 * 1024) return false;
+    value.assign(static_cast<size_t>(len), '\0');
+    if (len == 0) return true;
+    in.read(&value[0], static_cast<std::streamsize>(len));
+    return static_cast<bool>(in);
+}
+
+static bool skip_bytes(std::istream& in, uint64_t bytes) {
+    if (bytes > static_cast<uint64_t>(std::numeric_limits<std::streamoff>::max())) return false;
+    in.seekg(static_cast<std::streamoff>(bytes), std::ios::cur);
+    return static_cast<bool>(in);
+}
+
+static uint64_t gguf_scalar_size(uint32_t type) {
+    switch (type) {
+        case 0:  // UINT8
+        case 1:  // INT8
+        case 7:  // BOOL
+            return 1;
+        case 2:  // UINT16
+        case 3:  // INT16
+            return 2;
+        case 4:  // UINT32
+        case 5:  // INT32
+        case 6:  // FLOAT32
+            return 4;
+        case 10: // UINT64
+        case 11: // INT64
+        case 12: // FLOAT64
+            return 8;
+        default:
+            return 0;
+    }
+}
+
+static bool skip_gguf_value(std::istream& in, uint32_t type);
+
+static bool read_gguf_integer_value(std::istream& in, uint32_t type, int64_t& value) {
+    switch (type) {
+        case 0: { uint8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
+        case 1: { int8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
+        case 2: { uint16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
+        case 3: { int16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
+        case 4: { uint32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
+        case 5: { int32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
+        case 10: {
+            uint64_t v = 0;
+            if (!read_le(in, v)) return false;
+            if (v > static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) return false;
+            value = static_cast<int64_t>(v);
+            return true;
+        }
+        case 11: { int64_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
+        default:
+            return skip_gguf_value(in, type) && false;
+    }
+}
+
+static bool skip_gguf_value(std::istream& in, uint32_t type) {
+    if (type == 8) {  // STRING
+        std::string ignored;
+        return read_gguf_string(in, ignored);
+    }
+
+    if (type == 9) {  // ARRAY
+        uint32_t elem_type = 0;
+        uint64_t count = 0;
+        if (!read_le(in, elem_type) || !read_le(in, count)) return false;
+
+        if (elem_type == 8) {
+            for (uint64_t i = 0; i < count; ++i) {
+                std::string ignored;
+                if (!read_gguf_string(in, ignored)) return false;
+            }
+            return true;
+        }
+
+        if (elem_type == 9) return false;
+        uint64_t elem_size = gguf_scalar_size(elem_type);
+        if (elem_size == 0) return false;
+        if (count > std::numeric_limits<uint64_t>::max() / elem_size) return false;
+        return skip_bytes(in, count * elem_size);
+    }
+
+    uint64_t size = gguf_scalar_size(type);
+    return size > 0 && skip_bytes(in, size);
+}
+
+} // namespace
+
+bool read_gguf_metadata(GgufMetadata& out, const std::string& path) {
+    std::ifstream in(path_from_utf8(path), std::ios::binary);
+    if (!in) return false;
+
+    char magic[4] = {};
+    in.read(magic, sizeof(magic));
+    if (!in || std::memcmp(magic, "GGUF", 4) != 0) return false;
+
+    uint32_t version = 0;
+    uint64_t tensor_count = 0;
+    uint64_t kv_count = 0;
+    if (!read_le(in, version) || !read_le(in, tensor_count) || !read_le(in, kv_count)) return false;
+    (void)version;
+    (void)tensor_count;
+
+    int64_t pending_context_length = 0;
+
+    for (uint64_t i = 0; i < kv_count; ++i) {
+        std::string key;
+        uint32_t type = 0;
+        if (!read_gguf_string(in, key) || !read_le(in, type)) return false;
+
+        // Read architecture
+        if (key == "general.architecture" && type == 8) {
+            if (!read_gguf_string(in, out.architecture)) return false;
+            if (pending_context_length > 0) {
+                out.context_length = pending_context_length;
+            }
+            continue;
+        }
+
+        // Context length
+        const bool context_key = !out.architecture.empty() && key == out.architecture + ".context_length";
+        const bool possible_context_key = out.architecture.empty() && key.size() > std::strlen(".context_length") &&
+                                          ends_with_ignore_case(key, ".context_length");
+        if (context_key || possible_context_key) {
+            int64_t value = 0;
+            if (read_gguf_integer_value(in, type, value) && value > 0) {
+                if (context_key) {
+                    out.context_length = value;
+                } else {
+                    pending_context_length = value;
+                }
+            }
+            continue;
+        }
+
+        // Architecture fields for KV cache estimation
+        if (!out.architecture.empty()) {
+            if (key == out.architecture + ".block_count") {
+                int64_t value = 0;
+                if (read_gguf_integer_value(in, type, value) && value > 0)
+                    out.block_count = value;
+                continue;
+            }
+            if (key == out.architecture + ".embedding_length") {
+                int64_t value = 0;
+                if (read_gguf_integer_value(in, type, value) && value > 0)
+                    out.embedding_length = value;
+                continue;
+            }
+            if (key == out.architecture + ".attention.head_count_kv") {
+                int64_t value = 0;
+                if (read_gguf_integer_value(in, type, value) && value > 0)
+                    out.head_count_kv = value;
+                continue;
+            }
+            if (key == out.architecture + ".attention.key_length") {
+                int64_t value = 0;
+                if (read_gguf_integer_value(in, type, value) && value > 0)
+                    out.key_length = value;
+                continue;
+            }
+        }
+
+        // Capability detection (vision, tool-calling, MTP)
+        if (type == 4) {
+            uint32_t val = 0;
+            if (read_le(in, val)) {
+                if (contains_ignore_case(key, "nextn_predict_layers") && val > 0)
+                    out.caps.mtp = true;
+            }
+        } else if (type == 8) {
+            std::string value;
+            if (read_gguf_string(in, value)) {
+                inspect_gguf_string(key, value, out.caps);
+            }
+        } else if (type == 9) {
+            // Array — check string elements for capability hints
+            uint32_t elem_type = 0;
+            uint64_t count = 0;
+            if (read_le(in, elem_type) && read_le(in, count)) {
+                if (elem_type == 8) {
+                    for (uint64_t j = 0; j < count; ++j) {
+                        std::string value;
+                        if (!read_gguf_string(in, value)) return false;
+                        inspect_gguf_string(key, value, out.caps);
+                    }
+                } else if (elem_type != 9) {
+                    uint64_t elem_size = gguf_scalar_size(elem_type);
+                    if (elem_size == 0) return false;
+                    if (!skip_bytes(in, count * elem_size)) return false;
+                } else {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+        } else {
+            if (!skip_gguf_value(in, type)) return false;
+        }
+    }
+
+    if (out.context_length == 0 && pending_context_length > 0) {
+        out.context_length = pending_context_length;
+    }
+    return true;
+}
+
+
+} // namespace llamacpp
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 05cd10a4d..e295a835a 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -1,6 +1,13 @@
 #include "lemon/backends/llamacpp/llamacpp_server.h"
+#include "lemon/backends/llamacpp/llamacpp_gguf.h"
 #include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/gguf_capabilities.h"
+#include "lemon/model_manager.h"
+#include <algorithm>
+#include <filesystem>
+#include <system_error>
 #include "lemon/auto_tune.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
@@ -654,9 +661,48 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
     return std::make_unique<LlamaCppServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+namespace {
+// llamacpp model-management behavior: GGUF metadata + capability labels.
+class LlamaCppOps : public BackendOps {
+public:
+    void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override {
+        const std::string gguf_path = info.resolved_path();
+        if (gguf_path.size() < 5) {
+            return;
+        }
+        std::string ext = gguf_path.substr(gguf_path.size() - 5);
+        std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+        if (ext != ".gguf") {
+            return;
+        }
+        std::error_code ec;
+        if (!std::filesystem::exists(lemon::utils::path_from_utf8(gguf_path), ec)) {
+            return;
+        }
+        GgufMetadata meta;
+        if (!read_gguf_metadata(meta, gguf_path)) {
+            return;
+        }
+        info.max_context_window = meta.context_length;
+        info.gguf_block_count = meta.block_count;
+        info.gguf_embedding_length = meta.embedding_length;
+        info.gguf_head_count_kv = meta.head_count_kv;
+        info.gguf_key_length = meta.key_length;
+        // GGUF vision/tool metadata are LLM capabilities. Don't apply them to
+        // embedding/reranking models, or labels like tool-calling would
+        // reclassify the model away from its endpoint type.
+        if (info.type == ModelType::LLM) {
+            apply_gguf_capability_labels(info.labels, meta.caps);
+        }
+    }
+};
+}  // namespace
 
 const BackendSpec* spec() { return &LlamaCppServer::SPEC; }
-const BackendOps* ops() { return default_backend_ops(); }
+const BackendOps* ops() {
+    static const LlamaCppOps kOps;
+    return &kOps;
+}
 }  // namespace llamacpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 5253a16ad..81d0200b2 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -1,15 +1,16 @@
 #include <lemon/model_manager.h>
 #include <lemon/runtime_config.h>
 #include <lemon/hf_variants.h>
-#include <lemon/gguf_capabilities.h>
 #include <lemon/utils/json_utils.h>
 #include <lemon/utils/http_client.h>
 #include <lemon/utils/process_manager.h>
 #include <lemon/utils/path_utils.h>
 #include <lemon/system_info.h>
 #include <lemon/backends/backend_descriptor_registry.h>
+#include <lemon/backends/backend_registry.h>
 #include <lemon/backends/backend_utils.h>
 #include <lemon/backends/cloud/cloud_server.h>
+#include <lemon/backends/fastflowlm/fastflowlm_models.h>
 #include <lemon/cloud_provider_registry.h>
 #include <filesystem>
 #include <iostream>
@@ -153,354 +154,21 @@ static std::string cache_key_to_canonical_id(const std::string& cache_key) {
     return canonical_id(ModelSource::Builtin, cache_key);
 }
 
-template <typename T>
-static bool read_le(std::istream& in, T& value) {
-    in.read(reinterpret_cast<char*>(&value), sizeof(T));
-    return static_cast<bool>(in);
-}
-
-static bool read_gguf_string(std::istream& in, std::string& value) {
-    uint64_t len = 0;
-    if (!read_le(in, len)) return false;
-    if (len > 1024 * 1024) return false;
-    value.assign(static_cast<size_t>(len), '\0');
-    if (len == 0) return true;
-    in.read(&value[0], static_cast<std::streamsize>(len));
-    return static_cast<bool>(in);
-}
-
-static bool skip_bytes(std::istream& in, uint64_t bytes) {
-    if (bytes > static_cast<uint64_t>(std::numeric_limits<std::streamoff>::max())) return false;
-    in.seekg(static_cast<std::streamoff>(bytes), std::ios::cur);
-    return static_cast<bool>(in);
-}
-
-static uint64_t gguf_scalar_size(uint32_t type) {
-    switch (type) {
-        case 0:  // UINT8
-        case 1:  // INT8
-        case 7:  // BOOL
-            return 1;
-        case 2:  // UINT16
-        case 3:  // INT16
-            return 2;
-        case 4:  // UINT32
-        case 5:  // INT32
-        case 6:  // FLOAT32
-            return 4;
-        case 10: // UINT64
-        case 11: // INT64
-        case 12: // FLOAT64
-            return 8;
-        default:
-            return 0;
-    }
-}
-
-static bool skip_gguf_value(std::istream& in, uint32_t type);
-
-static bool read_gguf_integer_value(std::istream& in, uint32_t type, int64_t& value) {
-    switch (type) {
-        case 0: { uint8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
-        case 1: { int8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
-        case 2: { uint16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
-        case 3: { int16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
-        case 4: { uint32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
-        case 5: { int32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
-        case 10: {
-            uint64_t v = 0;
-            if (!read_le(in, v)) return false;
-            if (v > static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) return false;
-            value = static_cast<int64_t>(v);
-            return true;
-        }
-        case 11: { int64_t v = 0; if (!read_le(in, v)) return false; value = v; return true; }
-        default:
-            return skip_gguf_value(in, type) && false;
-    }
-}
-
-static bool skip_gguf_value(std::istream& in, uint32_t type) {
-    if (type == 8) {  // STRING
-        std::string ignored;
-        return read_gguf_string(in, ignored);
-    }
-
-    if (type == 9) {  // ARRAY
-        uint32_t elem_type = 0;
-        uint64_t count = 0;
-        if (!read_le(in, elem_type) || !read_le(in, count)) return false;
-
-        if (elem_type == 8) {
-            for (uint64_t i = 0; i < count; ++i) {
-                std::string ignored;
-                if (!read_gguf_string(in, ignored)) return false;
-            }
-            return true;
-        }
-
-        if (elem_type == 9) return false;
-        uint64_t elem_size = gguf_scalar_size(elem_type);
-        if (elem_size == 0) return false;
-        if (count > std::numeric_limits<uint64_t>::max() / elem_size) return false;
-        return skip_bytes(in, count * elem_size);
-    }
-
-    uint64_t size = gguf_scalar_size(type);
-    return size > 0 && skip_bytes(in, size);
-}
-
-// All GGUF metadata extracted in a single pass over the KV header.
-// Replaces the previous three separate readers (context_length, arch_info, capabilities)
-// that each opened the file independently.
-struct GgufMetadata {
-    std::string architecture;
-    int64_t context_length = 0;
-    int64_t block_count = 0;
-    int64_t embedding_length = 0;
-    int64_t head_count_kv = 0;
-    int64_t key_length = 0;
-    GgufCapabilities caps;
-};
-
-static bool read_gguf_metadata(GgufMetadata& out, const std::string& path) {
-    std::ifstream in(path_from_utf8(path), std::ios::binary);
-    if (!in) return false;
-
-    char magic[4] = {};
-    in.read(magic, sizeof(magic));
-    if (!in || std::memcmp(magic, "GGUF", 4) != 0) return false;
-
-    uint32_t version = 0;
-    uint64_t tensor_count = 0;
-    uint64_t kv_count = 0;
-    if (!read_le(in, version) || !read_le(in, tensor_count) || !read_le(in, kv_count)) return false;
-    (void)version;
-    (void)tensor_count;
-
-    int64_t pending_context_length = 0;
-
-    for (uint64_t i = 0; i < kv_count; ++i) {
-        std::string key;
-        uint32_t type = 0;
-        if (!read_gguf_string(in, key) || !read_le(in, type)) return false;
-
-        // Read architecture
-        if (key == "general.architecture" && type == 8) {
-            if (!read_gguf_string(in, out.architecture)) return false;
-            if (pending_context_length > 0) {
-                out.context_length = pending_context_length;
-            }
-            continue;
-        }
-
-        // Context length
-        const bool context_key = !out.architecture.empty() && key == out.architecture + ".context_length";
-        const bool possible_context_key = out.architecture.empty() && key.size() > std::strlen(".context_length") &&
-                                          ends_with_ignore_case(key, ".context_length");
-        if (context_key || possible_context_key) {
-            int64_t value = 0;
-            if (read_gguf_integer_value(in, type, value) && value > 0) {
-                if (context_key) {
-                    out.context_length = value;
-                } else {
-                    pending_context_length = value;
-                }
-            }
-            continue;
-        }
-
-        // Architecture fields for KV cache estimation
-        if (!out.architecture.empty()) {
-            if (key == out.architecture + ".block_count") {
-                int64_t value = 0;
-                if (read_gguf_integer_value(in, type, value) && value > 0)
-                    out.block_count = value;
-                continue;
-            }
-            if (key == out.architecture + ".embedding_length") {
-                int64_t value = 0;
-                if (read_gguf_integer_value(in, type, value) && value > 0)
-                    out.embedding_length = value;
-                continue;
-            }
-            if (key == out.architecture + ".attention.head_count_kv") {
-                int64_t value = 0;
-                if (read_gguf_integer_value(in, type, value) && value > 0)
-                    out.head_count_kv = value;
-                continue;
-            }
-            if (key == out.architecture + ".attention.key_length") {
-                int64_t value = 0;
-                if (read_gguf_integer_value(in, type, value) && value > 0)
-                    out.key_length = value;
-                continue;
-            }
-        }
-
-        // Capability detection (vision, tool-calling, MTP)
-        if (type == 4) {
-            uint32_t val = 0;
-            if (read_le(in, val)) {
-                if (contains_ignore_case(key, "nextn_predict_layers") && val > 0)
-                    out.caps.mtp = true;
-            }
-        } else if (type == 8) {
-            std::string value;
-            if (read_gguf_string(in, value)) {
-                inspect_gguf_string(key, value, out.caps);
-            }
-        } else if (type == 9) {
-            // Array — check string elements for capability hints
-            uint32_t elem_type = 0;
-            uint64_t count = 0;
-            if (read_le(in, elem_type) && read_le(in, count)) {
-                if (elem_type == 8) {
-                    for (uint64_t j = 0; j < count; ++j) {
-                        std::string value;
-                        if (!read_gguf_string(in, value)) return false;
-                        inspect_gguf_string(key, value, out.caps);
-                    }
-                } else if (elem_type != 9) {
-                    uint64_t elem_size = gguf_scalar_size(elem_type);
-                    if (elem_size == 0) return false;
-                    if (!skip_bytes(in, count * elem_size)) return false;
-                } else {
-                    return false;
-                }
-            } else {
-                return false;
-            }
-        } else {
-            if (!skip_gguf_value(in, type)) return false;
-        }
-    }
-
-    if (out.context_length == 0 && pending_context_length > 0) {
-        out.context_length = pending_context_length;
-    }
-    return true;
-}
-
 // Candidate roots that FLM may use to store models. FLM resolves its model
 // directory from the FLM_MODEL_PATH env var (set by the installer) and falls
 // back to a built-in default that has changed across releases. lemond is often
 // launched from a parent process that predates the FLM install and therefore
 // doesn't see FLM_MODEL_PATH, so we also probe every documented default.
 // Order is most-specific to most-historical.
-static std::vector<fs::path> get_flm_models_dir_candidates() {
-    std::vector<fs::path> roots;
-
-    const char* flm_model_path = std::getenv("FLM_MODEL_PATH");
-    if (flm_model_path && *flm_model_path) {
-        roots.push_back(path_from_utf8(flm_model_path) / "models");
-    }
-
-#ifdef _WIN32
-    const char* userprofile = std::getenv("USERPROFILE");
-    if (userprofile && *userprofile) {
-        fs::path home = path_from_utf8(userprofile);
-        roots.push_back(home / ".flm" / "models");          // current installer default
-        roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default
-        roots.push_back(home / "flm" / "models");
-    }
-#else
-    const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME");
-    if (xdg_config_home && *xdg_config_home) {
-        roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models");
-    }
-    const char* home = std::getenv("HOME");
-    if (home && *home) {
-        fs::path home_path = path_from_utf8(home);
-        roots.push_back(home_path / ".flm" / "models");
-        roots.push_back(home_path / ".config" / "flm" / "models");
-    }
-#endif
-
-    return roots;
-}
-
-static fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) {
-    if (repo_dir.empty()) return fs::path();
-
-    for (const auto& root : get_flm_models_dir_candidates()) {
-        fs::path candidate = root / repo_dir / "config.json";
-        if (safe_exists(candidate)) return candidate;
-    }
-    return fs::path();
-}
-
-static std::string repo_dir_from_url(const std::string& url) {
-    std::string clean = url;
-    while (!clean.empty() && clean.back() == '/') clean.pop_back();
-    size_t query_pos = clean.find_first_of("?#");
-    if (query_pos != std::string::npos) clean = clean.substr(0, query_pos);
-
-    for (const std::string marker : {"/tree/", "/resolve/"}) {
-        size_t marker_pos = clean.find(marker);
-        if (marker_pos != std::string::npos) {
-            clean = clean.substr(0, marker_pos);
-            break;
-        }
-    }
-
-    size_t slash = clean.find_last_of('/');
-    return slash == std::string::npos ? clean : clean.substr(slash + 1);
-}
-
-static int64_t read_flm_max_context_window(const ModelInfo& info) {
-    if (info.type != ModelType::LLM) return 0;
-
-    std::string config_path = info.resolved_path("config");
-    if (config_path.empty()) return 0;
-
-    try {
-        json config = JsonUtils::load_from_file(config_path);
-        if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) {
-            int64_t value = config["max_position_embeddings"].get<int64_t>();
-            return value > 0 ? value : 0;
-        }
-        if (config.contains("text_config") && config["text_config"].is_object()) {
-            const auto& text_config = config["text_config"];
-            if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) {
-                int64_t value = text_config["max_position_embeddings"].get<int64_t>();
-                return value > 0 ? value : 0;
-            }
-        }
-    } catch (const std::exception& e) {
-        LOG(DEBUG, "ModelManager") << "Could not read FLM config metadata for "
-                                   << info.model_name << ": " << e.what() << std::endl;
-    }
-    return 0;
-}
 
 static void populate_model_metadata(ModelInfo& info) {
     info.max_context_window = 0;
     if (!info.downloaded) return;
 
-    if (info.recipe == "llamacpp") {
-        std::string gguf_path = info.resolved_path();
-        if (!gguf_path.empty() && ends_with_ignore_case(gguf_path, ".gguf") && safe_exists(path_from_utf8(gguf_path))) {
-            GgufMetadata meta;
-            if (read_gguf_metadata(meta, gguf_path)) {
-                info.max_context_window = meta.context_length;
-                info.gguf_block_count = meta.block_count;
-                info.gguf_embedding_length = meta.embedding_length;
-                info.gguf_head_count_kv = meta.head_count_kv;
-                info.gguf_key_length = meta.key_length;
-
-                // GGUF vision/tool metadata are LLM capabilities. Do not apply
-                // them to embedding/reranking models, otherwise labels such as
-                // tool-calling would reclassify the model away from its endpoint
-                // type and break /embeddings or /rerank.
-                if (info.type == ModelType::LLM) {
-                    apply_gguf_capability_labels(info.labels, meta.caps);
-                }
-            }
-        }
-    } else if (info.recipe == "flm") {
-        info.max_context_window = read_flm_max_context_window(info);
-    }
+    // Per-backend metadata (GGUF arch/labels for llamacpp, config.json ctx for
+    // flm, …) is read by the backend's ops, not a recipe switchboard here.
+    backends::BackendOpsContext ctx;
+    backends::ops_for(info.recipe)->populate_metadata(info, ctx);
 }
 
 static bool is_user_model_name(const std::string& model_name) {
@@ -3108,7 +2776,8 @@ std::vector<ModelInfo> ModelManager::get_flm_available_models() {
                     info.suggested = true; // All official FLM models are suggested
 
                     if (JsonUtils::get_or_default<bool>(m, "installed", false) && m.contains("url") && m["url"].is_string()) {
-                        fs::path config_path = find_flm_config_path_from_repo_dir(repo_dir_from_url(m["url"].get<std::string>()));
+                        fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir(
+                            backends::fastflowlm::repo_dir_from_url(m["url"].get<std::string>()));
                         if (!config_path.empty()) {
                             info.resolved_paths["config"] = path_to_utf8(config_path);
                         }

From 7933852f4009426ecfa7b955f63a34c157ed95a8 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 14:53:40 -0400
Subject: [PATCH 10/39] refactor(backends): descriptor-drive ROCm channels
 (kill duplicated (llamacpp||sd-cpp)&&rocm)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a `rocm_channels` descriptor field (llamacpp {"stable","nightly"}, sd-cpp
{"stable"}) and a recipe_has_rocm_channels() registry helper. Replace the
hardcoded `(recipe=="llamacpp"||recipe=="sd-cpp") && rocm` predicate — copied
across backend_utils.cpp (3×), backend_manager.cpp (2×), and system_info.cpp —
with the descriptor check. rocm_channel_for_recipe() now clamps a requested
channel to one the backend publishes (so sd-cpp's missing "nightly" -> "stable"
falls out of the data instead of a per-recipe special case).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_descriptor.h |  6 ++++++
 .../lemon/backends/backend_descriptor_registry.h    |  5 +++++
 src/cpp/include/lemon/backends/llamacpp/llamacpp.h  |  1 +
 src/cpp/include/lemon/backends/sdcpp/sdcpp.h        |  1 +
 src/cpp/server/backend_manager.cpp                  |  5 +++--
 .../server/backends/backend_descriptor_registry.cpp |  5 +++++
 src/cpp/server/backends/backend_utils.cpp           | 10 +++++-----
 src/cpp/server/runtime_config.cpp                   | 13 ++++++++++---
 src/cpp/server/system_info.cpp                      |  2 +-
 9 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index 3b9cdb2fb..29ea2e0ea 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -65,6 +65,12 @@ struct BackendDescriptor {
     std::string web_display_name;   // name used on the docs website ("" = fall back to display_name)
     int         web_priority = 0;    // model-grouping order on the docs website (lower = higher; 0 = unlisted)
 
+    // ROCm release channels this backend publishes (e.g. {"stable","nightly"}).
+    // Empty = the backend has no ROCm channels (its "rocm" build is a single
+    // artifact). Drives the rocm-stable/rocm-nightly bin-key collapse and the
+    // channel clamp (a requested channel not listed here falls back to the first).
+    std::vector<std::string> rocm_channels;
+
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
         return config_section.empty() ? recipe : config_section;
diff --git a/src/cpp/include/lemon/backends/backend_descriptor_registry.h b/src/cpp/include/lemon/backends/backend_descriptor_registry.h
index e3be93cda..44ec7e15d 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor_registry.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor_registry.h
@@ -21,5 +21,10 @@ const BackendDescriptor* descriptor_for(const std::string& recipe);
 // True if the recipe is backed by a registered descriptor.
 bool has_backend(const std::string& recipe);
 
+// True if the recipe publishes ROCm release channels (stable/nightly) — i.e. its
+// "rocm" backend resolves to a channel-specific artifact. False for recipes whose
+// rocm build is a single artifact (or that have no rocm build at all).
+bool recipe_has_rocm_channels(const std::string& recipe);
+
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index 19d63c370..fc43c4515 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -46,6 +46,7 @@ inline const BackendDescriptor descriptor = {
     /*experimental*/    false,
     /*web_display_name*/ "llama.cpp GPU",
     /*web_priority*/    1,
+    /*rocm_channels*/   {"stable", "nightly"},
 };
 
 }  // namespace llamacpp
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index 2e12af119..b65fe4fd6 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -50,6 +50,7 @@ inline const BackendDescriptor descriptor = {
     /*experimental*/    false,
     /*web_display_name*/ "stable-diffusion.cpp",
     /*web_priority*/    5,
+    /*rocm_channels*/   {"stable"},
 };
 
 }  // namespace sdcpp
diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp
index 1b61f3407..2983d49ca 100644
--- a/src/cpp/server/backend_manager.cpp
+++ b/src/cpp/server/backend_manager.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backend_manager.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/runtime_config.h"
 #include "lemon/system_info.h"
@@ -35,7 +36,7 @@ std::string get_current_os() {
 }
 
 std::string normalize_backend_name(const std::string& recipe, const std::string& backend) {
-    if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") {
+    if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") {
         // Map "rocm" to the appropriate channel based on config
         std::string channel = "stable";  // default to stable for now
         if (auto* cfg = RuntimeConfig::global()) {
@@ -467,7 +468,7 @@ void BackendManager::install_backend(const std::string& recipe, const std::strin
     // Do that here before inflating the install to a multi-file UX flow.
     const std::string os = get_current_os();
     const bool is_rocm_stable_backend =
-        (recipe == "llamacpp" || recipe == "sd-cpp") &&
+        backends::recipe_has_rocm_channels(recipe) &&
         resolved_backend == "rocm-stable";
     const bool therock_applicable =
         is_rocm_stable_backend && will_install_therock(os, backend_versions_);
diff --git a/src/cpp/server/backends/backend_descriptor_registry.cpp b/src/cpp/server/backends/backend_descriptor_registry.cpp
index 5fd217909..6d1741d87 100644
--- a/src/cpp/server/backends/backend_descriptor_registry.cpp
+++ b/src/cpp/server/backends/backend_descriptor_registry.cpp
@@ -25,5 +25,10 @@ bool has_backend(const std::string& recipe) {
     return descriptor_for(recipe) != nullptr;
 }
 
+bool recipe_has_rocm_channels(const std::string& recipe) {
+    const BackendDescriptor* d = descriptor_for(recipe);
+    return d != nullptr && !d->rocm_channels.empty();
+}
+
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp
index 9a57a28db..4c3e505d2 100644
--- a/src/cpp/server/backends/backend_utils.cpp
+++ b/src/cpp/server/backends/backend_utils.cpp
@@ -207,8 +207,8 @@ namespace lemon::backends {
                                               std::string& out_section,
                                               std::string& out_bin_key) {
         std::string config_backend = backend;
-        if ((recipe == "llamacpp" || recipe == "sd-cpp") &&
-            (backend == "rocm-stable" || backend == "rocm-nightly")) {
+        if ((recipe_has_rocm_channels(recipe) &&
+            (backend == "rocm-stable" || backend == "rocm-nightly"))) {
             config_backend = "rocm";
         }
         out_section = RuntimeConfig::recipe_to_config_section(recipe);
@@ -279,7 +279,7 @@ namespace lemon::backends {
 
         // Resolve "rocm" to actual channel for backends that support ROCm channels
         std::string resolved_backend = backend;
-        if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") {
+        if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") {
             std::string channel = "stable";  // default to stable
             if (auto* cfg = RuntimeConfig::global()) {
                 channel = cfg->rocm_channel_for_recipe(spec.recipe);
@@ -319,7 +319,7 @@ namespace lemon::backends {
         // directory or ROCm backends remain stuck in update_required after a
         // successful install.
         std::string resolved_backend = backend;
-        if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") {
+        if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") {
             std::string channel = "stable";
             if (auto* cfg = RuntimeConfig::global()) {
                 channel = cfg->rocm_channel_for_recipe(spec.recipe);
@@ -333,7 +333,7 @@ namespace lemon::backends {
 
     std::string BackendUtils::get_backend_version(const std::string& recipe, const std::string& backend) {
         std::string resolved_backend = backend;
-        if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") {
+        if (recipe_has_rocm_channels(recipe) && backend == "rocm") {
             // Map "rocm" to the appropriate channel based on config
             std::string channel = "stable";  // default to stable for now
             if (auto* cfg = RuntimeConfig::global()) {
diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp
index 08aa41dc6..0a14f006b 100644
--- a/src/cpp/server/runtime_config.cpp
+++ b/src/cpp/server/runtime_config.cpp
@@ -254,9 +254,16 @@ std::string RuntimeConfig::rocm_channel() const {
 
 std::string RuntimeConfig::rocm_channel_for_recipe(const std::string& recipe) const {
     std::string channel = rocm_channel();
-    // sd-cpp currently has no nightly artifacts; use stable builds.
-    if (recipe == "sd-cpp" && channel == "nightly") {
-        return "stable";
+    // Clamp to a channel the backend actually publishes. A backend that lists
+    // only {"stable"} (e.g. sd-cpp, which has no nightly artifacts) falls back to
+    // its first channel when "nightly" is requested. Driven by the descriptor's
+    // rocm_channels, so no per-recipe special case lives here.
+    const auto* desc = lemon::backends::descriptor_for(recipe);
+    if (desc && !desc->rocm_channels.empty()) {
+        const auto& channels = desc->rocm_channels;
+        if (std::find(channels.begin(), channels.end(), channel) == channels.end()) {
+            return channels.front();
+        }
     }
     return channel;
 }
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index cdf089843..45335912d 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -718,7 +718,7 @@ static std::string get_expected_backend_version(const std::string& recipe, const
     // version pins ("rocm-stable", "rocm-nightly") in backend_versions.json.
     // Mirror the resolution done by BackendUtils::get_backend_version().
     std::string resolved_backend = backend;
-    if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") {
+    if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") {
         std::string channel = "stable";
         if (auto* cfg = RuntimeConfig::global()) {
             channel = cfg->rocm_channel_for_recipe(recipe);

From 2cc963e499d8da1725fd3dd7620183476766a166 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 15:25:10 -0400
Subject: [PATCH 11/39] refactor(backends): migrate resolve_model_path
 switchboard to ops (worst leak)

Replace the ~290-line recipe switchboard in ModelManager::resolve_model_path
with ops_for(recipe)->resolve_checkpoint_path(). The model manager now only does
the generic prefix (collections, local_path/local_upload, HF cache-dir
computation) and hands off to the backend.

- New BackendOps::resolve_checkpoint_path; base = the shared HF behavior
  (active-snapshot variant/aux resolution, main-repo fallback, directory
  fallback). Backends override only their artifact layout:
    * llamacpp -> GGUF resolver (sharding/folder/quant-token), moved into
      backends/llamacpp/llamacpp_gguf (resolve_gguf_path).
    * ryzenai -> genai_config.json directory; kokoro -> index.json;
      whispercpp -> first .bin; cloud -> ""; flm -> checkpoint passthrough.
- New shared backends/hf_cache_util (exists/dir_options/active_snapshot_path/
  repo_id_to_cache_dir_name) so ops reuse the same HF-cache mechanics.

model_manager.cpp -362 lines; resolve_model_path 365 -> 34. Verified all recipes
still resolve as downloaded (llamacpp variants, whisper .bin, kokoro index,
sd-cpp, ryzenai, flm) via /models.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                                |   3 +-
 src/cpp/include/lemon/backends/backend_ops.h  |  20 +
 .../include/lemon/backends/hf_cache_util.h    |  30 ++
 .../lemon/backends/llamacpp/llamacpp_gguf.h   |   6 +
 src/cpp/server/backends/backend_ops.cpp       |  99 +++++
 .../server/backends/cloud/cloud_server.cpp    |  17 +-
 .../backends/fastflowlm/fastflowlm_server.cpp |   6 +
 src/cpp/server/backends/hf_cache_util.cpp     |  72 ++++
 .../server/backends/kokoro/kokoro_server.cpp  |  30 +-
 .../backends/llamacpp/llamacpp_gguf.cpp       | 204 ++++++++++
 .../backends/llamacpp/llamacpp_server.cpp     |  10 +
 .../backends/ryzenai/ryzenai_server.cpp       |  28 +-
 .../backends/whispercpp/whispercpp_server.cpp |  39 +-
 src/cpp/server/model_manager.cpp              | 362 +-----------------
 14 files changed, 574 insertions(+), 352 deletions(-)
 create mode 100644 src/cpp/include/lemon/backends/hf_cache_util.h
 create mode 100644 src/cpp/server/backends/hf_cache_util.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 758108b69..8c19edd87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -679,7 +679,8 @@ set(LEMON_BACKEND_DESCRIPTOR_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp)
 set(LEMON_BACKEND_FACTORY_SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp)
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/hf_cache_util.cpp)
 foreach(_backend_entry ${LEMON_BACKENDS})
     string(REPLACE "|" ";" _backend_parts "${_backend_entry}")
     list(GET _backend_parts 1 _backend_stem)
diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index 53b046e84..c973a7de7 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -16,6 +16,19 @@ struct BackendOpsContext {
     ModelManager* model_manager = nullptr;
 };
 
+// Inputs for resolving a checkpoint's on-disk path. The model manager computes
+// the HF-cache locations generically; each backend's ops decide how to find its
+// artifact within (a .gguf file, a genai_config.json directory, a .bin, …).
+struct CheckpointResolveContext {
+    std::string hf_cache;          // HF cache root dir
+    std::string model_cache_path;  // hf_cache/<checkpoint repo cache dir>
+    std::string repo_id;           // checkpoint's repo id
+    std::string main_repo_id;      // the model's "main" checkpoint repo id (fallback)
+    std::string variant;           // checkpoint variant after ':' ("" if none)
+    std::string type;              // checkpoint type ("main", "mmproj", "npu_cache", …)
+    std::string checkpoint;        // the raw checkpoint string
+};
+
 // Stateless per-backend behavior for model management that happens WITHOUT a
 // running subprocess: checkpoint-path resolution, download, dynamic discovery,
 // per-model metadata, version detection, availability. One singleton per
@@ -37,6 +50,13 @@ class BackendOps {
         (void)info;
         (void)ctx;
     }
+
+    // Resolve a checkpoint to its absolute on-disk path (file or directory).
+    // Default: the shared HF behavior — locate the variant/aux file in the active
+    // snapshot, else fall back to the model cache directory. Backends with a
+    // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override.
+    virtual std::string resolve_checkpoint_path(const ModelInfo& info,
+                                                const CheckpointResolveContext& ctx) const;
 };
 
 // Shared default ops instance for backends that override nothing.
diff --git a/src/cpp/include/lemon/backends/hf_cache_util.h b/src/cpp/include/lemon/backends/hf_cache_util.h
new file mode 100644
index 000000000..91c64278e
--- /dev/null
+++ b/src/cpp/include/lemon/backends/hf_cache_util.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <filesystem>
+#include <string>
+
+namespace lemon {
+namespace backends {
+namespace hf_cache {
+
+// Shared Hugging Face cache mechanics used by backend ops to locate model
+// artifacts on disk (the same logic model_manager uses for its own cache work).
+
+// Exists check that tolerates the symlinks HF uses for dedup (Win32 on Windows,
+// where MSVC's std::filesystem refuses untrusted reparse points).
+bool exists(const std::filesystem::path& p);
+
+// Directory-iteration options that skip inaccessible/symlinked entries instead
+// of throwing.
+std::filesystem::directory_options dir_options();
+
+// The active HF snapshot directory (snapshots/<refs/main>) for a model cache
+// dir, or an empty path if there is no recorded ref / it doesn't exist.
+std::filesystem::path active_snapshot_path(const std::filesystem::path& model_cache_path);
+
+// HF cache directory name for a repo id ("org/repo" -> "models--org--repo").
+std::string repo_id_to_cache_dir_name(const std::string& repo_id);
+
+} // namespace hf_cache
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
index 2e431478b..ccf79ae57 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h
@@ -25,6 +25,12 @@ struct GgufMetadata {
 // not a valid GGUF container.
 bool read_gguf_metadata(GgufMetadata& out, const std::string& path);
 
+// Resolve the on-disk path of the GGUF file for a model cache directory and
+// variant (handles sharding, folder variants, and quant-token fallback). Returns
+// the cache directory if no GGUF is present, or "" if the requested variant
+// can't be resolved.
+std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant);
+
 } // namespace llamacpp
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp
index 773e39494..af21249c5 100644
--- a/src/cpp/server/backends/backend_ops.cpp
+++ b/src/cpp/server/backends/backend_ops.cpp
@@ -1,8 +1,107 @@
 #include "lemon/backends/backend_ops.h"
 
+#include <algorithm>
+#include <filesystem>
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/utils/path_utils.h"
+
+namespace fs = std::filesystem;
+
 namespace lemon {
 namespace backends {
 
+using lemon::utils::path_from_utf8;
+using lemon::utils::path_to_utf8;
+
+// Default checkpoint resolution: the shared Hugging Face behavior. Locate the
+// requested variant (or auxiliary file like mmproj) within the active snapshot,
+// falling back to the main repo and finally the model cache directory. Backends
+// with bespoke layouts override resolve_checkpoint_path().
+std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info,
+                                                const CheckpointResolveContext& ctx) const {
+    (void)info;
+
+    // NPU side-cache checkpoints have no resolvable local file here (the backend
+    // that uses them resolves them itself at load time).
+    if (ctx.type == "npu_cache") {
+        return "";
+    }
+
+    fs::path model_cache_path_fs = path_from_utf8(ctx.model_cache_path);
+
+    if (!ctx.variant.empty()) {
+        // Prefer refs/main for auxiliary checkpoints too (e.g. mmproj) so
+        // companion files stay on the active snapshot as the main model.
+        fs::path active_snapshot = hf_cache::active_snapshot_path(model_cache_path_fs);
+        if (!active_snapshot.empty()) {
+            fs::path direct_variant_path = active_snapshot / path_from_utf8(ctx.variant);
+            if (hf_cache::exists(direct_variant_path)) {
+                return path_to_utf8(direct_variant_path);
+            }
+            std::error_code ec;
+            for (const auto& entry :
+                 fs::recursive_directory_iterator(active_snapshot, hf_cache::dir_options(), ec)) {
+                if (ec) break;
+                if (entry.is_regular_file(ec)) {
+                    if (entry.path().filename().string() == ctx.variant) {
+                        return path_to_utf8(entry.path());
+                    }
+                } else if (entry.is_directory(ec)) {
+                    fs::path variant_path = entry.path() / path_from_utf8(ctx.variant);
+                    if (hf_cache::exists(variant_path)) {
+                        return path_to_utf8(variant_path);
+                    }
+                }
+                ec.clear();
+            }
+        }
+
+        // Try to find the exact variant in the cache directory's subtree.
+        if (hf_cache::exists(model_cache_path_fs)) {
+            for (const auto& entry :
+                 fs::recursive_directory_iterator(model_cache_path_fs, hf_cache::dir_options())) {
+                if (entry.is_regular_file()) {
+                    if (entry.path().filename().string() == ctx.variant) {
+                        return path_to_utf8(entry.path());
+                    }
+                } else if (entry.is_directory()) {
+                    fs::path variant_path = entry.path() / path_from_utf8(ctx.variant);
+                    if (hf_cache::exists(variant_path)) {
+                        return path_to_utf8(variant_path);
+                    }
+                }
+            }
+        }
+
+        // Backward-compat: older downloads placed all files in the main repo dir.
+        if (ctx.repo_id != ctx.main_repo_id) {
+            std::string main_cache_path =
+                ctx.hf_cache + "/" + hf_cache::repo_id_to_cache_dir_name(ctx.main_repo_id);
+            fs::path main_cache_path_fs = path_from_utf8(main_cache_path);
+            if (fs::exists(main_cache_path_fs)) {
+                for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) {
+                    if (entry.is_regular_file()) {
+                        if (entry.path().filename().string() == ctx.variant) {
+                            return path_to_utf8(entry.path());
+                        }
+                    } else if (entry.is_directory()) {
+                        fs::path variant_path = entry.path() / path_from_utf8(ctx.variant);
+                        if (fs::exists(variant_path)) {
+                            return path_to_utf8(variant_path);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Variant not found — signal not downloaded.
+        return "";
+    }
+
+    // No variant: return the cache directory.
+    return ctx.model_cache_path;
+}
+
 const BackendOps* default_backend_ops() {
     static const BackendOps kDefault;
     return &kDefault;
diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
index 29dede2b0..b20aae05c 100644
--- a/src/cpp/server/backends/cloud/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -806,8 +806,23 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
+namespace {
+class CloudOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext&) const override {
+        // Cloud-offloaded models have no local artifacts; the checkpoint is the
+        // upstream provider's model id, used directly when forwarding requests.
+        return "";
+    }
+};
+}  // namespace
+
 const BackendSpec* spec() { return nullptr; }
-const BackendOps* ops() { return default_backend_ops(); }
+const BackendOps* ops() {
+    static const CloudOps kOps;
+    return &kOps;
+}
 }  // namespace cloud
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 0b5e15934..648cd9ff5 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -485,6 +485,12 @@ class FlmOps : public BackendOps {
     void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override {
         info.max_context_window = read_flm_max_context_window(info);
     }
+
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext& ctx) const override {
+        // FLM uses the checkpoint string as-is (e.g. "gemma3:4b"); no local file.
+        return ctx.checkpoint;
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/backends/hf_cache_util.cpp b/src/cpp/server/backends/hf_cache_util.cpp
new file mode 100644
index 000000000..028b25ee4
--- /dev/null
+++ b/src/cpp/server/backends/hf_cache_util.cpp
@@ -0,0 +1,72 @@
+#include "lemon/backends/hf_cache_util.h"
+
+#include <fstream>
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+namespace fs = std::filesystem;
+
+namespace lemon {
+namespace backends {
+namespace hf_cache {
+
+bool exists(const fs::path& p) {
+#ifdef _WIN32
+    // The HF cache uses symlinks for dedup; MSVC's std::filesystem refuses
+    // "untrusted" reparse points when the token lacks symlink privilege, so use
+    // the Win32 API which has no such restriction.
+    return GetFileAttributesW(p.c_str()) != INVALID_FILE_ATTRIBUTES;
+#else
+    std::error_code ec;
+    return fs::exists(p, ec);
+#endif
+}
+
+fs::directory_options dir_options() {
+#ifdef _WIN32
+    return fs::directory_options::skip_permission_denied;
+#else
+    return fs::directory_options::none;
+#endif
+}
+
+namespace {
+std::string read_ref_main(const fs::path& model_cache_path) {
+    std::ifstream refs_file(model_cache_path / "refs" / "main");
+    if (!refs_file.is_open()) {
+        return "";
+    }
+    std::string ref;
+    std::getline(refs_file, ref);
+    ref.erase(0, ref.find_first_not_of(" \t\r\n"));
+    size_t last = ref.find_last_not_of(" \t\r\n");
+    if (last == std::string::npos) {
+        return "";
+    }
+    ref.erase(last + 1);
+    return ref;
+}
+} // namespace
+
+fs::path active_snapshot_path(const fs::path& model_cache_path) {
+    std::string ref = read_ref_main(model_cache_path);
+    if (ref.empty()) {
+        return fs::path();
+    }
+    fs::path snapshot_path = model_cache_path / "snapshots" / ref;
+    return lemon::backends::hf_cache::exists(snapshot_path) ? snapshot_path : fs::path();
+}
+
+std::string repo_id_to_cache_dir_name(const std::string& repo_id) {
+    std::string cache_dir_name = "models--";
+    for (char c : repo_id) {
+        cache_dir_name += (c == '/') ? "--" : std::string(1, c);
+    }
+    return cache_dir_name;
+}
+
+} // namespace hf_cache
+} // namespace backends
+} // namespace lemon
diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
index 13f1a3ffe..80d502ead 100644
--- a/src/cpp/server/backends/kokoro/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -1,6 +1,11 @@
 #include "lemon/backends/kokoro/kokoro_server.h"
 #include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/model_manager.h"
+#include "lemon/utils/path_utils.h"
+#include <filesystem>
 #include "lemon/backend_manager.h"
 #include "lemon/utils/process_manager.h"
 #include "lemon/utils/json_utils.h"
@@ -214,8 +219,31 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
+namespace {
+class KokoroOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext& ctx) const override {
+        // Kokoro models are a directory; resolve to the index.json file inside.
+        std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path);
+        if (hf_cache::exists(dir)) {
+            for (const auto& entry :
+                 std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+                if (entry.is_regular_file() && entry.path().filename() == "index.json") {
+                    return lemon::utils::path_to_utf8(entry.path());
+                }
+            }
+        }
+        return ctx.model_cache_path;  // directory even if index not found
+    }
+};
+}  // namespace
+
 const BackendSpec* spec() { return &KokoroServer::SPEC; }
-const BackendOps* ops() { return default_backend_ops(); }
+const BackendOps* ops() {
+    static const KokoroOps kOps;
+    return &kOps;
+}
 }  // namespace kokoro
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
index 1e099d064..e23e3c2a4 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
@@ -3,17 +3,31 @@
 #include <algorithm>
 #include <cctype>
 #include <cstring>
+#include <filesystem>
 #include <fstream>
 #include <istream>
 #include <limits>
+#include <map>
+#include <vector>
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/hf_variants.h"
+#include "lemon/utils/aixlog.hpp"
 #include "lemon/utils/path_utils.h"
 
+namespace fs = std::filesystem;
+
 namespace lemon {
 namespace backends {
 namespace llamacpp {
 namespace {
 
 using lemon::utils::path_from_utf8;
+using lemon::utils::path_to_utf8;
+
+std::string to_lower(std::string s) {
+    std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); });
+    return s;
+}
 
 // Local copies of the tiny case-insensitive string helpers (kept out of a shared
 // util to keep this GGUF reader self-contained).
@@ -248,6 +262,196 @@ bool read_gguf_metadata(GgufMetadata& out, const std::string& path) {
 }
 
 
+std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant) {
+    fs::path model_cache_path_fs = path_from_utf8(model_cache_path);
+        if (!hf_cache::exists(model_cache_path_fs)) {
+            return model_cache_path;  // Return directory path even if not found
+        }
+
+        // Prefer the active HF snapshot recorded in refs/main. This lets
+        // Lemonade keep using the previous snapshot when upstream only changed
+        // README/metadata and the requested model artifacts are unchanged.
+        auto collect_gguf_files = [](const fs::path& search_root) {
+            std::vector<std::string> files;
+            if (search_root.empty() || !hf_cache::exists(search_root)) {
+                return files;
+            }
+
+            std::error_code ec;
+            for (const auto& entry : fs::recursive_directory_iterator(search_root, hf_cache::dir_options(), ec)) {
+                if (ec) break;
+                if (!entry.is_regular_file(ec)) {
+                    ec.clear();
+                    continue;
+                }
+
+                std::string filename = entry.path().filename().string();
+                std::string filename_lower = filename;
+                std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
+
+                if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) {
+                    files.push_back(path_to_utf8(entry.path()));
+                }
+            }
+            return files;
+        };
+
+        std::vector<std::string> all_gguf_files = collect_gguf_files(hf_cache::active_snapshot_path(model_cache_path_fs));
+        if (all_gguf_files.empty()) {
+            // Backward-compatible fallback for caches without refs/main and for
+            // partially migrated/manual HF cache layouts.
+            all_gguf_files = collect_gguf_files(model_cache_path_fs);
+        }
+
+        if (all_gguf_files.empty()) {
+            return model_cache_path;  // Return directory if no GGUF found
+        }
+
+        // Sort files for consistent ordering (important for sharded models)
+        std::sort(all_gguf_files.begin(), all_gguf_files.end());
+
+        // Case 0: Wildcard (*) - return first file (llama-server will auto-load shards)
+        if (variant == "*") {
+            return all_gguf_files[0];
+        }
+
+        // Case 1: Empty variant - return first file
+        if (variant.empty()) {
+            return all_gguf_files[0];
+        }
+
+        // Case 2: Exact filename match (variant ends with .gguf)
+        if (variant.find(".gguf") != std::string::npos) {
+            for (const auto& filepath : all_gguf_files) {
+                std::string filename = path_from_utf8(filepath).filename().string();
+                if (filename == variant) {
+                    return filepath;
+                }
+            }
+            return "";  // Exact variant not found — signal not downloaded
+        }
+
+        // Case 3: Files ending with {variant}.gguf (case insensitive)
+        std::string variant_lower = variant;
+        std::transform(variant_lower.begin(), variant_lower.end(), variant_lower.begin(), ::tolower);
+        std::string suffix = variant_lower + ".gguf";
+
+        std::vector<std::string> matching_files;
+        for (const auto& filepath : all_gguf_files) {
+            std::string filename = path_from_utf8(filepath).filename().string();
+            std::string filename_lower = filename;
+            std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
+
+            if (filename_lower.size() >= suffix.size() &&
+                filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) {
+                matching_files.push_back(filepath);
+            }
+        }
+
+        if (!matching_files.empty()) {
+            return matching_files[0];
+        }
+
+        // Case 4: Folder-based sharding (files in variant/ folder)
+        std::string folder_prefix_lower = variant_lower + "/";
+
+        for (const auto& filepath : all_gguf_files) {
+            // Get relative path from model cache path
+            std::string relative_path = path_to_utf8(
+                path_from_utf8(filepath).lexically_relative(model_cache_path_fs));
+            std::string relative_lower = relative_path;
+            // Normalize path separators and case so folder-variant matching works cross-platform.
+            std::transform(relative_lower.begin(), relative_lower.end(), relative_lower.begin(), ::tolower);
+            std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/');
+
+            if (relative_lower.find(folder_prefix_lower) != std::string::npos) {
+                return filepath;
+            }
+        }
+
+        // Case 5: Local quant-token fallback.
+        //
+        // Keep the existing resolver cases above as the primary logic: exact
+        // filenames, suffix matches, and folder-based sharding are more
+        // specific and preserve the CHECKPOINT:VARIANT contract.
+        //
+        // Some GGUF repositories name files with the quant token in the middle,
+        // for example:
+        //   Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf
+        // for variant:
+        //   IQ4_XS
+        // That file does not end with IQ4_XS.gguf, so mirror the downloader's
+        // GGUF variant enumeration over the files that are already present in
+        // the local HF cache before declaring the model missing.
+        //
+        // HF cache paths have an extra snapshots/<revision>/ prefix that is not
+        // part of the repository-relative filename. Strip it before calling
+        // enumerate_gguf_variants(); otherwise the enumerator treats
+        // "snapshots" as a top-level sharded-folder variant and never extracts
+        // the quant token from the actual GGUF filename.
+        std::vector<std::string> relative_gguf_files;
+        std::map<std::string, std::string> absolute_by_relative;
+        auto repo_relative_from_cache_relative = [](std::string rel) {
+            std::replace(rel.begin(), rel.end(), '\\', '/');
+
+            static const std::string snapshots_prefix = "snapshots/";
+            if (rel.rfind(snapshots_prefix, 0) == 0) {
+                size_t revision_end = rel.find('/', snapshots_prefix.size());
+                if (revision_end != std::string::npos && revision_end + 1 < rel.size()) {
+                    rel = rel.substr(revision_end + 1);
+                }
+            }
+
+            return rel;
+        };
+
+        for (const auto& filepath : all_gguf_files) {
+            std::string relative_path = path_to_utf8(
+                path_from_utf8(filepath).lexically_relative(model_cache_path_fs));
+            relative_path = repo_relative_from_cache_relative(relative_path);
+
+            // Multiple HF snapshots can contain the same repo-relative file.
+            // Keep the first absolute path from the sorted all_gguf_files list
+            // so duplicates do not create false ambiguity.
+            if (absolute_by_relative.emplace(relative_path, filepath).second) {
+                relative_gguf_files.push_back(relative_path);
+            }
+        }
+
+        std::vector<std::string> enumerated_matches;
+        auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files);
+        for (const auto& local_variant : local_variants.variants) {
+            if (to_lower(local_variant.name) != variant_lower) {
+                continue;
+            }
+
+            auto it = absolute_by_relative.find(local_variant.primary_file);
+            if (it != absolute_by_relative.end()) {
+                enumerated_matches.push_back(it->second);
+            }
+        }
+
+        if (enumerated_matches.size() == 1) {
+            LOG(INFO, "ModelManager")
+                << "Resolved local GGUF variant '" << variant
+                << "' via quant-token fallback: " << enumerated_matches[0] << std::endl;
+            return enumerated_matches[0];
+        }
+
+        if (enumerated_matches.size() > 1) {
+            LOG(WARNING, "ModelManager")
+                << "Multiple local GGUF files matched variant '" << variant
+                << "' via quant-token fallback; refusing to guess" << std::endl;
+            return "";
+        }
+
+        // No match found for the requested GGUF variant. Do not fall back to
+        // another quantization in the same Hugging Face repo; otherwise a
+        // custom download with a different quant can make a built-in model
+        // appear downloaded and allow deleting the wrong file.
+        return "";
+}
+
 } // namespace llamacpp
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index e295a835a..a9af9359f 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -695,6 +695,16 @@ class LlamaCppOps : public BackendOps {
             apply_gguf_capability_labels(info.labels, meta.caps);
         }
     }
+
+    std::string resolve_checkpoint_path(const ModelInfo& info,
+                                        const CheckpointResolveContext& ctx) const override {
+        // The main checkpoint is a GGUF file (with sharding/variant resolution);
+        // auxiliary checkpoints (mmproj, …) use the shared default.
+        if (ctx.type == "main") {
+            return resolve_gguf_path(ctx.model_cache_path, ctx.variant);
+        }
+        return BackendOps::resolve_checkpoint_path(info, ctx);
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index e965ea3b9..f6ba8f457 100644
--- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -1,6 +1,9 @@
 #include "lemon/backends/ryzenai/ryzenai_server.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/model_manager.h"
+#include "lemon/backends/backend_ops.h"
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/utils/path_utils.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
 #include "lemon/utils/process_manager.h"
@@ -185,8 +188,31 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
+namespace {
+class RyzenAiOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo&,
+                                        const CheckpointResolveContext& ctx) const override {
+        // RyzenAI models are a directory containing genai_config.json.
+        std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path);
+        if (hf_cache::exists(dir)) {
+            for (const auto& entry :
+                 std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+                if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") {
+                    return lemon::utils::path_to_utf8(entry.path().parent_path());
+                }
+            }
+        }
+        return ctx.model_cache_path;  // directory even if genai_config not found
+    }
+};
+}  // namespace
+
 const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; }
-const BackendOps* ops() { return default_backend_ops(); }
+const BackendOps* ops() {
+    static const RyzenAiOps kOps;
+    return &kOps;
+}
 }  // namespace ryzenai
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index c77d10669..dfa0ebea9 100644
--- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -1,6 +1,9 @@
 #include "lemon/backends/whispercpp/whispercpp_server.h"
 #include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
+#include "lemon/backends/hf_cache_util.h"
+#include "lemon/model_manager.h"
 #include "lemon/backend_manager.h"
 #include "lemon/runtime_config.h"
 #include "lemon/system_info.h"
@@ -699,8 +702,42 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
+namespace {
+class WhisperOps : public BackendOps {
+public:
+    std::string resolve_checkpoint_path(const ModelInfo& info,
+                                        const CheckpointResolveContext& ctx) const override {
+        // With no variant, find any .bin model file; otherwise use the shared
+        // default (variant/aux resolution).
+        if (ctx.variant.empty()) {
+            std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path);
+            if (!hf_cache::exists(dir)) {
+                return ctx.model_cache_path;
+            }
+            std::vector<std::string> bin_files;
+            for (const auto& entry :
+                 std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+                if (entry.is_regular_file() &&
+                    entry.path().filename().string().find(".bin") != std::string::npos) {
+                    bin_files.push_back(lemon::utils::path_to_utf8(entry.path()));
+                }
+            }
+            if (bin_files.empty()) {
+                return ctx.model_cache_path;
+            }
+            std::sort(bin_files.begin(), bin_files.end());
+            return bin_files[0];
+        }
+        return BackendOps::resolve_checkpoint_path(info, ctx);
+    }
+};
+}  // namespace
+
 const BackendSpec* spec() { return &WhisperServer::SPEC; }
-const BackendOps* ops() { return default_backend_ops(); }
+const BackendOps* ops() {
+    static const WhisperOps kOps;
+    return &kOps;
+}
 }  // namespace whispercpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 81d0200b2..d1295ff92 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -1091,370 +1091,38 @@ std::map<std::string, ModelInfo> ModelManager::discover_extra_models() const {
 }
 
 std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::string& type, const std::string& checkpoint) const {
-    // Collections are virtual entries with no direct checkpoint to resolve
+    // Collections are virtual entries with no direct checkpoint to resolve.
     if (is_collection_recipe(info.recipe)) {
         return "";
     }
 
-    // Cloud-offloaded models have no local artifacts; checkpoint is the
-    // upstream provider's model id, used directly when forwarding requests.
-    if (info.recipe == "cloud") {
-        return "";
-    }
-
-    // FLM models use checkpoint as-is (e.g., "gemma3:4b")
-    if (info.recipe == "flm") {
-        return checkpoint;
-    }
-
-    // Local path models use checkpoint as-is (absolute path to file)
+    // Local-path models use the checkpoint as-is (absolute path to a file).
     if (info.source == "local_path") {
         return checkpoint;
     }
 
     std::string hf_cache = get_hf_cache_dir();
 
-    // Local uploads: checkpoint is relative path from HF cache
+    // Local uploads: checkpoint is a relative path from the HF cache.
     if (info.source == "local_upload") {
         std::string normalized = checkpoint;
         std::replace(normalized.begin(), normalized.end(), '\\', '/');
         return hf_cache + "/" + normalized;
     }
 
-    // For now, NPU cache is handled directly in whisper.cpp
-    if (type == "npu_cache") {
-        return "";
-    }
-
-    // HuggingFace models: need to find the GGUF file in cache
-    // Parse checkpoint to get repo_id and variant
-    // Use the checkpoint's own repo, falling back to main repo for backward compatibility
-    std::string checkpoint_repo_id = checkpoint_to_repo_id(checkpoint);
-    std::string main_repo_id = checkpoint_to_repo_id(info.checkpoint("main"));
-    std::string repo_id = checkpoint_repo_id;
-    std::string variant = checkpoint_to_variant(checkpoint);
-
-    std::string model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(repo_id);
-    fs::path model_cache_path_fs = path_from_utf8(model_cache_path);
-
-    // For RyzenAI LLM models, look for genai_config.json directory
-    if (info.recipe == "ryzenai-llm") {
-        if (safe_exists(model_cache_path_fs)) {
-            for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-                if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") {
-                    return path_to_utf8(entry.path().parent_path());
-                }
-            }
-        }
-        return model_cache_path;  // Return directory even if genai_config not found
-    }
-
-    // For kokoro models, look for index.json directory
-    if (info.recipe == "kokoro") {
-        if (safe_exists(model_cache_path_fs)) {
-            for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-                if (entry.is_regular_file() && entry.path().filename() == "index.json") {
-                    return path_to_utf8(entry.path());
-                }
-            }
-        }
-
-        return model_cache_path;  // Return directory even if index not found
-    }
-
-    // For whispercpp, find the .bin model file
-    if (info.recipe == "whispercpp" && variant.empty()) {
-        // No variant specified - use fallback logic to find any .bin file
-        if (!safe_exists(model_cache_path_fs)) {
-            return model_cache_path;  // Return directory path even if not found
-        }
-
-        // Collect all .bin files
-        std::vector<std::string> all_bin_files;
-        for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-            if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                if (filename.find(".bin") != std::string::npos) {
-                    all_bin_files.push_back(path_to_utf8(entry.path()));
-                }
-            }
-        }
-
-        if (all_bin_files.empty()) {
-            return model_cache_path;  // Return directory if no .bin found
-        }
-
-        // Sort files for consistent ordering
-        std::sort(all_bin_files.begin(), all_bin_files.end());
-
-        // Return first .bin file as fallback (only when no variant specified)
-        return all_bin_files[0];
-    }
-
-    // For llamacpp, find the GGUF file with advanced sharded model support
-    if (info.recipe == "llamacpp" && type == "main") {
-        if (!safe_exists(model_cache_path_fs)) {
-            return model_cache_path;  // Return directory path even if not found
-        }
-
-        // Prefer the active HF snapshot recorded in refs/main. This lets
-        // Lemonade keep using the previous snapshot when upstream only changed
-        // README/metadata and the requested model artifacts are unchanged.
-        auto collect_gguf_files = [](const fs::path& search_root) {
-            std::vector<std::string> files;
-            if (search_root.empty() || !safe_exists(search_root)) {
-                return files;
-            }
-
-            std::error_code ec;
-            for (const auto& entry : fs::recursive_directory_iterator(search_root, safe_dir_options, ec)) {
-                if (ec) break;
-                if (!entry.is_regular_file(ec)) {
-                    ec.clear();
-                    continue;
-                }
-
-                std::string filename = entry.path().filename().string();
-                std::string filename_lower = filename;
-                std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
-
-                if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) {
-                    files.push_back(path_to_utf8(entry.path()));
-                }
-            }
-            return files;
-        };
-
-        std::vector<std::string> all_gguf_files = collect_gguf_files(active_hf_snapshot_path(model_cache_path_fs));
-        if (all_gguf_files.empty()) {
-            // Backward-compatible fallback for caches without refs/main and for
-            // partially migrated/manual HF cache layouts.
-            all_gguf_files = collect_gguf_files(model_cache_path_fs);
-        }
-
-        if (all_gguf_files.empty()) {
-            return model_cache_path;  // Return directory if no GGUF found
-        }
-
-        // Sort files for consistent ordering (important for sharded models)
-        std::sort(all_gguf_files.begin(), all_gguf_files.end());
-
-        // Case 0: Wildcard (*) - return first file (llama-server will auto-load shards)
-        if (variant == "*") {
-            return all_gguf_files[0];
-        }
-
-        // Case 1: Empty variant - return first file
-        if (variant.empty()) {
-            return all_gguf_files[0];
-        }
-
-        // Case 2: Exact filename match (variant ends with .gguf)
-        if (variant.find(".gguf") != std::string::npos) {
-            for (const auto& filepath : all_gguf_files) {
-                std::string filename = path_from_utf8(filepath).filename().string();
-                if (filename == variant) {
-                    return filepath;
-                }
-            }
-            return "";  // Exact variant not found — signal not downloaded
-        }
-
-        // Case 3: Files ending with {variant}.gguf (case insensitive)
-        std::string variant_lower = variant;
-        std::transform(variant_lower.begin(), variant_lower.end(), variant_lower.begin(), ::tolower);
-        std::string suffix = variant_lower + ".gguf";
-
-        std::vector<std::string> matching_files;
-        for (const auto& filepath : all_gguf_files) {
-            std::string filename = path_from_utf8(filepath).filename().string();
-            std::string filename_lower = filename;
-            std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
-
-            if (filename_lower.size() >= suffix.size() &&
-                filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) {
-                matching_files.push_back(filepath);
-            }
-        }
-
-        if (!matching_files.empty()) {
-            return matching_files[0];
-        }
-
-        // Case 4: Folder-based sharding (files in variant/ folder)
-        std::string folder_prefix_lower = variant_lower + "/";
-
-        for (const auto& filepath : all_gguf_files) {
-            // Get relative path from model cache path
-            std::string relative_path = path_to_utf8(
-                path_from_utf8(filepath).lexically_relative(model_cache_path_fs));
-            std::string relative_lower = relative_path;
-            // Normalize path separators and case so folder-variant matching works cross-platform.
-            std::transform(relative_lower.begin(), relative_lower.end(), relative_lower.begin(), ::tolower);
-            std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/');
-
-            if (relative_lower.find(folder_prefix_lower) != std::string::npos) {
-                return filepath;
-            }
-        }
-
-        // Case 5: Local quant-token fallback.
-        //
-        // Keep the existing resolver cases above as the primary logic: exact
-        // filenames, suffix matches, and folder-based sharding are more
-        // specific and preserve the CHECKPOINT:VARIANT contract.
-        //
-        // Some GGUF repositories name files with the quant token in the middle,
-        // for example:
-        //   Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf
-        // for variant:
-        //   IQ4_XS
-        // That file does not end with IQ4_XS.gguf, so mirror the downloader's
-        // GGUF variant enumeration over the files that are already present in
-        // the local HF cache before declaring the model missing.
-        //
-        // HF cache paths have an extra snapshots/<revision>/ prefix that is not
-        // part of the repository-relative filename. Strip it before calling
-        // enumerate_gguf_variants(); otherwise the enumerator treats
-        // "snapshots" as a top-level sharded-folder variant and never extracts
-        // the quant token from the actual GGUF filename.
-        std::vector<std::string> relative_gguf_files;
-        std::map<std::string, std::string> absolute_by_relative;
-        auto repo_relative_from_cache_relative = [](std::string rel) {
-            std::replace(rel.begin(), rel.end(), '\\', '/');
-
-            static const std::string snapshots_prefix = "snapshots/";
-            if (rel.rfind(snapshots_prefix, 0) == 0) {
-                size_t revision_end = rel.find('/', snapshots_prefix.size());
-                if (revision_end != std::string::npos && revision_end + 1 < rel.size()) {
-                    rel = rel.substr(revision_end + 1);
-                }
-            }
-
-            return rel;
-        };
-
-        for (const auto& filepath : all_gguf_files) {
-            std::string relative_path = path_to_utf8(
-                path_from_utf8(filepath).lexically_relative(model_cache_path_fs));
-            relative_path = repo_relative_from_cache_relative(relative_path);
-
-            // Multiple HF snapshots can contain the same repo-relative file.
-            // Keep the first absolute path from the sorted all_gguf_files list
-            // so duplicates do not create false ambiguity.
-            if (absolute_by_relative.emplace(relative_path, filepath).second) {
-                relative_gguf_files.push_back(relative_path);
-            }
-        }
-
-        std::vector<std::string> enumerated_matches;
-        auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files);
-        for (const auto& local_variant : local_variants.variants) {
-            if (to_lower(local_variant.name) != variant_lower) {
-                continue;
-            }
-
-            auto it = absolute_by_relative.find(local_variant.primary_file);
-            if (it != absolute_by_relative.end()) {
-                enumerated_matches.push_back(it->second);
-            }
-        }
-
-        if (enumerated_matches.size() == 1) {
-            LOG(INFO, "ModelManager")
-                << "Resolved local GGUF variant '" << variant
-                << "' via quant-token fallback: " << enumerated_matches[0] << std::endl;
-            return enumerated_matches[0];
-        }
-
-        if (enumerated_matches.size() > 1) {
-            LOG(WARNING, "ModelManager")
-                << "Multiple local GGUF files matched variant '" << variant
-                << "' via quant-token fallback; refusing to guess" << std::endl;
-            return "";
-        }
-
-        // No match found for the requested GGUF variant. Do not fall back to
-        // another quantization in the same Hugging Face repo; otherwise a
-        // custom download with a different quant can make a built-in model
-        // appear downloaded and allow deleting the wrong file.
-        return "";
-    }
-
-    // Everything else
-    if (!variant.empty()) {
-        // Prefer refs/main for auxiliary checkpoints too (for example mmproj),
-        // so companion files stay on the same active snapshot as the main model
-        // when unchanged artifacts are reused across README-only commits.
-        fs::path active_snapshot = active_hf_snapshot_path(model_cache_path_fs);
-        if (!active_snapshot.empty()) {
-            fs::path direct_variant_path = active_snapshot / path_from_utf8(variant);
-            if (safe_exists(direct_variant_path)) {
-                return path_to_utf8(direct_variant_path);
-            }
-
-            std::error_code ec;
-            for (const auto& entry : fs::recursive_directory_iterator(active_snapshot, safe_dir_options, ec)) {
-                if (ec) break;
-                if (entry.is_regular_file(ec)) {
-                    std::string filename = entry.path().filename().string();
-                    if (filename == variant) {
-                        return path_to_utf8(entry.path());
-                    }
-                } else if (entry.is_directory(ec)) {
-                    fs::path variant_path = entry.path() / path_from_utf8(variant);
-                    if (safe_exists(variant_path)) {
-                        return path_to_utf8(variant_path);
-                    }
-                }
-                ec.clear();
-            }
-        }
-
-        // Try to find the exact variant in snapshots subdirectories
-        if (safe_exists(model_cache_path_fs)) {
-            for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) {
-                if (entry.is_regular_file()) {
-                    std::string filename = entry.path().filename().string();
-                    if (filename == variant) {
-                        return path_to_utf8(entry.path());
-                    }
-                } else if (entry.is_directory()) {
-                    fs::path variant_path = entry.path() / path_from_utf8(variant);
-                    if (safe_exists(variant_path)) {
-                        return path_to_utf8(variant_path);
-                    }
-                }
-            }
-        }
-        // Variant not found in checkpoint's own repo - try main repo as fallback
-        // (backward compat: older downloads placed all files in the main repo dir)
-        if (checkpoint_repo_id != main_repo_id) {
-            std::string main_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(main_repo_id);
-            fs::path main_cache_path_fs = path_from_utf8(main_cache_path);
-            if (fs::exists(main_cache_path_fs)) {
-                for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) {
-                    if (entry.is_regular_file()) {
-                        std::string filename = entry.path().filename().string();
-                        if (filename == variant) {
-                            return path_to_utf8(entry.path());
-                        }
-                    } else if (entry.is_directory()) {
-                        fs::path variant_path = entry.path() / path_from_utf8(variant);
-                        if (fs::exists(variant_path)) {
-                            return path_to_utf8(variant_path);
-                        }
-                    }
-                }
-            }
-        }
-
-        // Variant not found - return empty string to indicate model not downloaded
-        return "";
-    }
+    // Compute the HF cache location for this checkpoint's repo, then let the
+    // backend's ops find its artifact within (a .gguf file, a genai_config.json
+    // directory, a .bin, …) — no per-recipe switchboard here.
+    backends::CheckpointResolveContext ctx;
+    ctx.hf_cache = hf_cache;
+    ctx.repo_id = checkpoint_to_repo_id(checkpoint);
+    ctx.main_repo_id = checkpoint_to_repo_id(info.checkpoint("main"));
+    ctx.variant = checkpoint_to_variant(checkpoint);
+    ctx.model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(ctx.repo_id);
+    ctx.type = type;
+    ctx.checkpoint = checkpoint;
 
-    // Fallback: return directory path
-    return model_cache_path;
+    return backends::ops_for(info.recipe)->resolve_checkpoint_path(info, ctx);
 }
 
 void ModelManager::resolve_all_model_paths(ModelInfo& info) {

From 2feae8471d7ca229925e664dc2012df64683c9ac Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:00:52 -0400
Subject: [PATCH 12/39] =?UTF-8?q?refactor(backends):=20migrate=20download/?=
 =?UTF-8?q?discovery/is=5Fdownloaded=20to=20ops;=20FLM=20cluster=20?=
 =?UTF-8?q?=E2=86=92=20folder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Dynamic discovery, download status, and downloading now flow through BackendOps
instead of recipe switchboards in model_manager:

- discover_models: build_cache loops descriptors with dynamic_models=true and
  merges ops->discover_models(). FLM (`flm list`) and cloud (per-provider) both
  implement it — the two bespoke discovery blocks collapse to one generic loop.
- is_downloaded: base = shared HF completeness (ModelManager::checkpoints_complete);
  CloudOps → true; FlmOps → installed-set membership. Replaces the flm_set/cloud/
  else branches in build_cache and add_model_to_cache.
- validate_checkpoint_file: LlamaCppOps does the GGUF-magic check (was an inline
  llamacpp branch in are_required_checkpoints_complete).
- download_model: base = shared HF engine (download_from_huggingface_engine);
  FlmOps → flm pull; CloudOps → no-op. download_registered_model just dispatches.
  invalidates_cache_after_download() replaces the recipe=="flm" cache-reset.

The whole FLM cluster (find_flm_binary, flm_installed_checkpoints, flm_discover_models,
flm_download) moves into backends/fastflowlm/fastflowlm_models. model_manager keeps
only the generic HF engine.

Verified: server_endpoints 69 pass; download status correct for every recipe.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_ops.h  |  35 +-
 .../backends/fastflowlm/fastflowlm_models.h   |  18 +-
 src/cpp/include/lemon/model_manager.h         |  17 +-
 src/cpp/server/backends/backend_ops.cpp       |  14 +
 .../server/backends/cloud/cloud_server.cpp    |  41 ++
 .../backends/fastflowlm/fastflowlm_models.cpp | 416 ++++++++++++++
 .../backends/fastflowlm/fastflowlm_server.cpp |  16 +
 .../backends/llamacpp/llamacpp_server.cpp     |  21 +
 src/cpp/server/model_manager.cpp              | 544 ++----------------
 9 files changed, 613 insertions(+), 509 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index c973a7de7..35f434df3 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -2,11 +2,11 @@
 
 #include <string>
 #include <vector>
+#include "lemon/model_manager.h"  // ModelInfo, DownloadProgressCallback (server-side only)
 
 namespace lemon {
 
-struct ModelInfo;
-class ModelManager;
+class CloudProviderRegistry;
 
 namespace backends {
 
@@ -14,6 +14,7 @@ namespace backends {
 // management needs without a running subprocess. Grows as migrations require.
 struct BackendOpsContext {
     ModelManager* model_manager = nullptr;
+    CloudProviderRegistry* cloud_registry = nullptr;  // for dynamic cloud discovery
 };
 
 // Inputs for resolving a checkpoint's on-disk path. The model manager computes
@@ -57,6 +58,36 @@ class BackendOps {
     // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override.
     virtual std::string resolve_checkpoint_path(const ModelInfo& info,
                                                 const CheckpointResolveContext& ctx) const;
+
+    // Models supplied at runtime rather than from server_models.json (descriptor
+    // dynamic_models = true). Default: none. cloud/flm override.
+    virtual std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const {
+        (void)ctx;
+        return {};
+    }
+
+    // Whether a model's local artifacts are present. Default: the shared HF
+    // checkpoint-completeness check (ModelManager::checkpoints_complete). cloud
+    // (always true) and flm (installed-set membership) override.
+    virtual bool is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const;
+
+    // Validate a resolved checkpoint file for the cache. Returns "" if valid, or
+    // a reason it should be treated as not-downloaded. Default: always valid;
+    // llamacpp checks GGUF magic.
+    virtual std::string validate_checkpoint_file(const std::string& resolved_path) const {
+        (void)resolved_path;
+        return "";
+    }
+
+    // Download a model's artifacts. Default: the shared Hugging Face download.
+    // cloud (no-op) and flm (flm pull) override.
+    virtual void download_model(const ModelInfo& info, bool do_not_upgrade,
+                                DownloadProgressCallback progress,
+                                const BackendOpsContext& ctx) const;
+
+    // Whether the model cache must be rebuilt after this backend downloads a
+    // model (e.g. flm, whose model list changes). Default: false.
+    virtual bool invalidates_cache_after_download() const { return false; }
 };
 
 // Shared default ops instance for backends that override nothing.
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
index 3e99e3003..f5d0f269d 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
@@ -3,14 +3,24 @@
 #include <cstdint>
 #include <filesystem>
 #include <string>
+#include <vector>
+#include "lemon/model_manager.h"  // ModelInfo, DownloadProgressCallback
 
 namespace lemon {
 
-struct ModelInfo;
-
 namespace backends {
 namespace fastflowlm {
 
+// Locate the FLM executable (install dir on Windows, system PATH on Linux).
+std::string find_flm_binary();
+
+// Installed FLM model checkpoints (from `flm list --filter installed`).
+std::vector<std::string> flm_installed_checkpoints();
+
+// Discover all available FLM models (from `flm list --json`), each with its
+// downloaded status set. Returns empty if FLM is not ready.
+std::vector<ModelInfo> flm_discover_models();
+
 // FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH /
 // platform-default roots and describes them with a config.json; this knowledge
 // lives in the fastflowlm backend folder rather than in the shared model manager.
@@ -24,6 +34,10 @@ std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo
 // Read the model's max context window from its FLM config.json (0 if unknown).
 int64_t read_flm_max_context_window(const ModelInfo& info);
 
+// Download (pull) an FLM model by checkpoint via the `flm` CLI.
+void flm_download(const std::string& checkpoint, bool do_not_upgrade,
+                  DownloadProgressCallback progress_callback);
+
 } // namespace fastflowlm
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
index be850b583..967d6d044 100644
--- a/src/cpp/include/lemon/model_manager.h
+++ b/src/cpp/include/lemon/model_manager.h
@@ -224,11 +224,15 @@ class ModelManager {
     // Check if model is downloaded
     bool is_model_downloaded(const std::string& model_name);
 
-    // Get list of installed FLM models (for caching)
-    std::vector<std::string> get_flm_installed_models();
+    // Shared Hugging Face completeness check: true if all required checkpoints
+    // are present and complete (per-backend file validation runs via ops). The
+    // default BackendOps::is_downloaded delegates here for HF-backed backends.
+    bool checkpoints_complete(const ModelInfo& info) const;
 
-    // Get list of all available FLM models from 'flm list --json'
-    std::vector<ModelInfo> get_flm_available_models();
+    // Shared Hugging Face download engine. The default BackendOps::download_model
+    // delegates here; flm/cloud override with their own download.
+    void download_from_huggingface_engine(const ModelInfo& info,
+                                          DownloadProgressCallback progress_callback = nullptr);
 
     // Get HuggingFace cache directory (respects HF_HUB_CACHE, HF_HOME, and platform defaults)
     std::string get_hf_cache_dir() const;
@@ -310,11 +314,6 @@ class ModelManager {
     void download_from_huggingface(const ModelInfo& info,
                                    DownloadProgressCallback progress_callback = nullptr);
 
-    // Download from FLM
-    void download_from_flm(const std::string& checkpoint,
-                          bool do_not_upgrade = true,
-                          DownloadProgressCallback progress_callback = nullptr);
-
     // Discover GGUF models from extra_models_dir
     std::map<std::string, ModelInfo> discover_extra_models() const;
 
diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp
index af21249c5..2f4cdf48c 100644
--- a/src/cpp/server/backends/backend_ops.cpp
+++ b/src/cpp/server/backends/backend_ops.cpp
@@ -102,6 +102,20 @@ std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info,
     return ctx.model_cache_path;
 }
 
+bool BackendOps::is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const {
+    // Default: the shared HF checkpoint-completeness check.
+    return ctx.model_manager != nullptr && ctx.model_manager->checkpoints_complete(info);
+}
+
+void BackendOps::download_model(const ModelInfo& info, bool do_not_upgrade,
+                                DownloadProgressCallback progress, const BackendOpsContext& ctx) const {
+    // Default: the shared Hugging Face download engine.
+    (void)do_not_upgrade;
+    if (ctx.model_manager != nullptr) {
+        ctx.model_manager->download_from_huggingface_engine(info, progress);
+    }
+}
+
 const BackendOps* default_backend_ops() {
     static const BackendOps kDefault;
     return &kDefault;
diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
index b20aae05c..03e5c794e 100644
--- a/src/cpp/server/backends/cloud/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -815,6 +815,47 @@ class CloudOps : public BackendOps {
         // upstream provider's model id, used directly when forwarding requests.
         return "";
     }
+
+    // Cloud models have no local artifacts — always "downloaded".
+    bool is_downloaded(const ModelInfo&, const BackendOpsContext&) const override {
+        return true;
+    }
+
+    // "Downloading" a cloud model is a no-op.
+    void download_model(const ModelInfo&, bool, DownloadProgressCallback,
+                        const BackendOpsContext&) const override {}
+
+    // Discover models from each installed cloud provider with a resolvable
+    // credential. Per AGENTS.md invariant #11 the registry persists only
+    // {provider, base_url}; keys come from env vars / process memory. Failures
+    // are logged, never propagated, so one offline provider can't block discovery.
+    std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const override {
+        std::vector<ModelInfo> out;
+        if (ctx.cloud_registry == nullptr) {
+            return out;
+        }
+        for (const auto& rec : ctx.cloud_registry->list_installed()) {
+            const std::string api_key = ctx.cloud_registry->resolve_key(rec.name);
+            if (api_key.empty() || rec.base_url.empty()) {
+                LOG(INFO, "CloudOps") << "Skipping cloud discovery for '" << rec.name
+                                      << "': no API key resolvable (set "
+                                      << CloudProviderRegistry::env_var_name(rec.name)
+                                      << " or POST /v1/cloud/auth)" << std::endl;
+                continue;
+            }
+            try {
+                for (auto& m : CloudServer::discover_models(rec.name, api_key, rec.base_url)) {
+                    if (m.recipe == "cloud" && !m.model_name.empty()) {
+                        out.push_back(std::move(m));
+                    }
+                }
+            } catch (const std::exception& e) {
+                LOG(WARNING, "CloudOps") << "Cloud discovery threw for '" << rec.name
+                                         << "': " << e.what() << std::endl;
+            }
+        }
+        return out;
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
index 0ac7f8caf..2f2bb36b2 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
@@ -7,6 +7,12 @@
 #include "lemon/utils/aixlog.hpp"
 #include "lemon/utils/json_utils.h"
 #include "lemon/utils/path_utils.h"
+#include <sstream>
+#include "lemon/backends/backend_descriptor_registry.h"
+#include "lemon/backends/backend_registry.h"
+#include "lemon/backends/backend_utils.h"
+#include "lemon/system_info.h"
+#include "lemon/utils/process_manager.h"
 
 namespace fs = std::filesystem;
 using json = nlohmann::json;
@@ -17,6 +23,7 @@ namespace fastflowlm {
 namespace {
 
 using lemon::utils::path_from_utf8;
+using lemon::utils::path_to_utf8;
 
 bool safe_exists(const fs::path& p) {
     std::error_code ec;
@@ -114,6 +121,415 @@ int64_t read_flm_max_context_window(const ModelInfo& info) {
     return 0;
 }
 
+std::string find_flm_binary() {
+    try {
+        const backends::BackendSpec* spec = try_get_spec_for_recipe("flm");
+        if (!spec) {
+            return "";
+        }
+        return BackendUtils::get_backend_binary_path(*spec, "npu");
+    } catch (...) {
+#ifndef _WIN32
+        return lemon::utils::find_flm_executable();
+#else
+        return "";
+#endif
+    }
+}
+
+std::vector<std::string> flm_installed_checkpoints() {
+    std::vector<std::string> installed_models;
+
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) return installed_models;
+
+    // Run 'flm list --filter installed --quiet --json' to get only installed models
+    std::string output;
+#ifdef _WIN32
+    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+#else
+    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null";
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        return installed_models;
+    }
+
+    char buffer[256];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    pclose(pipe);
+#endif
+
+    // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] }
+    try {
+        json j = lemon::utils::JsonUtils::parse(output);
+        if (j.contains("models") && j["models"].is_array()) {
+            for (const auto& model : j["models"]) {
+                if (model.contains("name") && model["name"].is_string()) {
+                    installed_models.push_back(model["name"].get<std::string>());
+                }
+            }
+            return installed_models;
+        }
+    } catch (...) {
+        // Fallback to legacy parsing if JSON parsing fails
+    }
+
+    // Legacy parsing - cleaner format without emojis
+    // Expected format:
+    //   Models:
+    //     - modelname:tag
+    //     - another:model
+    std::istringstream stream(output);
+    std::string line;
+    while (std::getline(stream, line)) {
+        // Trim whitespace
+        line.erase(0, line.find_first_not_of(" \t\r\n"));
+        line.erase(line.find_last_not_of(" \t\r\n") + 1);
+
+        // Skip the "Models:" header line or empty lines
+        if (line == "Models:" || line.empty()) {
+            continue;
+        }
+
+        // Parse model checkpoint (format: "  - modelname:tag")
+        if (line.find("- ") == 0) {
+            std::string checkpoint = line.substr(2);
+            // Trim any remaining whitespace
+            checkpoint.erase(0, checkpoint.find_first_not_of(" \t"));
+            checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1);
+            if (!checkpoint.empty()) {
+                installed_models.push_back(checkpoint);
+            }
+        }
+    }
+
+    return installed_models;
+}
+
+std::vector<ModelInfo> flm_discover_models() {
+    std::vector<ModelInfo> flm_models;
+    if (!SystemInfoCache::get_flm_status().is_ready()) {
+        return flm_models;
+    }
+
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) return flm_models;
+
+    LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl;
+
+    // Run 'flm list --json' to get all available models
+    std::string output;
+#ifdef _WIN32
+    std::string command = "\"" + flm_path + "\" list --json";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+    LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc
+              << ", output length: " << output.size() << std::endl;
+    if (rc != 0 || output.empty()) {
+        LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. "
+                  << "Output: " << output.substr(0, 200) << std::endl;
+    }
+#else
+    std::string command = "\"" + flm_path + "\" list --json 2>/dev/null";
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        return flm_models;
+    }
+
+    char buffer[256];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    pclose(pipe);
+#endif
+
+    // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] }
+    try {
+        json j = lemon::utils::JsonUtils::parse(output);
+        if (j.contains("models") && j["models"].is_array()) {
+            for (const auto& m : j["models"]) {
+                if (m.contains("name") && m["name"].is_string()) {
+                    std::string checkpoint = m["name"].get<std::string>();
+
+                    // Format display name: replace : with -, append -FLM
+                    // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM"
+                    std::string display_name = checkpoint;
+                    // Replace : with -
+                    std::replace(display_name.begin(), display_name.end(), ':', '-');
+
+                    std::string model_name = display_name + "-FLM";
+
+                    ModelInfo info;
+                    info.model_name = model_name;
+                    info.checkpoints["main"] = checkpoint;
+                    info.recipe = "flm";
+                    info.suggested = true; // All official FLM models are suggested
+                    info.downloaded = lemon::utils::JsonUtils::get_or_default<bool>(m, "installed", false);
+
+                    if (lemon::utils::JsonUtils::get_or_default<bool>(m, "installed", false) && m.contains("url") && m["url"].is_string()) {
+                        fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir(
+                            backends::fastflowlm::repo_dir_from_url(m["url"].get<std::string>()));
+                        if (!config_path.empty()) {
+                            info.resolved_paths["config"] = path_to_utf8(config_path);
+                        }
+                    }
+
+                    // Size in GB (footprint field contains disk size in GB)
+                    if (m.contains("footprint") && m["footprint"].is_number()) {
+                        info.size = m["footprint"].get<double>();
+                    }
+
+                    // Labels from FLM metadata
+                    if (m.contains("label") && m["label"].is_array()) {
+                        for (const auto& l : m["label"]) {
+                            if (l.is_string()) {
+                                info.labels.push_back(l.get<std::string>());
+                            }
+                        }
+                    }
+
+                    // Populate type and device fields (multi-model support)
+                    info.type = get_model_type_from_labels(info.labels);
+                    const BackendDescriptor* flm_desc = descriptor_for("flm");
+                    info.device = flm_desc ? flm_desc->default_device : DEVICE_NPU;
+
+                    flm_models.push_back(info);
+                }
+            }
+        }
+    } catch (const std::exception& e) {
+        LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl;
+    } catch (...) {
+        LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl;
+    }
+
+    return flm_models;
+}
+
+
+void flm_download(const std::string& checkpoint, bool do_not_upgrade,
+                  DownloadProgressCallback progress_callback) {
+    LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl;
+
+    // Ensure FLM is ready (single source of truth)
+    auto status = SystemInfoCache::get_flm_status();
+    if (!status.is_ready()) {
+        throw std::runtime_error(status.error_string());
+    }
+
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) {
+        throw std::runtime_error("FLM executable not found");
+    }
+
+    // Prepare arguments
+    std::vector<std::string> args = {"pull", checkpoint};
+    if (!do_not_upgrade) {
+        args.push_back("--force");
+    }
+
+    LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\"";
+    for (const auto& arg : args) {
+        LOG(INFO, "ProcessManager") << " \"" << arg << "\"";
+    }
+    LOG(INFO, "ProcessManager") << std::endl;
+
+    // State for parsing FLM output
+    int total_files = 0;
+    int current_file_index = 0;
+    std::string current_filename;
+    bool cancelled = false;
+
+    // Run flm pull command and parse output
+    int exit_code = lemon::utils::ProcessManager::run_process_with_output(
+        flm_path, args,
+        [&](const std::string& line) -> bool {
+            // Always print the line to console
+            LOG(INFO, "FLM") << line << std::endl;
+
+            // Parse FLM output to extract progress information
+            // Pattern: "[FLM]  Downloading X/Y: filename"
+            if (line.find("[FLM]  Downloading ") != std::string::npos &&
+                line.find("/") != std::string::npos &&
+                line.find(":") != std::string::npos) {
+
+                // Extract "X/Y: filename" from "[FLM]  Downloading X/Y: filename"
+                size_t start = line.find("Downloading ") + 12;
+                size_t slash = line.find("/", start);
+                size_t colon = line.find(":", slash);
+
+                if (slash != std::string::npos && colon != std::string::npos) {
+                    try {
+                        current_file_index = std::stoi(line.substr(start, slash - start));
+                        total_files = std::stoi(line.substr(slash + 1, colon - slash - 1));
+                        current_filename = line.substr(colon + 2);  // Skip ": "
+
+                        // Send progress update
+                        if (progress_callback) {
+                            DownloadProgress progress;
+                            progress.file = current_filename;
+                            progress.file_index = current_file_index;
+                            progress.total_files = total_files;
+                            progress.bytes_downloaded = 0;
+                            progress.bytes_total = 0;
+                            progress.percent = (total_files > 0) ?
+                                ((current_file_index - 1) * 100 / total_files) : 0;
+
+                            if (!progress_callback(progress)) {
+                                cancelled = true;
+                                return false;  // Kill the process
+                            }
+                        }
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+            // Pattern: "[FLM]  Downloading: XX.X% (XXX.XMB / XXX.XMB)"
+            else if (line.find("[FLM]  Downloading: ") != std::string::npos &&
+                     line.find("%") != std::string::npos) {
+
+                // Extract percentage and bytes
+                size_t start = line.find("Downloading: ") + 13;
+                size_t pct_end = line.find("%", start);
+
+                if (pct_end != std::string::npos) {
+                    try {
+                        std::string pct_str = line.substr(start, pct_end - start);
+                        double file_percent = std::stod(pct_str);
+
+                        // Try to extract bytes (XXX.XMB / XXX.XMB)
+                        size_t open_paren = line.find("(", pct_end);
+                        size_t slash = line.find("/", open_paren);
+                        size_t close_paren = line.find(")", slash);
+
+                        size_t bytes_downloaded = 0;
+                        size_t bytes_total = 0;
+
+                        if (open_paren != std::string::npos && slash != std::string::npos) {
+                            std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1);
+                            std::string total_str = line.substr(slash + 1, close_paren - slash - 1);
+
+                            // Parse "XXX.XMB" format
+                            auto parse_size = [](const std::string& s) -> size_t {
+                                double val = 0;
+                                size_t mb_pos = s.find("MB");
+                                size_t gb_pos = s.find("GB");
+                                size_t kb_pos = s.find("KB");
+
+                                if (mb_pos != std::string::npos) {
+                                    val = std::stod(s.substr(0, mb_pos));
+                                    return static_cast<size_t>(val * 1024 * 1024);
+                                } else if (gb_pos != std::string::npos) {
+                                    val = std::stod(s.substr(0, gb_pos));
+                                    return static_cast<size_t>(val * 1024 * 1024 * 1024);
+                                } else if (kb_pos != std::string::npos) {
+                                    val = std::stod(s.substr(0, kb_pos));
+                                    return static_cast<size_t>(val * 1024);
+                                }
+                                return 0;
+                            };
+
+                            bytes_downloaded = parse_size(downloaded_str);
+                            bytes_total = parse_size(total_str);
+                        }
+
+                        // Send progress update with byte-level info
+                        if (progress_callback) {
+                            DownloadProgress progress;
+                            progress.file = current_filename;
+                            progress.file_index = current_file_index;
+                            progress.total_files = total_files;
+                            progress.bytes_downloaded = bytes_downloaded;
+                            progress.bytes_total = bytes_total;
+                            // Use intra-file percent when we have byte-level progress
+                            progress.percent = static_cast<int>(file_percent);
+
+                            if (!progress_callback(progress)) {
+                                cancelled = true;
+                                return false;  // Kill the process
+                            }
+                        }
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+            // Pattern: "[FLM]  Overall progress: XX.X% (X/Y files)"
+            else if (line.find("[FLM]  Overall progress: ") != std::string::npos) {
+                size_t start = line.find("progress: ") + 10;
+                size_t pct_end = line.find("%", start);
+
+                if (pct_end != std::string::npos) {
+                    try {
+                        int overall_percent = static_cast<int>(std::stod(line.substr(start, pct_end - start)));
+
+                        if (progress_callback) {
+                            DownloadProgress progress;
+                            progress.file = current_filename;
+                            progress.file_index = current_file_index;
+                            progress.total_files = total_files;
+                            progress.bytes_downloaded = 0;  // Not available for overall progress
+                            progress.bytes_total = 0;
+                            progress.percent = overall_percent;
+
+                            if (!progress_callback(progress)) {
+                                cancelled = true;
+                                return false;  // Kill the process
+                            }
+                        }
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+            // Pattern: "[FLM]  Missing files (N):"
+            else if (line.find("[FLM]  Missing files (") != std::string::npos) {
+                size_t start = line.find("(") + 1;
+                size_t end = line.find(")", start);
+                if (end != std::string::npos) {
+                    try {
+                        total_files = std::stoi(line.substr(start, end - start));
+                    } catch (...) {
+                        // Ignore parse errors
+                    }
+                }
+            }
+
+            return true;  // Continue
+        },
+        "",  // Working directory
+        3600  // 1 hour timeout for large model downloads
+    );
+
+    if (cancelled) {
+        LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl;
+        throw std::runtime_error("Download cancelled");
+    }
+
+    if (exit_code != 0) {
+        LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl;
+        throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code));
+    }
+
+    // Send completion event
+    if (progress_callback) {
+        DownloadProgress progress;
+        progress.complete = true;
+        progress.file_index = total_files;
+        progress.total_files = total_files;
+        progress.percent = 100;
+        (void)progress_callback(progress);  // Ignore return - download already complete
+    }
+
+    LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl;
+}
+
+
 } // namespace fastflowlm
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 648cd9ff5..256fe339b 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -491,6 +491,22 @@ class FlmOps : public BackendOps {
         // FLM uses the checkpoint string as-is (e.g. "gemma3:4b"); no local file.
         return ctx.checkpoint;
     }
+
+    std::vector<ModelInfo> discover_models(const BackendOpsContext&) const override {
+        return flm_discover_models();
+    }
+
+    bool is_downloaded(const ModelInfo& info, const BackendOpsContext&) const override {
+        const auto installed = flm_installed_checkpoints();
+        return std::find(installed.begin(), installed.end(), info.checkpoint()) != installed.end();
+    }
+
+    void download_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress,
+                        const BackendOpsContext&) const override {
+        flm_download(info.checkpoint(), do_not_upgrade, progress);
+    }
+
+    bool invalidates_cache_after_download() const override { return true; }
 };
 }  // namespace
 
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index a9af9359f..f1b265fb3 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -705,6 +705,27 @@ class LlamaCppOps : public BackendOps {
         }
         return BackendOps::resolve_checkpoint_path(info, ctx);
     }
+
+    std::string validate_checkpoint_file(const std::string& resolved_path) const override {
+        // A .gguf file in the cache must start with the GGUF magic, else it's a
+        // truncated/corrupt download and the model is not really present.
+        std::error_code ec;
+        std::filesystem::path p = lemon::utils::path_from_utf8(resolved_path);
+        if (std::filesystem::is_directory(p, ec)) {
+            return "";
+        }
+        std::string ext = resolved_path.size() >= 5 ? resolved_path.substr(resolved_path.size() - 5) : "";
+        std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower);
+        if (ext != ".gguf") {
+            return "";
+        }
+        std::ifstream in(p, std::ios::binary);
+        char magic[4] = {};
+        in.read(magic, sizeof(magic));
+        bool ok = in.gcount() == static_cast<std::streamsize>(sizeof(magic)) &&
+                  magic[0] == 'G' && magic[1] == 'G' && magic[2] == 'U' && magic[3] == 'F';
+        return ok ? "" : "Invalid GGUF cache file";
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index d1295ff92..76a4a57a8 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -1297,21 +1297,6 @@ static bool has_partial_files(const fs::path& dir) {
     return false;
 }
 
-static bool is_valid_gguf_file_for_cache(const std::string& path) {
-    std::ifstream in(path_from_utf8(path), std::ios::binary);
-    if (!in.is_open()) {
-        return false;
-    }
-
-    char magic[4] = {};
-    in.read(magic, sizeof(magic));
-    return in.gcount() == static_cast<std::streamsize>(sizeof(magic)) &&
-           magic[0] == 'G' &&
-           magic[1] == 'G' &&
-           magic[2] == 'U' &&
-           magic[3] == 'F';
-}
-
 static bool is_checkpoint_path_complete(const std::string& path_str) {
     if (path_str.empty()) return false;
 
@@ -1347,20 +1332,26 @@ static bool are_required_checkpoints_complete(const ModelInfo& info) {
             return false;
         }
 
-        fs::path resolved = path_from_utf8(resolved_path);
-        if (info.recipe == "llamacpp" &&
-            !safe_is_directory(resolved) &&
-            ends_with_ignore_case(resolved_path, ".gguf") &&
-            !is_valid_gguf_file_for_cache(resolved_path)) {
+        // Per-backend file validation (e.g. llamacpp checks GGUF magic).
+        std::string invalid = backends::ops_for(info.recipe)->validate_checkpoint_file(resolved_path);
+        if (!invalid.empty()) {
             LOG(WARNING, "ModelManager")
-                << "Invalid GGUF cache file; marking model as not downloaded: "
-                << resolved_path << std::endl;
+                << invalid << "; marking model as not downloaded: " << resolved_path << std::endl;
             return false;
         }
     }
     return true;
 }
 
+bool ModelManager::checkpoints_complete(const ModelInfo& info) const {
+    return are_required_checkpoints_complete(info);
+}
+
+void ModelManager::download_from_huggingface_engine(const ModelInfo& info,
+                                                    DownloadProgressCallback progress_callback) {
+    download_from_huggingface(info, progress_callback);
+}
+
 void ModelManager::build_cache() {
     std::lock_guard<std::mutex> lock(models_cache_mutex_);
 
@@ -1498,48 +1489,20 @@ void ModelManager::build_cache() {
         all_models[name] = info;
     }
 
-    // Step 1.6: Discover FLM models from 'flm list --json'
-    // Only discover FLM models if FLM is fully installed
-    // Precedence: server_models.json > user_models.json > extra_models > flm_list
-    auto flm_status = SystemInfoCache::get_flm_status();
-    if (flm_status.is_ready()) {
-        auto flm_available = get_flm_available_models();
-        for (const auto& info : flm_available) {
-            // Use emplace to only add if key doesn't exist (respect precedence)
-            all_models.emplace(info.model_name, info);
-        }
-    }
-
-    // Cloud-offload discovery is server-side and automatic. For each
-    // installed cloud provider with a resolvable credential (env var or
-    // runtime-auth POST), call discover_models and merge the results into
-    // all_models. Per AGENTS.md invariant #11, the registry persists only
-    // {provider, base_url} pairs — API keys live in env vars or process
-    // memory, never on disk. Failures are logged, never propagated, so a
-    // single offline provider can't block the rest of cache build.
-    if (cloud_registry_ != nullptr) {
-        auto installed = cloud_registry_->list_installed();
-        for (const auto& rec : installed) {
-            const std::string api_key = cloud_registry_->resolve_key(rec.name);
-            if (api_key.empty() || rec.base_url.empty()) {
-                LOG(INFO, "ModelManager") << "Skipping cloud discovery for '"
-                                           << rec.name << "': no API key resolvable"
-                                           << " (set " << CloudProviderRegistry::env_var_name(rec.name)
-                                           << " or POST /v1/cloud/auth)" << std::endl;
-                continue;
-            }
-            std::vector<ModelInfo> discovered;
-            try {
-                discovered = backends::CloudServer::discover_models(rec.name, api_key, rec.base_url);
-            } catch (const std::exception& e) {
-                LOG(WARNING, "ModelManager") << "Cloud discovery threw for '"
-                                              << rec.name << "': " << e.what()
-                                              << std::endl;
+    // Step 1.6: Dynamic discovery. Backends whose models are supplied at runtime
+    // (descriptor dynamic_models = true — flm from `flm list`, cloud from each
+    // provider) contribute their models via ops->discover_models(). Each carries
+    // its own downloaded status. Precedence: server/user/extra models win, so we
+    // emplace (don't overwrite). Failures are handled inside each backend's ops.
+    {
+        backends::BackendOpsContext octx;
+        octx.model_manager = this;
+        octx.cloud_registry = cloud_registry_;
+        for (const auto* desc : backends::all_descriptors()) {
+            if (!desc->dynamic_models) {
                 continue;
             }
-            for (auto& m : discovered) {
-                if (m.recipe != "cloud" || m.model_name.empty()) continue;
-                // Same merge precedence as FLM: emplace, don't overwrite.
+            for (auto& m : backends::ops_for(desc->recipe)->discover_models(octx)) {
                 all_models.emplace(m.model_name, std::move(m));
             }
         }
@@ -1556,21 +1519,21 @@ void ModelManager::build_cache() {
     // Step 2: Filter by backend availability
     all_models = filter_models_by_backend(all_models);
 
-    // Step 3: Check download status ONCE for all models
-    auto flm_models = get_flm_installed_models();
-    std::unordered_set<std::string> flm_set(flm_models.begin(), flm_models.end());
+    // Step 3: Check download status for all models. Dynamic-discovery backends
+    // (flm, cloud) already set downloaded during discovery; everyone else asks
+    // its backend ops (default = shared HF completeness check).
+    backends::BackendOpsContext status_ctx;
+    status_ctx.model_manager = this;
 
     int downloaded_count = 0;
     // First pass: determine download status for non-collection models
     for (auto& [name, info] : all_models) {
         if (is_collection_recipe(info.recipe)) {
             continue;  // Handled in second pass after components are resolved
-        } else if (info.recipe == "flm") {
-            info.downloaded = flm_set.count(info.checkpoint()) > 0;
-        } else if (info.recipe == "cloud") {
-            info.downloaded = true;  // Cloud-offloaded models have no local artifacts
-        } else {
-            info.downloaded = are_required_checkpoints_complete(info);
+        }
+        const auto* desc = backends::descriptor_for(info.recipe);
+        if (!(desc && desc->dynamic_models)) {
+            info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, status_ctx);
         }
 
         if (info.downloaded) {
@@ -1667,16 +1630,14 @@ void ModelManager::add_model_to_cache(const std::string& model_name) {
         return; // Backend not available, don't add to cache
     }
 
-    // Check download status
+    // Check download status (collections aggregate their components; everyone
+    // else asks its backend ops).
     if (is_collection_recipe(info.recipe)) {
         info.downloaded = check_component_downloaded(info, models_cache_);
-    } else if (info.recipe == "flm") {
-        auto flm_models = get_flm_installed_models();
-        info.downloaded = std::find(flm_models.begin(), flm_models.end(), info.checkpoint()) != flm_models.end();
-    } else if (info.recipe == "cloud") {
-        info.downloaded = true;  // Cloud-offloaded models have no local artifacts
     } else {
-        info.downloaded = are_required_checkpoints_complete(info);
+        backends::BackendOpsContext octx;
+        octx.model_manager = this;
+        info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, octx);
     }
 
     populate_model_metadata(info);
@@ -1715,10 +1676,10 @@ void ModelManager::update_model_in_cache(const std::string& model_name, bool dow
         // The path changes now that files exist on disk
         if (downloaded) {
             resolve_all_model_paths(it->second);
-            if (it->second.recipe == "flm") {
+            if (backends::ops_for(it->second.recipe)->invalidates_cache_after_download()) {
                 cache_valid_ = false;
-                LOG(INFO, "ModelManager") << "Invalidated model cache after FLM download for '"
-                          << model_name << "'" << std::endl;
+                LOG(INFO, "ModelManager") << "Invalidated model cache after download for '"
+                          << model_name << "' (backend rebuilds its model list)" << std::endl;
                 return;
             }
             populate_model_metadata(it->second);
@@ -2295,192 +2256,8 @@ void ModelManager::unregister_user_model(const std::string& model_name) {
     cache_valid_ = false;
 }
 
-// Find the FLM executable: install dir on Windows, system PATH on Linux.
-// Returns empty string if not found.
-static std::string find_flm_binary() {
-    try {
-        const backends::BackendSpec* spec = backends::try_get_spec_for_recipe("flm");
-        if (!spec) {
-            return "";
-        }
-        return backends::BackendUtils::get_backend_binary_path(*spec, "npu");
-    } catch (...) {
-#ifndef _WIN32
-        return utils::find_flm_executable();
-#else
-        return "";
-#endif
-    }
-}
-
-// Helper function to get FLM installed models by calling 'flm list --filter installed --quiet'
-std::vector<std::string> ModelManager::get_flm_installed_models() {
-    std::vector<std::string> installed_models;
-
-    std::string flm_path = find_flm_binary();
-    if (flm_path.empty()) return installed_models;
-
-    // Run 'flm list --filter installed --quiet --json' to get only installed models
-    std::string output;
-#ifdef _WIN32
-    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-#else
-    std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null";
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        return installed_models;
-    }
-
-    char buffer[256];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    pclose(pipe);
-#endif
-
-    // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] }
-    try {
-        json j = JsonUtils::parse(output);
-        if (j.contains("models") && j["models"].is_array()) {
-            for (const auto& model : j["models"]) {
-                if (model.contains("name") && model["name"].is_string()) {
-                    installed_models.push_back(model["name"].get<std::string>());
-                }
-            }
-            return installed_models;
-        }
-    } catch (...) {
-        // Fallback to legacy parsing if JSON parsing fails
-    }
-
-    // Legacy parsing - cleaner format without emojis
-    // Expected format:
-    //   Models:
-    //     - modelname:tag
-    //     - another:model
-    std::istringstream stream(output);
-    std::string line;
-    while (std::getline(stream, line)) {
-        // Trim whitespace
-        line.erase(0, line.find_first_not_of(" \t\r\n"));
-        line.erase(line.find_last_not_of(" \t\r\n") + 1);
-
-        // Skip the "Models:" header line or empty lines
-        if (line == "Models:" || line.empty()) {
-            continue;
-        }
-
-        // Parse model checkpoint (format: "  - modelname:tag")
-        if (line.find("- ") == 0) {
-            std::string checkpoint = line.substr(2);
-            // Trim any remaining whitespace
-            checkpoint.erase(0, checkpoint.find_first_not_of(" \t"));
-            checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1);
-            if (!checkpoint.empty()) {
-                installed_models.push_back(checkpoint);
-            }
-        }
-    }
-
-    return installed_models;
-}
-
-std::vector<ModelInfo> ModelManager::get_flm_available_models() {
-    std::vector<ModelInfo> flm_models;
-
-    std::string flm_path = find_flm_binary();
-    if (flm_path.empty()) return flm_models;
-
-    LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl;
-
-    // Run 'flm list --json' to get all available models
-    std::string output;
-#ifdef _WIN32
-    std::string command = "\"" + flm_path + "\" list --json";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-    LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc
-              << ", output length: " << output.size() << std::endl;
-    if (rc != 0 || output.empty()) {
-        LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. "
-                  << "Output: " << output.substr(0, 200) << std::endl;
-    }
-#else
-    std::string command = "\"" + flm_path + "\" list --json 2>/dev/null";
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        return flm_models;
-    }
-
-    char buffer[256];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    pclose(pipe);
-#endif
-
-    // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] }
-    try {
-        json j = JsonUtils::parse(output);
-        if (j.contains("models") && j["models"].is_array()) {
-            for (const auto& m : j["models"]) {
-                if (m.contains("name") && m["name"].is_string()) {
-                    std::string checkpoint = m["name"].get<std::string>();
-
-                    // Format display name: replace : with -, append -FLM
-                    // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM"
-                    std::string display_name = checkpoint;
-                    // Replace : with -
-                    std::replace(display_name.begin(), display_name.end(), ':', '-');
-
-                    std::string model_name = display_name + "-FLM";
-
-                    ModelInfo info;
-                    info.model_name = model_name;
-                    info.checkpoints["main"] = checkpoint;
-                    info.recipe = "flm";
-                    info.suggested = true; // All official FLM models are suggested
-
-                    if (JsonUtils::get_or_default<bool>(m, "installed", false) && m.contains("url") && m["url"].is_string()) {
-                        fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir(
-                            backends::fastflowlm::repo_dir_from_url(m["url"].get<std::string>()));
-                        if (!config_path.empty()) {
-                            info.resolved_paths["config"] = path_to_utf8(config_path);
-                        }
-                    }
-
-                    // Size in GB (footprint field contains disk size in GB)
-                    if (m.contains("footprint") && m["footprint"].is_number()) {
-                        info.size = m["footprint"].get<double>();
-                    }
-
-                    // Labels from FLM metadata
-                    if (m.contains("label") && m["label"].is_array()) {
-                        for (const auto& l : m["label"]) {
-                            if (l.is_string()) {
-                                info.labels.push_back(l.get<std::string>());
-                            }
-                        }
-                    }
-
-                    // Populate type and device fields (multi-model support)
-                    info.type = get_model_type_from_labels(info.labels);
-                    info.device = device_type_for_recipe(info.recipe);
 
-                    flm_models.push_back(info);
-                }
-            }
-        }
-    } catch (const std::exception& e) {
-        LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl;
-    } catch (...) {
-        LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl;
-    }
 
-    return flm_models;
-}
 
 bool ModelManager::is_model_downloaded(const std::string& model_name) {
     // Build cache if needed
@@ -2505,18 +2282,11 @@ bool ModelManager::is_model_downloaded(const std::string& model_name) {
 }
 
 void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) {
-    // Cloud models have no local artifacts; "downloading" is a no-op.
-    if (info.recipe == "cloud") {
-        update_model_in_cache(info.model_name, true);
-        return;
-    }
-
-    // Use recipe-specific download paths
-    if (info.recipe == "flm") {
-        download_from_flm(info.checkpoint(), do_not_upgrade, progress_callback);
-    } else {
-        download_from_huggingface(info, progress_callback);
-    }
+    // The backend's ops own the download (shared HF engine by default; flm pulls
+    // via the flm CLI; cloud is a no-op).
+    backends::BackendOpsContext octx;
+    octx.model_manager = this;
+    backends::ops_for(info.recipe)->download_model(info, do_not_upgrade, progress_callback, octx);
 
     // Update cache after successful download
     update_model_in_cache(info.model_name, true);
@@ -4019,224 +3789,6 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
     LOG(INFO, "ModelManager") << "Download location: " << reported_download_path << std::endl;
 }
 
-void ModelManager::download_from_flm(const std::string& checkpoint,
-                                     bool do_not_upgrade,
-                                     DownloadProgressCallback progress_callback) {
-    LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl;
-
-    // Ensure FLM is ready (single source of truth)
-    auto status = SystemInfoCache::get_flm_status();
-    if (!status.is_ready()) {
-        throw std::runtime_error(status.error_string());
-    }
-
-    std::string flm_path = find_flm_binary();
-    if (flm_path.empty()) {
-        throw std::runtime_error("FLM executable not found");
-    }
-
-    // Prepare arguments
-    std::vector<std::string> args = {"pull", checkpoint};
-    if (!do_not_upgrade) {
-        args.push_back("--force");
-    }
-
-    LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\"";
-    for (const auto& arg : args) {
-        LOG(INFO, "ProcessManager") << " \"" << arg << "\"";
-    }
-    LOG(INFO, "ProcessManager") << std::endl;
-
-    // State for parsing FLM output
-    int total_files = 0;
-    int current_file_index = 0;
-    std::string current_filename;
-    bool cancelled = false;
-
-    // Run flm pull command and parse output
-    int exit_code = utils::ProcessManager::run_process_with_output(
-        flm_path, args,
-        [&](const std::string& line) -> bool {
-            // Always print the line to console
-            LOG(INFO, "FLM") << line << std::endl;
-
-            // Parse FLM output to extract progress information
-            // Pattern: "[FLM]  Downloading X/Y: filename"
-            if (line.find("[FLM]  Downloading ") != std::string::npos &&
-                line.find("/") != std::string::npos &&
-                line.find(":") != std::string::npos) {
-
-                // Extract "X/Y: filename" from "[FLM]  Downloading X/Y: filename"
-                size_t start = line.find("Downloading ") + 12;
-                size_t slash = line.find("/", start);
-                size_t colon = line.find(":", slash);
-
-                if (slash != std::string::npos && colon != std::string::npos) {
-                    try {
-                        current_file_index = std::stoi(line.substr(start, slash - start));
-                        total_files = std::stoi(line.substr(slash + 1, colon - slash - 1));
-                        current_filename = line.substr(colon + 2);  // Skip ": "
-
-                        // Send progress update
-                        if (progress_callback) {
-                            DownloadProgress progress;
-                            progress.file = current_filename;
-                            progress.file_index = current_file_index;
-                            progress.total_files = total_files;
-                            progress.bytes_downloaded = 0;
-                            progress.bytes_total = 0;
-                            progress.percent = (total_files > 0) ?
-                                ((current_file_index - 1) * 100 / total_files) : 0;
-
-                            if (!progress_callback(progress)) {
-                                cancelled = true;
-                                return false;  // Kill the process
-                            }
-                        }
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-            // Pattern: "[FLM]  Downloading: XX.X% (XXX.XMB / XXX.XMB)"
-            else if (line.find("[FLM]  Downloading: ") != std::string::npos &&
-                     line.find("%") != std::string::npos) {
-
-                // Extract percentage and bytes
-                size_t start = line.find("Downloading: ") + 13;
-                size_t pct_end = line.find("%", start);
-
-                if (pct_end != std::string::npos) {
-                    try {
-                        std::string pct_str = line.substr(start, pct_end - start);
-                        double file_percent = std::stod(pct_str);
-
-                        // Try to extract bytes (XXX.XMB / XXX.XMB)
-                        size_t open_paren = line.find("(", pct_end);
-                        size_t slash = line.find("/", open_paren);
-                        size_t close_paren = line.find(")", slash);
-
-                        size_t bytes_downloaded = 0;
-                        size_t bytes_total = 0;
-
-                        if (open_paren != std::string::npos && slash != std::string::npos) {
-                            std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1);
-                            std::string total_str = line.substr(slash + 1, close_paren - slash - 1);
-
-                            // Parse "XXX.XMB" format
-                            auto parse_size = [](const std::string& s) -> size_t {
-                                double val = 0;
-                                size_t mb_pos = s.find("MB");
-                                size_t gb_pos = s.find("GB");
-                                size_t kb_pos = s.find("KB");
-
-                                if (mb_pos != std::string::npos) {
-                                    val = std::stod(s.substr(0, mb_pos));
-                                    return static_cast<size_t>(val * 1024 * 1024);
-                                } else if (gb_pos != std::string::npos) {
-                                    val = std::stod(s.substr(0, gb_pos));
-                                    return static_cast<size_t>(val * 1024 * 1024 * 1024);
-                                } else if (kb_pos != std::string::npos) {
-                                    val = std::stod(s.substr(0, kb_pos));
-                                    return static_cast<size_t>(val * 1024);
-                                }
-                                return 0;
-                            };
-
-                            bytes_downloaded = parse_size(downloaded_str);
-                            bytes_total = parse_size(total_str);
-                        }
-
-                        // Send progress update with byte-level info
-                        if (progress_callback) {
-                            DownloadProgress progress;
-                            progress.file = current_filename;
-                            progress.file_index = current_file_index;
-                            progress.total_files = total_files;
-                            progress.bytes_downloaded = bytes_downloaded;
-                            progress.bytes_total = bytes_total;
-                            // Use intra-file percent when we have byte-level progress
-                            progress.percent = static_cast<int>(file_percent);
-
-                            if (!progress_callback(progress)) {
-                                cancelled = true;
-                                return false;  // Kill the process
-                            }
-                        }
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-            // Pattern: "[FLM]  Overall progress: XX.X% (X/Y files)"
-            else if (line.find("[FLM]  Overall progress: ") != std::string::npos) {
-                size_t start = line.find("progress: ") + 10;
-                size_t pct_end = line.find("%", start);
-
-                if (pct_end != std::string::npos) {
-                    try {
-                        int overall_percent = static_cast<int>(std::stod(line.substr(start, pct_end - start)));
-
-                        if (progress_callback) {
-                            DownloadProgress progress;
-                            progress.file = current_filename;
-                            progress.file_index = current_file_index;
-                            progress.total_files = total_files;
-                            progress.bytes_downloaded = 0;  // Not available for overall progress
-                            progress.bytes_total = 0;
-                            progress.percent = overall_percent;
-
-                            if (!progress_callback(progress)) {
-                                cancelled = true;
-                                return false;  // Kill the process
-                            }
-                        }
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-            // Pattern: "[FLM]  Missing files (N):"
-            else if (line.find("[FLM]  Missing files (") != std::string::npos) {
-                size_t start = line.find("(") + 1;
-                size_t end = line.find(")", start);
-                if (end != std::string::npos) {
-                    try {
-                        total_files = std::stoi(line.substr(start, end - start));
-                    } catch (...) {
-                        // Ignore parse errors
-                    }
-                }
-            }
-
-            return true;  // Continue
-        },
-        "",  // Working directory
-        3600  // 1 hour timeout for large model downloads
-    );
-
-    if (cancelled) {
-        LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl;
-        throw std::runtime_error("Download cancelled");
-    }
-
-    if (exit_code != 0) {
-        LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl;
-        throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code));
-    }
-
-    // Send completion event
-    if (progress_callback) {
-        DownloadProgress progress;
-        progress.complete = true;
-        progress.file_index = total_files;
-        progress.total_files = total_files;
-        progress.percent = 100;
-        (void)progress_callback(progress);  // Ignore return - download already complete
-    }
-
-    LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl;
-}
 
 void ModelManager::delete_model(const std::string& model_name) {
     auto info = get_model_info(model_name);
@@ -4263,7 +3815,7 @@ void ModelManager::delete_model(const std::string& model_name) {
 
         // Find flm executable — on Windows flm.exe lives under the lemonade
         // cache dir, not on PATH, so we must resolve the full path.
-        std::string flm_path = find_flm_binary();
+        std::string flm_path = backends::fastflowlm::find_flm_binary();
         if (flm_path.empty()) {
             throw std::runtime_error("FLM executable not found");
         }

From 435426052fa1a9fd3ba044feebe18b4ade574f27 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:08:50 -0400
Subject: [PATCH 13/39] refactor(backends): migrate version detection to a
 resolve_version ops hook

get_recipe_version now reads version.txt generically and lets the backend ops
override, instead of branching on recipe. The per-backend version commands move
into their folders:

- system llama-server version (`llama-server --version` + regex) -> backends/
  llamacpp; LlamaCppOps::resolve_version returns it for the "system" backend.
- flm version (`flm version --json`) -> backends/fastflowlm (flm_version());
  FlmOps::resolve_version returns it when no version.txt is present.

Removes SystemInfo::get_system_llamacpp_version / get_flm_version and the
llamacpp-system / flm branches from system_info.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_ops.h  |  11 ++
 .../backends/fastflowlm/fastflowlm_models.h   |   3 +
 src/cpp/include/lemon/system_info.h           |   3 -
 .../backends/fastflowlm/fastflowlm_models.cpp |  69 +++++++++
 .../backends/fastflowlm/fastflowlm_server.cpp |   8 +
 .../backends/llamacpp/llamacpp_server.cpp     |  48 ++++++
 src/cpp/server/system_info.cpp                | 137 ++----------------
 7 files changed, 151 insertions(+), 128 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index 35f434df3..854065f24 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -88,6 +88,17 @@ class BackendOps {
     // Whether the model cache must be rebuilt after this backend downloads a
     // model (e.g. flm, whose model list changes). Default: false.
     virtual bool invalidates_cache_after_download() const { return false; }
+
+    // Resolve a backend's installed version for a given backend variant. The
+    // caller passes the version read from the on-disk version.txt (or "" if
+    // absent); the default returns it unchanged. Backends that detect their
+    // version another way override: llamacpp's "system" build runs
+    // `llama-server --version`; flm queries `flm version` when no file is present.
+    virtual std::string resolve_version(const std::string& backend,
+                                        const std::string& file_version) const {
+        (void)backend;
+        return file_version;
+    }
 };
 
 // Shared default ops instance for backends that override nothing.
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
index f5d0f269d..20c7a96b8 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
@@ -34,6 +34,9 @@ std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo
 // Read the model's max context window from its FLM config.json (0 if unknown).
 int64_t read_flm_max_context_window(const ModelInfo& info);
 
+// Detect the installed FLM version via `flm version` ("unknown" if unavailable).
+std::string flm_version();
+
 // Download (pull) an FLM model by checkpoint via the `flm` CLI.
 void flm_download(const std::string& checkpoint, bool do_not_upgrade,
                   DownloadProgressCallback progress_callback);
diff --git a/src/cpp/include/lemon/system_info.h b/src/cpp/include/lemon/system_info.h
index 9b143ae47..a67c744b6 100644
--- a/src/cpp/include/lemon/system_info.h
+++ b/src/cpp/include/lemon/system_info.h
@@ -104,9 +104,6 @@ class SystemInfo {
     };
     static std::vector<RecipeStatus> get_all_recipe_statuses();
 
-    static std::string get_flm_version();
-    static std::string get_system_llamacpp_version();
-
     // Device support detection
     static std::string get_rocm_arch();
     static std::string get_cuda_arch();
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
index 2f2bb36b2..0331cc895 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
@@ -530,6 +530,75 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
 }
 
 
+std::string flm_version() {
+    // Cache real version strings to avoid spawning the subprocess twice per
+    // build_recipes_info() pass. "unknown" is NOT cached so that post-install
+    // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed.
+    static std::string cached_version;
+    if (!cached_version.empty()) {
+        return cached_version;
+    }
+
+    // Find the flm executable using shared utility
+    std::string flm_path = lemon::utils::find_flm_executable();
+    if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) {
+        return "unknown";
+    }
+
+    std::string output;
+    #ifdef _WIN32
+    std::string command = "\"" + flm_path + "\" version --json 2>NUL";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+    #else
+    std::string command = "\"" + flm_path + "\" version --json 2>/dev/null";
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        return "unknown";
+    }
+
+    char buffer[256];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    pclose(pipe);
+    #endif
+
+    // Parse JSON output: { "version": "0.9.34" }
+    try {
+        json j = lemon::utils::JsonUtils::parse(output);
+        if (j.contains("version") && j["version"].is_string()) {
+            std::string version = j["version"].get<std::string>();
+            // If the version doesn't start with 'v', prepend it
+            // for backend_versions.json compatibility (e.g. "v0.9.34").
+            if (!version.empty() && version[0] != 'v') {
+                version = "v" + version;
+            }
+            cached_version = version;
+            return cached_version;
+        }
+    } catch (...) {
+        // Fallback to legacy parsing if JSON parsing fails
+    }
+
+    // Legacy parsing from output like "FLM v0.9.4"
+    if (output.find("FLM v") != std::string::npos) {
+        size_t pos = output.find("FLM v");
+        // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34").
+        std::string version = output.substr(pos + 4);
+        // Trim whitespace and newlines
+        size_t end = version.find_first_of(" \t\n\r");
+        if (end != std::string::npos) {
+            version = version.substr(0, end);
+        }
+        cached_version = version;
+        return cached_version;
+    }
+
+    return "unknown";
+}
+
+
 } // namespace fastflowlm
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 256fe339b..772fac2d3 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -507,6 +507,14 @@ class FlmOps : public BackendOps {
     }
 
     bool invalidates_cache_after_download() const override { return true; }
+
+    std::string resolve_version(const std::string&, const std::string& file_version) const override {
+        // On Linux FLM is a system package with no version.txt; query the CLI.
+        if (file_version.empty() || file_version == "unknown") {
+            return flm_version();
+        }
+        return file_version;
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index f1b265fb3..63254f155 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -7,6 +7,7 @@
 #include "lemon/model_manager.h"
 #include <algorithm>
 #include <filesystem>
+#include <regex>
 #include <system_error>
 #include "lemon/auto_tune.h"
 #include "lemon/backend_manager.h"
@@ -662,6 +663,44 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 namespace {
+std::string system_llamacpp_version() {
+    std::string output;
+    #ifdef _WIN32
+    std::string command = "llama-server --version 2>NUL";
+    int rc = lemon::utils::ProcessManager::run_command(command, output);
+    #else
+    FILE* pipe = popen("llama-server --version 2>/dev/null", "r");
+    if (!pipe) {
+        return "unknown";
+    }
+
+    char buffer[256];
+    if (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output = buffer;
+    }
+
+    pclose(pipe);
+    #endif
+
+    // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432"
+    if (!output.empty()) {
+        // Try to find a version number
+        std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))");
+        std::smatch match;
+        if (std::regex_search(output, match, version_regex)) {
+            for (size_t i = 1; i < match.size(); ++i) {
+                if (match[i].matched) {
+                    return "b" + match[i].str();
+                }
+            }
+        }
+        return "detected";
+    }
+
+    return "unknown";
+}
+
+
 // llamacpp model-management behavior: GGUF metadata + capability labels.
 class LlamaCppOps : public BackendOps {
 public:
@@ -726,6 +765,15 @@ class LlamaCppOps : public BackendOps {
                   magic[0] == 'G' && magic[1] == 'G' && magic[2] == 'U' && magic[3] == 'F';
         return ok ? "" : "Invalid GGUF cache file";
     }
+
+    std::string resolve_version(const std::string& backend,
+                                const std::string& file_version) const override {
+        // The PATH-installed "system" llama-server has no version.txt; query it.
+        if (backend == "system") {
+            return system_llamacpp_version();
+        }
+        return file_version;
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 45335912d..6ae2ef03f 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -8,6 +8,7 @@
 #include "lemon/utils/process_manager.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backends/backend_descriptor_registry.h"
+#include "lemon/backends/backend_registry.h"
 #include "lemon/recipe_backend_def.h"
 #include <filesystem>
 #include <fstream>
@@ -609,31 +610,22 @@ static bool is_recipe_installed(const std::string& recipe, const std::string& ba
 }
 
 static std::string get_recipe_version(const std::string& recipe, const std::string& backend) {
-    if (recipe == "llamacpp" && backend == "system") {
-        return SystemInfo::get_system_llamacpp_version();
-    }
+    // Read the on-disk version.txt generically, then let the backend's ops
+    // override (llamacpp "system" runs llama-server --version; flm queries the
+    // CLI when no file is present). No per-recipe branches here.
     auto* spec = try_get_spec_for_recipe(recipe);
+    std::string file_version;
     if (spec) {
         std::string version_file = BackendUtils::get_installed_version_file(*spec, backend);
-        if (version_file.empty()) {
-#ifndef _WIN32
-            // On Linux, FLM is a system package with no version.txt - query directly
-            if (recipe == "flm") {
-                return SystemInfo::get_flm_version();
-            }
-#endif
-            return "unknown";
+        if (!version_file.empty()) {
+            file_version = read_version_file(version_file);
         }
-        std::string version = read_version_file(version_file);
-#ifndef _WIN32
-        // On Linux, version.txt may not exist on disk for system-installed FLM
-        if (recipe == "flm" && (version.empty() || version == "unknown")) {
-            return SystemInfo::get_flm_version();
-        }
-#endif
-        return version;
     }
-    return "";
+    std::string resolved = backends::ops_for(recipe)->resolve_version(backend, file_version);
+    if (!spec && resolved.empty()) {
+        return "";
+    }
+    return resolved.empty() ? "unknown" : resolved;
 }
 
 static std::string get_install_command(const std::string& recipe, const std::string& backend) {
@@ -1681,43 +1673,6 @@ static std::string read_version_file(const fs::path& version_file) {
     return "unknown";
 }
 
-std::string SystemInfo::get_system_llamacpp_version() {
-    std::string output;
-    #ifdef _WIN32
-    std::string command = "llama-server --version 2>NUL";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-    #else
-    FILE* pipe = popen("llama-server --version 2>/dev/null", "r");
-    if (!pipe) {
-        return "unknown";
-    }
-
-    char buffer[256];
-    if (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output = buffer;
-    }
-
-    pclose(pipe);
-    #endif
-
-    // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432"
-    if (!output.empty()) {
-        // Try to find a version number
-        std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))");
-        std::smatch match;
-        if (std::regex_search(output, match, version_regex)) {
-            for (size_t i = 1; i < match.size(); ++i) {
-                if (match[i].matched) {
-                    return "b" + match[i].str();
-                }
-            }
-        }
-        return "detected";
-    }
-
-    return "unknown";
-}
-
 // Map a CUDA Compute Capability "MAJOR.MINOR" string (as reported by nvidia-smi
 // --query-gpu=compute_cap) to the sm_XX token used in llamacpp-cuda release filenames.
 // Returns empty if the value cannot be parsed.
@@ -2266,74 +2221,6 @@ bool SystemInfo::get_has_igpu() {
     return false;  // No iGPU detected
 }
 
-std::string SystemInfo::get_flm_version() {
-    // Cache real version strings to avoid spawning the subprocess twice per
-    // build_recipes_info() pass. "unknown" is NOT cached so that post-install
-    // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed.
-    static std::string cached_version;
-    if (!cached_version.empty()) {
-        return cached_version;
-    }
-
-    // Find the flm executable using shared utility
-    std::string flm_path = utils::find_flm_executable();
-    if (flm_path.empty() || !utils::is_safe_executable_path(flm_path)) {
-        return "unknown";
-    }
-
-    std::string output;
-    #ifdef _WIN32
-    std::string command = "\"" + flm_path + "\" version --json 2>NUL";
-    int rc = lemon::utils::ProcessManager::run_command(command, output);
-    #else
-    std::string command = "\"" + flm_path + "\" version --json 2>/dev/null";
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        return "unknown";
-    }
-
-    char buffer[256];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    pclose(pipe);
-    #endif
-
-    // Parse JSON output: { "version": "0.9.34" }
-    try {
-        json j = JsonUtils::parse(output);
-        if (j.contains("version") && j["version"].is_string()) {
-            std::string version = j["version"].get<std::string>();
-            // If the version doesn't start with 'v', prepend it
-            // for backend_versions.json compatibility (e.g. "v0.9.34").
-            if (!version.empty() && version[0] != 'v') {
-                version = "v" + version;
-            }
-            cached_version = version;
-            return cached_version;
-        }
-    } catch (...) {
-        // Fallback to legacy parsing if JSON parsing fails
-    }
-
-    // Legacy parsing from output like "FLM v0.9.4"
-    if (output.find("FLM v") != std::string::npos) {
-        size_t pos = output.find("FLM v");
-        // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34").
-        std::string version = output.substr(pos + 4);
-        // Trim whitespace and newlines
-        size_t end = version.find_first_of(" \t\n\r");
-        if (end != std::string::npos) {
-            version = version.substr(0, end);
-        }
-        cached_version = version;
-        return cached_version;
-    }
-
-    return "unknown";
-}
-
 // ============================================================================
 // Factory function
 // ============================================================================

From 2a9b38e30cc3456e9d12306b1f0795b5c8dc172e Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:13:42 -0400
Subject: [PATCH 14/39] polish(backends): drop redundant config_section
 (defaults to recipe)

config_section duplicated the recipe string in 8 descriptors; it defaults to the
recipe via effective_config_section(), so set those to "". Only sd-cpp ("sdcpp")
and ryzenai-llm ("ryzenai") keep an explicit section because theirs genuinely
differ from the recipe.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/cloud/cloud.h           | 2 +-
 src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h | 2 +-
 src/cpp/include/lemon/backends/kokoro/kokoro.h         | 2 +-
 src/cpp/include/lemon/backends/llamacpp/llamacpp.h     | 2 +-
 src/cpp/include/lemon/backends/moonshine/moonshine.h   | 2 +-
 src/cpp/include/lemon/backends/ryzenai/ryzenai.h       | 2 +-
 src/cpp/include/lemon/backends/vllm/vllm.h             | 2 +-
 src/cpp/include/lemon/backends/whispercpp/whispercpp.h | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/cpp/include/lemon/backends/cloud/cloud.h b/src/cpp/include/lemon/backends/cloud/cloud.h
index 9d4f5559b..976a84f70 100644
--- a/src/cpp/include/lemon/backends/cloud/cloud.h
+++ b/src/cpp/include/lemon/backends/cloud/cloud.h
@@ -12,7 +12,7 @@ inline const BackendDescriptor descriptor = {
     /*recipe*/          "cloud",
     /*display_name*/    "Cloud",
     /*binary*/          "",  // no subprocess: runs on a remote provider
-    /*config_section*/  "cloud",
+    /*config_section*/  "",  // defaults to recipe
     /*default_device*/  DEVICE_NONE,
     /*slot_policy*/     SlotPolicy::Unmetered,  // never counts toward slots, never auto-evicted
     /*selectable_backend*/ false,
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index b9efb610b..24fc07470 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = {
 #else
     /*binary*/          "flm",
 #endif
-    /*config_section*/  "flm",
+    /*config_section*/  "",  // defaults to recipe
     /*default_device*/  DEVICE_NPU,
     /*slot_policy*/     SlotPolicy::CoexistByType,
     /*selectable_backend*/ false,
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h
index 4663d3ad3..3ebb9efbd 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = {
 #else
     /*binary*/          "koko",
 #endif
-    /*config_section*/  "kokoro",
+    /*config_section*/  "",  // defaults to recipe
     /*default_device*/  DEVICE_CPU,
     /*slot_policy*/     SlotPolicy::Standard,
     /*selectable_backend*/ false,
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index fc43c4515..cbd6386fa 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = {
 #else
     /*binary*/          "llama-server",
 #endif
-    /*config_section*/  "llamacpp",
+    /*config_section*/  "",  // defaults to recipe
     /*default_device*/  DEVICE_GPU,   // cpu/system variants resolve to CPU via effective_device()
     /*slot_policy*/     SlotPolicy::Standard,
     /*selectable_backend*/ true,
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h
index 81f45dc25..5b8faafe2 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h
@@ -12,7 +12,7 @@ inline const BackendDescriptor descriptor = {
     /*recipe*/          "moonshine",
     /*display_name*/    "Moonshine",
     /*binary*/          "moonshine-server",
-    /*config_section*/  "moonshine",
+    /*config_section*/  "",  // defaults to recipe
     /*default_device*/  DEVICE_CPU,
     /*slot_policy*/     SlotPolicy::Standard,
     /*selectable_backend*/ false,
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
index 4171dbe93..c290c4dd1 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = {
 #else
     /*binary*/          "ryzenai-server",
 #endif
-    /*config_section*/  "ryzenai",
+    /*config_section*/  "ryzenai",  // differs from recipe "ryzenai-llm"
     /*default_device*/  DEVICE_NPU,
     /*slot_policy*/     SlotPolicy::ExclusiveNpu,
     /*selectable_backend*/ false,
diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h
index 6f468a1ed..4c35ad1ec 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -12,7 +12,7 @@ inline const BackendDescriptor descriptor = {
     /*recipe*/          "vllm",
     /*display_name*/    "vLLM ROCm (experimental)",
     /*binary*/          "vllm-server",
-    /*config_section*/  "vllm",
+    /*config_section*/  "",  // defaults to recipe
     /*default_device*/  DEVICE_GPU,
     /*slot_policy*/     SlotPolicy::Standard,
     /*selectable_backend*/ true,
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
index ce2014dec..8c4a29815 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = {
 #else
     /*binary*/          "whisper-server",
 #endif
-    /*config_section*/  "whispercpp",
+    /*config_section*/  "",  // defaults to recipe
     /*default_device*/  DEVICE_CPU,   // npu variant resolves to NPU + ExclusiveNpu via effective_*()
     /*slot_policy*/     SlotPolicy::Standard,
     /*selectable_backend*/ true,

From 623334c3bb35e0065948101c220fbc41a19fc341 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:18:37 -0400
Subject: [PATCH 15/39] refactor(backends): gate Prometheus scraping on an
 exposes_prometheus_metrics descriptor flag

prometheus_metrics.cpp hardcoded `recipe == "llamacpp"` to decide whether to
scrape a backend subprocess's /metrics. Replace with a descriptor flag
(exposes_prometheus_metrics; llamacpp = true) so a new backend that exposes
Prometheus metrics opts in via its descriptor, not by editing the metrics code.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_descriptor.h | 4 ++++
 src/cpp/include/lemon/backends/llamacpp/llamacpp.h  | 1 +
 src/cpp/server/prometheus_metrics.cpp               | 4 +++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index 29ea2e0ea..aad473eba 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -71,6 +71,10 @@ struct BackendDescriptor {
     // channel clamp (a requested channel not listed here falls back to the first).
     std::vector<std::string> rocm_channels;
 
+    // True if the backend's subprocess exposes a Prometheus /metrics endpoint
+    // that lemond should scrape and re-export (llama-server does).
+    bool exposes_prometheus_metrics = false;
+
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
         return config_section.empty() ? recipe : config_section;
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index cbd6386fa..a17c24961 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -47,6 +47,7 @@ inline const BackendDescriptor descriptor = {
     /*web_display_name*/ "llama.cpp GPU",
     /*web_priority*/    1,
     /*rocm_channels*/   {"stable", "nightly"},
+    /*exposes_prometheus_metrics*/ true,
 };
 
 }  // namespace llamacpp
diff --git a/src/cpp/server/prometheus_metrics.cpp b/src/cpp/server/prometheus_metrics.cpp
index 8ecfdb288..88f7bdaf3 100644
--- a/src/cpp/server/prometheus_metrics.cpp
+++ b/src/cpp/server/prometheus_metrics.cpp
@@ -1,5 +1,6 @@
 #include "lemon/prometheus_metrics.h"
 
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/version.h"
 
 #include <algorithm>
@@ -274,7 +275,8 @@ void append_llamacpp_backend_metrics(PrometheusBuilder& metrics,
                                      const json& model,
                                      const std::map<std::string, std::string>& labels,
                                      std::set<std::string>& described_backend_metrics) {
-    if (model.value("recipe", "") != "llamacpp") {
+    const auto* desc = backends::descriptor_for(model.value("recipe", ""));
+    if (desc == nullptr || !desc->exposes_prometheus_metrics) {
         return;
     }
 

From ae8ca93330b296cfd351de564107274b3cfed1bb Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:23:44 -0400
Subject: [PATCH 16/39] refactor(backends): move hf_load and moonshine_arch to
 ModelInfo::extras

These backend-specific per-model fields no longer sit on the shared ModelInfo
struct: llamacpp reads info.extra<bool>("hf_load", false) and moonshine reads
info.extra<int>("moonshine_arch", -1). Removed the typed fields, their explicit
parse sites, and their kKnownKeys entries; added parse_extras() to the two
ModelInfo-building paths that lacked it (add_model_to_cache, get_model_info_
unfiltered) so extras populate everywhere a model is built from JSON.

Verified: llamacpp models still resolve/download (hf_load path intact).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/model_manager.h              |  8 --------
 .../server/backends/llamacpp/llamacpp_server.cpp   |  6 +++---
 .../server/backends/moonshine/moonshine_server.cpp |  2 +-
 src/cpp/server/model_manager.cpp                   | 14 +++-----------
 4 files changed, 7 insertions(+), 23 deletions(-)

diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
index 967d6d044..f12cd3e9a 100644
--- a/src/cpp/include/lemon/model_manager.h
+++ b/src/cpp/include/lemon/model_manager.h
@@ -76,11 +76,6 @@ struct ModelInfo {
     bool suggested = false;
     std::string source;  // "local_upload" for locally uploaded models
     bool downloaded = false;     // Whether model is downloaded and available
-    // When true, LlamaCppServer launches llama-server with `-hf <checkpoint>`
-    // instead of `-m <gguf> [--mmproj <mmproj>]`. Required for models like
-    // Qwen2.5-Omni where llama-server's manual-load path rejects audio content
-    // parts — the -hf path drives the dual-clip (vision+audio) context correctly.
-    bool hf_load = false;
     double size = 0.0;   // Model size in GB
     int64_t max_context_window = 0;  // Static model-supported text context, when known
 
@@ -107,9 +102,6 @@ struct ModelInfo {
     double cost_input_per_million = -1.0;
     double cost_output_per_million = -1.0;
 
-    // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING)
-    int moonshine_arch = -1;
-
     // Generic per-model fields a backend declares for itself. Any server_models.json
     // key not consumed by a typed field above lands here, so a new backend can read
     // custom per-model config in load() without editing this shared struct.
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 63254f155..0d8121a37 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -295,7 +295,7 @@ void LlamaCppServer::load(const std::string& model_name,
     // Use pre-resolved GGUF path. Skipped for hf_load models because llama-server
     // sources the weights itself via -hf; those models may not have local files.
     std::string gguf_path = model_info.resolved_path();
-    if (gguf_path.empty() && !model_info.hf_load) {
+    if (gguf_path.empty() && !model_info.extra<bool>("hf_load", false)) {
         throw std::runtime_error("GGUF file not found for checkpoint: " + model_info.checkpoint());
     }
 
@@ -331,7 +331,7 @@ void LlamaCppServer::load(const std::string& model_name,
     // is required for models like Qwen2.5-Omni where the manual -m + --mmproj
     // path rejects audio content parts in /v1/chat/completions — the -hf path
     // drives the dual-clip (vision+audio) context correctly.
-    if (model_info.hf_load) {
+    if (model_info.extra<bool>("hf_load", false)) {
         push_arg(args, reserved_flags, "-hf", model_info.checkpoint(),
                  std::vector<std::string>{"--hf-repo", "-mr", "--hf-file", "-mf"});
     } else {
@@ -353,7 +353,7 @@ void LlamaCppServer::load(const std::string& model_name,
 
     // Add mmproj file if present (for vision models). Skip when hf_load is set —
     // llama-server resolves the mmproj companion itself from the HF repo.
-    if (!mmproj_path.empty() && !model_info.hf_load) {
+    if (!mmproj_path.empty() && !model_info.extra<bool>("hf_load", false)) {
         push_arg(args, reserved_flags, "--mmproj", mmproj_path);
         if (!use_gpu) {
             LOG(DEBUG, "LlamaCpp") << "Skipping mmproj argument since GPU mode is not enabled" << std::endl;
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index ed709990b..b294f46ee 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -84,7 +84,7 @@ void MoonshineServer::load(const std::string& model_name,
 
     // Resolve model architecture. Prefer the explicit registry field; fall back
     // to inferring from the checkpoint variant (onnx/tiny, onnx/small, etc.).
-    int model_arch = model_info.moonshine_arch;
+    int model_arch = model_info.extra<int>("moonshine_arch", -1);
     if (model_arch < 0) {
         std::string variant = model_info.checkpoint();
         std::transform(variant.begin(), variant.end(), variant.begin(), ::tolower);
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 76a4a57a8..9210afd02 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -293,7 +293,7 @@ static void parse_image_defaults(ModelInfo& info, const json& model_json) {
 static void parse_extras(ModelInfo& info, const json& model_json) {
     static const std::set<std::string> kKnownKeys = {
         "checkpoint", "checkpoints", "components", "mmproj", "recipe", "suggested",
-        "hf_load", "source", "size", "cloud_provider", "moonshine_arch",
+        "source", "size", "cloud_provider",
         "labels", "image_defaults", "recipe_options"
     };
     if (!model_json.is_object()) return;
@@ -1375,11 +1375,9 @@ void ModelManager::build_cache() {
         parse_components(info, value);
         info.recipe = JsonUtils::get_or_default<std::string>(value, "recipe", "");
         info.suggested = JsonUtils::get_or_default<bool>(value, "suggested", false);
-        info.hf_load = JsonUtils::get_or_default<bool>(value, "hf_load", false);
         info.source = JsonUtils::get_or_default<std::string>(value, "source", "");
         info.size = JsonUtils::get_or_default<double>(value, "size", 0.0);
         info.cloud_provider = JsonUtils::get_or_default<std::string>(value, "cloud_provider", "");
-        info.moonshine_arch = JsonUtils::get_or_default<int>(value, "moonshine_arch", -1);
 
         // HF-backed collections store their components on Hugging Face — the
         // cached manifest is the single source of truth. Rebuild the component
@@ -1430,11 +1428,9 @@ void ModelManager::build_cache() {
         parse_components(info, value);
         info.recipe = JsonUtils::get_or_default<std::string>(value, "recipe", "");
         info.suggested = JsonUtils::get_or_default<bool>(value, "suggested", true);
-        info.hf_load = JsonUtils::get_or_default<bool>(value, "hf_load", false);
         info.source = JsonUtils::get_or_default<std::string>(value, "source", "");
         info.size = JsonUtils::get_or_default<double>(value, "size", 0.0);
         info.cloud_provider = JsonUtils::get_or_default<std::string>(value, "cloud_provider", "");
-        info.moonshine_arch = JsonUtils::get_or_default<int>(value, "moonshine_arch", -1);
 
         // HF-backed user collections (created by `lemonade pull <org>/<repo>`)
         // keep only a repo pointer in user_models.json; their components live in
@@ -1601,12 +1597,12 @@ void ModelManager::add_model_to_cache(const std::string& model_name) {
     info.cloud_provider = JsonUtils::get_or_default<std::string>(*model_json, "cloud_provider", "");
 
     parse_image_defaults(info, *model_json);
+    parse_extras(info, *model_json);
     json jro = (model_json->contains("recipe_options") && (*model_json)["recipe_options"].is_object())
         ? (*model_json)["recipe_options"] : json(nullptr);
     info.recipe_options = build_recipe_options(info, jro, cache_key_to_canonical_id(model_name), recipe_options_);
 
     info.suggested = JsonUtils::get_or_default<bool>(*model_json, "suggested", is_user_model);
-    info.hf_load = JsonUtils::get_or_default<bool>(*model_json, "hf_load", false);
     info.source = JsonUtils::get_or_default<std::string>(*model_json, "source", "");
 
     if (model_json->contains("labels") && (*model_json)["labels"].is_array()) {
@@ -4345,7 +4341,6 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name)
     parse_components(info, *model_json);
     info.recipe = JsonUtils::get_or_default<std::string>(*model_json, "recipe", "");
     info.suggested = JsonUtils::get_or_default<bool>(*model_json, "suggested", false);
-    info.hf_load = JsonUtils::get_or_default<bool>(*model_json, "hf_load", false);
     info.source = JsonUtils::get_or_default<std::string>(*model_json, "source", "");
 
     // Parse labels array
@@ -4364,10 +4359,7 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name)
         }
     }
 
-    // Parse moonshine_arch
-    if (model_json->contains("moonshine_arch") && (*model_json)["moonshine_arch"].is_number_integer()) {
-        info.moonshine_arch = (*model_json)["moonshine_arch"].get<int>();
-    }
+    parse_extras(info, *model_json);
 
     return info;
 }

From add34ed2af2a23e8e17c138f6c15f34c40a7d87f Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:27:30 -0400
Subject: [PATCH 17/39] refactor(backends): descriptor-drive the gfx1151 CWSR
 availability check

Replace the hardcoded (sd-cpp||llamacpp||vllm)&&rocm recipe-list in
is_recipe_installed and build_recipes_info with a rocm_requires_cwsr_fix
descriptor flag (set on those three backends). The kernel CWSR detection
(needs_gfx1151_cwsr_fix) stays in system_info as generic hardware detection;
only "which backends' rocm build needs it" is now descriptor data.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../lemon/backends/backend_descriptor.h       |  4 ++++
 .../lemon/backends/llamacpp/llamacpp.h        |  1 +
 src/cpp/include/lemon/backends/sdcpp/sdcpp.h  |  2 ++
 src/cpp/include/lemon/backends/vllm/vllm.h    |  4 ++++
 src/cpp/server/system_info.cpp                | 24 +++++++++----------
 5 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index aad473eba..57dd5a7b7 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -75,6 +75,10 @@ struct BackendDescriptor {
     // that lemond should scrape and re-export (llama-server does).
     bool exposes_prometheus_metrics = false;
 
+    // True if this backend's ROCm build requires the gfx1151 (Strix Halo) kernel
+    // CWSR fix. Gates the availability/remediation check for the "rocm" backend.
+    bool rocm_requires_cwsr_fix = false;
+
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
         return config_section.empty() ? recipe : config_section;
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index a17c24961..8e5435f9f 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -48,6 +48,7 @@ inline const BackendDescriptor descriptor = {
     /*web_priority*/    1,
     /*rocm_channels*/   {"stable", "nightly"},
     /*exposes_prometheus_metrics*/ true,
+    /*rocm_requires_cwsr_fix*/ true,
 };
 
 }  // namespace llamacpp
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index b65fe4fd6..57ce30cbd 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -51,6 +51,8 @@ inline const BackendDescriptor descriptor = {
     /*web_display_name*/ "stable-diffusion.cpp",
     /*web_priority*/    5,
     /*rocm_channels*/   {"stable"},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ true,
 };
 
 }  // namespace sdcpp
diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h
index 4c35ad1ec..84a596168 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -32,6 +32,10 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    true,
     /*web_display_name*/ "",
+    /*web_priority*/    0,
+    /*rocm_channels*/   {},  // single rocm artifact, no stable/nightly channels
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ true,
 };
 
 }  // namespace vllm
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 6ae2ef03f..b117bf167 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -565,15 +565,13 @@ static bool device_matches_constraint(const std::string& device_family,
 
 // Generic installation check
 static bool is_recipe_installed(const std::string& recipe, const std::string& backend, std::string& error_message) {
-    bool is_llamacpp_rocm_backend = recipe == "llamacpp" && backend == "rocm";
-
-    // Special handling for ROCm backends on gfx1151 (Strix Halo) if kernel CWSR fix is missing
-    bool is_vllm_rocm_backend = recipe == "vllm" && backend == "rocm";
-    if ((recipe == "sd-cpp" && backend == "rocm") || is_llamacpp_rocm_backend || is_vllm_rocm_backend) {
-        if (needs_gfx1151_cwsr_fix()) {
-            error_message = "Linux kernel missing support";
-            return false;
-        }
+    // Special handling for ROCm backends on gfx1151 (Strix Halo) if the kernel
+    // CWSR fix is missing — which backends' rocm build needs it is a descriptor flag.
+    const auto* cwsr_desc = backends::descriptor_for(recipe);
+    if (backend == "rocm" && cwsr_desc && cwsr_desc->rocm_requires_cwsr_fix &&
+        needs_gfx1151_cwsr_fix()) {
+        error_message = "Linux kernel missing support";
+        return false;
     }
     auto* spec = try_get_spec_for_recipe(recipe);
     if (spec) {
@@ -1365,11 +1363,11 @@ json SystemInfo::build_recipes_info(const json& devices) {
                     : "Backend is supported but not installed.";
                 backend["message"] = install_error.empty() ? default_message : install_error;
 
-                bool is_rocm_backend = (def.recipe == "sd-cpp" && def.backend == "rocm") ||
-                    (def.recipe == "llamacpp" && def.backend == "rocm") ||
-                    (def.recipe == "vllm" && def.backend == "rocm");
+                const auto* cwsr_desc = backends::descriptor_for(def.recipe);
+                bool is_rocm_backend = def.backend == "rocm" && cwsr_desc &&
+                                       cwsr_desc->rocm_requires_cwsr_fix;
 
-                // Special action for ROCm backends on llamacpp/sd-cpp/vllm if CWSR fix is missing
+                // Special action for ROCm backends that need the gfx1151 CWSR fix.
                 if (is_rocm_backend
                     && !install_error.empty() && needs_gfx1151_cwsr_fix()) {
                     backend["action"] = "Visit https://lemonade-server.ai/gfx1151_linux.html";

From 94ebbab1c8e4fa5360201fb4bb36c0098bb2629a Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:32:42 -0400
Subject: [PATCH 18/39] refactor(backends): migrate install availability to a
 check_install ops hook

is_recipe_installed now finds the managed binary generically and asks the
backend's ops whether it's actually installed, instead of hardcoding the
llamacpp-system HIP check and the flm PATH fallback:

- check_install(backend, binary_found) ops hook; base = installed iff binary
  found. LlamaCppOps adds the ggml HIP-plugin requirement for the "system"
  build on AMD GPUs; FlmOps treats a PATH-installed flm as present.
- is_ggml_hip_plugin_available moves into backends/llamacpp; find_flm_executable
  and run_flm_validate move into backends/fastflowlm. Removed from path_utils
  (+ their orphaned decls/comments).

system_info no longer carries llamacpp/flm-specific availability knowledge.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_ops.h  |  16 ++
 .../backends/fastflowlm/fastflowlm_models.h   |   6 +
 src/cpp/include/lemon/utils/path_utils.h      |  23 ---
 .../backends/fastflowlm/fastflowlm_models.cpp | 124 +++++++++++++-
 .../backends/fastflowlm/fastflowlm_server.cpp |  12 +-
 .../backends/llamacpp/llamacpp_server.cpp     |  59 +++++++
 src/cpp/server/system_info.cpp                |  38 ++---
 src/cpp/server/utils/path_utils.cpp           | 161 ------------------
 8 files changed, 224 insertions(+), 215 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index 854065f24..cf796cb37 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -99,6 +99,22 @@ class BackendOps {
         (void)backend;
         return file_version;
     }
+
+    // Result of a backend-specific install check: whether the backend variant is
+    // usable, plus an optional error explaining why not.
+    struct InstallCheck {
+        bool installed = false;
+        std::string error;
+    };
+
+    // Decide whether a backend variant is installed, given whether its managed
+    // binary was found on disk. Default: installed iff the binary was found.
+    // llamacpp's "system" build also requires the ggml HIP plugin when an AMD GPU
+    // is present; flm can be a system PATH package even without a managed binary.
+    virtual InstallCheck check_install(const std::string& backend, bool binary_found) const {
+        (void)backend;
+        return {binary_found, ""};
+    }
 };
 
 // Shared default ops instance for backends that override nothing.
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
index 20c7a96b8..910e25be6 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
@@ -34,6 +34,12 @@ std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo
 // Read the model's max context window from its FLM config.json (0 if unknown).
 int64_t read_flm_max_context_window(const ModelInfo& info);
 
+// Locate the flm executable on PATH / install dirs ("" if not found).
+std::string find_flm_executable();
+
+// Run `flm validate` and report readiness; error_message on failure.
+bool run_flm_validate(const std::string& flm_path, std::string& error_message);
+
 // Detect the installed FLM version via `flm version` ("unknown" if unavailable).
 std::string flm_version();
 
diff --git a/src/cpp/include/lemon/utils/path_utils.h b/src/cpp/include/lemon/utils/path_utils.h
index 96561186c..63f142ee6 100644
--- a/src/cpp/include/lemon/utils/path_utils.h
+++ b/src/cpp/include/lemon/utils/path_utils.h
@@ -35,22 +35,6 @@ bool is_safe_executable_path(const std::string& path);
  */
 bool looks_like_path(const std::string& v);
 
-/**
- * Find the FLM executable (flm.exe on Windows, flm on Unix).
- * Uses SearchPathA on Windows (same API as CreateProcessA) to search PATH,
- * then falls back to the default installation directory.
- * @return Full path to flm executable, or empty string if not found.
- */
-std::string find_flm_executable();
-
-/**
- * Run 'flm validate' command and check if it succeeds.
- * @param flm_path Optional path to flm executable. If empty, will search for it.
- * @param error_message Output parameter for error message if validation fails.
- * @return true if validation succeeds, false otherwise.
- */
-bool run_flm_validate(const std::string& flm_path, std::string& error_message);
-
 /**
  * Get an environment variable as UTF-8 text.
  */
@@ -73,13 +57,6 @@ std::string path_to_utf8(const std::filesystem::path& path);
  */
 std::string find_executable_in_path(const std::string& executable_name);
 
-/**
- * Check if the HIP plugin for GGML backends is available on the system.
- * This function checks common installation paths for libggml-hip.so.
- * @return true if the HIP plugin is found, false otherwise.
- */
-bool is_ggml_hip_plugin_available();
-
 /**
  * Set the lemonade cache directory. Must be called once at startup before
  * get_cache_dir(). After this call, get_cache_dir() returns this path.
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
index 0331cc895..a2c4ad52f 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
@@ -130,7 +130,7 @@ std::string find_flm_binary() {
         return BackendUtils::get_backend_binary_path(*spec, "npu");
     } catch (...) {
 #ifndef _WIN32
-        return lemon::utils::find_flm_executable();
+        return find_flm_executable();
 #else
         return "";
 #endif
@@ -540,7 +540,7 @@ std::string flm_version() {
     }
 
     // Find the flm executable using shared utility
-    std::string flm_path = lemon::utils::find_flm_executable();
+    std::string flm_path = find_flm_executable();
     if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) {
         return "unknown";
     }
@@ -599,6 +599,126 @@ std::string flm_version() {
 }
 
 
+std::string find_flm_executable() {
+#ifdef _WIN32
+    // On Windows, only check the Lemonade install directory (auto-installed zip).
+    // No system PATH fallback - FLM should be installed via install_backend().
+    std::string install_dir = (fs::path(lemon::utils::get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string();
+    if (fs::exists(install_dir)) {
+        for (const auto& entry : fs::recursive_directory_iterator(install_dir)) {
+            if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") {
+                std::string path = entry.path().string();
+                if (lemon::utils::is_safe_executable_path(path)) {
+                    return path;
+                }
+            }
+        }
+    }
+    return "";
+#else
+    // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`.
+    if (!lemon::utils::find_executable_in_path("flm").empty()) {
+        return "flm";
+    }
+    return "";
+#endif
+}
+
+bool run_flm_validate(const std::string& flm_path, std::string& error_message) {
+    std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path;
+    if (flm_exe.empty()) {
+        error_message = "FLM executable not found";
+        return false;
+    }
+    if (!lemon::utils::is_safe_executable_path(flm_exe)) {
+        error_message = "FLM path contains invalid characters";
+        return false;
+    }
+
+    std::string command = "\"" + flm_exe + "\" validate --json";
+    std::string output;
+    int exit_code;
+#ifdef _WIN32
+    exit_code = lemon::utils::ProcessManager::run_command(command, output);
+#else
+    FILE* pipe = popen(command.c_str(), "r");
+    if (!pipe) {
+        error_message = "Failed to execute " + flm_exe;
+        return false;
+    }
+
+    char buffer[1024];
+    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
+        output += buffer;
+    }
+
+    exit_code = pclose(pipe);
+    if (exit_code != -1) {
+        exit_code = WEXITSTATUS(exit_code);
+    }
+#endif
+
+    try {
+        if (!output.empty()) {
+            json j = lemon::utils::JsonUtils::parse(output);
+            if (j.is_object()) {
+                // Check for overall status
+                bool validation_ok = false;
+                if (j.contains("ready")) {
+                    validation_ok = j["ready"].get<bool>();
+                }
+
+                if (validation_ok) {
+                    error_message.clear();
+                    return true;
+                }
+
+                std::vector<std::string> errors;
+
+                if (j.contains("amd_device_found") && !j["amd_device_found"].get<bool>()) {
+                    errors.push_back("No AMD NPU device found.");
+                }
+
+                if (j.contains("all_fw_ok") && !j["all_fw_ok"].get<bool>()) {
+                    errors.push_back("NPU firmware is incompatible.");
+                }
+                if (j.contains("kernel_ok") && !j["kernel_ok"].get<bool>()) {
+                    errors.push_back("Kernel version is incompatible.");
+                }
+
+                if (j.contains("memlock_ok") && !j["memlock_ok"].get<bool>()) {
+                    errors.push_back("Memlock limits are too low.");
+                }
+
+                if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get<bool>()) {
+                    errors.push_back("NPU driver version is too old.");
+                }
+
+                if (errors.empty()) {
+                    error_message = "NPU validation failed.";
+                } else {
+                    error_message = "";
+                    for (size_t i = 0; i < errors.size(); ++i) {
+                        error_message += errors[i] + (i == errors.size() - 1 ? "" : " ");
+                    }
+                }
+                return false;
+            }
+        }
+    } catch (...) {
+        // Fallback for non-JSON output or parsing error
+    }
+
+    if (exit_code != 0) {
+        error_message = "flm validate failed with exit code " + std::to_string(exit_code);
+        return false;
+    }
+
+    error_message.clear();
+    return true;
+}
+
+
 } // namespace fastflowlm
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 772fac2d3..e251e8240 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -170,7 +170,7 @@ void FastFlowLMServer::load(const std::string& model_name,
     // Validate NPU hardware/drivers
     std::string flm_path = get_flm_path();
     std::string validate_error;
-    if (!utils::run_flm_validate(flm_path, validate_error)) {
+    if (!fastflowlm::run_flm_validate(flm_path, validate_error)) {
         throw std::runtime_error("FLM NPU validation failed: " + validate_error +
             "\nVisit " + DRIVER_INSTALL_URL + " for driver installation instructions.");
     }
@@ -457,7 +457,7 @@ std::string FastFlowLMServer::get_flm_path() {
     }
 #else
     // On Linux, FLM is installed as a system package (in PATH)
-    std::string flm_path = utils::find_flm_executable();
+    std::string flm_path = fastflowlm::find_flm_executable();
     if (!flm_path.empty()) {
         LOG(INFO, "FastFlowLM") << "Found flm at: " << flm_path << std::endl;
     } else {
@@ -515,6 +515,14 @@ class FlmOps : public BackendOps {
         }
         return file_version;
     }
+
+    InstallCheck check_install(const std::string&, bool binary_found) const override {
+        // On Linux FLM is a system package on PATH, not in the managed install dir.
+        if (!binary_found && !find_flm_executable().empty()) {
+            return {true, ""};
+        }
+        return {binary_found, ""};
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 0d8121a37..d98ce46e6 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -701,6 +701,52 @@ std::string system_llamacpp_version() {
 }
 
 
+bool is_ggml_hip_plugin_available() {
+#ifdef __linux__
+    // Allow distros/packagers that install outside the FHS paths below
+    // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so.
+    if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) {
+        // Require the basename to look like the HIP plugin (libggml-hip*.so*,
+        // case-insensitive, versioned sonames allowed). This is a sanity check,
+        // not a security boundary: the path is not forwarded to ggml's loader,
+        // so we cannot verify it is actually loadable. It only guards against an
+        // accidental override pointing at an unrelated existing file.
+        std::string name = fs::path(env).filename().string();
+        std::transform(name.begin(), name.end(), name.begin(),
+                       [](unsigned char c) { return std::tolower(c); });
+        const bool name_matches = name.rfind("libggml-hip", 0) == 0 &&
+                                  name.find(".so") != std::string::npos;
+        // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing
+        // filesystem overload: an odd or malformed path resolves to "not a
+        // regular file" (ec set) instead of raising a filesystem_error.
+        std::error_code hip_path_ec;
+        if (name_matches && fs::is_regular_file(env, hip_path_ec)) {
+            return true;
+        }
+    }
+    // On Linux x86_64, check common system library paths for the HIP plugin
+    std::vector<std::string> possible_paths = {
+        // Debian/Ubuntu multiarch path (most common)
+        "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so",
+	// Arch AUR path
+	"/usr/lib/libggml-hip.so",
+        // Standard Linux paths
+        "/usr/lib/ggml/backends0/libggml-hip.so",
+        "/usr/lib64/ggml/backends0/libggml-hip.so"
+    };
+
+    // Check all possible paths
+    for (const auto& path : possible_paths) {
+        if (fs::exists(path)) {
+            return true;
+        }
+    }
+#endif
+
+    return false;
+}
+
+
 // llamacpp model-management behavior: GGUF metadata + capability labels.
 class LlamaCppOps : public BackendOps {
 public:
@@ -774,6 +820,19 @@ class LlamaCppOps : public BackendOps {
         }
         return file_version;
     }
+
+    InstallCheck check_install(const std::string& backend, bool binary_found) const override {
+        // The system llama-server also needs the ggml HIP plugin for ROCm GPU
+        // acceleration when an AMD GPU (KFD) is present.
+        if (binary_found && backend == "system") {
+#ifdef __linux__
+            if (std::filesystem::exists("/sys/class/kfd") && !is_ggml_hip_plugin_available()) {
+                return {false, "HIP plugin libggml-hip.so not installed"};
+            }
+#endif
+        }
+        return {binary_found, ""};
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index b117bf167..d77b80830 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -573,38 +573,22 @@ static bool is_recipe_installed(const std::string& recipe, const std::string& ba
         error_message = "Linux kernel missing support";
         return false;
     }
-    auto* spec = try_get_spec_for_recipe(recipe);
-    if (spec) {
+    // Find the managed binary, then let the backend's ops decide installed-ness
+    // (llamacpp "system" also needs the HIP plugin; flm can be a PATH package).
+    bool binary_found = false;
+    if (auto* spec = try_get_spec_for_recipe(recipe)) {
         try {
             BackendUtils::get_backend_binary_path(*spec, backend);
-
-            // For system llamacpp backend, also verify the HIP plugin is available
-            // This is required for ROCm GPU acceleration with dynamically loaded backends
-            if (recipe == "llamacpp" && backend == "system") {
-#ifdef __linux__
-                // Check if AMD GPU driver is loaded (KFD indicates amdgpu driver)
-                if (fs::exists("/sys/class/kfd")) {
-                    // System has AMD GPU(s), so we need the HIP plugin
-                    if (!is_ggml_hip_plugin_available()) {
-                        error_message = "HIP plugin libggml-hip.so not installed";
-                        return false;
-                    }
-                }
-#endif
-            }
-
-            return true;
+            binary_found = true;
         } catch (...) {
-#ifndef _WIN32
-            // On Linux, FLM is installed as a system package (in PATH, not install dir)
-            if (recipe == "flm" && !utils::find_flm_executable().empty()) {
-                return true;
-            }
-#endif
-            return false;
+            binary_found = false;
         }
     }
-    return false;
+    auto check = backends::ops_for(recipe)->check_install(backend, binary_found);
+    if (!check.installed && !check.error.empty()) {
+        error_message = check.error;
+    }
+    return check.installed;
 }
 
 static std::string get_recipe_version(const std::string& recipe, const std::string& backend) {
diff --git a/src/cpp/server/utils/path_utils.cpp b/src/cpp/server/utils/path_utils.cpp
index dc7492295..fb8591337 100644
--- a/src/cpp/server/utils/path_utils.cpp
+++ b/src/cpp/server/utils/path_utils.cpp
@@ -103,30 +103,6 @@ bool looks_like_path(const std::string& v) {
     }
 }
 
-std::string find_flm_executable() {
-#ifdef _WIN32
-    // On Windows, only check the Lemonade install directory (auto-installed zip).
-    // No system PATH fallback - FLM should be installed via install_backend().
-    std::string install_dir = (fs::path(get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string();
-    if (fs::exists(install_dir)) {
-        for (const auto& entry : fs::recursive_directory_iterator(install_dir)) {
-            if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") {
-                std::string path = entry.path().string();
-                if (is_safe_executable_path(path)) {
-                    return path;
-                }
-            }
-        }
-    }
-    return "";
-#else
-    // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`.
-    if (!find_executable_in_path("flm").empty()) {
-        return "flm";
-    }
-    return "";
-#endif
-}
 
 std::string find_executable_in_path(const std::string& executable_name) {
     if (!is_safe_executable_path(executable_name)) {
@@ -180,50 +156,6 @@ std::string find_executable_in_path(const std::string& executable_name) {
 #endif
 }
 
-bool is_ggml_hip_plugin_available() {
-#ifdef __linux__
-    // Allow distros/packagers that install outside the FHS paths below
-    // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so.
-    if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) {
-        // Require the basename to look like the HIP plugin (libggml-hip*.so*,
-        // case-insensitive, versioned sonames allowed). This is a sanity check,
-        // not a security boundary: the path is not forwarded to ggml's loader,
-        // so we cannot verify it is actually loadable. It only guards against an
-        // accidental override pointing at an unrelated existing file.
-        std::string name = fs::path(env).filename().string();
-        std::transform(name.begin(), name.end(), name.begin(),
-                       [](unsigned char c) { return std::tolower(c); });
-        const bool name_matches = name.rfind("libggml-hip", 0) == 0 &&
-                                  name.find(".so") != std::string::npos;
-        // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing
-        // filesystem overload: an odd or malformed path resolves to "not a
-        // regular file" (ec set) instead of raising a filesystem_error.
-        std::error_code hip_path_ec;
-        if (name_matches && fs::is_regular_file(env, hip_path_ec)) {
-            return true;
-        }
-    }
-    // On Linux x86_64, check common system library paths for the HIP plugin
-    std::vector<std::string> possible_paths = {
-        // Debian/Ubuntu multiarch path (most common)
-        "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so",
-	// Arch AUR path
-	"/usr/lib/libggml-hip.so",
-        // Standard Linux paths
-        "/usr/lib/ggml/backends0/libggml-hip.so",
-        "/usr/lib64/ggml/backends0/libggml-hip.so"
-    };
-
-    // Check all possible paths
-    for (const auto& path : possible_paths) {
-        if (fs::exists(path)) {
-            return true;
-        }
-    }
-#endif
-
-    return false;
-}
 
 std::string get_cache_dir() {
     // If set_cache_dir() was called at startup, use that
@@ -295,98 +227,5 @@ std::string get_downloaded_bin_dir() {
     return bin_dir;
 }
 
-bool run_flm_validate(const std::string& flm_path, std::string& error_message) {
-    std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path;
-    if (flm_exe.empty()) {
-        error_message = "FLM executable not found";
-        return false;
-    }
-    if (!is_safe_executable_path(flm_exe)) {
-        error_message = "FLM path contains invalid characters";
-        return false;
-    }
-
-    std::string command = "\"" + flm_exe + "\" validate --json";
-    std::string output;
-    int exit_code;
-#ifdef _WIN32
-    exit_code = ProcessManager::run_command(command, output);
-#else
-    FILE* pipe = popen(command.c_str(), "r");
-    if (!pipe) {
-        error_message = "Failed to execute " + flm_exe;
-        return false;
-    }
-
-    char buffer[1024];
-    while (fgets(buffer, sizeof(buffer), pipe) != nullptr) {
-        output += buffer;
-    }
-
-    exit_code = pclose(pipe);
-    if (exit_code != -1) {
-        exit_code = WEXITSTATUS(exit_code);
-    }
-#endif
-
-    try {
-        if (!output.empty()) {
-            json j = JsonUtils::parse(output);
-            if (j.is_object()) {
-                // Check for overall status
-                bool validation_ok = false;
-                if (j.contains("ready")) {
-                    validation_ok = j["ready"].get<bool>();
-                }
-
-                if (validation_ok) {
-                    error_message.clear();
-                    return true;
-                }
-
-                std::vector<std::string> errors;
-
-                if (j.contains("amd_device_found") && !j["amd_device_found"].get<bool>()) {
-                    errors.push_back("No AMD NPU device found.");
-                }
-
-                if (j.contains("all_fw_ok") && !j["all_fw_ok"].get<bool>()) {
-                    errors.push_back("NPU firmware is incompatible.");
-                }
-                if (j.contains("kernel_ok") && !j["kernel_ok"].get<bool>()) {
-                    errors.push_back("Kernel version is incompatible.");
-                }
-
-                if (j.contains("memlock_ok") && !j["memlock_ok"].get<bool>()) {
-                    errors.push_back("Memlock limits are too low.");
-                }
-
-                if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get<bool>()) {
-                    errors.push_back("NPU driver version is too old.");
-                }
-
-                if (errors.empty()) {
-                    error_message = "NPU validation failed.";
-                } else {
-                    error_message = "";
-                    for (size_t i = 0; i < errors.size(); ++i) {
-                        error_message += errors[i] + (i == errors.size() - 1 ? "" : " ");
-                    }
-                }
-                return false;
-            }
-        }
-    } catch (...) {
-        // Fallback for non-JSON output or parsing error
-    }
-
-    if (exit_code != 0) {
-        error_message = "flm validate failed with exit code " + std::to_string(exit_code);
-        return false;
-    }
-
-    error_message.clear();
-    return true;
-}
 
 } // namespace utils::lemon

From 1ced08c910b10eaa7e472294cb68518d5a9b8e19 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:37:28 -0400
Subject: [PATCH 19/39] refactor(backends): descriptor-drive version comparison
 policy (Exact vs AtLeast)

The update-required check special-cased recipe=="flm" to allow an installed
version newer than the pin. Replace with a version_policy descriptor field
(Exact default; flm = AtLeast for its system-managed package). system_info no
longer names flm in the version-comparison logic.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_descriptor.h    | 10 ++++++++++
 src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h |  4 ++++
 src/cpp/server/system_info.cpp                         |  7 ++++---
 3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index 57dd5a7b7..d640ca279 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -29,6 +29,12 @@ enum class SlotPolicy {
     Unmetered      // never counts toward slots, never auto-evicted (cloud)
 };
 
+// How an installed backend version is compared against the expected pin.
+enum class VersionPolicy {
+    Exact,    // installed must match the expected version
+    AtLeast   // installed >= expected is acceptable (system-managed packages, e.g. flm)
+};
+
 inline const char* slot_policy_to_string(SlotPolicy p) {
     switch (p) {
         case SlotPolicy::Standard:      return "standard";
@@ -79,6 +85,10 @@ struct BackendDescriptor {
     // CWSR fix. Gates the availability/remediation check for the "rocm" backend.
     bool rocm_requires_cwsr_fix = false;
 
+    // How the installed version is compared against the expected pin. Exact by
+    // default; system-managed packages (flm) accept any version >= expected.
+    VersionPolicy version_policy = VersionPolicy::Exact;
+
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
         return config_section.empty() ? recipe : config_section;
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index 24fc07470..b56c9e577 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -32,6 +32,10 @@ inline const BackendDescriptor descriptor = {
     /*experimental*/    false,
     /*web_display_name*/ "FastFlowLM NPU",
     /*web_priority*/    3,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::AtLeast,  // system-managed package
 };
 
 }  // namespace fastflowlm
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index d77b80830..4f5697b50 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -1401,9 +1401,10 @@ json SystemInfo::build_recipes_info(const json& devices) {
                 return installed.compare(0, prefix.size(), prefix) == 0;
             };
 #if !defined(_WIN32)
-            // On non-Windows, FLM is a system-managed package; a version newer
-            // than the minimum required is acceptable.
-            if (def.recipe == "flm") {
+            // System-managed packages (e.g. flm on Linux) accept a version newer
+            // than the minimum required.
+            const auto* ver_desc = backends::descriptor_for(def.recipe);
+            if (ver_desc && ver_desc->version_policy == VersionPolicy::AtLeast) {
                 auto installed_ver = utils::Version::parse(installed_version);
                 auto expected_ver = utils::Version::parse(expected_version);
                 // If either version cannot be parsed, fall back to exact equality check

From e89b47cee5d07c7a3cae985e177264d02107f991 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:41:59 -0400
Subject: [PATCH 20/39] refactor(backends): move FLM model deletion into the
 fastflowlm folder

The `flm remove` subprocess orchestration moves out of ModelManager::delete_model
into backends/fastflowlm (flm_remove). model_manager keeps only the generic
HF-cache deletion path; the flm branch is now a thin call into the backend.

Verified: server_endpoints 69 pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../backends/fastflowlm/fastflowlm_models.h   |  3 ++
 .../backends/fastflowlm/fastflowlm_models.cpp | 29 +++++++++++
 src/cpp/server/model_manager.cpp              | 50 +------------------
 3 files changed, 34 insertions(+), 48 deletions(-)

diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
index 910e25be6..87470300c 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
@@ -47,6 +47,9 @@ std::string flm_version();
 void flm_download(const std::string& checkpoint, bool do_not_upgrade,
                   DownloadProgressCallback progress_callback);
 
+// Remove an installed FLM model by checkpoint via `flm remove`; throws on failure.
+void flm_remove(const std::string& checkpoint);
+
 } // namespace fastflowlm
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
index a2c4ad52f..83d2080bc 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
@@ -8,6 +8,8 @@
 #include "lemon/utils/json_utils.h"
 #include "lemon/utils/path_utils.h"
 #include <sstream>
+#include <thread>
+#include <chrono>
 #include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
@@ -719,6 +721,33 @@ bool run_flm_validate(const std::string& flm_path, std::string& error_message) {
 }
 
 
+void flm_remove(const std::string& checkpoint) {
+    if (checkpoint.empty()) {
+        throw std::runtime_error("FLM model has empty checkpoint field, cannot delete");
+    }
+    std::string flm_path = find_flm_binary();
+    if (flm_path.empty()) {
+        throw std::runtime_error("FLM executable not found");
+    }
+    std::vector<std::string> args = {"remove", checkpoint};
+    auto handle = lemon::utils::ProcessManager::start_process(flm_path, args, "", false);
+
+    int timeout_seconds = 60;
+    for (int i = 0; i < timeout_seconds * 10; ++i) {
+        if (!lemon::utils::ProcessManager::is_running(handle)) {
+            int exit_code = lemon::utils::ProcessManager::get_exit_code(handle);
+            if (exit_code != 0) {
+                throw std::runtime_error("FLM remove failed for " + checkpoint +
+                                         " (exit code " + std::to_string(exit_code) + ")");
+            }
+            return;
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    }
+    lemon::utils::ProcessManager::stop_process(handle);
+    throw std::runtime_error("FLM remove timed out for " + checkpoint);
+}
+
 } // namespace fastflowlm
 } // namespace backends
 } // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 9210afd02..abcbbe3b0 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -3800,55 +3800,9 @@ void ModelManager::delete_model(const std::string& model_name) {
                                  "Delete the file directly from: " + info.checkpoint());
     }
 
-    // Handle FLM models separately
+    // FLM models have no local HF cache; deletion is the backend's `flm remove`.
     if (info.recipe == "flm") {
-        LOG(INFO, "ModelManager") << "Deleting FLM model: " << info.checkpoint() << std::endl;
-
-        // Validate checkpoint is not empty
-        if (info.checkpoint().empty()) {
-            throw std::runtime_error("FLM model has empty checkpoint field, cannot delete");
-        }
-
-        // Find flm executable — on Windows flm.exe lives under the lemonade
-        // cache dir, not on PATH, so we must resolve the full path.
-        std::string flm_path = backends::fastflowlm::find_flm_binary();
-        if (flm_path.empty()) {
-            throw std::runtime_error("FLM executable not found");
-        }
-
-        // Prepare arguments for 'flm remove' command
-        std::vector<std::string> args = {"remove", info.checkpoint()};
-
-        LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\"";
-        for (const auto& arg : args) {
-            LOG(INFO, "ProcessManager") << " \"" << arg << "\"";
-        }
-        LOG(INFO, "ProcessManager") << std::endl;
-
-        // Run flm remove command
-        auto handle = utils::ProcessManager::start_process(flm_path, args, "", false);
-
-        // Wait for process to complete
-        int timeout_seconds = 60; // 1 minute timeout for removal
-        for (int i = 0; i < timeout_seconds * 10; ++i) {
-            if (!utils::ProcessManager::is_running(handle)) {
-                int exit_code = utils::ProcessManager::get_exit_code(handle);
-                if (exit_code != 0) {
-                    LOG(ERROR, "ModelManager") << "FLM remove failed with exit code: " << exit_code << std::endl;
-                    throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove failed with exit code " + std::to_string(exit_code));
-                }
-                break;
-            }
-            std::this_thread::sleep_for(std::chrono::milliseconds(100));
-        }
-
-        // Check if process is still running (timeout)
-        if (utils::ProcessManager::is_running(handle)) {
-            LOG(ERROR, "ModelManager") << "FLM remove timed out" << std::endl;
-            throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove timed out");
-        }
-
-        LOG(INFO, "ModelManager") << "Successfully deleted FLM model: " << canonical_model_name << std::endl;
+        backends::fastflowlm::flm_remove(info.checkpoint());
 
         // Remove from user models if it's a user model
         if (is_user_model_name(canonical_model_name)) {

From 55fa6f12dd12ff449f96cd03f8c6f5c42d8004ef Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:45:51 -0400
Subject: [PATCH 21/39] refactor(config): drive recipe_options() from
 descriptors, not per-recipe blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RuntimeConfig::recipe_options() had a hardcoded nested→flat translation block per
recipe (llamacpp/whispercpp/moonshine/sdcpp/vllm). Replace with a single loop
over the descriptors: each option's config.json key is derived from its name
role (*_backend → "backend", *_args → variant "<backend>_args"/"args",
*_device → "device", else the option name verbatim for sd-cpp's steps/cfg_scale/
width/height). Adding a backend no longer requires editing this function.

Verified: server_endpoints 69 pass (config/params translation unchanged).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/server/runtime_config.cpp | 81 +++++++++++++------------------
 1 file changed, 34 insertions(+), 47 deletions(-)

diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp
index 0a14f006b..cfc3546b7 100644
--- a/src/cpp/server/runtime_config.cpp
+++ b/src/cpp/server/runtime_config.cpp
@@ -323,56 +323,43 @@ json RuntimeConfig::recipe_options(const std::string& backend) const {
         return val;
     };
 
-    const std::string backend_args = backend + "_args";
-
-    if (config_.contains("llamacpp")) {
-        const auto& lc = config_["llamacpp"];
-        if (lc.contains("backend")) result["llamacpp_backend"] = resolve_auto(lc["backend"]);
-        if (lc.contains(backend_args) && lc[backend_args] != "") {
-            result["llamacpp_args"] = lc[backend_args];
-        } else if (lc.contains("args")) {
-            result["llamacpp_args"] = lc["args"];
-        }
-        if (lc.contains("device")) result["llamacpp_device"] = lc["device"];
-    }
-
-    if (config_.contains("whispercpp")) {
-        const auto& wc = config_["whispercpp"];
-        if (wc.contains("backend")) result["whispercpp_backend"] = resolve_auto(wc["backend"]);
-        if (wc.contains(backend_args) && wc[backend_args] != "") {
-            result["whispercpp_args"] = wc[backend_args];
-        } else if (wc.contains("args")) {
-            result["whispercpp_args"] = wc["args"];
-        }
-    }
+    auto ends_with = [](const std::string& s, const std::string& suf) {
+        return s.size() >= suf.size() && s.compare(s.size() - suf.size(), suf.size(), suf) == 0;
+    };
 
-    if (config_.contains("moonshine")) {
-        const auto& ms = config_["moonshine"];
-        if (ms.contains(backend_args) && ms[backend_args] != "") {
-            result["moonshine_args"] = ms[backend_args];
-        } else if (ms.contains("args")) {
-            result["moonshine_args"] = ms["args"];
-        }
-    }
+    const std::string backend_args = backend + "_args";
 
-    if (config_.contains("sdcpp")) {
-        const auto& sd = config_["sdcpp"];
-        if (sd.contains("backend")) result["sd-cpp_backend"] = resolve_auto(sd["backend"]);
-        if (sd.contains(backend_args) && sd[backend_args] != "") {
-            result["sdcpp_args"] = sd[backend_args];
-        } else if (sd.contains("args")) {
-            result["sdcpp_args"] = sd["args"];
+    // Translate each backend's nested config.json section into the flat
+    // recipe_options format, driven by the descriptor's option list — no
+    // per-recipe block. The flat key is the descriptor option name; the
+    // config.json key is derived from the option's role (its name suffix):
+    //   *_backend -> "backend"   *_args -> variant "<backend>_args" then "args"
+    //   *_device  -> "device"    everything else -> the option name verbatim
+    //                            (sd-cpp's steps/cfg_scale/width/height/…)
+    for (const auto* desc : lemon::backends::all_descriptors()) {
+        const std::string section = desc->effective_config_section();
+        if (!config_.contains(section) || !config_[section].is_object()) {
+            continue;
+        }
+        const auto& cfg = config_[section];
+        for (const auto& opt : desc->options) {
+            if (ends_with(opt.name, "_backend")) {
+                if (cfg.contains("backend")) {
+                    result[opt.name] = resolve_auto(cfg["backend"]);
+                }
+            } else if (ends_with(opt.name, "_args")) {
+                if (cfg.contains(backend_args) && cfg[backend_args] != "") {
+                    result[opt.name] = cfg[backend_args];
+                } else if (cfg.contains("args")) {
+                    result[opt.name] = cfg["args"];
+                }
+            } else {
+                const std::string ckey = ends_with(opt.name, "_device") ? "device" : opt.name;
+                if (cfg.contains(ckey)) {
+                    result[opt.name] = cfg[ckey];
+                }
+            }
         }
-        if (sd.contains("steps")) result["steps"] = sd["steps"];
-        if (sd.contains("cfg_scale")) result["cfg_scale"] = sd["cfg_scale"];
-        if (sd.contains("width")) result["width"] = sd["width"];
-        if (sd.contains("height")) result["height"] = sd["height"];
-    }
-
-    if (config_.contains("vllm")) {
-        const auto& vl = config_["vllm"];
-        if (vl.contains("backend")) result["vllm_backend"] = resolve_auto(vl["backend"]);
-        if (vl.contains("args")) result["vllm_args"] = vl["args"];
     }
 
     if (config_.contains("ctx_size")) result["ctx_size"] = config_["ctx_size"];

From c3aff5976ef6d78074f97c49d9feddea61b4f018 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:51:43 -0400
Subject: [PATCH 22/39] =?UTF-8?q?polish(backends):=20build=20BackendSpec?=
 =?UTF-8?q?=20from=20the=20descriptor=20(dedup=20binary=20across=20descrip?=
 =?UTF-8?q?tor=E2=86=94server.h)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The backend binary name (and recipe) were duplicated between the descriptor
(<stem>.h) and the BackendSpec literal (<stem>_server.h) — the cross-file
redundancy. Remove the static SPEC member; each backend's spec() now builds the
BackendSpec lazily from descriptor.binary (+ descriptor.recipe, or the explicit
"ryzenai-server" install id where it differs) plus the class's get_install_params
and split flag. In-class binary lookups go through spec(); server.cpp's sd upscale
uses try_get_spec_for_recipe.

Net: the binary name now lives in exactly one place (the descriptor). Lazy
function-local statics also avoid any static-init-order coupling between the
descriptor and the spec.

Verified: builds green; system-info install detection unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../lemon/backends/fastflowlm/fastflowlm_server.h     | 11 -----------
 src/cpp/include/lemon/backends/kokoro/kokoro_server.h |  9 ---------
 .../include/lemon/backends/llamacpp/llamacpp_server.h |  9 ---------
 .../lemon/backends/moonshine/moonshine_server.h       |  5 -----
 .../include/lemon/backends/ryzenai/ryzenai_server.h   |  9 ---------
 src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h   |  9 ---------
 src/cpp/include/lemon/backends/vllm/vllm_server.h     |  6 ------
 .../lemon/backends/whispercpp/whispercpp_server.h     |  9 ---------
 .../server/backends/fastflowlm/fastflowlm_server.cpp  | 11 ++++++++---
 src/cpp/server/backends/kokoro/kokoro_server.cpp      | 11 ++++++++---
 src/cpp/server/backends/llamacpp/llamacpp_server.cpp  | 11 ++++++++---
 .../server/backends/moonshine/moonshine_server.cpp    | 11 ++++++++---
 src/cpp/server/backends/ryzenai/ryzenai_server.cpp    | 11 ++++++++---
 src/cpp/server/backends/sdcpp/sdcpp_server.cpp        | 11 ++++++++---
 src/cpp/server/backends/vllm/vllm_server.cpp          | 11 ++++++++---
 .../server/backends/whispercpp/whispercpp_server.cpp  | 11 ++++++++---
 src/cpp/server/server.cpp                             |  2 +-
 17 files changed, 65 insertions(+), 92 deletions(-)

diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
index c422f4a4d..e4bce74d8 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
@@ -13,17 +13,6 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        // recipe
-            "flm",
-        // executable
-    #ifdef _WIN32
-            "flm.exe"
-    #else
-            "flm"
-    #endif
-        , get_install_params
-    );
 
     FastFlowLMServer(const std::string& log_level, ModelManager* model_manager = nullptr,
                      BackendManager* backend_manager = nullptr);
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
index 9c628c076..ec8e74844 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
@@ -15,15 +15,6 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "kokoro",
-    #ifdef _WIN32
-            "koko.exe"
-    #else
-            "koko"
-    #endif
-        , get_install_params
-    );
 
     explicit KokoroServer(const std::string& log_level,
                           ModelManager* model_manager,
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
index 8b28296c4..f1447c1ce 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
@@ -13,15 +13,6 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "llamacpp",
-    #ifdef _WIN32
-            "llama-server.exe"
-    #else
-            "llama-server"
-    #endif
-        , get_install_params
-    );
 
     LlamaCppServer(const std::string& log_level,
                    ModelManager* model_manager,
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
index 611bfe51c..47ea21f58 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
@@ -14,11 +14,6 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        "moonshine",
-        "moonshine-server",
-        get_install_params
-    );
 
     explicit MoonshineServer(const std::string& log_level,
                             ModelManager* model_manager,
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
index f824cfde3..f3a6806e7 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h
@@ -17,15 +17,6 @@ class RyzenAIServer : public WrappedServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        "ryzenai-server",
-#ifdef _WIN32
-        "ryzenai-server.exe"
-#else
-        "ryzenai-server"
-#endif
-        , get_install_params
-    );
 
     RyzenAIServer(const std::string& model_name, bool debug, ModelManager* model_manager,
                   BackendManager* backend_manager);
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
index 99be9e62c..65c470332 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
@@ -18,15 +18,6 @@ class SDServer : public WrappedServer, public IImageServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "sd-cpp",
-    #ifdef _WIN32
-            "sd-server.exe"
-    #else
-            "sd-server"
-    #endif
-        , get_install_params
-    );
 
     explicit SDServer(const std::string& log_level,
                       ModelManager* model_manager,
diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h
index 700296b97..0293fa811 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm_server.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h
@@ -13,12 +13,6 @@ class VLLMServer : public WrappedServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-            "vllm",
-            "vllm-server"
-        , get_install_params
-        , /*supports_split_archive=*/true
-    );
 
     VLLMServer(const std::string& log_level,
                ModelManager* model_manager,
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
index 8dc88bbb4..9ddd4f2af 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
@@ -15,15 +15,6 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer {
 public:
     static InstallParams get_install_params(const std::string& backend, const std::string& version);
 
-    inline static const BackendSpec SPEC = BackendSpec(
-        "whispercpp",
-#ifdef _WIN32
-        "whisper-server.exe"
-#else
-        "whisper-server"
-#endif
-        , get_install_params
-    );
 
     explicit WhisperServer(const std::string& log_level,
                           ModelManager* model_manager,
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index e251e8240..fc5ecef9b 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/fastflowlm/fastflowlm_server.h"
+#include "lemon/backends/fastflowlm/fastflowlm.h"
 #include "lemon/backends/fastflowlm/fastflowlm_models.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_ops.h"
@@ -164,7 +165,7 @@ void FastFlowLMServer::load(const std::string& model_name,
 
 #ifdef _WIN32
     // On Windows, auto-install FLM binary if needed (downloads zip and extracts)
-    backend_manager_->install_backend(SPEC.recipe, "npu");
+    backend_manager_->install_backend(fastflowlm::spec()->recipe, "npu");
 #endif
 
     // Validate NPU hardware/drivers
@@ -448,7 +449,7 @@ std::string FastFlowLMServer::get_flm_path() {
 #ifdef _WIN32
     // On Windows, use the standard install directory (auto-installed zip)
     try {
-        std::string path = BackendUtils::get_backend_binary_path(SPEC, "npu");
+        std::string path = BackendUtils::get_backend_binary_path(*fastflowlm::spec(), "npu");
         LOG(INFO, "FastFlowLM") << "Found flm at: " << path << std::endl;
         return path;
     } catch (const std::exception& e) {
@@ -526,7 +527,11 @@ class FlmOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() { return &FastFlowLMServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
+                                   FastFlowLMServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
 const BackendOps* ops() {
     static const FlmOps kOps;
     return &kOps;
diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
index 80d502ead..aa8ad871e 100644
--- a/src/cpp/server/backends/kokoro/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/kokoro/kokoro_server.h"
+#include "lemon/backends/kokoro/kokoro.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
@@ -74,7 +75,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
 
     // Install kokoros if needed
     const std::string backend = default_kokoro_backend();
-    backend_manager_->install_backend(SPEC.recipe, backend);
+    backend_manager_->install_backend(kokoro::spec()->recipe, backend);
 
     // Use pre-resolved model path
     fs::path model_path = fs::path(model_info.resolved_path());
@@ -94,7 +95,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
     LOG(INFO, "KokoroServer") << "Using model: " << model_index["model"] << std::endl;
 
     // Get koko executable path
-    std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend);
+    std::string exe_path = BackendUtils::get_backend_binary_path(*kokoro::spec(), backend);
 
     // Choose a port
     port_ = choose_port();
@@ -239,7 +240,11 @@ class KokoroOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() { return &KokoroServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
+                                   KokoroServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
 const BackendOps* ops() {
     static const KokoroOps kOps;
     return &kOps;
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index d98ce46e6..9b75b6eaa 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/llamacpp/llamacpp_server.h"
+#include "lemon/backends/llamacpp/llamacpp.h"
 #include "lemon/backends/llamacpp/llamacpp_gguf.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_ops.h"
@@ -290,7 +291,7 @@ void LlamaCppServer::load(const std::string& model_name,
     device_type_ = use_gpu ? DEVICE_GPU : DEVICE_CPU;
 
     // Install llama-server if needed (use per-model backend)
-    backend_manager_->install_backend(SPEC.recipe, llamacpp_backend);
+    backend_manager_->install_backend(llamacpp::spec()->recipe, llamacpp_backend);
 
     // Use pre-resolved GGUF path. Skipped for hf_load models because llama-server
     // sources the weights itself via -hf; those models may not have local files.
@@ -310,7 +311,7 @@ void LlamaCppServer::load(const std::string& model_name,
     port_ = choose_port();
 
     // Get executable path
-    std::string executable = BackendUtils::get_backend_binary_path(SPEC, llamacpp_backend);
+    std::string executable = BackendUtils::get_backend_binary_path(*llamacpp::spec(), llamacpp_backend);
 
     // Check for embeddings and reranking support based on model type
     bool supports_embeddings = (model_info.type == ModelType::EMBEDDING);
@@ -836,7 +837,11 @@ class LlamaCppOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() { return &LlamaCppServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
+                                   LlamaCppServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
 const BackendOps* ops() {
     static const LlamaCppOps kOps;
     return &kOps;
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index b294f46ee..b9c8ebd34 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/moonshine/moonshine_server.h"
+#include "lemon/backends/moonshine/moonshine.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
@@ -72,7 +73,7 @@ void MoonshineServer::load(const std::string& model_name,
     device_type_ = DEVICE_CPU;
 
     // Install moonshine-server if needed
-    backend_manager_->install_backend(SPEC.recipe, "cpu");
+    backend_manager_->install_backend(moonshine::spec()->recipe, "cpu");
 
     // Resolve model path from ModelManager (standard HF cache)
     std::string model_path = model_info.resolved_path();
@@ -98,7 +99,7 @@ void MoonshineServer::load(const std::string& model_name,
     }
 
     // Get executable path
-    std::string executable = BackendUtils::get_backend_binary_path(SPEC, "cpu");
+    std::string executable = BackendUtils::get_backend_binary_path(*moonshine::spec(), "cpu");
     LOG(INFO, "MoonshineServer") << "Using executable: " << executable << std::endl;
 
     // moonshine-server binds three consecutive ports: HTTP, WS (+1), TCP (+2).
@@ -341,7 +342,11 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
-const BackendSpec* spec() { return &MoonshineServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
+                                   MoonshineServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
 const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace moonshine
 }  // namespace backends
diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index f6ba8f457..c175301f6 100644
--- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/ryzenai/ryzenai_server.h"
+#include "lemon/backends/ryzenai/ryzenai.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/model_manager.h"
 #include "lemon/backends/backend_ops.h"
@@ -43,7 +44,7 @@ RyzenAIServer::~RyzenAIServer() {
 
 bool RyzenAIServer::is_available() {
     try {
-        return !backends::BackendUtils::get_backend_binary_path(SPEC, "npu").empty();
+        return !backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu").empty();
     } catch (...) {
         return false;
     }
@@ -60,7 +61,7 @@ void RyzenAIServer::load(const std::string& model_name,
     backend_manager_->install_backend("ryzenai-llm", "npu");
 
     // Get the path to ryzenai-server
-    std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(SPEC, "npu");
+    std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu");
     if (ryzenai_server_path.empty()) {
         throw std::runtime_error("RyzenAI-Server executable not found even after installation attempt");
     }
@@ -208,7 +209,11 @@ class RyzenAiOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec("ryzenai-server", descriptor.binary,
+                                   ::lemon::RyzenAIServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
 const BackendOps* ops() {
     static const RyzenAiOps kOps;
     return &kOps;
diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index 718855d8f..98f19e5ea 100644
--- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/sdcpp/sdcpp_server.h"
+#include "lemon/backends/sdcpp/sdcpp.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/backend_manager.h"
@@ -211,7 +212,7 @@ void SDServer::load(const std::string& model_name,
     }
 
     // Install sd-server if needed
-    backend_manager_->install_backend(SPEC.recipe, backend);
+    backend_manager_->install_backend(sdcpp::spec()->recipe, backend);
 
     // Get model path
     std::string model_path = model_info.resolved_path("main");
@@ -233,7 +234,7 @@ void SDServer::load(const std::string& model_name,
     LOG(DEBUG, "SDServer") << "Using model: " << model_path << std::endl;
 
     // Get sd-server executable path
-    std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend);
+    std::string exe_path = BackendUtils::get_backend_binary_path(*sdcpp::spec(), backend);
 
     // Choose a port
     port_ = choose_port();
@@ -757,7 +758,11 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
-const BackendSpec* spec() { return &SDServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
+                                   SDServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
 const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace sdcpp
 }  // namespace backends
diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
index 1ab4e22fc..085cd0f2a 100644
--- a/src/cpp/server/backends/vllm/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/vllm/vllm_server.h"
+#include "lemon/backends/vllm/vllm.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_utils.h"
 #include "lemon/model_manager.h"
@@ -123,7 +124,7 @@ void VLLMServer::load(const std::string& model_name,
     RuntimeConfig::validate_backend_choice("vllm", vllm_backend);
 
     // Install vllm-server if needed
-    backend_manager_->install_backend(SPEC.recipe, vllm_backend);
+    backend_manager_->install_backend(vllm::spec()->recipe, vllm_backend);
 
     // vLLM uses HuggingFace model names, not local file paths.
     // The checkpoint field in server_models.json is the HF model ID.
@@ -138,7 +139,7 @@ void VLLMServer::load(const std::string& model_name,
     port_ = choose_port();
 
     // Get executable path
-    std::string executable = BackendUtils::get_backend_binary_path(SPEC, vllm_backend);
+    std::string executable = BackendUtils::get_backend_binary_path(*vllm::spec(), vllm_backend);
 
     // Build command line arguments
     std::vector<std::string> args;
@@ -322,7 +323,11 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
-const BackendSpec* spec() { return &VLLMServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
+                                   VLLMServer::get_install_params, /*split=*/true);
+    return &kSpec;
+}
 const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace vllm
 }  // namespace backends
diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index dfa0ebea9..dcccaf7ac 100644
--- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -1,4 +1,5 @@
 #include "lemon/backends/whispercpp/whispercpp_server.h"
+#include "lemon/backends/whispercpp/whispercpp.h"
 #include "lemon/backends/backend_registry.h"
 #include "lemon/backends/backend_ops.h"
 #include "lemon/backends/backend_utils.h"
@@ -242,7 +243,7 @@ void WhisperServer::load(const std::string& model_name,
         device_type_ = DEVICE_CPU;
     }
 
-    backend_manager_->install_backend(SPEC.recipe, whispercpp_backend);
+    backend_manager_->install_backend(whispercpp::spec()->recipe, whispercpp_backend);
 
     std::string model_path = model_info.resolved_path();
     if (model_path.empty()) {
@@ -258,7 +259,7 @@ void WhisperServer::load(const std::string& model_name,
     }
 
     // Get whisper-server executable path
-    std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, whispercpp_backend);
+    std::string exe_path = BackendUtils::get_backend_binary_path(*whispercpp::spec(), whispercpp_backend);
 
     // Choose a port
     port_ = choose_port();
@@ -733,7 +734,11 @@ class WhisperOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() { return &WhisperServer::SPEC; }
+const BackendSpec* spec() {
+    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
+                                   WhisperServer::get_install_params, /*split=*/false);
+    return &kSpec;
+}
 const BackendOps* ops() {
     static const WhisperOps kOps;
     return &kOps;
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index 384412753..e4b7122f8 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -3118,7 +3118,7 @@ void Server::handle_image_upscale(const httplib::Request& req, httplib::Response
         // as a separate request from generation, which lets the frontend show
         // the original and upscaled images side by side with independent timing.
         std::string exe_dir = lemon::backends::BackendUtils::get_backend_binary_path(
-            lemon::backends::SDServer::SPEC, backend);
+            *lemon::backends::try_get_spec_for_recipe("sd-cpp"), backend);
         std::filesystem::path cli_exe = std::filesystem::path(exe_dir).parent_path() /
 #ifdef _WIN32
             "sd-cli.exe";

From de6d3b1b9df5663a31646554458f827439c7181c Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 17:58:11 -0400
Subject: [PATCH 23/39] polish(backends): drop redundant recipe from descriptor
 support rows

The recipe was repeated on every support row (6x in llamacpp.h). Introduce
a recipe-free BackendSupport struct; the owning descriptor's recipe is filled
in by recipe_defs() when flattening to RecipeBackendDef.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_descriptor.h  |  2 +-
 .../include/lemon/backends/fastflowlm/fastflowlm.h   |  2 +-
 src/cpp/include/lemon/backends/kokoro/kokoro.h       |  4 ++--
 src/cpp/include/lemon/backends/llamacpp/llamacpp.h   | 12 ++++++------
 src/cpp/include/lemon/backends/moonshine/moonshine.h |  6 +++---
 src/cpp/include/lemon/backends/ryzenai/ryzenai.h     |  2 +-
 src/cpp/include/lemon/backends/sdcpp/sdcpp.h         | 10 +++++-----
 src/cpp/include/lemon/backends/vllm/vllm.h           |  2 +-
 .../include/lemon/backends/whispercpp/whispercpp.h   | 10 +++++-----
 src/cpp/include/lemon/recipe_backend_def.h           | 10 ++++++++++
 src/cpp/server/system_info.cpp                       |  3 ++-
 11 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index d640ca279..efe938404 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -61,7 +61,7 @@ struct BackendDescriptor {
     bool dynamic_models       = false;  // true = class supplies models at runtime (cloud), not server_models.json
 
     std::vector<BackendOption>    options;                       // backend-specific knobs (common ones are automatic)
-    std::vector<RecipeBackendDef> support;                       // which OS / GPU families it runs on ({} = no local gating)
+    std::vector<BackendSupport>   support;                       // which OS / GPU families it runs on ({} = no local gating)
     std::vector<std::string>      default_labels;                // labels injected when a model omits them
     std::vector<std::string>      required_checkpoints{"main"};  // unconditional files; conditional ones checked in load()
 
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index b56c9e577..0c621f053 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -24,7 +24,7 @@ inline const BackendDescriptor descriptor = {
     /*dynamic_models*/  false,
     /*options*/ {},
     /*support*/ {
-        {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
+        {"npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h
index 3ebb9efbd..b1e52eba4 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -24,8 +24,8 @@ inline const BackendDescriptor descriptor = {
     /*dynamic_models*/  false,
     /*options*/ {},
     /*support*/ {
-        {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
-        {"kokoro", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
     },
     /*default_labels*/  {},  // kokoro models carry "tts" explicitly in server_models.json
     /*required_checkpoints*/ {"main"},
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index 8e5435f9f..02ed728d7 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -31,14 +31,14 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"},
     },
     /*support*/ {
-        {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"},
-        {"llamacpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
-        {"llamacpp", "cuda", {"windows", "linux"},
+        {"system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+        {"cuda", {"windows", "linux"},
          {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"},
-        {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"},
-        {"llamacpp", "rocm", {"windows", "linux"},
+        {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"},
+        {"rocm", {"windows", "linux"},
          {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
-        {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"},
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h
index 5b8faafe2..2c9feed2b 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h
@@ -23,9 +23,9 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to moonshine-server", ""},
     },
     /*support*/ {
-        {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"},
-        {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"},
-        {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"},
+        {"cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"},
+        {"cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"},
+        {"cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"},
     },
     /*default_labels*/  {"transcription", "realtime-transcription"},
     /*required_checkpoints*/ {"main"},
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
index c290c4dd1..13ebb9a7c 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -24,7 +24,7 @@ inline const BackendDescriptor descriptor = {
     /*dynamic_models*/  false,
     /*options*/ {},
     /*support*/ {
-        {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
+        {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index 57ce30cbd..8cf299a2c 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -36,13 +36,13 @@ inline const BackendDescriptor descriptor = {
         {"flow_shift", "", 0.0, "SIZE", "Flow shift", "Stable Diffusion Options"},
     },
     /*support*/ {
-        {"sd-cpp", "rocm", {"windows", "linux"},
+        {"rocm", {"windows", "linux"},
          {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
-        {"sd-cpp", "cuda", {"linux"},
+        {"cuda", {"linux"},
          {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"},
-        {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"},
-        {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
-        {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+        {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"},
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
     },
     /*default_labels*/  {"image"},
     /*required_checkpoints*/ {"main"},  // flux text_encoder+vae validated together in load()
diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h
index 84a596168..97c58c715 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -25,7 +25,7 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to vllm-server", "vLLM Options"},
     },
     /*support*/ {
-        {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"},
+        {"rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"},
     },
     /*default_labels*/  {},
     /*required_checkpoints*/ {"main"},
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
index 8c4a29815..e62ee029c 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -29,12 +29,12 @@ inline const BackendDescriptor descriptor = {
          "Custom arguments to pass to whisper-server", "Whisper.cpp Options"},
     },
     /*support*/ {
-        {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
-        {"whispercpp", "rocm", {"windows", "linux"},
+        {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},
+        {"rocm", {"windows", "linux"},
          {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"},
-        {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"},
-        {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
-        {"whispercpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
+        {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"},
+        {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"},
+        {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"},
     },
     /*default_labels*/  {"transcription", "realtime-transcription"},
     /*required_checkpoints*/ {"main"},  // npu_cache validated in load() (npu variant only)
diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h
index 829ff0f78..ec0af9a9d 100644
--- a/src/cpp/include/lemon/recipe_backend_def.h
+++ b/src/cpp/include/lemon/recipe_backend_def.h
@@ -26,4 +26,14 @@ struct RecipeBackendDef {
     std::string device_summary = "";
 };
 
+// A backend descriptor's support row, without the recipe (it's always the
+// owning descriptor's recipe — assembling a RecipeBackendDef fills it in). Keeps
+// the descriptor literals from repeating their own recipe on every row.
+struct BackendSupport {
+    std::string backend;
+    std::set<std::string> supported_os;
+    DeviceConstraints devices;
+    std::string device_summary = "";
+};
+
 } // namespace lemon
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 4f5697b50..e30108fe7 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -427,7 +427,8 @@ static const std::vector<RecipeBackendDef>& recipe_defs() {
         std::vector<RecipeBackendDef> v;
         for (const auto* desc : lemon::backends::all_descriptors()) {
             for (const auto& row : desc->support) {
-                v.push_back(row);
+                // Fill in the recipe (the owning descriptor's) per support row.
+                v.push_back({desc->recipe, row.backend, row.supported_os, row.devices, row.device_summary});
             }
         }
         return v;

From 070fcbcbebf59014163d381ad3aefa425426995c Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 18:06:28 -0400
Subject: [PATCH 24/39] polish(backends): remove dead llamacpp-special branch
 in version lookup

The preceding generic block already handles backend_versions[recipe] for any
recipe, so the recipe=="llamacpp" branch was unreachable duplicate code.
Removing it also drops a hardcoded backend name from shared code.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/server/backend_manager.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp
index 2983d49ca..83b61e80c 100644
--- a/src/cpp/server/backend_manager.cpp
+++ b/src/cpp/server/backend_manager.cpp
@@ -64,15 +64,6 @@ std::string get_backend_runtime_version(const json& backend_versions,
         return backend_versions[recipe][runtime_key].get<std::string>();
     }
 
-    // Only fall back to llamacpp runtime version if the recipe is llamacpp
-    if (recipe == "llamacpp" &&
-        backend_versions.contains("llamacpp") &&
-        backend_versions["llamacpp"].is_object() &&
-        backend_versions["llamacpp"].contains(runtime_key) &&
-        backend_versions["llamacpp"][runtime_key].is_string()) {
-        return backend_versions["llamacpp"][runtime_key].get<std::string>();
-    }
-
     throw std::runtime_error("backend_versions.json is missing runtime version for: " + recipe + ":" + runtime_key);
 }
 

From 554ab6c567c5ed70e8149bc98fea960b1eca786f Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 18:13:20 -0400
Subject: [PATCH 25/39] polish(router): replace flm/cloud recipe-string checks
 with slot policy

find_flm_server_by_type -> find_coexisting_server_by_type matches on
SlotPolicy::CoexistByType; count_pinned_servers_by_type skips
SlotPolicy::Unmetered instead of recipe=="cloud". router.cpp now holds
zero backend-name string literals; both behaviors are unchanged (flm is the
only CoexistByType backend, cloud the only Unmetered one).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/router.h |  2 +-
 src/cpp/server/router.cpp      | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/cpp/include/lemon/router.h b/src/cpp/include/lemon/router.h
index e98a8b11d..a4f2d9629 100644
--- a/src/cpp/include/lemon/router.h
+++ b/src/cpp/include/lemon/router.h
@@ -167,7 +167,7 @@ class Router {
     bool has_npu_server() const;
     WrappedServer* find_npu_server() const;
     WrappedServer* find_npu_server_by_recipe(const std::string& recipe) const;
-    WrappedServer* find_flm_server_by_type(ModelType type) const;
+    WrappedServer* find_coexisting_server_by_type(ModelType type) const;
     void evict_all_npu_servers();
     void evict_server(WrappedServer* server, int timeout_seconds = -1);
     void evict_all_servers();
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index 307c51294..514a9773e 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -225,10 +225,11 @@ WrappedServer* Router::find_npu_server_by_recipe(const std::string& recipe) cons
     return nullptr;
 }
 
-WrappedServer* Router::find_flm_server_by_type(ModelType type) const {
+WrappedServer* Router::find_coexisting_server_by_type(ModelType type) const {
     for (const auto& server : loaded_servers_) {
         if (server->is_backend_alive() &&
-            server->get_recipe_options().get_recipe() == "flm" &&
+            slot_policy_for_recipe(server->get_recipe_options().get_recipe()) ==
+                SlotPolicy::CoexistByType &&
             server->get_model_type() == type) {
             return server.get();
         }
@@ -455,7 +456,7 @@ void Router::load_model(const std::string& model_name,
                     evict_server(peer);
                 }
                 // 2. Evict FLM of the SAME model type (max 1 per type: 1 LLM, 1 transcription, 1 embed)
-                WrappedServer* same_type_flm = find_flm_server_by_type(model_type);
+                WrappedServer* same_type_flm = find_coexisting_server_by_type(model_type);
                 if (same_type_flm) {
                     LOG(INFO, "Router") << "FLM " << model_type_to_string(model_type)
                               << " slot occupied by: " << same_type_flm->get_model_name()
@@ -1449,7 +1450,8 @@ void Router::responses_stream(const std::string& request_body, httplib::DataSink
 int Router::count_pinned_servers_by_type(ModelType type) const {
     int count = 0;
     for (const auto& server : loaded_servers_) {
-        if (server->get_recipe_options().get_recipe() == "cloud") {
+        // Unmetered servers (cloud) never occupy a slot, so they don't count.
+        if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) {
             continue;
         }
         if (server->is_backend_alive() && server->get_model_type() == type && server->is_pinned()) {

From b4547cdbb5355950506a1efb24d73fb4b3dcdb4f Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 18:18:21 -0400
Subject: [PATCH 26/39] polish(backends): descriptor flag for self-managed
 downloads, not recipe==flm

Add BackendDescriptor::self_manages_downloads (true only for flm) and
ModelManager::backend_self_manages_downloads(). The two load-time
auto-download guards in server.cpp/ollama_api.cpp now consult it instead of
hardcoding recipe != "flm". flm is the only backend with the flag set, so
behavior is identical.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_descriptor.h    | 5 +++++
 src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h | 1 +
 src/cpp/include/lemon/model_manager.h                  | 4 ++++
 src/cpp/server/model_manager.cpp                       | 5 +++++
 src/cpp/server/ollama_api.cpp                          | 5 +++--
 src/cpp/server/server.cpp                              | 3 ++-
 6 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index efe938404..a5dc97603 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -89,6 +89,11 @@ struct BackendDescriptor {
     // default; system-managed packages (flm) accept any version >= expected.
     VersionPolicy version_policy = VersionPolicy::Exact;
 
+    // True if the backend pulls its own models on demand (flm self-pulls via its
+    // CLI) rather than being pre-downloaded from Hugging Face by the router. Such
+    // backends are skipped by the load-time auto-download path.
+    bool self_manages_downloads = false;
+
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
         return config_section.empty() ? recipe : config_section;
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index 0c621f053..d773f1bc4 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -36,6 +36,7 @@ inline const BackendDescriptor descriptor = {
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ false,
     /*version_policy*/  VersionPolicy::AtLeast,  // system-managed package
+    /*self_manages_downloads*/ true,  // flm pulls its own models via the flm CLI
 };
 
 }  // namespace fastflowlm
diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h
index f12cd3e9a..abdbeae71 100644
--- a/src/cpp/include/lemon/model_manager.h
+++ b/src/cpp/include/lemon/model_manager.h
@@ -216,6 +216,10 @@ class ModelManager {
     // Check if model is downloaded
     bool is_model_downloaded(const std::string& model_name);
 
+    // True if the model's backend pulls its own models on demand (e.g. flm) and
+    // so should be skipped by the router's load-time auto-download path.
+    bool backend_self_manages_downloads(const std::string& recipe) const;
+
     // Shared Hugging Face completeness check: true if all required checkpoints
     // are present and complete (per-backend file validation runs via ops). The
     // default BackendOps::is_downloaded delegates here for HF-backed backends.
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index abcbbe3b0..7ed3c737a 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -2277,6 +2277,11 @@ bool ModelManager::is_model_downloaded(const std::string& model_name) {
     return false;
 }
 
+bool ModelManager::backend_self_manages_downloads(const std::string& recipe) const {
+    const auto* desc = backends::descriptor_for(recipe);
+    return desc && desc->self_manages_downloads;
+}
+
 void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) {
     // The backend's ops own the download (shared HF engine by default; flm pulls
     // via the flm CLI; cloud is a no-op).
diff --git a/src/cpp/server/ollama_api.cpp b/src/cpp/server/ollama_api.cpp
index 7687caab4..0604a3935 100644
--- a/src/cpp/server/ollama_api.cpp
+++ b/src/cpp/server/ollama_api.cpp
@@ -238,8 +238,9 @@ void OllamaApi::auto_load_model(const std::string& model) {
 
     auto info = model_manager_->get_model_info(name);
 
-    // Download if not cached
-    if (info.recipe != "flm" && !model_manager_->is_model_downloaded(name)) {
+    // Download if not cached (backends that self-manage downloads pull on load)
+    if (!model_manager_->backend_self_manages_downloads(info.recipe) &&
+        !model_manager_->is_model_downloaded(name)) {
         LOG(INFO, "OllamaApi") << "Model not cached, downloading..." << std::endl;
         model_manager_->download_registered_model(info, true);
         info = model_manager_->get_model_info(name);
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index e4b7122f8..22af41fda 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -1575,7 +1575,8 @@ void Server::auto_load_model_if_needed(const std::string& requested_model) {
     //   - If model is NOT downloaded: Download it from HuggingFace
     //   - If model IS downloaded: Skip HuggingFace API check entirely (use cached version)
     // Only the /pull endpoint should check for updates (uses do_not_upgrade=false)
-    if (info.recipe != "flm" && !model_manager_->is_model_downloaded(requested_model)) {
+    if (!model_manager_->backend_self_manages_downloads(info.recipe) &&
+        !model_manager_->is_model_downloaded(requested_model)) {
         LOG(INFO, "Server") << "Model not cached, downloading from Hugging Face..." << std::endl;
         LOG(INFO, "Server") << "This may take several minutes for large models." << std::endl;
         model_manager_->download_registered_model(info, true);

From 71c1bb134a78127a5e1341a682178da22cfcb314 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 18:27:50 -0400
Subject: [PATCH 27/39] polish(backends): move local-import checkpoint scan
 into BackendOps

resolve_and_register_local_model() had a recipe if/else scanning the imported
directory for each backend's primary artifact (.gguf / .bin / genai_config.json
dir). Replace with BackendOps::find_imported_checkpoint(dir): default ""
registers the directory (sd-cpp/kokoro/moonshine); llamacpp reuses
resolve_gguf_path, whisper finds the .bin, ryzenai finds genai_config.json's
dir (and its resolve_checkpoint_path now reuses the same scan). server.cpp
holds no per-recipe import logic. Verified via local_import smoke tests for
llamacpp (ignores mmproj), whisper, and a default backend.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_ops.h  |  9 +++
 .../backends/llamacpp/llamacpp_server.cpp     |  5 ++
 .../backends/ryzenai/ryzenai_server.cpp       | 10 +++-
 .../backends/whispercpp/whispercpp_server.cpp | 16 ++++++
 src/cpp/server/server.cpp                     | 56 ++-----------------
 5 files changed, 42 insertions(+), 54 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index cf796cb37..2daaee228 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -59,6 +59,15 @@ class BackendOps {
     virtual std::string resolve_checkpoint_path(const ModelInfo& info,
                                                 const CheckpointResolveContext& ctx) const;
 
+    // Find the primary checkpoint artifact inside a freshly-imported local
+    // directory (a local_import pull), e.g. the .gguf / .bin file or the
+    // genai_config.json directory. Returns the absolute path to register, or ""
+    // to register the directory itself. Default: "" (register the directory).
+    virtual std::string find_imported_checkpoint(const std::string& import_dir) const {
+        (void)import_dir;
+        return "";
+    }
+
     // Models supplied at runtime rather than from server_models.json (descriptor
     // dynamic_models = true). Default: none. cloud/flm override.
     virtual std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const {
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 9b75b6eaa..51e040893 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -792,6 +792,11 @@ class LlamaCppOps : public BackendOps {
         return BackendOps::resolve_checkpoint_path(info, ctx);
     }
 
+    std::string find_imported_checkpoint(const std::string& import_dir) const override {
+        // The primary artifact is the (non-mmproj) GGUF file.
+        return resolve_gguf_path(import_dir, "");
+    }
+
     std::string validate_checkpoint_file(const std::string& resolved_path) const override {
         // A .gguf file in the cache must start with the GGUF magic, else it's a
         // truncated/corrupt download and the model is not really present.
diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index c175301f6..7bee8e46d 100644
--- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -195,7 +195,13 @@ class RyzenAiOps : public BackendOps {
     std::string resolve_checkpoint_path(const ModelInfo&,
                                         const CheckpointResolveContext& ctx) const override {
         // RyzenAI models are a directory containing genai_config.json.
-        std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path);
+        std::string found = find_imported_checkpoint(ctx.model_cache_path);
+        return found.empty() ? ctx.model_cache_path : found;  // dir if not found
+    }
+
+    std::string find_imported_checkpoint(const std::string& import_dir) const override {
+        // The primary artifact is the directory holding genai_config.json.
+        std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir);
         if (hf_cache::exists(dir)) {
             for (const auto& entry :
                  std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
@@ -204,7 +210,7 @@ class RyzenAiOps : public BackendOps {
                 }
             }
         }
-        return ctx.model_cache_path;  // directory even if genai_config not found
+        return "";  // register the directory itself
     }
 };
 }  // namespace
diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index dcccaf7ac..ef1d9a7e5 100644
--- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -731,6 +731,22 @@ class WhisperOps : public BackendOps {
         }
         return BackendOps::resolve_checkpoint_path(info, ctx);
     }
+
+    std::string find_imported_checkpoint(const std::string& import_dir) const override {
+        // The primary artifact is the .bin model file.
+        std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir);
+        if (!hf_cache::exists(dir)) {
+            return "";
+        }
+        for (const auto& entry :
+             std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) {
+            if (entry.is_regular_file() &&
+                entry.path().filename().string().find(".bin") != std::string::npos) {
+                return lemon::utils::path_to_utf8(entry.path());
+            }
+        }
+        return "";
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index 22af41fda..30cd919d3 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -3958,60 +3958,12 @@ void Server::resolve_and_register_local_model(
     std::string recipe = model_data.value("recipe", "");
     bool vision = model_data.value("vision", false);
 
-    std::string resolved_checkpoint;
+    // The backend's ops locate its primary artifact within the imported
+    // directory (.gguf / .bin file, genai_config.json dir, …); "" means register
+    // the directory itself.
+    std::string resolved_checkpoint = backends::ops_for(recipe)->find_imported_checkpoint(dest_path);
     std::string resolved_mmproj;
 
-    // For RyzenAI LLM models, find genai_config.json
-    if (recipe == "ryzenai-llm") {
-        for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {
-            if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") {
-                resolved_checkpoint = entry.path().parent_path().string();
-                break;
-            }
-        }
-        if (resolved_checkpoint.empty()) {
-            resolved_checkpoint = dest_path;
-        }
-    }
-    // For llamacpp models, find the GGUF file
-    else if (recipe == "llamacpp") {
-        std::string gguf_file_found;
-
-        // If no variant or variant not found, search for any .gguf file (excluding mmproj)
-        if (gguf_file_found.empty()) {
-            for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {
-                if (entry.is_regular_file()) {
-                    std::string filename = entry.path().filename().string();
-                    std::string filename_lower = filename;
-                    std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower);
-
-                    if (filename_lower.find(".gguf") != std::string::npos &&
-                        filename_lower.find("mmproj") == std::string::npos) {
-                        gguf_file_found = entry.path().string();
-                        break;
-                    }
-                }
-            }
-        }
-
-        resolved_checkpoint = gguf_file_found.empty() ? dest_path : gguf_file_found;
-    }
-    // For whispercpp, find .bin file
-    else if (recipe == "whispercpp") {
-        for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {
-            if (entry.is_regular_file()) {
-                std::string filename = entry.path().filename().string();
-                if (filename.find(".bin") != std::string::npos) {
-                    resolved_checkpoint = entry.path().string();
-                    break;
-                }
-            }
-        }
-        if (resolved_checkpoint.empty()) {
-            resolved_checkpoint = dest_path;
-        }
-    }
-
     // Search for mmproj file if vision is enabled or mmproj hint provided
     if (vision || !mmproj.empty()) {
         for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) {

From 7d822208838238e63de3895d5d703fd041ce89b6 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 19:18:40 -0400
Subject: [PATCH 28/39] fix(llamacpp): parenthesize numeric_limits::max() for
 MSVC

On Windows the merged include chain pulls in the windows.h max() macro into
this TU, turning std::numeric_limits<T>::max() into a syntax error (C2589).
Wrap the calls as (std::numeric_limits<T>::max)() so the macro cannot expand.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
index aeb4f2260..671900adb 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
@@ -60,7 +60,7 @@ static bool read_gguf_string(std::istream& in, std::string& value) {
 }
 
 static bool skip_bytes(std::istream& in, uint64_t bytes) {
-    if (bytes > static_cast<uint64_t>(std::numeric_limits<std::streamoff>::max())) return false;
+    if (bytes > static_cast<uint64_t>((std::numeric_limits<std::streamoff>::max)())) return false;
     in.seekg(static_cast<std::streamoff>(bytes), std::ios::cur);
     return static_cast<bool>(in);
 }
@@ -100,7 +100,7 @@ static bool read_gguf_integer_value(std::istream& in, uint32_t type, int64_t& va
         case 10: {
             uint64_t v = 0;
             if (!read_le(in, v)) return false;
-            if (v > static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) return false;
+            if (v > static_cast<uint64_t>((std::numeric_limits<int64_t>::max)())) return false;
             value = static_cast<int64_t>(v);
             return true;
         }
@@ -132,7 +132,7 @@ static bool skip_gguf_value(std::istream& in, uint32_t type) {
         if (elem_type == 9) return false;
         uint64_t elem_size = gguf_scalar_size(elem_type);
         if (elem_size == 0) return false;
-        if (count > std::numeric_limits<uint64_t>::max() / elem_size) return false;
+        if (count > (std::numeric_limits<uint64_t>::max)() / elem_size) return false;
         return skip_bytes(in, count * elem_size);
     }
 

From 6492260882f219155f895a4e784608669990c26f Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Mon, 22 Jun 2026 19:18:40 -0400
Subject: [PATCH 29/39] fix(flm): mark backend dynamic_models so its models
 register

flm models come from flm's model_list.json at runtime (0 entries in
server_models.json), but the descriptor had dynamic_models=false, so
build_cache skipped flm's ops->discover_models() and flm models (e.g.
llama3.2-1b-FLM) never registered -> 404. The build_cache comment already
documents flm as a dynamic-discovery backend alongside cloud; align the
descriptor with that intent.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_descriptor.h    | 2 +-
 src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index a5dc97603..bd46c98c4 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -58,7 +58,7 @@ struct BackendDescriptor {
     SlotPolicy slot_policy    = SlotPolicy::Standard; // default; override effective_slot_policy() if variant-dependent
     bool selectable_backend   = false;  // auto-creates "<recipe>_backend" option + "--<recipe>" flag
     bool uses_ctx_size        = false;  // opt in to the shared ctx_size option
-    bool dynamic_models       = false;  // true = class supplies models at runtime (cloud), not server_models.json
+    bool dynamic_models       = false;  // true = ops supply models at runtime (cloud, flm), not server_models.json
 
     std::vector<BackendOption>    options;                       // backend-specific knobs (common ones are automatic)
     std::vector<BackendSupport>   support;                       // which OS / GPU families it runs on ({} = no local gating)
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index d773f1bc4..7b812bfb9 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -21,7 +21,7 @@ inline const BackendDescriptor descriptor = {
     /*slot_policy*/     SlotPolicy::CoexistByType,
     /*selectable_backend*/ false,
     /*uses_ctx_size*/   true,
-    /*dynamic_models*/  false,
+    /*dynamic_models*/  true,  // models come from flm's model_list.json, not server_models.json
     /*options*/ {},
     /*support*/ {
         {"npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"},

From f7ec14caf32ff196c7743a4c971a1619b8f8d132 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Thu, 25 Jun 2026 15:15:08 -0400
Subject: [PATCH 30/39] polish(backends): move moonshine download
 file-selection into ops

model_manager's download path hardcoded recipe == "moonshine" to fetch a
variant directory of files. Add BackendOps::select_checkpoint_files (default
nullopt = the GGUF/direct-file defaults) and override it in MoonshineOps. The
download path no longer names a backend.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_ops.h  | 13 ++++++
 .../backends/moonshine/moonshine_server.cpp   | 42 ++++++++++++++++++-
 src/cpp/server/model_manager.cpp              | 26 ++++--------
 3 files changed, 63 insertions(+), 18 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index 2daaee228..49faa68b4 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <optional>
 #include <string>
 #include <vector>
 #include "lemon/model_manager.h"  // ModelInfo, DownloadProgressCallback (server-side only)
@@ -68,6 +69,18 @@ class BackendOps {
         return "";
     }
 
+    // Select the repo-relative files to download for the main checkpoint
+    // `main_variant`, for backends whose artifact layout isn't a GGUF file.
+    // Return nullopt to use the default GGUF selection. (Direct single-file
+    // variants — .safetensors/.pth/.ckpt — are handled generically upstream.)
+    // moonshine overrides: its variant names a directory of files to fetch.
+    virtual std::optional<std::vector<std::string>> select_checkpoint_files(
+        const std::string& main_variant, const std::vector<std::string>& repo_files) const {
+        (void)main_variant;
+        (void)repo_files;
+        return std::nullopt;
+    }
+
     // Models supplied at runtime rather than from server_models.json (descriptor
     // dynamic_models = true). Default: none. cloud/flm override.
     virtual std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const {
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index 41545837e..a84506e35 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -8,8 +8,10 @@
 #include "lemon/utils/http_client.h"
 #include "lemon/utils/process_manager.h"
 #include "lemon/error_types.h"
+#include <cctype>
 #include <iostream>
 #include <filesystem>
+#include <optional>
 #include <set>
 #include <vector>
 #include <lemon/utils/aixlog.hpp>
@@ -370,12 +372,50 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
+namespace {
+class MoonshineOps : public BackendOps {
+public:
+    std::optional<std::vector<std::string>> select_checkpoint_files(
+        const std::string& main_variant, const std::vector<std::string>& repo_files) const override {
+        // A Moonshine variant names a directory (e.g. "medium-streaming-en/quantized");
+        // download every file under it.
+        std::string folder_prefix = main_variant;
+        if (!folder_prefix.empty() && folder_prefix.back() != '/') {
+            folder_prefix += "/";
+        }
+        auto starts_with_ci = [](const std::string& s, const std::string& p) {
+            if (s.size() < p.size()) return false;
+            for (size_t i = 0; i < p.size(); ++i) {
+                if (std::tolower(static_cast<unsigned char>(s[i])) !=
+                    std::tolower(static_cast<unsigned char>(p[i]))) {
+                    return false;
+                }
+            }
+            return true;
+        };
+        std::vector<std::string> files;
+        for (const auto& f : repo_files) {
+            if (starts_with_ci(f, folder_prefix)) {
+                files.push_back(f);
+            }
+        }
+        if (files.empty()) {
+            throw std::runtime_error("No Moonshine model files found in folder: " + main_variant);
+        }
+        return files;
+    }
+};
+}  // namespace
+
 const BackendSpec* spec() {
     static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
                                    MoonshineServer::get_install_params, /*split=*/false);
     return &kSpec;
 }
-const BackendOps* ops() { return default_backend_ops(); }
+const BackendOps* ops() {
+    static const MoonshineOps kOps;
+    return &kOps;
+}
 }  // namespace moonshine
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 7ed3c737a..824a1ecce 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -3535,7 +3535,11 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
         bool is_direct_file = ends_with(main_variant, ".safetensors") ||
                               ends_with(main_variant, ".pth") ||
                               ends_with(main_variant, ".ckpt");
-        bool is_moonshine = info.recipe == "moonshine";
+
+        // Backends with a bespoke artifact layout (moonshine = a directory of
+        // files) select their own download set; nullopt = the default paths.
+        auto backend_files =
+            backends::ops_for(info.recipe)->select_checkpoint_files(main_variant, repo_files);
 
         if (is_direct_file) {
             // For non-GGUF model files, download the specified file directly
@@ -3545,22 +3549,10 @@ void ModelManager::download_from_huggingface(const ModelInfo& info,
             } else {
                 throw std::runtime_error("Model file not found in repository: " + main_variant);
             }
-        } else if (is_moonshine) {
-            // Moonshine variant is a directory path (e.g., "medium-streaming-en/quantized")
-            // Download all files under that directory
-            std::string folder_prefix = main_variant;
-            if (!folder_prefix.empty() && folder_prefix.back() != '/') {
-                folder_prefix += "/";
-            }
-            for (const auto& file : repo_files) {
-                if (starts_with_ignore_case(file, folder_prefix)) {
-                    files_to_download[main_repo_id].push_back(file);
-                }
-            }
-            if (files_to_download[main_repo_id].empty()) {
-                throw std::runtime_error("No Moonshine model files found in folder: " + main_variant);
-            }
-            LOG(INFO, "ModelManager") << "Moonshine: downloading " << files_to_download[main_repo_id].size()
+        } else if (backend_files) {
+            files_to_download[main_repo_id] = std::move(*backend_files);
+            LOG(INFO, "ModelManager") << info.recipe << ": downloading "
+                                      << files_to_download[main_repo_id].size()
                                       << " files from " << main_variant << std::endl;
         } else {
             // GGUF model: Use identify_gguf_models to determine which files to download

From 6cc95524b1c8249059ef9a9d784744074b9a9850 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Thu, 25 Jun 2026 15:20:29 -0400
Subject: [PATCH 31/39] polish(backends): move FLM unavailable-state machine
 into flm ops

system_info hardcoded a recipe == "flm" block to classify FLM's
supported-but-unavailable state (.deb/driver manual setup) and emit
troubleshoot links. Add BackendOps::classify_unavailable (default nullopt =
the generic installable/no-fetch path) and implement it in FlmOps. system_info
no longer names a backend in its install-state machine.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_ops.h  | 23 ++++++++++
 .../backends/fastflowlm/fastflowlm_server.cpp | 37 ++++++++++++++++
 src/cpp/server/system_info.cpp                | 42 +++++--------------
 3 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index 49faa68b4..c03c111f3 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -137,6 +137,29 @@ class BackendOps {
         (void)backend;
         return {binary_found, ""};
     }
+
+    // The /system-info state for a backend variant that is supported but not
+    // currently available (install probe failed).
+    struct UnavailableState {
+        std::string state;    // "installable" | "update_required" | "action_required"
+        std::string message;  // shown to the user
+        std::string action;   // remediation (a URL or an install command)
+        bool attach_installed_version = false;  // surface the installed version too
+    };
+
+    // Classify a "supported but not available" backend variant for /system-info,
+    // given the install probe's error text and the generic install command the
+    // caller would otherwise use. Return nullopt to use the generic
+    // installable/no-fetch default. flm overrides: it is a system .deb + drivers
+    // needing manual setup, so its states and remediation links differ.
+    virtual std::optional<UnavailableState> classify_unavailable(
+        const std::string& backend, const std::string& install_error,
+        const std::string& default_install_command) const {
+        (void)backend;
+        (void)install_error;
+        (void)default_install_command;
+        return std::nullopt;
+    }
 };
 
 // Shared default ops instance for backends that override nothing.
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index fc5ecef9b..4a84ecd10 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -14,6 +14,7 @@
 #include <iostream>
 #include <filesystem>
 #include <cstdlib>
+#include <optional>
 #include <thread>
 #include <chrono>
 #include <fstream>
@@ -524,6 +525,42 @@ class FlmOps : public BackendOps {
         }
         return {binary_found, ""};
     }
+
+    std::optional<UnavailableState> classify_unavailable(
+        const std::string&, const std::string& install_error,
+        const std::string& default_install_command) const override {
+        // FLM needs richer state to guide users through manual setup (installing
+        // the .deb, xrt drivers, etc.) rather than an automatic backend install.
+        bool is_not_installed = install_error.empty()
+                             || install_error.find("not installed") != std::string::npos
+                             || install_error.find("not found") != std::string::npos;
+        bool is_version_mismatch = install_error.find("requires") != std::string::npos;
+
+        UnavailableState s;
+        if (is_not_installed) {
+            s.state = "installable";
+        } else if (is_version_mismatch) {
+            s.state = "update_required";
+        } else {
+            s.state = "action_required";
+        }
+        s.message = install_error;
+        s.attach_installed_version = !is_not_installed;
+
+#ifdef __linux__
+        (void)default_install_command;
+        s.action = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot";
+#elif defined(_WIN32)
+        if (!is_not_installed && !is_version_mismatch) {
+            s.action = "Visit https://lemonade-server.ai/driver_install.html";
+        } else {
+            s.action = default_install_command;
+        }
+#else
+        s.action = default_install_command;
+#endif
+        return s;
+    }
 };
 }  // namespace
 
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index 3f7849a05..2c67a8f94 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -1316,41 +1316,21 @@ json SystemInfo::build_recipes_info(const json& devices) {
             backend["message"] = message;
             backend["action"] = "";
         } else if (!available) {
-            // FLM on Linux needs richer state to guide users through manual setup
-            // (installing .deb, xrt drivers, etc.)
-            if (def.recipe == "flm") {
-                bool is_not_installed = install_error.empty()
-                                     || install_error.find("not installed") != std::string::npos
-                                     || install_error.find("not found") != std::string::npos;
-                bool is_version_mismatch = install_error.find("requires") != std::string::npos;
-
-                if (is_not_installed) {
-                    backend["state"] = "installable";
-                } else if (is_version_mismatch) {
-                    backend["state"] = "update_required";
-                } else {
-                    backend["state"] = "action_required";
-                }
-                backend["message"] = install_error;
-
-                if (!is_not_installed) {
+            // Backends with bespoke unavailable-state guidance (flm: a system .deb
+            // + drivers needing manual setup) classify themselves; everyone else
+            // uses the generic installable/no-fetch default below.
+            const std::string default_install_command = get_install_command(def.recipe, def.backend);
+            if (auto st = backends::ops_for(def.recipe)->classify_unavailable(
+                    def.backend, install_error, default_install_command)) {
+                backend["state"] = st->state;
+                backend["message"] = st->message;
+                backend["action"] = st->action;
+                if (st->attach_installed_version) {
                     std::string installed_version = get_recipe_version(def.recipe, def.backend);
                     if (!installed_version.empty() && installed_version != "unknown") {
                         backend["version"] = installed_version;
                     }
                 }
-
-#ifdef __linux__
-                backend["action"] = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot";
-#elif defined(_WIN32)
-                if (!is_not_installed && !is_version_mismatch) {
-                    backend["action"] = "Visit https://lemonade-server.ai/driver_install.html";
-                } else {
-                    backend["action"] = get_install_command(def.recipe, def.backend);
-                }
-#else
-                backend["action"] = get_install_command(def.recipe, def.backend);
-#endif
             } else {
                 auto* cfg = RuntimeConfig::global();
                 bool no_fetch = cfg && cfg->no_fetch_executables();
@@ -1369,7 +1349,7 @@ json SystemInfo::build_recipes_info(const json& devices) {
                     && !install_error.empty() && needs_gfx1151_cwsr_fix()) {
                     backend["action"] = "Visit https://lemonade-server.ai/gfx1151_linux.html";
                 } else {
-                    backend["action"] = get_install_command(def.recipe, def.backend);
+                    backend["action"] = default_install_command;
                 }
             }
         } else {

From d0368daf30f631f1a8fb89cc2abf9668a03a1112 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Thu, 25 Jun 2026 15:22:11 -0400
Subject: [PATCH 32/39] polish(cli): drive bench backend override from
 descriptor, not recipe==llamacpp

bench hardcoded recipe == "llamacpp" to send the llamacpp_backend override.
Use the CLI-safe descriptor registry: any recipe with selectable_backend gets
its <config_section>_backend override (llamacpp and vllm today).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/cli/bench.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/cpp/cli/bench.cpp b/src/cpp/cli/bench.cpp
index 6cf1b1a5b..280b26d33 100644
--- a/src/cpp/cli/bench.cpp
+++ b/src/cpp/cli/bench.cpp
@@ -1,5 +1,6 @@
 #include "lemon_cli/bench.h"
 #include "lemon_cli/lemonade_client.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include <CLI/CLI.hpp>
 #include <lemon/utils/path_utils.h>
 #include <algorithm>
@@ -406,9 +407,10 @@ bool load_model_for_backend(lemonade::LemonadeClient& client,
         request_body["model_name"] = model;
         request_body["save_options"] = false;
 
-        // For llamacpp recipe, pass backend override
-        if (recipe == "llamacpp") {
-            request_body["llamacpp_backend"] = backend;
+        // For recipes that expose a selectable backend, pass the override.
+        if (const auto* desc = lemon::backends::descriptor_for(recipe);
+            desc && desc->selectable_backend) {
+            request_body[desc->effective_config_section() + "_backend"] = backend;
         }
 
         if (ctx_size > 0) {

From e14fc2a1917677633d9d996c4acbd1d4511bd0d4 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Thu, 25 Jun 2026 15:26:59 -0400
Subject: [PATCH 33/39] polish(backends): move GGUF :variant registration check
 into llamacpp ops

model_manager hardcoded actual_recipe == "llamacpp" to require a :variant on
GGUF checkpoints at registration. Add BackendOps::validate_registration_checkpoint
(default accept) and implement the GGUF rule in LlamaCppOps. Verified: a GGUF
checkpoint without :variant is still rejected; other recipes are unaffected.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_ops.h  |  8 ++++++++
 .../backends/llamacpp/llamacpp_server.cpp     | 14 ++++++++++++++
 src/cpp/server/model_manager.cpp              | 19 +++++--------------
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index c03c111f3..047c6795d 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -69,6 +69,14 @@ class BackendOps {
         return "";
     }
 
+    // Validate a user-supplied checkpoint string when registering a new model.
+    // Return an error message if invalid, "" if acceptable. Default: accept.
+    // llamacpp requires a :variant on GGUF checkpoints.
+    virtual std::string validate_registration_checkpoint(const std::string& checkpoint) const {
+        (void)checkpoint;
+        return "";
+    }
+
     // Select the repo-relative files to download for the main checkpoint
     // `main_variant`, for backends whose artifact layout isn't a GGUF file.
     // Return nullopt to use the default GGUF selection. (Direct single-file
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index d822250fc..d441f998e 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -804,6 +804,20 @@ class LlamaCppOps : public BackendOps {
         return resolve_gguf_path(import_dir, "");
     }
 
+    std::string validate_registration_checkpoint(const std::string& checkpoint) const override {
+        // A GGUF checkpoint must name its quant via CHECKPOINT:VARIANT.
+        std::string lower = checkpoint;
+        std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower);
+        if (lower.find("gguf") != std::string::npos &&
+            checkpoint.find(':') == std::string::npos) {
+            return "You are required to provide a 'variant' in the checkpoint field when "
+                   "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. "
+                   "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
+                   "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf";
+        }
+        return "";
+    }
+
     std::string validate_checkpoint_file(const std::string& resolved_path) const override {
         // A .gguf file in the cache must start with the GGUF magic, else it's a
         // truncated/corrupt download and the model is not really present.
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 824a1ecce..c904bf55b 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -2681,20 +2681,11 @@ void ModelManager::download_model(const std::string& model_name,
                 );
             }
 
-            // Validate GGUF models (llamacpp recipe) require a variant
-            if (actual_recipe == "llamacpp") {
-                std::string checkpoint_lower = actual_checkpoint;
-                std::transform(checkpoint_lower.begin(), checkpoint_lower.end(),
-                              checkpoint_lower.begin(), ::tolower);
-                if (checkpoint_lower.find("gguf") != std::string::npos &&
-                    actual_checkpoint.find(':') == std::string::npos) {
-                    throw std::runtime_error(
-                        "You are required to provide a 'variant' in the checkpoint field when "
-                        "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. "
-                        "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or "
-                        "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"
-                    );
-                }
+            // Backend-specific checkpoint validation (llamacpp: GGUF needs :variant).
+            if (auto err = backends::ops_for(actual_recipe)->validate_registration_checkpoint(
+                    actual_checkpoint);
+                !err.empty()) {
+                throw std::runtime_error(err);
             }
 
             LOG(INFO, "ModelManager") << "Registering new user model: " << model_name << std::endl;

From 5daebd52d396f09ffe835fa9a08418c367a7bbc2 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Thu, 25 Jun 2026 15:36:47 -0400
Subject: [PATCH 34/39] polish(backends): make_server<T> helper + collapse
 redundant namespaces

DRY pass across the backend folders:
- Add backends::make_server<T>(ctx) for the standard (log_level, model_manager,
  backend_manager) construction; the 6 plain create() bodies now call it instead
  of repeating the three context fields. cloud/ryzenai keep bespoke create().
- Each *_server.h closed and re-opened namespace lemon::backends just to nest the
  per-backend namespace; nest it inline instead (8 headers). ryzenai is left as-is
  (its legacy RyzenAIServer lives in namespace lemon, not lemon::backends).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_registry.h         | 8 ++++++++
 src/cpp/include/lemon/backends/cloud/cloud_server.h       | 5 -----
 .../include/lemon/backends/fastflowlm/fastflowlm_server.h | 5 -----
 src/cpp/include/lemon/backends/kokoro/kokoro_server.h     | 5 -----
 src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h | 5 -----
 .../include/lemon/backends/moonshine/moonshine_server.h   | 5 -----
 src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h       | 5 -----
 src/cpp/include/lemon/backends/vllm/vllm_server.h         | 5 -----
 .../include/lemon/backends/whispercpp/whispercpp_server.h | 5 -----
 src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp  | 2 +-
 src/cpp/server/backends/kokoro/kokoro_server.cpp          | 2 +-
 src/cpp/server/backends/llamacpp/llamacpp_server.cpp      | 2 +-
 src/cpp/server/backends/moonshine/moonshine_server.cpp    | 2 +-
 src/cpp/server/backends/sdcpp/sdcpp_server.cpp            | 2 +-
 src/cpp/server/backends/vllm/vllm_server.cpp              | 2 +-
 15 files changed, 14 insertions(+), 46 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h
index 75709781d..868a0a584 100644
--- a/src/cpp/include/lemon/backends/backend_registry.h
+++ b/src/cpp/include/lemon/backends/backend_registry.h
@@ -30,6 +30,14 @@ struct BackendContext {
 
 using BackendCreateFn = std::unique_ptr<WrappedServer> (*)(const BackendContext&);
 
+// Convenience for the common create(): construct a server class from the
+// standard (log_level, model_manager, backend_manager) context fields. Backends
+// needing extra constructor arguments (cloud, ryzenai) build theirs by hand.
+template <typename T>
+std::unique_ptr<WrappedServer> make_server(const BackendContext& ctx) {
+    return std::make_unique<T>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+}
+
 // Binds a descriptor (what the backend is) to its server class's create() (how
 // it runs). The generated factory registry supplies one per backend. This API is
 // server-only: it references server classes via create(), so it is compiled into
diff --git a/src/cpp/include/lemon/backends/cloud/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h
index 774c44300..a2dc3a6e9 100644
--- a/src/cpp/include/lemon/backends/cloud/cloud_server.h
+++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h
@@ -108,11 +108,6 @@ class CloudServer : public WrappedServer {
     bool loaded_ = false;
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace cloud {
 // Factory for the cloud backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
index e4bce74d8..bdcb1d88a 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h
@@ -61,11 +61,6 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public
     bool is_loaded_ = false;
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace fastflowlm {
 // Factory for the fastflowlm backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
index ec8e74844..6a9738252 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h
@@ -38,11 +38,6 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer {
     void audio_speech(const json& request, httplib::DataSink& sink) override;
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace kokoro {
 // Factory for the kokoro backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
index f1447c1ce..8a7a8405f 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h
@@ -49,11 +49,6 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR
     json tokenize(const json& request) override;
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace llamacpp {
 // Factory for the llamacpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
index 47ea21f58..e6535a34b 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h
@@ -48,11 +48,6 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi
     int tcp_port_ = 0;     // Port for line-delimited JSON streaming
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace moonshine {
 // Factory for the moonshine backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
index 65c470332..185108afc 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
@@ -86,11 +86,6 @@ class SDServer : public WrappedServer, public IImageServer {
     std::string resolve_size(const nlohmann::json& request) const;
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace sdcpp {
 // Factory for the sdcpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h
index 0293fa811..1ac9438ed 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm_server.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h
@@ -41,11 +41,6 @@ class VLLMServer : public WrappedServer {
 
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace vllm {
 // Factory for the vllm backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
index 9ddd4f2af..dc97cbd9f 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h
@@ -67,11 +67,6 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer {
     std::filesystem::path temp_dir_;  // Directory for temporary audio files
 };
 
-} // namespace backends
-} // namespace lemon
-
-namespace lemon {
-namespace backends {
 namespace whispercpp {
 // Factory for the whispercpp backend (constructs the server class — lemond only).
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx);
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 4a84ecd10..25bbc444d 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -477,7 +477,7 @@ namespace backends {
 namespace fastflowlm {
 
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
-    return std::make_unique<FastFlowLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+    return make_server<FastFlowLMServer>(ctx);
 }
 
 namespace {
diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
index aa8ad871e..154973501 100644
--- a/src/cpp/server/backends/kokoro/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -216,7 +216,7 @@ namespace backends {
 namespace kokoro {
 
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
-    return std::make_unique<KokoroServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+    return make_server<KokoroServer>(ctx);
 }
 
 
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index d441f998e..b1fd4ee83 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -667,7 +667,7 @@ namespace backends {
 namespace llamacpp {
 
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
-    return std::make_unique<LlamaCppServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+    return make_server<LlamaCppServer>(ctx);
 }
 
 namespace {
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index a84506e35..ced8e716f 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -368,7 +368,7 @@ namespace backends {
 namespace moonshine {
 
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
-    return std::make_unique<MoonshineServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+    return make_server<MoonshineServer>(ctx);
 }
 
 
diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index 98f19e5ea..4e23c046e 100644
--- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -754,7 +754,7 @@ namespace backends {
 namespace sdcpp {
 
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
-    return std::make_unique<SDServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+    return make_server<SDServer>(ctx);
 }
 
 
diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
index 085cd0f2a..8bfaced2d 100644
--- a/src/cpp/server/backends/vllm/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -319,7 +319,7 @@ namespace backends {
 namespace vllm {
 
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
-    return std::make_unique<VLLMServer>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
+    return make_server<VLLMServer>(ctx);
 }
 
 

From 43ec4f235ccbf6343b96376e51f8d09825b27ddf Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Thu, 25 Jun 2026 16:51:34 -0400
Subject: [PATCH 35/39] polish(backends): make_spec<T>/single_ops<T> helpers
 shrink spec()/ops()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-backend spec()/ops() are the name-based adapter the CMake codegen binds
(<stem>::spec/ops), so the functions must exist — but their bodies were
repetitive. Add make_spec<T>(descriptor[, split]) (backend_utils.h, where
BackendSpec is complete) and single_ops<T>() (backend_registry.h, next to
make_server) so the 7 standard spec() and 7 custom ops() collapse to one line
each. ryzenai (install key != recipe) and cloud (no spec) keep bespoke spec();
sd-cpp/vllm keep default_backend_ops(). Pure refactor — registry binding,
71/71 endpoints, and all-backends-registered smoke unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/include/lemon/backends/backend_registry.h    |  9 +++++++++
 src/cpp/include/lemon/backends/backend_utils.h       | 12 ++++++++++++
 src/cpp/server/backends/cloud/cloud_server.cpp       |  5 +----
 .../server/backends/fastflowlm/fastflowlm_server.cpp | 11 ++---------
 src/cpp/server/backends/kokoro/kokoro_server.cpp     | 11 ++---------
 src/cpp/server/backends/llamacpp/llamacpp_server.cpp | 11 ++---------
 .../server/backends/moonshine/moonshine_server.cpp   | 11 ++---------
 src/cpp/server/backends/ryzenai/ryzenai_server.cpp   |  5 +----
 src/cpp/server/backends/sdcpp/sdcpp_server.cpp       |  6 +-----
 src/cpp/server/backends/vllm/vllm_server.cpp         |  6 +-----
 .../server/backends/whispercpp/whispercpp_server.cpp | 11 ++---------
 11 files changed, 35 insertions(+), 63 deletions(-)

diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h
index 868a0a584..240ddf728 100644
--- a/src/cpp/include/lemon/backends/backend_registry.h
+++ b/src/cpp/include/lemon/backends/backend_registry.h
@@ -38,6 +38,15 @@ std::unique_ptr<WrappedServer> make_server(const BackendContext& ctx) {
     return std::make_unique<T>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
 }
 
+// Construct-on-first-use singleton for a stateless ops class, giving the
+// registry a stable pointer. Backends with no custom behavior return
+// default_backend_ops() from their ops() instead.
+template <typename T>
+const BackendOps* single_ops() {
+    static const T kOps;
+    return &kOps;
+}
+
 // Binds a descriptor (what the backend is) to its server class's create() (how
 // it runs). The generated factory registry supplies one per backend. This API is
 // server-only: it references server classes via create(), so it is compiled into
diff --git a/src/cpp/include/lemon/backends/backend_utils.h b/src/cpp/include/lemon/backends/backend_utils.h
index bfc37734d..bdbfe0869 100644
--- a/src/cpp/include/lemon/backends/backend_utils.h
+++ b/src/cpp/include/lemon/backends/backend_utils.h
@@ -5,6 +5,7 @@
 #include <filesystem>
 #include <utility>
 #include <vector>
+#include "lemon/backends/backend_descriptor.h"
 
 namespace fs = std::filesystem;
 
@@ -42,6 +43,17 @@ namespace lemon::backends {
         std::string log_name() const { return recipe + " Server"; };
     };
 
+    // Build a backend's install/download spec from its descriptor's recipe/binary
+    // and the server class T's get_install_params. The construct-on-first-use
+    // static gives the registry a stable pointer. Backends whose install key
+    // differs from the recipe (ryzenai) or that have no installable artifact
+    // (cloud) build their BackendSpec by hand instead of using this.
+    template <typename T>
+    const BackendSpec* make_spec(const BackendDescriptor& d, bool split = false) {
+        static const BackendSpec kSpec(d.recipe, d.binary, T::get_install_params, split);
+        return &kSpec;
+    }
+
     // Return the backend spec for recipes that use the standard BackendSpec flow.
     // Returns nullptr for recipes that require custom handling (e.g., flm) or unknown recipes.
     const BackendSpec* try_get_spec_for_recipe(const std::string& recipe);
diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
index f68bc0c92..3c61c213b 100644
--- a/src/cpp/server/backends/cloud/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -906,10 +906,7 @@ class CloudOps : public BackendOps {
 }  // namespace
 
 const BackendSpec* spec() { return nullptr; }
-const BackendOps* ops() {
-    static const CloudOps kOps;
-    return &kOps;
-}
+const BackendOps* ops() { return single_ops<CloudOps>(); }
 }  // namespace cloud
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 25bbc444d..050b5a961 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -564,15 +564,8 @@ class FlmOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() {
-    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
-                                   FastFlowLMServer::get_install_params, /*split=*/false);
-    return &kSpec;
-}
-const BackendOps* ops() {
-    static const FlmOps kOps;
-    return &kOps;
-}
+const BackendSpec* spec() { return make_spec<FastFlowLMServer>(descriptor); }
+const BackendOps* ops() { return single_ops<FlmOps>(); }
 }  // namespace fastflowlm
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
index 154973501..95d46de6a 100644
--- a/src/cpp/server/backends/kokoro/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -240,15 +240,8 @@ class KokoroOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() {
-    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
-                                   KokoroServer::get_install_params, /*split=*/false);
-    return &kSpec;
-}
-const BackendOps* ops() {
-    static const KokoroOps kOps;
-    return &kOps;
-}
+const BackendSpec* spec() { return make_spec<KokoroServer>(descriptor); }
+const BackendOps* ops() { return single_ops<KokoroOps>(); }
 }  // namespace kokoro
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index 6bdb0ae98..eb766e798 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -861,15 +861,8 @@ class LlamaCppOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() {
-    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
-                                   LlamaCppServer::get_install_params, /*split=*/false);
-    return &kSpec;
-}
-const BackendOps* ops() {
-    static const LlamaCppOps kOps;
-    return &kOps;
-}
+const BackendSpec* spec() { return make_spec<LlamaCppServer>(descriptor); }
+const BackendOps* ops() { return single_ops<LlamaCppOps>(); }
 }  // namespace llamacpp
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index ced8e716f..bcf263d67 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -407,15 +407,8 @@ class MoonshineOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() {
-    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
-                                   MoonshineServer::get_install_params, /*split=*/false);
-    return &kSpec;
-}
-const BackendOps* ops() {
-    static const MoonshineOps kOps;
-    return &kOps;
-}
+const BackendSpec* spec() { return make_spec<MoonshineServer>(descriptor); }
+const BackendOps* ops() { return single_ops<MoonshineOps>(); }
 }  // namespace moonshine
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index 7bee8e46d..69e1eed16 100644
--- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -220,10 +220,7 @@ const BackendSpec* spec() {
                                    ::lemon::RyzenAIServer::get_install_params, /*split=*/false);
     return &kSpec;
 }
-const BackendOps* ops() {
-    static const RyzenAiOps kOps;
-    return &kOps;
-}
+const BackendOps* ops() { return single_ops<RyzenAiOps>(); }
 }  // namespace ryzenai
 }  // namespace backends
 }  // namespace lemon
diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index 4e23c046e..a4b1787f9 100644
--- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -758,11 +758,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
-const BackendSpec* spec() {
-    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
-                                   SDServer::get_install_params, /*split=*/false);
-    return &kSpec;
-}
+const BackendSpec* spec() { return make_spec<SDServer>(descriptor); }
 const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace sdcpp
 }  // namespace backends
diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
index 8bfaced2d..60a79c95f 100644
--- a/src/cpp/server/backends/vllm/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -323,11 +323,7 @@ std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
 }
 
 
-const BackendSpec* spec() {
-    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
-                                   VLLMServer::get_install_params, /*split=*/true);
-    return &kSpec;
-}
+const BackendSpec* spec() { return make_spec<VLLMServer>(descriptor, /*split=*/true); }
 const BackendOps* ops() { return default_backend_ops(); }
 }  // namespace vllm
 }  // namespace backends
diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index bd245b9e5..d1222e551 100644
--- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -763,15 +763,8 @@ class WhisperOps : public BackendOps {
 };
 }  // namespace
 
-const BackendSpec* spec() {
-    static const BackendSpec kSpec(descriptor.recipe, descriptor.binary,
-                                   WhisperServer::get_install_params, /*split=*/false);
-    return &kSpec;
-}
-const BackendOps* ops() {
-    static const WhisperOps kOps;
-    return &kOps;
-}
+const BackendSpec* spec() { return make_spec<WhisperServer>(descriptor); }
+const BackendOps* ops() { return single_ops<WhisperOps>(); }
 }  // namespace whispercpp
 }  // namespace backends
 }  // namespace lemon

From 8f6f36e1e337adeedf5307562c248529ebc99eb9 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Thu, 25 Jun 2026 16:51:34 -0400
Subject: [PATCH 36/39] docs(nav): add Adding a Backend + Backends Reference to
 mkdocs nav

The two backend dev docs added by this work (dev/adding-a-backend.md and the
generated dev/backends-reference.md) were not wired into the Development nav.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 mkdocs.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mkdocs.yml b/mkdocs.yml
index 18201bba3..73ecc9981 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -83,6 +83,8 @@ nav:
     - Contribute: dev/contribute.md
     - Documentation Guide: dev/documentation.md
     - C++: dev/getting-started.md
+    - Adding a Backend: dev/adding-a-backend.md
+    - Backends Reference: dev/backends-reference.md
     - Desktop App: dev/app.md
     - Web UI: dev/web-ui.md
     - Lemonade Omni Models: dev/lemonade-omni.md

From 2ac10fdfdf40f55802136ef152b905e094c71b83 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Fri, 26 Jun 2026 10:23:35 -0400
Subject: [PATCH 37/39] feat(config): generate defaults.json from descriptors
 via /internal/config/defaults

Per-recipe config defaults are now declared in each backend descriptor
(takes_args / arg_variants / bin_variants / config_extra -> config_defaults())
instead of hand-maintained blocks in defaults.json. The committed
resources/defaults.json stays fully populated (so it remains the discoverable
reference for factory defaults) but is now generated:

- New GET /internal/config/defaults emits the canonical default config
  (ConfigFile::base_defaults(): global keys + descriptor-derived per-recipe
  sections, host/deployment-independent). Documented alongside /internal/config.
- gen_backend_docs.py -> gen_backend_boilerplate.py, which mirrors that endpoint
  verbatim into resources/defaults.json (whole-file) in addition to the doc
  regions. The existing CI --check now also fails if defaults.json drifts.

config_file keeps reading defaults.json at runtime; base_defaults() re-seeds the
descriptor blocks so the descriptor stays authoritative even if the file lags.
Verified: a fresh config.json reproduces every prior default; endpoints 71/71;
generator --check clean; black clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/workflows/docs_and_style.yml          |  2 +-
 docs/dev/backends-reference.md                |  2 +-
 docs/dev/getting-started.md                   | 10 ++
 docs/embeddable/runtime.md                    | 18 ++++
 ...end_docs.py => gen_backend_boilerplate.py} | 55 ++++++++---
 .../lemon/backends/backend_descriptor.h       | 25 +++++
 .../lemon/backends/fastflowlm/fastflowlm.h    |  4 +
 .../include/lemon/backends/kokoro/kokoro.h    |  9 ++
 .../lemon/backends/llamacpp/llamacpp.h        |  6 ++
 .../lemon/backends/moonshine/moonshine.h      | 10 ++
 .../include/lemon/backends/ryzenai/ryzenai.h  |  9 ++
 src/cpp/include/lemon/backends/sdcpp/sdcpp.h  |  6 ++
 src/cpp/include/lemon/backends/vllm/vllm.h    |  6 ++
 .../lemon/backends/whispercpp/whispercpp.h    |  9 ++
 src/cpp/include/lemon/config_file.h           | 11 ++-
 src/cpp/include/lemon/server.h                |  1 +
 src/cpp/resources/defaults.json               | 92 +++++++++----------
 src/cpp/server/config_file.cpp                | 21 ++++-
 src/cpp/server/server.cpp                     | 17 ++++
 19 files changed, 249 insertions(+), 64 deletions(-)
 rename docs/tools/{gen_backend_docs.py => gen_backend_boilerplate.py} (90%)

diff --git a/.github/workflows/docs_and_style.yml b/.github/workflows/docs_and_style.yml
index 35aa4cf50..3354f50fd 100644
--- a/.github/workflows/docs_and_style.yml
+++ b/.github/workflows/docs_and_style.yml
@@ -39,7 +39,7 @@ jobs:
       - name: Build lemond
         run: cmake --build --preset default --target lemond
       - name: Check backend reference docs are up to date
-        run: python3 docs/tools/gen_backend_docs.py --check
+        run: python3 docs/tools/gen_backend_boilerplate.py --check
 
   markdown-link-check:
     runs-on: ubuntu-latest
diff --git a/docs/dev/backends-reference.md b/docs/dev/backends-reference.md
index 5faad0189..3993fe8fe 100644
--- a/docs/dev/backends-reference.md
+++ b/docs/dev/backends-reference.md
@@ -1,6 +1,6 @@
 # Backend reference
 
-<!-- This file is generated by docs/tools/gen_backend_docs.py from the C++ backend
+<!-- This file is generated by docs/tools/gen_backend_boilerplate.py from the C++ backend
 descriptors. Do not edit the regions between the GENERATED markers by hand; run
 the generator instead. Prose outside the markers is preserved. -->
 
diff --git a/docs/dev/getting-started.md b/docs/dev/getting-started.md
index b8e487c4c..ef1769059 100644
--- a/docs/dev/getting-started.md
+++ b/docs/dev/getting-started.md
@@ -625,6 +625,7 @@ Internal endpoints accept connections from any address, so first-party clients o
 | `POST` | `/internal/shutdown` | Unloads all models and shuts down the server |
 | `POST` | `/internal/set` | Unified config setter (see below) |
 | `GET`  | `/internal/config` | Returns the full runtime config snapshot |
+| `GET`  | `/internal/config/defaults` | Returns the canonical default config (factory defaults) |
 | `POST` | `/internal/cleanup-cache` | Cleans up orphaned files in the Hugging Face cache |
 | `POST` | `/internal/pin` | Pin or unpin a loaded model |
 
@@ -676,6 +677,15 @@ Returns the full runtime configuration as a flat JSON object containing all serv
 curl http://localhost:13305/internal/config
 ```
 
+#### `GET /internal/config/defaults`
+
+Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or deployment overrides. The per-recipe sections come from the backend descriptors (each descriptor's `config_defaults()`), making this the authoritative source of the factory defaults. `docs/tools/gen_backend_boilerplate.py` reads this endpoint to regenerate the committed `src/cpp/resources/defaults.json`, and a CI `--check` fails if that file drifts from the descriptors.
+
+**Example:**
+```bash
+curl http://localhost:13305/internal/config/defaults
+```
+
 ### Dependencies
 
 All dependencies are automatically fetched by CMake via FetchContent:
diff --git a/docs/embeddable/runtime.md b/docs/embeddable/runtime.md
index a50b8c4af..983038e95 100644
--- a/docs/embeddable/runtime.md
+++ b/docs/embeddable/runtime.md
@@ -114,6 +114,7 @@ Your app can manage its `lemond` instance at runtime by using `/internal` endpoi
 |--------|------|-------------|
 | `POST` | `/internal/set` | Unified config setter (see below) |
 | `GET`  | `/internal/config` | Returns the full runtime config snapshot |
+| `GET`  | `/internal/config/defaults` | Returns the canonical default config (factory defaults) |
 | `POST` | `/internal/pin` | Pin or unpin a loaded model (prevents auto-eviction) |
 
 The settings defined in `config.json` can all be changed at runtime without restarting `lemond` with the `/internal/set` endpoint. See the [Configuration Guide](../guide/configuration/README.md) for details on all settings.
@@ -137,6 +138,23 @@ Returns the full runtime configuration as a flat JSON object containing all serv
     curl http://localhost:8000/internal/config
     ```
 
+#### `GET /internal/config/defaults`
+
+Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or any deployment override. The per-recipe sections are derived from the backend descriptors, so this is the authoritative source for "what are the factory defaults." It is what `docs/tools/gen_backend_boilerplate.py` reads to regenerate `src/cpp/resources/defaults.json`.
+
+**Example:**
+=== "Windows (cmd.exe)"
+
+    ```cmd
+    curl http://localhost:8000/internal/config/defaults
+    ```
+
+=== "Linux (bash)"
+
+    ```bash
+    curl http://localhost:8000/internal/config/defaults
+    ```
+
 #### `POST /internal/set`
 
 Accepts a JSON object with one or more keys to update atomically. Returns `{"status":"success","updated":{...}}` on success, or `400` with an error message on validation failure.
diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_boilerplate.py
similarity index 90%
rename from docs/tools/gen_backend_docs.py
rename to docs/tools/gen_backend_boilerplate.py
index 8e5bf3133..b4e8ac8d9 100644
--- a/docs/tools/gen_backend_docs.py
+++ b/docs/tools/gen_backend_boilerplate.py
@@ -1,19 +1,24 @@
 #!/usr/bin/env python3
-"""Generate backend reference docs from the self-describing backend descriptors.
+"""Generate backend boilerplate (docs + config defaults) from the descriptors.
 
 The C++ backend descriptors (src/cpp/include/lemon/backends/<stem>/<stem>.h) are
 the single source of truth for what each backend is. This script boots a `lemond`
-server, reads the descriptor-generated ``/system-info`` ``recipes`` object and
-``server_models.json``, and rewrites the marker-delimited regions of the target
-doc(s). A CI step runs it with ``--check`` and fails if the committed docs drift.
+server and regenerates the committed artifacts that would otherwise be
+hand-maintained:
 
-Usage:
-    python docs/tools/gen_backend_docs.py [--lemond PATH] [--check]
+  * Marker-delimited regions of the backend reference docs, from
+    ``/system-info`` ``recipes`` + ``server_models.json``.
+  * The whole of ``src/cpp/resources/defaults.json``, mirrored verbatim from
+    ``/internal/config/defaults`` (its per-recipe blocks come from each
+    descriptor's ``config_defaults()``).
+
+A CI step runs it with ``--check`` and fails if any committed artifact drifts.
 
-``--check`` regenerates in memory and exits non-zero if the on-disk docs differ,
-without modifying them.
+Usage:
+    python docs/tools/gen_backend_boilerplate.py [--lemond PATH] [--check]
 
-Only the regions between::
+``--check`` regenerates in memory and exits non-zero if any on-disk artifact
+differs, without modifying it. For the docs, only the regions between::
 
     <!-- BEGIN GENERATED: <id> -->
     <!-- END GENERATED: <id> -->
@@ -108,6 +113,12 @@ def system_info(self) -> dict:
     def config(self) -> dict:
         return json.loads(self._get("/internal/config", timeout=10))
 
+    def config_defaults_text(self) -> str:
+        # Verbatim text of the canonical default config (the server's own
+        # serialization) so the committed resources/defaults.json is byte-stable.
+        text = self._get("/internal/config/defaults", timeout=10).decode("utf-8")
+        return text if text.endswith("\n") else text + "\n"
+
 
 def md_escape(text: str) -> str:
     return str(text).replace("|", "\\|")
@@ -429,7 +440,7 @@ def render_models(recipes: dict) -> str:
 
 DEFAULT_TEMPLATE = """# Backend reference
 
-<!-- This file is generated by docs/tools/gen_backend_docs.py from the C++ backend
+<!-- This file is generated by docs/tools/gen_backend_boilerplate.py from the C++ backend
 descriptors. Do not edit the regions between the GENERATED markers by hand; run
 the generator instead. Prose outside the markers is preserved. -->
 
@@ -504,6 +515,7 @@ def main() -> int:
     with Lemond(binary) as server:
         info = server.system_info()
         config = server.config()
+        defaults_text = server.config_defaults_text()
     recipes = info.get("recipes", {})
     if not recipes:
         sys.exit("/system-info returned no recipes")
@@ -562,7 +574,24 @@ def main() -> int:
         },
     }
 
+    # Whole-file generated artifacts (not marker-delimited): resources/defaults.json
+    # is the canonical default config, mirrored verbatim from GET
+    # /internal/config/defaults (per-recipe blocks come from the descriptors).
+    raw_targets: dict = {
+        REPO_ROOT / "src" / "cpp" / "resources" / "defaults.json": defaults_text,
+    }
+
     stale = []
+    for path, content in raw_targets.items():
+        rel = path.relative_to(REPO_ROOT)
+        if args.check:
+            if not path.exists() or path.read_text() != content:
+                stale.append(str(rel))
+        else:
+            path.parent.mkdir(parents=True, exist_ok=True)
+            path.write_text(content)
+            print(f"Wrote {rel}")
+
     for path, spec in targets.items():
         rel = path.relative_to(REPO_ROOT)
         current = path.read_text() if path.exists() else spec.get("template", "")
@@ -580,11 +609,11 @@ def main() -> int:
     if args.check:
         if stale:
             sys.exit(
-                "Stale generated docs: "
+                "Stale generated files: "
                 + ", ".join(stale)
-                + "\nRun: python docs/tools/gen_backend_docs.py"
+                + "\nRun: python docs/tools/gen_backend_boilerplate.py"
             )
-        print("All generated docs are up to date.")
+        print("All generated files are up to date.")
     return 0
 
 
diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index bd46c98c4..03ca71e69 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -94,10 +94,35 @@ struct BackendDescriptor {
     // backends are skipped by the load-time auto-download path.
     bool self_manages_downloads = false;
 
+    // --- config.json per-recipe defaults schema ---
+    // The backend's section of config.json is derived from these fields, so a new
+    // backend's defaults live in its descriptor instead of a hand-maintained
+    // defaults.json block. (selectable_backend additionally emits `backend: "auto"`.)
+    bool takes_args = false;                       // emits `args: ""`
+    std::vector<std::string> arg_variants;         // each emits `<variant>_args: ""`
+    std::vector<std::string> bin_variants;         // each emits `<variant>_bin: "builtin"`
+    nlohmann::json config_extra = nlohmann::json::object();  // fixed extras (e.g. prefer_system, image defaults)
+
     // The config.json section name for this backend, falling back to the recipe.
     std::string effective_config_section() const {
         return config_section.empty() ? recipe : config_section;
     }
+
+    // Build this backend's config.json default section from the schema above.
+    // Returns an empty object when the backend has no configurable section.
+    nlohmann::json config_defaults() const {
+        nlohmann::json block = nlohmann::json::object();
+        if (selectable_backend) block["backend"] = "auto";
+        if (takes_args) block["args"] = "";
+        for (const auto& v : arg_variants) block[v + "_args"] = "";
+        for (const auto& v : bin_variants) block[v + "_bin"] = "builtin";
+        if (config_extra.is_object()) {
+            for (auto it = config_extra.begin(); it != config_extra.end(); ++it) {
+                block[it.key()] = it.value();
+            }
+        }
+        return block;
+    }
 };
 
 } // namespace lemon
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index 7b812bfb9..24049ab31 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -37,6 +37,10 @@ inline const BackendDescriptor descriptor = {
     /*rocm_requires_cwsr_fix*/ false,
     /*version_policy*/  VersionPolicy::AtLeast,  // system-managed package
     /*self_manages_downloads*/ true,  // flm pulls its own models via the flm CLI
+    /*takes_args*/      true,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {},
+    /*config_extra*/    nlohmann::json::object(),
 };
 
 }  // namespace fastflowlm
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h
index b1e52eba4..5f3fbf97c 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -33,6 +33,15 @@ inline const BackendDescriptor descriptor = {
     /*experimental*/    false,
     /*web_display_name*/ "",
     /*web_priority*/    6,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      false,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {"cpu"},
+    /*config_extra*/    nlohmann::json::object(),
 };
 
 }  // namespace kokoro
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index 02ed728d7..7c58a73f3 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -49,6 +49,12 @@ inline const BackendDescriptor descriptor = {
     /*rocm_channels*/   {"stable", "nightly"},
     /*exposes_prometheus_metrics*/ true,
     /*rocm_requires_cwsr_fix*/ true,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"rocm", "vulkan", "cpu"},
+    /*bin_variants*/    {"rocm", "vulkan", "cuda", "cpu"},
+    /*config_extra*/    {{"prefer_system", true}},
 };
 
 }  // namespace llamacpp
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h
index 2c9feed2b..ae7313714 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h
@@ -32,6 +32,16 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Speech-to-text",
     /*experimental*/    false,
     /*web_display_name*/ "",
+    /*web_priority*/    0,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"cpu"},
+    /*bin_variants*/    {"cpu"},
+    /*config_extra*/    nlohmann::json::object(),
 };
 
 }  // namespace moonshine
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
index 13ebb9a7c..dbc15d7f3 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -32,6 +32,15 @@ inline const BackendDescriptor descriptor = {
     /*experimental*/    false,
     /*web_display_name*/ "Ryzen AI SW NPU",
     /*web_priority*/    2,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      false,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {"server"},
+    /*config_extra*/    nlohmann::json::object(),
 };
 
 }  // namespace ryzenai
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index 8cf299a2c..986d26fbe 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -53,6 +53,12 @@ inline const BackendDescriptor descriptor = {
     /*rocm_channels*/   {"stable"},
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ true,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"cpu", "rocm", "vulkan"},
+    /*bin_variants*/    {"cpu", "rocm", "vulkan"},
+    /*config_extra*/    {{"steps", 20}, {"cfg_scale", 7.0}, {"width", 512}, {"height", 512}},
 };
 
 }  // namespace sdcpp
diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h
index 97c58c715..8984e15b3 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -36,6 +36,12 @@ inline const BackendDescriptor descriptor = {
     /*rocm_channels*/   {},  // single rocm artifact, no stable/nightly channels
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ true,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {},
+    /*bin_variants*/    {},
+    /*config_extra*/    nlohmann::json::object(),
 };
 
 }  // namespace vllm
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
index e62ee029c..9c38b66d5 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -42,6 +42,15 @@ inline const BackendDescriptor descriptor = {
     /*experimental*/    false,
     /*web_display_name*/ "whisper.cpp",
     /*web_priority*/    4,
+    /*rocm_channels*/   {},
+    /*exposes_prometheus_metrics*/ false,
+    /*rocm_requires_cwsr_fix*/ false,
+    /*version_policy*/  VersionPolicy::Exact,
+    /*self_manages_downloads*/ false,
+    /*takes_args*/      true,
+    /*arg_variants*/    {"cpu", "npu"},
+    /*bin_variants*/    {"cpu", "npu"},
+    /*config_extra*/    nlohmann::json::object(),
 };
 
 }  // namespace whispercpp
diff --git a/src/cpp/include/lemon/config_file.h b/src/cpp/include/lemon/config_file.h
index ec56c17fb..8c46e125f 100644
--- a/src/cpp/include/lemon/config_file.h
+++ b/src/cpp/include/lemon/config_file.h
@@ -84,8 +84,15 @@ static inline bool config_migrate(json& config,
 /// Manages reading and writing config.json in the lemonade cache dir.
 class ConfigFile {
 public:
-    /// Returns the full default config loaded from installed resource JSON.
-    /// On Linux, an optional distro override at /usr/share/lemonade/defaults.json
+    /// The canonical default config: resources/defaults.json (global keys) with
+    /// each backend's per-recipe section seeded from its descriptor. Host- and
+    /// deployment-independent, so it is reproducible — this is what
+    /// GET /internal/config/defaults emits and gen_backend_boilerplate.py writes
+    /// back into resources/defaults.json.
+    static json base_defaults();
+
+    /// base_defaults() plus deployment overrides. On Linux, an optional distro
+    /// override at /usr/share/lemonade/defaults.json (and LEMONADE_DEFAULTS_PATH)
     /// is merged on top when present.
     static json get_defaults();
 
diff --git a/src/cpp/include/lemon/server.h b/src/cpp/include/lemon/server.h
index d481f5b80..3e39357d3 100644
--- a/src/cpp/include/lemon/server.h
+++ b/src/cpp/include/lemon/server.h
@@ -73,6 +73,7 @@ class Server {
     // Unified config endpoints
     void handle_config_set(const httplib::Request& req, httplib::Response& res);
     void handle_config_get(const httplib::Request& req, httplib::Response& res);
+    void handle_config_defaults_get(const httplib::Request& req, httplib::Response& res);
 
     // Side-effect callback for RuntimeConfig::set(). Receives a nested JSON
     // mirroring the input shape, containing only entries that actually changed.
diff --git a/src/cpp/resources/defaults.json b/src/cpp/resources/defaults.json
index f79396266..ab86404dd 100644
--- a/src/cpp/resources/defaults.json
+++ b/src/cpp/resources/defaults.json
@@ -1,71 +1,71 @@
 {
+  "cloud_providers": [],
   "config_version": 2,
-  "port": 13305,
-  "host": "localhost",
-  "websocket_port": "auto",
-  "log_level": "info",
-  "global_timeout": 600,
-  "max_loaded_models": 1,
-  "no_broadcast": false,
-  "extra_models_dir": "",
-  "models_dir": "auto",
   "ctx_size": -1,
-  "offline": false,
-  "no_fetch_executables": false,
   "disable_model_filtering": false,
   "enable_dgpu_gtt": false,
-  "rocm_channel": "stable",
+  "extra_models_dir": "",
+  "flm": {
+    "args": ""
+  },
+  "global_timeout": 600,
+  "host": "localhost",
+  "kokoro": {
+    "cpu_bin": "builtin"
+  },
   "llamacpp": {
-    "backend": "auto",
     "args": "",
-    "rocm_args": "",
-    "vulkan_args": "",
+    "backend": "auto",
     "cpu_args": "",
+    "cpu_bin": "builtin",
+    "cuda_bin": "builtin",
     "prefer_system": true,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin",
-    "cuda_bin": "builtin",
-    "cpu_bin": "builtin"
+    "vulkan_args": "",
+    "vulkan_bin": "builtin"
   },
-  "whispercpp": {
-    "backend": "auto",
+  "log_level": "info",
+  "max_loaded_models": 1,
+  "models_dir": "auto",
+  "moonshine": {
     "args": "",
     "cpu_args": "",
-    "npu_args": "",
-    "cpu_bin": "builtin",
-    "npu_bin": "builtin"
+    "cpu_bin": "builtin"
+  },
+  "no_broadcast": false,
+  "no_fetch_executables": false,
+  "offline": false,
+  "port": 13305,
+  "rocm_channel": "stable",
+  "ryzenai": {
+    "server_bin": "builtin"
   },
   "sdcpp": {
-    "backend": "auto",
     "args": "",
-    "cpu_args": "",
-    "rocm_args": "",
-    "vulkan_args": "",
-    "steps": 20,
+    "backend": "auto",
     "cfg_scale": 7.0,
-    "width": 512,
-    "height": 512,
+    "cpu_args": "",
     "cpu_bin": "builtin",
+    "height": 512,
+    "rocm_args": "",
     "rocm_bin": "builtin",
-    "vulkan_bin": "builtin"
-  },
-  "flm": {
-    "args": ""
+    "steps": 20,
+    "vulkan_args": "",
+    "vulkan_bin": "builtin",
+    "width": 512
   },
   "vllm": {
-    "backend": "auto",
-    "args": ""
-  },
-  "ryzenai": {
-    "server_bin": "builtin"
-  },
-  "kokoro": {
-    "cpu_bin": "builtin"
+    "args": "",
+    "backend": "auto"
   },
-  "moonshine": {
+  "websocket_port": "auto",
+  "whispercpp": {
     "args": "",
+    "backend": "auto",
     "cpu_args": "",
-    "cpu_bin": "builtin"
-  },
-  "cloud_providers": []
+    "cpu_bin": "builtin",
+    "npu_args": "",
+    "npu_bin": "builtin"
+  }
 }
diff --git a/src/cpp/server/config_file.cpp b/src/cpp/server/config_file.cpp
index d8f6955af..2787c0167 100644
--- a/src/cpp/server/config_file.cpp
+++ b/src/cpp/server/config_file.cpp
@@ -1,4 +1,5 @@
 #include "lemon/config_file.h"
+#include "lemon/backends/backend_descriptor_registry.h"
 #include "lemon/utils/json_utils.h"
 #include "lemon/utils/path_utils.h"
 
@@ -27,10 +28,28 @@ static json load_json_file(const fs::path& path) {
     }
 }
 
-json ConfigFile::get_defaults() {
+json ConfigFile::base_defaults() {
     json defaults = load_json_file(utils::path_from_utf8(
         utils::get_resource_path("resources/defaults.json")));
 
+    // Seed each backend's config.json section from its descriptor. The per-recipe
+    // defaults are authored in the backend's descriptor; resources/defaults.json
+    // is the generated, committed mirror (see GET /internal/config/defaults and
+    // docs/tools/gen_backend_boilerplate.py). Re-seeding here keeps the descriptor
+    // authoritative even if the committed file lags. Empty result = no section.
+    for (const auto* d : backends::all_descriptors()) {
+        json block = d->config_defaults();
+        if (!block.empty()) {
+            defaults[d->effective_config_section()] = block;
+        }
+    }
+
+    return defaults;
+}
+
+json ConfigFile::get_defaults() {
+    json defaults = base_defaults();
+
 #ifndef _WIN32
     fs::path distro_defaults = "/usr/share/lemonade/defaults.json";
     if (fs::exists(distro_defaults)) {
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index b4c529951..511aa080c 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -677,6 +677,9 @@ void Server::setup_routes(httplib::Server &web_server) {
     web_server.Get("/internal/config", [this](const httplib::Request& req, httplib::Response& res) {
         handle_config_get(req, res);
     });
+    web_server.Get("/internal/config/defaults", [this](const httplib::Request& req, httplib::Response& res) {
+        handle_config_defaults_get(req, res);
+    });
     web_server.Post("/internal/cleanup-cache", [this](const httplib::Request& req, httplib::Response& res) {
         handle_cleanup_cache(req, res);
     });
@@ -4468,6 +4471,20 @@ void Server::handle_config_get(const httplib::Request& /*req*/, httplib::Respons
     }
 }
 
+void Server::handle_config_defaults_get(const httplib::Request& /*req*/, httplib::Response& res) {
+    try {
+        // The canonical default config (global keys + descriptor-derived per-recipe
+        // sections), independent of this host's config.json or deployment overrides.
+        // gen_backend_boilerplate.py reads this to regenerate resources/defaults.json.
+        res.set_content(ConfigFile::base_defaults().dump(2), "application/json");
+    } catch (const std::exception& e) {
+        LOG(ERROR, "Server") << "ERROR in handle_config_defaults_get: " << e.what() << std::endl;
+        res.status = 500;
+        nlohmann::json error = {{"error", e.what()}};
+        res.set_content(error.dump(), "application/json");
+    }
+}
+
 void Server::handle_bin_change(const std::string& section,
                                 const std::string& bin_key,
                                 const std::string& new_value) {

From 283297aa7b4315269150191db733f4e6fcb252e1 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Fri, 26 Jun 2026 10:25:02 -0400
Subject: [PATCH 38/39] polish(cli): branch hf pull on repo_kind, not
 recipe==llamacpp

The single-installable-unit path keyed off recipe != "llamacpp"; switch it to
repo_kind != "gguf", the same server-provided classification the function
already uses for the collection branch. Behavior-equivalent (collections are
handled earlier, so by here repo_kind is gguf or onnx-ryzenai), and it drops the
last backend-name literal from hf_pull.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/cpp/cli/hf_pull.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/cpp/cli/hf_pull.cpp b/src/cpp/cli/hf_pull.cpp
index 8ed30ca0a..f5a84c051 100644
--- a/src/cpp/cli/hf_pull.cpp
+++ b/src/cpp/cli/hf_pull.cpp
@@ -255,11 +255,12 @@ int hf_pull_flow(lemonade::LemonadeClient& client,
 
     const auto& variants = variants_response["variants"];
     std::string recipe = variants_response.value("recipe", std::string("llamacpp"));
+    std::string repo_kind = variants_response.value("repo_kind", std::string("gguf"));
 
-    // Non-llamacpp recipes (currently: ONNX RyzenAI) ship as a single
-    // installable unit — no per-variant menu, no `:variant` checkpoint
-    // suffix, no `-VARIANT` model name tail.
-    if (recipe != "llamacpp") {
+    // Non-GGUF repos (currently: ONNX RyzenAI) ship as a single installable
+    // unit — no per-variant menu, no `:variant` checkpoint suffix, no
+    // `-VARIANT` model name tail. (Collections returned earlier above.)
+    if (repo_kind != "gguf") {
         if (!variant.empty()) {
             std::cerr << "warning: variant '" << variant << "' ignored for "
                       << recipe << " checkpoints" << std::endl;

From ce221b466e6be489b1bbab987e8fac2ed49f6c03 Mon Sep 17 00:00:00 2001
From: jeremyfowers <jeremy.fowers@amd.com>
Date: Sat, 27 Jun 2026 21:24:07 -0400
Subject: [PATCH 39/39] refactor(backends): functional-only comments; complete,
 alphabetical website model list (#2320 review)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Address Geramy's review on #2320:

- Comments: sweep every backend file to functional-only, removing commentary
  that narrates self-explanatory code or references backends/flows it can't
  know about (net ~330 comment lines cut). Short purpose comments on
  declarations and genuine WHY/invariant notes are kept.
- Website model list: drop the per-backend `web_priority` descriptor field
  (a 0-default that silently dropped backends like moonshine and vllm) and
  instead list every descriptor-backed recipe, ordered alphabetically by
  display name — a new backend can no longer be accidentally excluded.
- CMakeLists: move the authoritative LEMON_BACKENDS list near the top; the
  codegen foreach stays where it was.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt                                |  64 ++++----
 docs/assets/models.js                         |   8 +-
 docs/tools/gen_backend_boilerplate.py         |  14 +-
 .../lemon/backends/backend_descriptor.h       |   4 +-
 src/cpp/include/lemon/backends/backend_ops.h  |  46 ++----
 .../include/lemon/backends/backend_registry.h |   6 +-
 .../include/lemon/backends/backend_utils.h    |   4 +-
 .../lemon/backends/fastflowlm/fastflowlm.h    |   1 -
 .../backends/fastflowlm/fastflowlm_models.h   |   3 +-
 .../include/lemon/backends/kokoro/kokoro.h    |   1 -
 .../lemon/backends/llamacpp/llamacpp.h        |   1 -
 .../lemon/backends/moonshine/moonshine.h      |   1 -
 .../include/lemon/backends/ryzenai/ryzenai.h  |   1 -
 src/cpp/include/lemon/backends/sdcpp/sdcpp.h  |   1 -
 .../lemon/backends/sdcpp/sdcpp_server.h       |  14 --
 src/cpp/include/lemon/backends/vllm/vllm.h    |   1 -
 .../lemon/backends/whispercpp/whispercpp.h    |   1 -
 src/cpp/include/lemon/config_file.h           |   4 +-
 src/cpp/include/lemon/gguf_reader.h           |   1 -
 src/cpp/include/lemon/model_types.h           |   6 +-
 src/cpp/include/lemon/wrapped_server.h        |   1 -
 src/cpp/server/backends/backend_ops.cpp       |   3 +-
 src/cpp/server/backends/backend_utils.cpp     |   9 +-
 .../server/backends/cloud/cloud_server.cpp    | 148 ++++--------------
 .../backends/fastflowlm/fastflowlm_models.cpp |  22 +--
 .../backends/fastflowlm/fastflowlm_server.cpp |  29 +---
 .../server/backends/kokoro/kokoro_server.cpp  |   8 -
 .../backends/llamacpp/llamacpp_gguf.cpp       |  65 +++-----
 .../backends/llamacpp/llamacpp_server.cpp     |   8 -
 .../backends/moonshine/moonshine_server.cpp   |   6 -
 .../backends/ryzenai/ryzenai_server.cpp       |  13 +-
 .../server/backends/sdcpp/sdcpp_server.cpp    |  15 +-
 src/cpp/server/backends/vllm/vllm_server.cpp  |  14 +-
 .../backends/whispercpp/whispercpp_server.cpp |  38 +----
 src/cpp/server/config_file.cpp                |   8 +-
 src/cpp/server/model_manager.cpp              |   6 +-
 src/cpp/server/recipe_options.cpp             |   6 +-
 src/cpp/server/router.cpp                     |   6 +-
 src/cpp/server/runtime_config.cpp             |  11 +-
 src/cpp/server/server.cpp                     |   1 -
 src/cpp/server/system_info.cpp                |  20 +--
 test/cpp/test_auto_tune.cpp                   |  52 +-----
 42 files changed, 169 insertions(+), 502 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2b7dee8a4..78937c046 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,38 @@ include(GNUInstallDirs)
           ${CMAKE_SOURCE_DIR}/docs/man/man1/lemonade.1
           DESTINATION ${CMAKE_INSTALL_MANDIR}/man1
      )
+
+# ============================================================
+# Self-describing backends registry
+# ============================================================
+# The authoritative backend list. Each entry is "<recipe>|<stem>":
+#   recipe - the recipe string used in server_models.json (may contain dashes)
+#   stem   - identifier-safe name and folder. Each backend lives in its own
+#            folder, shipping (in namespace lemon::backends::<stem>):
+#              include/lemon/backends/<stem>/<stem>.h         inline const descriptor (CLI-safe data)
+#              include/lemon/backends/<stem>/<stem>_server.h  WrappedServer subclass + create() decl
+#              server/backends/<stem>/<stem>_server.cpp       implementation + create() def
+#
+# Adding a backend is one line here plus that folder. The codegen later in this
+# file (see "Self-describing backends registry codegen") compiles the server
+# source and regenerates the registry headers, which bind each descriptor to its
+# create(). Because this list is a tracked input, editing it forces regeneration
+# on the next build (a file(GLOB) would silently miss a newly added backend). The
+# descriptor is a header-only inline const, so it links into both the lemonade
+# CLI and lemond; only lemond links the server sources.
+set(LEMON_BACKENDS
+    # "<recipe>|<stem>"
+    "llamacpp|llamacpp"
+    "whispercpp|whispercpp"
+    "moonshine|moonshine"
+    "kokoro|kokoro"
+    "sd-cpp|sdcpp"
+    "flm|fastflowlm"
+    "ryzenai-llm|ryzenai"
+    "vllm|vllm"
+    "cloud|cloud"
+)
+
 # ============================================================
 # Tauri app source paths (used by both runtime and installers)
 # ============================================================
@@ -639,35 +671,11 @@ elseif(UNIX)
 endif()
 
 # ============================================================
-# Self-describing backends registry
+# Self-describing backends registry codegen
 # ============================================================
-# The authoritative backend list. Each entry is "<recipe>|<stem>":
-#   recipe - the recipe string used in server_models.json (may contain dashes)
-#   stem   - identifier-safe name and folder. Each backend lives in its own
-#            folder, shipping (in namespace lemon::backends::<stem>):
-#              include/lemon/backends/<stem>/<stem>.h         inline const descriptor (CLI-safe data)
-#              include/lemon/backends/<stem>/<stem>_server.h  WrappedServer subclass + create() decl
-#              server/backends/<stem>/<stem>_server.cpp       implementation + create() def
-#
-# Adding a backend is one line here plus that folder. The foreach below compiles
-# the server source and regenerates the registry headers, which bind each
-# descriptor to its create(). Because this list is a tracked input, editing it
-# forces regeneration on the next build (a file(GLOB) would silently miss a
-# newly added backend). The descriptor is a header-only inline const, so it links
-# into both the lemonade CLI and lemond; only lemond links the server sources.
-set(LEMON_BACKENDS
-    # "<recipe>|<stem>"
-    "llamacpp|llamacpp"
-    "whispercpp|whispercpp"
-    "moonshine|moonshine"
-    "kokoro|kokoro"
-    "sd-cpp|sdcpp"
-    "flm|fastflowlm"
-    "ryzenai-llm|ryzenai"
-    "vllm|vllm"
-    "cloud|cloud"
-)
-
+# Consumes LEMON_BACKENDS (defined near the top of this file): the foreach below
+# compiles each backend's server source and regenerates the registry headers that
+# bind every descriptor to its create().
 set(LEMON_DESCRIPTOR_INCLUDES "")
 set(LEMON_DESCRIPTOR_ENTRIES "")
 set(LEMON_FACTORY_INCLUDES "")
diff --git a/docs/assets/models.js b/docs/assets/models.js
index d9814cccb..e3c7762d7 100644
--- a/docs/assets/models.js
+++ b/docs/assets/models.js
@@ -4,12 +4,14 @@ const RAW_BASE = 'https://raw.githubusercontent.com/lemonade-sdk/lemonade';
 
 /* BEGIN GENERATED: models-js-recipes */
 const RECIPE_PRIORITY = [
+  'flm',
+  'kokoro',
   'llamacpp',
+  'moonshine',
   'ryzenai-llm',
-  'flm',
-  'whispercpp',
   'sd-cpp',
-  'kokoro'
+  'vllm',
+  'whispercpp'
 ];
 
 const RECIPE_DISPLAY_NAMES = {
diff --git a/docs/tools/gen_backend_boilerplate.py b/docs/tools/gen_backend_boilerplate.py
index b4e8ac8d9..940459288 100644
--- a/docs/tools/gen_backend_boilerplate.py
+++ b/docs/tools/gen_backend_boilerplate.py
@@ -279,12 +279,16 @@ def _js_key(recipe: str) -> str:
     return recipe if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", recipe) else f"'{recipe}'"
 
 
+def _web_display_name(info: dict, recipe: str) -> str:
+    return info.get("web_display_name") or info.get("display_name", recipe)
+
+
 def render_models_js(recipes: dict) -> str:
-    # RECIPE_PRIORITY: recipes with web_priority > 0, in that order (legacy oga-*
-    # recipes have no descriptor and are intentionally dropped).
+    # RECIPE_PRIORITY: every descriptor-backed recipe, ordered alphabetically by
+    # display name. Listing all of them (rather than an opt-in subset) means a new
+    # backend can never be silently dropped from the website.
     prioritized = sorted(
-        (r for r, i in recipes.items() if i.get("web_priority", 0) > 0),
-        key=lambda r: recipes[r]["web_priority"],
+        recipes, key=lambda r: _web_display_name(recipes[r], r).lower()
     )
     pri_lines = ",\n".join(f"  '{r}'" for r in prioritized)
 
@@ -292,7 +296,7 @@ def render_models_js(recipes: dict) -> str:
     # fallback (matching the curated map, which omits redundant entries).
     name_lines = []
     for r, info in _ordered(recipes):
-        name = info.get("web_display_name") or info.get("display_name", r)
+        name = _web_display_name(info, r)
         if name and name != _js_to_title(r):
             name_lines.append(f"  {_js_key(r)}: '{name}'")
     names = ",\n".join(name_lines)
diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h
index 03ca71e69..51b357154 100644
--- a/src/cpp/include/lemon/backends/backend_descriptor.h
+++ b/src/cpp/include/lemon/backends/backend_descriptor.h
@@ -20,8 +20,7 @@ struct BackendOption {
     std::string group;                // CLI help group, e.g. "General Options"
 };
 
-// How a backend shares the accelerator. Replaces the router's recipe-string
-// checks for NPU exclusivity and LRU slot accounting.
+// How a backend shares the accelerator.
 enum class SlotPolicy {
     Standard,      // counts toward the LRU slots, no device exclusivity (llamacpp, sd-cpp)
     ExclusiveNpu,  // evict ALL npu servers before loading (ryzenai-llm, whispercpp-npu)
@@ -69,7 +68,6 @@ struct BackendDescriptor {
     std::string modality;           // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation"
     bool        experimental = false; // true renders "(experimental)" next to the recipe in generated docs
     std::string web_display_name;   // name used on the docs website ("" = fall back to display_name)
-    int         web_priority = 0;    // model-grouping order on the docs website (lower = higher; 0 = unlisted)
 
     // ROCm release channels this backend publishes (e.g. {"stable","nightly"}).
     // Empty = the backend has no ROCm channels (its "rocm" build is a single
diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h
index 047c6795d..29bfa0b2d 100644
--- a/src/cpp/include/lemon/backends/backend_ops.h
+++ b/src/cpp/include/lemon/backends/backend_ops.h
@@ -3,7 +3,7 @@
 #include <optional>
 #include <string>
 #include <vector>
-#include "lemon/model_manager.h"  // ModelInfo, DownloadProgressCallback (server-side only)
+#include "lemon/model_manager.h"  // ModelInfo, DownloadProgressCallback
 
 namespace lemon {
 
@@ -11,16 +11,14 @@ class CloudProviderRegistry;
 
 namespace backends {
 
-// Context handed to BackendOps methods — the bits of server state model
-// management needs without a running subprocess. Grows as migrations require.
+// Context handed to BackendOps methods: the server state model management needs
+// without a running subprocess.
 struct BackendOpsContext {
     ModelManager* model_manager = nullptr;
     CloudProviderRegistry* cloud_registry = nullptr;  // for dynamic cloud discovery
 };
 
-// Inputs for resolving a checkpoint's on-disk path. The model manager computes
-// the HF-cache locations generically; each backend's ops decide how to find its
-// artifact within (a .gguf file, a genai_config.json directory, a .bin, …).
+// Inputs for resolving a checkpoint's on-disk path.
 struct CheckpointResolveContext {
     std::string hf_cache;          // HF cache root dir
     std::string model_cache_path;  // hf_cache/<checkpoint repo cache dir>
@@ -35,13 +33,10 @@ struct CheckpointResolveContext {
 // running subprocess: checkpoint-path resolution, download, dynamic discovery,
 // per-model metadata, version detection, availability. One singleton per
 // backend, exposed via lemon::backends::<stem>::ops() and bound in the registry
-// (see BackendRegistration::ops).
-//
-// The base class is the shared default behavior (the common HF-backed case);
-// each backend folder overrides ONLY the policy points it needs, so shared
-// logic is inherited rather than copied. Methods are added here incrementally as
-// switchboards in model_manager / system_info are migrated; every method has a
-// default so adding one never forces edits to backends that don't override it.
+// (see BackendRegistration::ops). The base class provides shared default
+// behavior; backends override only the policy points they need. Every method
+// has a default, so adding one never forces edits to backends that don't
+// override it.
 class BackendOps {
 public:
     virtual ~BackendOps() = default;
@@ -55,8 +50,7 @@ class BackendOps {
 
     // Resolve a checkpoint to its absolute on-disk path (file or directory).
     // Default: the shared HF behavior — locate the variant/aux file in the active
-    // snapshot, else fall back to the model cache directory. Backends with a
-    // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override.
+    // snapshot, else fall back to the model cache directory.
     virtual std::string resolve_checkpoint_path(const ModelInfo& info,
                                                 const CheckpointResolveContext& ctx) const;
 
@@ -71,7 +65,6 @@ class BackendOps {
 
     // Validate a user-supplied checkpoint string when registering a new model.
     // Return an error message if invalid, "" if acceptable. Default: accept.
-    // llamacpp requires a :variant on GGUF checkpoints.
     virtual std::string validate_registration_checkpoint(const std::string& checkpoint) const {
         (void)checkpoint;
         return "";
@@ -81,7 +74,6 @@ class BackendOps {
     // `main_variant`, for backends whose artifact layout isn't a GGUF file.
     // Return nullopt to use the default GGUF selection. (Direct single-file
     // variants — .safetensors/.pth/.ckpt — are handled generically upstream.)
-    // moonshine overrides: its variant names a directory of files to fetch.
     virtual std::optional<std::vector<std::string>> select_checkpoint_files(
         const std::string& main_variant, const std::vector<std::string>& repo_files) const {
         (void)main_variant;
@@ -90,40 +82,35 @@ class BackendOps {
     }
 
     // Models supplied at runtime rather than from server_models.json (descriptor
-    // dynamic_models = true). Default: none. cloud/flm override.
+    // dynamic_models = true). Default: none.
     virtual std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const {
         (void)ctx;
         return {};
     }
 
     // Whether a model's local artifacts are present. Default: the shared HF
-    // checkpoint-completeness check (ModelManager::checkpoints_complete). cloud
-    // (always true) and flm (installed-set membership) override.
+    // checkpoint-completeness check (ModelManager::checkpoints_complete).
     virtual bool is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const;
 
     // Validate a resolved checkpoint file for the cache. Returns "" if valid, or
-    // a reason it should be treated as not-downloaded. Default: always valid;
-    // llamacpp checks GGUF magic.
+    // a reason it should be treated as not-downloaded. Default: always valid.
     virtual std::string validate_checkpoint_file(const std::string& resolved_path) const {
         (void)resolved_path;
         return "";
     }
 
     // Download a model's artifacts. Default: the shared Hugging Face download.
-    // cloud (no-op) and flm (flm pull) override.
     virtual void download_model(const ModelInfo& info, bool do_not_upgrade,
                                 DownloadProgressCallback progress,
                                 const BackendOpsContext& ctx) const;
 
     // Whether the model cache must be rebuilt after this backend downloads a
-    // model (e.g. flm, whose model list changes). Default: false.
+    // model. Default: false.
     virtual bool invalidates_cache_after_download() const { return false; }
 
     // Resolve a backend's installed version for a given backend variant. The
     // caller passes the version read from the on-disk version.txt (or "" if
-    // absent); the default returns it unchanged. Backends that detect their
-    // version another way override: llamacpp's "system" build runs
-    // `llama-server --version`; flm queries `flm version` when no file is present.
+    // absent); the default returns it unchanged.
     virtual std::string resolve_version(const std::string& backend,
                                         const std::string& file_version) const {
         (void)backend;
@@ -139,8 +126,6 @@ class BackendOps {
 
     // Decide whether a backend variant is installed, given whether its managed
     // binary was found on disk. Default: installed iff the binary was found.
-    // llamacpp's "system" build also requires the ggml HIP plugin when an AMD GPU
-    // is present; flm can be a system PATH package even without a managed binary.
     virtual InstallCheck check_install(const std::string& backend, bool binary_found) const {
         (void)backend;
         return {binary_found, ""};
@@ -158,8 +143,7 @@ class BackendOps {
     // Classify a "supported but not available" backend variant for /system-info,
     // given the install probe's error text and the generic install command the
     // caller would otherwise use. Return nullopt to use the generic
-    // installable/no-fetch default. flm overrides: it is a system .deb + drivers
-    // needing manual setup, so its states and remediation links differ.
+    // installable/no-fetch default.
     virtual std::optional<UnavailableState> classify_unavailable(
         const std::string& backend, const std::string& install_error,
         const std::string& default_install_command) const {
diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h
index 240ddf728..4f4e7b6f8 100644
--- a/src/cpp/include/lemon/backends/backend_registry.h
+++ b/src/cpp/include/lemon/backends/backend_registry.h
@@ -18,8 +18,7 @@ namespace backends {
 
 struct BackendSpec;  // install/download spec, defined in backend_utils.h
 
-// Everything a backend's create() needs to build an instance. Mirrors the
-// arguments the old router factory passed to each backend constructor.
+// Everything a backend's create() needs to build an instance.
 struct BackendContext {
     std::string log_level;
     ModelManager* model_manager = nullptr;
@@ -31,8 +30,7 @@ struct BackendContext {
 using BackendCreateFn = std::unique_ptr<WrappedServer> (*)(const BackendContext&);
 
 // Convenience for the common create(): construct a server class from the
-// standard (log_level, model_manager, backend_manager) context fields. Backends
-// needing extra constructor arguments (cloud, ryzenai) build theirs by hand.
+// standard (log_level, model_manager, backend_manager) context fields.
 template <typename T>
 std::unique_ptr<WrappedServer> make_server(const BackendContext& ctx) {
     return std::make_unique<T>(ctx.log_level, ctx.model_manager, ctx.backend_manager);
diff --git a/src/cpp/include/lemon/backends/backend_utils.h b/src/cpp/include/lemon/backends/backend_utils.h
index bdbfe0869..8e2e532a5 100644
--- a/src/cpp/include/lemon/backends/backend_utils.h
+++ b/src/cpp/include/lemon/backends/backend_utils.h
@@ -45,9 +45,7 @@ namespace lemon::backends {
 
     // Build a backend's install/download spec from its descriptor's recipe/binary
     // and the server class T's get_install_params. The construct-on-first-use
-    // static gives the registry a stable pointer. Backends whose install key
-    // differs from the recipe (ryzenai) or that have no installable artifact
-    // (cloud) build their BackendSpec by hand instead of using this.
+    // static gives the registry a stable pointer.
     template <typename T>
     const BackendSpec* make_spec(const BackendDescriptor& d, bool split = false) {
         static const BackendSpec kSpec(d.recipe, d.binary, T::get_install_params, split);
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
index 24049ab31..dcdf345ac 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h
@@ -31,7 +31,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    false,
     /*web_display_name*/ "FastFlowLM NPU",
-    /*web_priority*/    3,
     /*rocm_channels*/   {},
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ false,
diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
index 87470300c..ee690e16a 100644
--- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
+++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h
@@ -22,8 +22,7 @@ std::vector<std::string> flm_installed_checkpoints();
 std::vector<ModelInfo> flm_discover_models();
 
 // FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH /
-// platform-default roots and describes them with a config.json; this knowledge
-// lives in the fastflowlm backend folder rather than in the shared model manager.
+// platform-default roots and describes them with a config.json.
 
 // Derive the on-disk repo directory name from an FLM model URL.
 std::string repo_dir_from_url(const std::string& url);
diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h
index 5f3fbf97c..3c34dc268 100644
--- a/src/cpp/include/lemon/backends/kokoro/kokoro.h
+++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h
@@ -32,7 +32,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text-to-speech",
     /*experimental*/    false,
     /*web_display_name*/ "",
-    /*web_priority*/    6,
     /*rocm_channels*/   {},
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ false,
diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
index 7c58a73f3..f0308cfb2 100644
--- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
+++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h
@@ -45,7 +45,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    false,
     /*web_display_name*/ "llama.cpp GPU",
-    /*web_priority*/    1,
     /*rocm_channels*/   {"stable", "nightly"},
     /*exposes_prometheus_metrics*/ true,
     /*rocm_requires_cwsr_fix*/ true,
diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h
index ae7313714..171b8a51f 100644
--- a/src/cpp/include/lemon/backends/moonshine/moonshine.h
+++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h
@@ -32,7 +32,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Speech-to-text",
     /*experimental*/    false,
     /*web_display_name*/ "",
-    /*web_priority*/    0,
     /*rocm_channels*/   {},
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ false,
diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
index dbc15d7f3..6df5511e4 100644
--- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
+++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h
@@ -31,7 +31,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    false,
     /*web_display_name*/ "Ryzen AI SW NPU",
-    /*web_priority*/    2,
     /*rocm_channels*/   {},
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ false,
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
index 986d26fbe..dadd24fe9 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h
@@ -49,7 +49,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Image generation",
     /*experimental*/    false,
     /*web_display_name*/ "stable-diffusion.cpp",
-    /*web_priority*/    5,
     /*rocm_channels*/   {"stable"},
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ true,
diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
index 185108afc..7e2acf048 100644
--- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
+++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h
@@ -46,20 +46,6 @@ class SDServer : public WrappedServer, public IImageServer {
     //
     // sd-server's HTTP API does not expose an upscaling endpoint, so we use the
     // sd-cli binary's -M upscale mode as a subprocess.
-    //
-    // Called by Server::handle_image_upscale (server.cpp), which is registered
-    // as the route handler for POST /api/v1/images/upscale (see register_post
-    // in Server::Server).
-    //
-    // Endpoint: POST /api/v1/images/upscale
-    //   Request body (JSON):
-    //     { "image": "<base64 PNG>", "model": "<model name, e.g. RealESRGAN-x4plus>" }
-    //   Success response (200):
-    //     { "created": <timestamp>, "data": [{ "b64_json": "<base64 PNG>" }] }
-    //   Error responses:
-    //     400 - missing "image" or "model" field
-    //     404 - model name not found in server_models.json
-    //     500 - upscale subprocess failed or sd-cli binary not found
     static std::string upscale_via_cli(
         const std::string& b64_image,
         const std::string& upscale_model_path,
diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h
index 8984e15b3..b62fbd83f 100644
--- a/src/cpp/include/lemon/backends/vllm/vllm.h
+++ b/src/cpp/include/lemon/backends/vllm/vllm.h
@@ -32,7 +32,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Text generation",
     /*experimental*/    true,
     /*web_display_name*/ "",
-    /*web_priority*/    0,
     /*rocm_channels*/   {},  // single rocm artifact, no stable/nightly channels
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ true,
diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
index 9c38b66d5..28c617ec2 100644
--- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
+++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h
@@ -41,7 +41,6 @@ inline const BackendDescriptor descriptor = {
     /*modality*/        "Speech-to-text",
     /*experimental*/    false,
     /*web_display_name*/ "whisper.cpp",
-    /*web_priority*/    4,
     /*rocm_channels*/   {},
     /*exposes_prometheus_metrics*/ false,
     /*rocm_requires_cwsr_fix*/ false,
diff --git a/src/cpp/include/lemon/config_file.h b/src/cpp/include/lemon/config_file.h
index 8c46e125f..f0353a345 100644
--- a/src/cpp/include/lemon/config_file.h
+++ b/src/cpp/include/lemon/config_file.h
@@ -86,9 +86,7 @@ class ConfigFile {
 public:
     /// The canonical default config: resources/defaults.json (global keys) with
     /// each backend's per-recipe section seeded from its descriptor. Host- and
-    /// deployment-independent, so it is reproducible — this is what
-    /// GET /internal/config/defaults emits and gen_backend_boilerplate.py writes
-    /// back into resources/defaults.json.
+    /// deployment-independent.
     static json base_defaults();
 
     /// base_defaults() plus deployment overrides. On Linux, an optional distro
diff --git a/src/cpp/include/lemon/gguf_reader.h b/src/cpp/include/lemon/gguf_reader.h
index 8eb4be3ad..865058067 100644
--- a/src/cpp/include/lemon/gguf_reader.h
+++ b/src/cpp/include/lemon/gguf_reader.h
@@ -219,7 +219,6 @@ inline bool read_gguf_metadata(GgufMetadata& out, const std::string& path) {
         uint32_t type = 0;
         if (!read_gguf_string(in, key) || !read_gguf_le(in, type)) return false;
 
-        // Read architecture
         if (key == "general.architecture" && type == 8) {
             if (!read_gguf_string(in, out.architecture)) return false;
             if (pending_context_length > 0) {
diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h
index c92bedb37..855912f16 100644
--- a/src/cpp/include/lemon/model_types.h
+++ b/src/cpp/include/lemon/model_types.h
@@ -140,11 +140,7 @@ inline ModelType get_model_type_from_labels(const std::vector<std::string>& labe
 }
 
 // Fallback device type for recipes with no registered backend descriptor
-// (collections and unknown recipes). The authoritative per-backend default lives
-// in BackendDescriptor::default_device; ModelManager::device_type_for_recipe
-// consults the descriptor registry first and only falls back here. Kept in this
-// low-level header (which must not depend on the backend registry) for that
-// fallback alone — it intentionally carries no per-backend knowledge.
+// (collections and unknown recipes); the descriptor registry is authoritative.
 inline DeviceType get_device_type_from_recipe(const std::string& recipe) {
     (void)recipe;
     return DEVICE_NONE;
diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h
index 41e91595b..3ad8465f7 100644
--- a/src/cpp/include/lemon/wrapped_server.h
+++ b/src/cpp/include/lemon/wrapped_server.h
@@ -308,7 +308,6 @@ class WrappedServer : public ICompletionServer {
         // No-op by default
     }
 
-    // ICompletionServer implementation - forward requests to the wrapped server.
     // Default to an "unsupported" error so non-chat backends (TTS, image,
     // transcription) inherit a sensible response instead of stubbing each one.
     virtual json chat_completion(const json& request) override {
diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp
index 2f4cdf48c..0fc96623a 100644
--- a/src/cpp/server/backends/backend_ops.cpp
+++ b/src/cpp/server/backends/backend_ops.cpp
@@ -15,8 +15,7 @@ using lemon::utils::path_to_utf8;
 
 // Default checkpoint resolution: the shared Hugging Face behavior. Locate the
 // requested variant (or auxiliary file like mmproj) within the active snapshot,
-// falling back to the main repo and finally the model cache directory. Backends
-// with bespoke layouts override resolve_checkpoint_path().
+// falling back to the main repo and finally the model cache directory.
 std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info,
                                                 const CheckpointResolveContext& ctx) const {
     (void)info;
diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp
index 42c0d1709..8ccf12632 100644
--- a/src/cpp/server/backends/backend_utils.cpp
+++ b/src/cpp/server/backends/backend_utils.cpp
@@ -34,8 +34,7 @@ using json = nlohmann::json;
 namespace lemon::backends {
 
     const BackendSpec* try_get_spec_for_recipe(const std::string& recipe) {
-        // Each backend exposes its install/download spec through the registry
-        // (see <stem>::spec()); no per-recipe branches or server includes here.
+        // Each backend exposes its install/download spec through the registry.
         return spec_for(recipe);
     }
 
@@ -545,8 +544,6 @@ namespace lemon::backends {
             // Remove the downloaded archive on ANY exit from here on — success
             // OR exception, including a throw from commit_staged_install() below
             // (a swap/rename failure) — so the cache archive is never leaked.
-            // Mirrors StagingGuard above; replaces the per-throw fs::remove(zip_path)
-            // calls that did not cover the commit_staged_install throw path.
             struct ZipGuard {
                 const std::string& path;
                 ~ZipGuard() {
@@ -754,9 +751,7 @@ namespace lemon::backends {
                 LOG(ERROR, spec.log_name()) << "Extraction completed but executable not found" << std::endl;
                 throw std::runtime_error("Extraction failed: executable not found");
             }
-            // Swap succeeded: staging was consumed by the rename, so disarm the
-            // guard (its cleanup would now be a no-op, but disarm to make intent
-            // explicit and skip a pointless filesystem call).
+            // Swap succeeded: staging was consumed by the rename, so disarm the guard.
             staging_guard.active = false;
 
             LOG(DEBUG, spec.log_name()) << "Executable verified at: " << exe_path << std::endl;
diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp
index 3c61c213b..0db3c3344 100644
--- a/src/cpp/server/backends/cloud/cloud_server.cpp
+++ b/src/cpp/server/backends/cloud/cloud_server.cpp
@@ -23,19 +23,8 @@ bool id_contains(const std::string& id, const std::string& needle) {
     return id.find(needle) != std::string::npos;
 }
 
-// Pattern-based fallback for /v1/models entries that don't publish any
-// capability metadata (notably OpenAI, whose response is just
-// {id, object, owned_by, created}). The patterns cover the model
-// families we currently know about:
-//   - Image/video: flux, stable-diffusion, sdxl, sd-, dall-e, gpt-image,
-//                  chatgpt-image, sora
-//   - Audio:       whisper, tts, *-transcribe, gpt-realtime, gpt-audio
-//   - Reranking:   rerank
-//   - Embeddings:  embed, bge-, nomic-
-//   - Classifiers: moderation
-// Anything else falls through to LLM. New providers that publish
-// capability metadata (see is_chat_model below) bypass this entirely
-// and don't need new patterns.
+// Id-pattern fallback for /v1/models entries that don't publish capability
+// metadata (notably OpenAI). Anything unmatched falls through to LLM.
 ModelType infer_type(const std::string& id) {
     if (id_contains(id, "flux") || id_contains(id, "stable-diffusion") ||
         id_contains(id, "sdxl") || id_contains(id, "sd-") ||
@@ -61,21 +50,12 @@ ModelType infer_type(const std::string& id) {
 }
 
 // Decide whether a /v1/models entry should be surfaced as a chat model.
-//
-// Strategy: trust provider-supplied capability metadata when it exists,
-// fall back to id pattern matching only when there is none. This keeps
-// the substring list bounded — adding a new provider that publishes
-// capabilities does not require adding new patterns.
-//
-// Signals checked, in priority order:
+// Trust provider capability metadata first, in priority order, falling back
+// to infer_type(id) for bare responses:
 //   1. supports_chat: bool       — Fireworks
-//   2. capabilities: [string]    — generic ("chat", "chat.completions",
-//                                  "embeddings", "image_generation", ...)
-//   3. architecture.modality     — OpenRouter ("text->text",
-//                                  "text+image->text", "text->image", ...)
-//                                  Anything that produces text via chat is
-//                                  considered chat-capable.
-//   4. infer_type(id) == LLM     — fallback for bare responses (OpenAI).
+//   2. capabilities: [string]    — generic
+//   3. architecture.modality     — OpenRouter
+//   4. type: string              — Together AI
 bool is_chat_model(const json& m) {
     if (!m.is_object() || !m.contains("id") || !m["id"].is_string()) {
         return false;
@@ -142,24 +122,11 @@ std::vector<std::string> chat_labels() {
 }
 
 // Detect capability labels (vision / tool-calling / reasoning) from a
-// /v1/models entry and normalise the divergent fields providers use into
-// lemonade's shared label vocabulary, so cloud models gate inputs exactly
-// like local ones (the UI offers image upload iff "vision" is present, etc.).
-//
-// Strategy mirrors is_chat_model: trust structured provider metadata first,
-// fall back to id patterns only for providers that publish none (OpenAI).
-// When a signal is absent the capability defaults OFF — under-offering an
-// input is safer than letting the client attach an image the provider rejects
-// (the per-model override exists for the cases auto-detection can't cover).
-//
-// Recognised signals:
-//   vision — supports_image_input (Fireworks); supports_vision/vision bools;
-//            architecture.input_modalities ⊇ "image" (OpenRouter);
-//            modalities/input_modalities ⊇ "image".
-//   tools  — supports_tools (Fireworks); supported_parameters ⊇ "tools"
-//            (OpenRouter); capabilities ⊇ "tools"/"function_calling";
-//            function_calling/supports_function_calling bools.
-//   reason — supported_parameters ⊇ "reasoning"; reasoning/supports_reasoning.
+// /v1/models entry, normalising the divergent fields providers use into
+// lemonade's shared label vocabulary so cloud models gate inputs like local
+// ones. When a signal is absent the capability defaults OFF — under-offering
+// an input is safer than letting the client attach an image the provider
+// rejects (the per-model override covers cases auto-detection can't).
 std::vector<std::string> capability_labels(const json& m) {
     std::vector<std::string> labels;
     if (!m.is_object()) return labels;
@@ -175,7 +142,6 @@ std::vector<std::string> capability_labels(const json& m) {
         return false;
     };
 
-    // ---- vision ----
     bool vision = flag("supports_image_input") || flag("supports_vision") ||
                   flag("vision") ||
                   array_has(m.value("modalities", json::array()), "image") ||
@@ -185,7 +151,6 @@ std::vector<std::string> capability_labels(const json& m) {
                            "image");
     }
 
-    // ---- tool-calling ----
     const json params = m.value("supported_parameters", json::array());
     const json caps = m.value("capabilities", json::array());
     bool tools = flag("supports_tools") || flag("function_calling") ||
@@ -194,14 +159,12 @@ std::vector<std::string> capability_labels(const json& m) {
                  array_has(caps, "tools") || array_has(caps, "function_calling") ||
                  array_has(caps, "tool_calling");
 
-    // ---- reasoning ----
     bool reasoning = flag("reasoning") || flag("supports_reasoning") ||
                      array_has(params, "reasoning") ||
                      array_has(params, "include_reasoning");
 
-    // ---- id-pattern fallback for metadata-barren providers (OpenAI) ----
-    // Only consulted when the entry carries no structured capability hints at
-    // all, so an authoritative "false" from a provider is never overridden.
+    // Id-pattern fallback, consulted only when the entry carries no structured
+    // capability hints, so an authoritative "false" from a provider stands.
     const bool has_meta = m.contains("supports_image_input") ||
                           m.contains("supports_vision") || m.contains("vision") ||
                           m.contains("supports_tools") ||
@@ -274,34 +237,10 @@ std::pair<double, double> parse_cloud_cost(const json& m) {
     return cost;
 }
 
-// Build the user-facing model name from a provider's upstream id, applying
-// two universal cleanup rules (no provider-specific code):
-//
-//   1. Collapse "accounts/<x>/models/<y>" -> "<x>/<y>". This is a
-//      content-pattern match (the GCP-style resource-path convention used
-//      by Fireworks). Any provider that adopts the same shape benefits
-//      automatically; providers using flat ids ("gpt-4o") or other
-//      namespaces ("meta-llama/Llama-3.3-70B-Instruct-Turbo") pass through
-//      untouched.
-//
-//   2. If the cleaned id leads with "<provider>/", strip it before adding
-//      the wrapping "<provider>/" prefix — otherwise Fireworks's first-
-//      party models ("fireworks/...") would render as
-//      "fireworks/fireworks/...".
-//
-// The provider namespace is joined with a "." separator (matching the
-// "user."/"extra." namespacing used elsewhere); the cleaned upstream id keeps
-// its own native "/" separators.
-//
-// Examples:
-//   provider="fireworks", id="accounts/fireworks/models/deepseek-v4-pro"
-//     -> "fireworks.deepseek-v4-pro"
-//   provider="fireworks", id="accounts/trilogy/models/cogsci-..."
-//     -> "fireworks.trilogy/cogsci-..."
-//   provider="openai",    id="gpt-4o"
-//     -> "openai.gpt-4o"
-//   provider="together",  id="meta-llama/Llama-3.3-70B-Instruct-Turbo"
-//     -> "together.meta-llama/Llama-3.3-70B-Instruct-Turbo"
+// Build the user-facing model name "<provider>.<cleaned_upstream_id>" by
+// applying two content-pattern cleanup rules (no provider-specific code).
+// Example: provider="fireworks", id="accounts/fireworks/models/deepseek-v4-pro"
+// -> "fireworks.deepseek-v4-pro".
 std::string build_public_name(const std::string& provider, const std::string& upstream_id) {
     std::string cleaned = upstream_id;
 
@@ -500,11 +439,8 @@ json CloudServer::post_with_auth(const std::string& path, const json& request,
     try {
         auto response = utils::HttpClient::post(url, request.dump(), headers, timeout_seconds);
         if (response.status_code == 200) {
-            // Telemetry: the chat/completions handler in server.cpp parses
-            // the `usage` field off the returned JSON and calls
-            // Router::update_telemetry / update_prompt_tokens. CloudServer
-            // returns the body unchanged so that path picks up the same
-            // prompt/completion counts every other backend reports.
+            // Return the body unchanged so the server.cpp handler picks up the
+            // `usage` telemetry like every other backend.
             return json::parse(response.body);
         }
 
@@ -549,12 +485,8 @@ void CloudServer::forward_streaming_request(const std::string& endpoint,
                                             bool sse,
                                             long timeout_seconds,
                                             TelemetryCallback telemetry_callback) {
-    // Telemetry from cloud streaming responses: OpenAI-shape SSE puts the
-    // usage block in the final pre-[DONE] chunk. We don't parse it here —
-    // the Router-level streaming path delivers cleaner numbers than we can
-    // reconstruct from chunked output, and matching local backends here
-    // would only diverge subtly. Passing the callback through preserves the
-    // contract for callers that pass one in.
+    // Streaming telemetry is left to the Router-level path, which produces
+    // cleaner numbers than reconstructing them from chunked SSE output.
     (void) telemetry_callback;
     auto sse_error = [](const std::string& message, const std::string& type,
                         const json& extra = json::object()) {
@@ -638,7 +570,6 @@ void CloudServer::forward_streaming_request(const std::string& endpoint,
                     if (length == 0) return true;
                     if (first_chunk) {
                         first_chunk = false;
-                        // Skip leading whitespace before classifying.
                         size_t i = 0;
                         while (i < length && std::isspace(static_cast<unsigned char>(data[i]))) ++i;
                         if (i < length && (data[i] == 'd' || data[i] == ':')) {
@@ -777,32 +708,20 @@ std::vector<ModelInfo> CloudServer::discover_models(const std::string& provider,
     }
 
     for (const auto& m : *model_array) {
-        // Chat-only by design. CloudServer implements chat_completion /
-        // completion against OpenAI v1; embeddings, audio, reranking, and
-        // image use diverging wire formats across providers and belong in
-        // sibling backends. is_chat_model() trusts provider-supplied
-        // capability metadata first (supports_chat, capabilities,
-        // architecture.modality) and falls back to id pattern matching for
-        // bare responses, so the router never sees a cloud model it cannot
-        // dispatch.
+        // Chat-only by design; embeddings/audio/reranking/image belong in
+        // sibling backends with diverging wire formats.
         if (!is_chat_model(m)) {
             continue;
         }
         std::string upstream_id = m["id"].get<std::string>();
 
         ModelInfo info;
-        // Public name = "<provider>.<cleaned_upstream_id>". The cleanup
-        // rules in build_public_name() are content-pattern based and apply
-        // universally to any provider — see the function comment for the
-        // examples and rationale.
         info.model_name = build_public_name(provider, upstream_id);
         info.checkpoints["main"] = upstream_id;
         info.recipe = "cloud";
         info.cloud_provider = provider;
-        // Discovered models are "suggested" because the user explicitly
-        // configured this provider — they wouldn't have a working API key
-        // otherwise. Without this, the Model Manager UI's default
-        // suggested-only filter hides every cloud model.
+        // Mark suggested so the Model Manager's default suggested-only filter
+        // doesn't hide every cloud model the user explicitly configured.
         info.suggested = true;
         info.downloaded = true;  // Cloud models have no local artifacts.
         info.size = 0.0;
@@ -812,9 +731,7 @@ std::vector<ModelInfo> CloudServer::discover_models(const std::string& provider,
         for (auto& cap : capability_labels(m)) {
             info.labels.push_back(std::move(cap));
         }
-        // Static metadata the providers publish (all three give context_length;
-        // OpenRouter/Together also give pricing). Surfaced in /models, /health
-        // and the discover response — display only, never affects routing.
+        // Display-only metadata; never affects routing.
         if (m.contains("context_length") && m["context_length"].is_number_integer()) {
             info.max_context_window = m["context_length"].get<int64_t>();
         }
@@ -849,24 +766,21 @@ class CloudOps : public BackendOps {
 public:
     std::string resolve_checkpoint_path(const ModelInfo&,
                                         const CheckpointResolveContext&) const override {
-        // Cloud-offloaded models have no local artifacts; the checkpoint is the
-        // upstream provider's model id, used directly when forwarding requests.
+        // Cloud models have no local artifacts; the checkpoint is the upstream
+        // provider's model id, used directly when forwarding requests.
         return "";
     }
 
-    // Cloud models have no local artifacts — always "downloaded".
     bool is_downloaded(const ModelInfo&, const BackendOpsContext&) const override {
         return true;
     }
 
-    // "Downloading" a cloud model is a no-op.
     void download_model(const ModelInfo&, bool, DownloadProgressCallback,
                         const BackendOpsContext&) const override {}
 
     // Discover models from each installed cloud provider with a resolvable
-    // credential. Per AGENTS.md invariant #11 the registry persists only
-    // {provider, base_url}; keys come from env vars / process memory. Failures
-    // are logged, never propagated, so one offline provider can't block discovery.
+    // credential. Failures are logged, never propagated, so one offline
+    // provider can't block discovery.
     std::vector<ModelInfo> discover_models(const BackendOpsContext& ctx) const override {
         std::vector<ModelInfo> out;
         if (ctx.cloud_registry == nullptr) {
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
index 83d2080bc..8a424234e 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp
@@ -145,7 +145,6 @@ std::vector<std::string> flm_installed_checkpoints() {
     std::string flm_path = find_flm_binary();
     if (flm_path.empty()) return installed_models;
 
-    // Run 'flm list --filter installed --quiet --json' to get only installed models
     std::string output;
 #ifdef _WIN32
     std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL";
@@ -180,7 +179,7 @@ std::vector<std::string> flm_installed_checkpoints() {
         // Fallback to legacy parsing if JSON parsing fails
     }
 
-    // Legacy parsing - cleaner format without emojis
+    // Legacy parsing
     // Expected format:
     //   Models:
     //     - modelname:tag
@@ -188,11 +187,9 @@ std::vector<std::string> flm_installed_checkpoints() {
     std::istringstream stream(output);
     std::string line;
     while (std::getline(stream, line)) {
-        // Trim whitespace
         line.erase(0, line.find_first_not_of(" \t\r\n"));
         line.erase(line.find_last_not_of(" \t\r\n") + 1);
 
-        // Skip the "Models:" header line or empty lines
         if (line == "Models:" || line.empty()) {
             continue;
         }
@@ -200,7 +197,6 @@ std::vector<std::string> flm_installed_checkpoints() {
         // Parse model checkpoint (format: "  - modelname:tag")
         if (line.find("- ") == 0) {
             std::string checkpoint = line.substr(2);
-            // Trim any remaining whitespace
             checkpoint.erase(0, checkpoint.find_first_not_of(" \t"));
             checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1);
             if (!checkpoint.empty()) {
@@ -223,7 +219,6 @@ std::vector<ModelInfo> flm_discover_models() {
 
     LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl;
 
-    // Run 'flm list --json' to get all available models
     std::string output;
 #ifdef _WIN32
     std::string command = "\"" + flm_path + "\" list --json";
@@ -260,7 +255,6 @@ std::vector<ModelInfo> flm_discover_models() {
                     // Format display name: replace : with -, append -FLM
                     // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM"
                     std::string display_name = checkpoint;
-                    // Replace : with -
                     std::replace(display_name.begin(), display_name.end(), ':', '-');
 
                     std::string model_name = display_name + "-FLM";
@@ -285,7 +279,6 @@ std::vector<ModelInfo> flm_discover_models() {
                         info.size = m["footprint"].get<double>();
                     }
 
-                    // Labels from FLM metadata
                     if (m.contains("label") && m["label"].is_array()) {
                         for (const auto& l : m["label"]) {
                             if (l.is_string()) {
@@ -294,7 +287,6 @@ std::vector<ModelInfo> flm_discover_models() {
                         }
                     }
 
-                    // Populate type and device fields (multi-model support)
                     info.type = get_model_type_from_labels(info.labels);
                     const BackendDescriptor* flm_desc = descriptor_for("flm");
                     info.device = flm_desc ? flm_desc->default_device : DEVICE_NPU;
@@ -317,7 +309,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
                   DownloadProgressCallback progress_callback) {
     LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl;
 
-    // Ensure FLM is ready (single source of truth)
     auto status = SystemInfoCache::get_flm_status();
     if (!status.is_ready()) {
         throw std::runtime_error(status.error_string());
@@ -328,7 +319,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
         throw std::runtime_error("FLM executable not found");
     }
 
-    // Prepare arguments
     std::vector<std::string> args = {"pull", checkpoint};
     if (!do_not_upgrade) {
         args.push_back("--force");
@@ -346,14 +336,11 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
     std::string current_filename;
     bool cancelled = false;
 
-    // Run flm pull command and parse output
     int exit_code = lemon::utils::ProcessManager::run_process_with_output(
         flm_path, args,
         [&](const std::string& line) -> bool {
-            // Always print the line to console
             LOG(INFO, "FLM") << line << std::endl;
 
-            // Parse FLM output to extract progress information
             // Pattern: "[FLM]  Downloading X/Y: filename"
             if (line.find("[FLM]  Downloading ") != std::string::npos &&
                 line.find("/") != std::string::npos &&
@@ -370,7 +357,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
                         total_files = std::stoi(line.substr(slash + 1, colon - slash - 1));
                         current_filename = line.substr(colon + 2);  // Skip ": "
 
-                        // Send progress update
                         if (progress_callback) {
                             DownloadProgress progress;
                             progress.file = current_filename;
@@ -395,7 +381,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
             else if (line.find("[FLM]  Downloading: ") != std::string::npos &&
                      line.find("%") != std::string::npos) {
 
-                // Extract percentage and bytes
                 size_t start = line.find("Downloading: ") + 13;
                 size_t pct_end = line.find("%", start);
 
@@ -440,7 +425,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
                             bytes_total = parse_size(total_str);
                         }
 
-                        // Send progress update with byte-level info
                         if (progress_callback) {
                             DownloadProgress progress;
                             progress.file = current_filename;
@@ -518,7 +502,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade,
         throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code));
     }
 
-    // Send completion event
     if (progress_callback) {
         DownloadProgress progress;
         progress.complete = true;
@@ -541,7 +524,6 @@ std::string flm_version() {
         return cached_version;
     }
 
-    // Find the flm executable using shared utility
     std::string flm_path = find_flm_executable();
     if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) {
         return "unknown";
@@ -588,7 +570,6 @@ std::string flm_version() {
         size_t pos = output.find("FLM v");
         // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34").
         std::string version = output.substr(pos + 4);
-        // Trim whitespace and newlines
         size_t end = version.find_first_of(" \t\n\r");
         if (end != std::string::npos) {
             version = version.substr(0, end);
@@ -664,7 +645,6 @@ bool run_flm_validate(const std::string& flm_path, std::string& error_message) {
         if (!output.empty()) {
             json j = lemon::utils::JsonUtils::parse(output);
             if (j.is_object()) {
-                // Check for overall status
                 bool validation_ok = false;
                 if (j.contains("ready")) {
                     validation_ok = j["ready"].get<bool>();
diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
index 050b5a961..94916d5d2 100644
--- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
+++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp
@@ -32,7 +32,6 @@ namespace fs = std::filesystem;
 namespace lemon {
 namespace backends {
 
-// URL to direct users to for driver updates
 static const std::string DRIVER_INSTALL_URL = "https://lemonade-server.ai/driver_install.html";
 
 
@@ -78,7 +77,6 @@ FastFlowLMServer::~FastFlowLMServer() {
 std::string FastFlowLMServer::download_model(const std::string& checkpoint, bool do_not_upgrade) {
     LOG(INFO, "FastFlowLM") << "Pulling model with FLM: " << checkpoint << std::endl;
 
-    // Use flm pull command to download the model
     std::string flm_path = get_flm_path();
     if (flm_path.empty()) {
         throw std::runtime_error("FLM not found");
@@ -95,7 +93,6 @@ std::string FastFlowLMServer::download_model(const std::string& checkpoint, bool
     }
     LOG(INFO, "ProcessManager") << std::endl;
 
-    // Run flm pull command (with debug output if enabled)
     auto handle = utils::ProcessManager::start_process(flm_path, args, "", is_debug());
 
     // Wait for process to complete (handles both fast exits and long downloads).
@@ -157,7 +154,6 @@ void FastFlowLMServer::load(const std::string& model_name,
                            bool do_not_upgrade) {
     LOG(INFO, "FastFlowLM") << "Loading model: " << model_name << std::endl;
 
-    // Get FLM-specific options from RecipeOptions
     int ctx_size = options.get_option("ctx_size");
 
     std::cout << "[FastFlowLM] Options: ctx_size=" << ctx_size << std::endl;
@@ -165,11 +161,9 @@ void FastFlowLMServer::load(const std::string& model_name,
     // We use checkpoint_ (base class field) for FLM API calls
 
 #ifdef _WIN32
-    // On Windows, auto-install FLM binary if needed (downloads zip and extracts)
     backend_manager_->install_backend(fastflowlm::spec()->recipe, "npu");
 #endif
 
-    // Validate NPU hardware/drivers
     std::string flm_path = get_flm_path();
     std::string validate_error;
     if (!fastflowlm::run_flm_validate(flm_path, validate_error)) {
@@ -177,17 +171,13 @@ void FastFlowLMServer::load(const std::string& model_name,
             "\nVisit " + DRIVER_INSTALL_URL + " for driver installation instructions.");
     }
 
-    // Download model if needed
     download_model(model_info.checkpoint(), do_not_upgrade);
 
-    // Choose a port
     port_ = choose_port();
 
-    // Construct flm serve command based on model type
     // Bind to localhost only for security
     std::vector<std::string> args;
     if (model_type_ == ModelType::TRANSCRIPTION) {
-        // ASR mode: flm serve --asr 1
         args = {
             "serve",
             "--asr", "1",
@@ -196,7 +186,6 @@ void FastFlowLMServer::load(const std::string& model_name,
             "--quiet"
         };
     } else if (model_type_ == ModelType::EMBEDDING) {
-        // Embedding mode: flm serve --embed 1
         args = {
             "serve",
             "--embed", "1",
@@ -205,7 +194,6 @@ void FastFlowLMServer::load(const std::string& model_name,
             "--quiet"
         };
     } else {
-        // LLM mode (default): flm serve <checkpoint> --ctx-len N
         args = {
             "serve",
             model_info.checkpoint(),
@@ -226,7 +214,6 @@ void FastFlowLMServer::load(const std::string& model_name,
     set_process_handle(utils::ProcessManager::start_process(flm_path, args, "", is_debug(), true));
     LOG(INFO, "ProcessManager") << "Process started successfully" << std::endl;
 
-    // Wait for flm-server to be ready
     bool ready = wait_for_ready();
     if (!ready) {
         const ProcessHandle handle = consume_process_handle_for_cleanup();
@@ -277,14 +264,12 @@ bool FastFlowLMServer::wait_for_ready() {
             return false;
         }
 
-        // Try to reach the /api/tags endpoint
         if (utils::HttpClient::is_reachable(tags_url, 1)) {
             LOG(INFO, "FastFlowLM") << server_name_ + " is ready!" << std::endl;
             start_backend_watchdog("/api/tags");
             return true;
         }
 
-        // Sleep 1 second between attempts
         std::this_thread::sleep_for(std::chrono::seconds(1));
     }
 
@@ -303,7 +288,7 @@ json FastFlowLMServer::chat_completion(const json& request) {
     // FLM requires the checkpoint name in the request (e.g., "gemma3:4b")
     // (whereas llama-server ignores the model name field)
     json modified_request = request;
-    modified_request["model"] = checkpoint_;  // Use base class checkpoint field
+    modified_request["model"] = checkpoint_;
 
     return forward_request("/v1/chat/completions", modified_request);
 }
@@ -318,7 +303,7 @@ json FastFlowLMServer::completion(const json& request) {
     // FLM requires the checkpoint name in the request (e.g., "lfm2:1.2b")
     // (whereas llama-server ignores the model name field)
     json modified_request = request;
-    modified_request["model"] = checkpoint_;  // Use base class checkpoint field
+    modified_request["model"] = checkpoint_;
 
     return forward_request("/v1/completions", modified_request);
 }
@@ -349,7 +334,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) {
     }
 
     try {
-        // Extract audio data from request (same format as WhisperServer)
         if (!request.contains("file_data")) {
             throw std::runtime_error("Missing 'file_data' in request");
         }
@@ -357,7 +341,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) {
         std::string audio_data = request["file_data"].get<std::string>();
         std::string filename = request.value("filename", "audio.wav");
 
-        // Determine content type from filename extension
         std::filesystem::path filepath(filename);
         std::string ext = filepath.extension().string();
         std::string content_type = "audio/wav";
@@ -367,10 +350,8 @@ json FastFlowLMServer::audio_transcriptions(const json& request) {
         else if (ext == ".flac") content_type = "audio/flac";
         else if (ext == ".webm") content_type = "audio/webm";
 
-        // Build multipart fields for FLM's /v1/audio/transcriptions endpoint
         std::vector<utils::MultipartField> fields;
 
-        // Audio file field
         fields.push_back({
             "file",
             audio_data,
@@ -381,7 +362,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) {
         // Model field (required by OpenAI API format)
         fields.push_back({"model", checkpoint_, "", ""});
 
-        // Optional parameters
         if (request.contains("language")) {
             fields.push_back({"language", request["language"].get<std::string>(), "", ""});
         }
@@ -408,7 +388,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) {
 }
 
 json FastFlowLMServer::responses(const json& request) {
-    // Responses API is not supported for FLM backend
     return ErrorResponse::from_exception(
         UnsupportedOperationException("Responses API", "flm")
     );
@@ -420,7 +399,6 @@ void FastFlowLMServer::forward_streaming_request(const std::string& endpoint,
                                                   bool sse,
                                                   long timeout_seconds,
                                                   TelemetryCallback telemetry_callback) {
-    // Streaming is only supported for LLM models
     if (model_type_ == ModelType::TRANSCRIPTION || model_type_ == ModelType::EMBEDDING) {
         std::string error_msg = "data: {\"error\":{\"message\":\"Streaming not supported for FLM "
             + model_type_to_string(model_type_) + " model\",\"type\":\"unsupported_operation\"}}\n\n";
@@ -433,10 +411,9 @@ void FastFlowLMServer::forward_streaming_request(const std::string& endpoint,
     // not the Lemonade model name (e.g., "Gemma3-4b-it-FLM")
     try {
         json request = json::parse(request_body);
-        request["model"] = checkpoint_;  // Use base class checkpoint field
+        request["model"] = checkpoint_;
         std::string modified_body = request.dump();
 
-        // Call base class with modified request
         WrappedServer::forward_streaming_request(endpoint, modified_body, sink, sse,
                                                  timeout_seconds, telemetry_callback);
     } catch (const json::exception& e) {
diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp
index 95d46de6a..a78e0a954 100644
--- a/src/cpp/server/backends/kokoro/kokoro_server.cpp
+++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp
@@ -73,11 +73,9 @@ KokoroServer::~KokoroServer() {
 void KokoroServer::load(const std::string& model_name, const ModelInfo& model_info, const RecipeOptions& options, bool do_not_upgrade) {
     LOG(INFO, "KokoroServer") << "Loading model: " << model_name << std::endl;
 
-    // Install kokoros if needed
     const std::string backend = default_kokoro_backend();
     backend_manager_->install_backend(kokoro::spec()->recipe, backend);
 
-    // Use pre-resolved model path
     fs::path model_path = fs::path(model_info.resolved_path());
     if (model_path.empty() || !fs::exists(model_path)) {
         throw std::runtime_error("Model file not found for checkpoint: " + model_info.checkpoint());
@@ -94,10 +92,8 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
 
     LOG(INFO, "KokoroServer") << "Using model: " << model_index["model"] << std::endl;
 
-    // Get koko executable path
     std::string exe_path = BackendUtils::get_backend_binary_path(*kokoro::spec(), backend);
 
-    // Choose a port
     port_ = choose_port();
     if (port_ == 0) {
         throw std::runtime_error("Failed to find an available port");
@@ -110,7 +106,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
     env_vars.push_back({"ESPEAK_DATA_PATH", (exe_dir / "espeak-ng-data").string()});
 #ifndef _WIN32
     std::string lib_path = exe_dir.string();
-    // Preserve existing LD_LIBRARY_PATH if it exists
     const char* existing_ld_path = std::getenv("LD_LIBRARY_PATH");
     if (existing_ld_path && strlen(existing_ld_path) > 0) {
         lib_path = lib_path + ":" + std::string(existing_ld_path);
@@ -120,7 +115,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
     LOG(INFO, "KokoroServer") << "Setting LD_LIBRARY_PATH=" << lib_path << std::endl;
 #endif
 
-    // Build command line arguments
     // Note: Don't include exe_path here - ProcessManager::start_process already handles it
     fs::path model_dir = model_path.parent_path();
     std::vector<std::string> args = {
@@ -131,7 +125,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
         "--port", std::to_string(port_)
     };
 
-    // Launch the subprocess
     ProcessHandle started_handle = utils::ProcessManager::start_process(
         exe_path,
         args,
@@ -148,7 +141,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in
 
     LOG(INFO, "KokoroServer") << "Process started with PID: " << started_handle.pid << std::endl;
 
-    // Wait for server to be ready
     if (!wait_for_ready("/")) {
         unload();
         throw std::runtime_error("koko failed to start or become ready");
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
index 81cc1c555..f41c2bd08 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp
@@ -30,7 +30,7 @@ std::string to_lower(std::string s) {
 std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant) {
     fs::path model_cache_path_fs = path_from_utf8(model_cache_path);
     if (!hf_cache::exists(model_cache_path_fs)) {
-        return model_cache_path;  // Return directory path even if not found
+        return model_cache_path;
     }
 
     // Collect the (sorted, mmproj-excluded) GGUF files under a search root.
@@ -64,11 +64,8 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st
 
     const std::string variant_lower = to_lower(variant);
 
-    // Resolve the requested GGUF variant within a candidate list of files.
-    // Returns the matched absolute path, or "" if this candidate set does not
-    // contain the variant. Factored into a lambda so the search can be retried
-    // against a broader set of snapshots (see #2300 below) without duplicating
-    // the matching logic.
+    // Factored into a lambda so the search can be retried against a broader set
+    // of snapshots (see #2300 below) without duplicating the matching logic.
     auto resolve_gguf_variant = [&](const std::vector<std::string>& gguf_files) -> std::string {
         if (gguf_files.empty()) {
             return "";
@@ -115,26 +112,14 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st
             }
         }
 
-        // Case 5: Local quant-token fallback.
+        // Case 5: Local quant-token fallback. Some repos put the quant token in
+        // the middle of the filename (e.g. ...-IQ4_XS-Q8nextn.gguf for variant
+        // IQ4_XS), so the suffix cases above miss it; mirror the downloader's
+        // variant enumeration over the local cache instead.
         //
-        // Keep the existing resolver cases above as the primary logic: exact
-        // filenames, suffix matches, and folder-based sharding are more
-        // specific and preserve the CHECKPOINT:VARIANT contract.
-        //
-        // Some GGUF repositories name files with the quant token in the middle,
-        // for example:
-        //   Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf
-        // for variant:
-        //   IQ4_XS
-        // That file does not end with IQ4_XS.gguf, so mirror the downloader's
-        // GGUF variant enumeration over the files that are already present in
-        // the local HF cache before declaring the model missing.
-        //
-        // HF cache paths have an extra snapshots/<revision>/ prefix that is not
-        // part of the repository-relative filename. Strip it before calling
-        // enumerate_gguf_variants(); otherwise the enumerator treats
-        // "snapshots" as a top-level sharded-folder variant and never extracts
-        // the quant token from the actual GGUF filename.
+        // Strip the HF cache snapshots/<revision>/ prefix before calling
+        // enumerate_gguf_variants(), otherwise it treats "snapshots" as a
+        // sharded-folder variant and never extracts the quant token.
         std::vector<std::string> relative_gguf_files;
         std::map<std::string, std::string> absolute_by_relative;
         auto repo_relative_from_cache_relative = [](std::string rel) {
@@ -191,9 +176,8 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st
             return "";
         }
 
-        // No match in this candidate set. Do not fall back to another
-        // quantization in the same Hugging Face repo; otherwise a custom
-        // download with a different quant can make a built-in model appear
+        // Don't fall back to another quantization in the same HF repo; a custom
+        // download with a different quant could make a built-in model appear
         // downloaded and allow deleting the wrong file.
         return "";
     };
@@ -215,25 +199,19 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st
     };
 
     if (active_gguf_files.empty() && whole_cache_gguf_files().empty()) {
-        return model_cache_path;  // Return directory if no GGUF found anywhere
+        return model_cache_path;
     }
 
     std::string resolved_path = resolve_gguf_variant(active_gguf_files);
 
-    // #2300: a sibling variant that shares this HF repo can live in a snapshot
-    // other than the one refs/main points at. refs/main advances to the
-    // snapshot of whichever variant was pulled or updated last, leaving the
-    // other variants' symlinks behind in earlier snapshots; after a restart the
-    // refs/main-only search above then reports them as missing. If the active
-    // snapshot did not contain the requested variant, broaden the search to
-    // every snapshot in this repo's cache before declaring it missing. Blobs are
-    // content-addressed and shared, so reading an older snapshot is safe, and
-    // resolving against the active snapshot first preserves the CHECKPOINT:VARIANT
-    // contract (a different quant is never substituted while the exact one exists).
-    //
-    // The whole-cache set is a superset of the active set, so the two are equal
-    // only when refs/main's snapshot is the sole snapshot holding GGUFs — in
-    // which case the broader search is identical and skipped.
+    // #2300: a requested variant can live in a snapshot other than the one
+    // refs/main points at (refs/main advances to whichever variant was pulled
+    // last, stranding the others in earlier snapshots), so the active-only
+    // search above can report it missing. Broaden to every snapshot before
+    // giving up; blobs are content-addressed so reading an older snapshot is
+    // safe, and searching the active snapshot first preserves CHECKPOINT:VARIANT.
+    // The whole-cache set is a superset of the active set, so when they're equal
+    // the broader search is identical and skipped.
     if (resolved_path.empty()) {
         const std::vector<std::string>& all_files = whole_cache_gguf_files();
         if (all_files != active_gguf_files) {
@@ -247,4 +225,3 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st
 } // namespace llamacpp
 } // namespace backends
 } // namespace lemon
-
diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
index eb766e798..ef1a67aa6 100644
--- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
+++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp
@@ -48,7 +48,6 @@ using namespace lemon::utils;
 namespace lemon {
 namespace backends {
 
-// Embedding model batch configuration set to 8192 as default
 static const int EMBEDDING_BATCH_SIZE = 8192;
 static const int EMBEDDING_UBATCH_SIZE = 8192;
 
@@ -272,7 +271,6 @@ void LlamaCppServer::load(const std::string& model_name,
                          bool do_not_upgrade) {
     LOG(INFO, "LlamaCpp") << "Loading model: " << model_name << std::endl;
 
-    // Llamacpp Backend logging
     LOG(DEBUG, "LlamaCpp") << "Per-model settings: " << options.to_log_string() << std::endl;
 
     int ctx_size = options.get_option("ctx_size");
@@ -309,13 +307,10 @@ void LlamaCppServer::load(const std::string& model_name,
     std::string mmproj_path = model_info.resolved_path("mmproj");
     std::string draft_path = model_info.resolved_path("draft");
 
-    // Choose port
     port_ = choose_port();
 
-    // Get executable path
     std::string executable = BackendUtils::get_backend_binary_path(*llamacpp::spec(), llamacpp_backend);
 
-    // Check for embeddings and reranking support based on model type
     bool supports_embeddings = (model_info.type == ModelType::EMBEDDING);
     bool supports_reranking = (model_info.type == ModelType::RERANKING);
 
@@ -365,7 +360,6 @@ void LlamaCppServer::load(const std::string& model_name,
     }
     push_reserved(reserved_flags, "--mmproj", std::vector<std::string>{"-mm", "-mmu", "--mmproj-url", "--no-mmproj", "--mmproj-auto", "--no-mmproj-auto", "--mmproj-offload", "--no-mmproj-offload"});
 
-    // Add draft model if present
     if (!draft_path.empty()) {
         push_arg(args, reserved_flags, "--model-draft", draft_path);
     }
@@ -571,7 +565,6 @@ void LlamaCppServer::load(const std::string& model_name,
     set_process_handle(ProcessManager::start_process(
         process_executable, args, working_dir, inherit_llama_output, true, env_vars));
 
-    // Wait for server to be ready
     if (!wait_for_ready("/health")) {
         const ProcessHandle handle = consume_process_handle_for_cleanup();
         if (has_process_handle(handle)) {
@@ -744,7 +737,6 @@ bool is_ggml_hip_plugin_available() {
         "/usr/lib64/ggml/backends0/libggml-hip.so"
     };
 
-    // Check all possible paths
     for (const auto& path : possible_paths) {
         if (fs::exists(path)) {
             return true;
diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp
index bcf263d67..f6f6c644c 100644
--- a/src/cpp/server/backends/moonshine/moonshine_server.cpp
+++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp
@@ -74,10 +74,8 @@ void MoonshineServer::load(const std::string& model_name,
 
     device_type_ = DEVICE_CPU;
 
-    // Install moonshine-server if needed
     backend_manager_->install_backend(moonshine::spec()->recipe, "cpu");
 
-    // Resolve model path from ModelManager (standard HF cache)
     std::string model_path = model_info.resolved_path();
     if (model_path.empty() || !fs::exists(model_path)) {
         throw std::runtime_error("Model directory not found for checkpoint: " + model_info.checkpoint());
@@ -100,7 +98,6 @@ void MoonshineServer::load(const std::string& model_name,
         }
     }
 
-    // Get executable path
     std::string executable = BackendUtils::get_backend_binary_path(*moonshine::spec(), "cpu");
     LOG(INFO, "MoonshineServer") << "Using executable: " << executable << std::endl;
 
@@ -159,12 +156,10 @@ void MoonshineServer::load(const std::string& model_name,
         args.insert(args.end(), custom_args_vec.begin(), custom_args_vec.end());
     }
 
-    // Set environment variables
     std::vector<std::pair<std::string, std::string>> env_vars;
     // Prevent system/user Python packages from leaking into the bundled environment
     env_vars.push_back({"PYTHONNOUSERSITE", "1"});
 
-    // Launch the subprocess
     bool inherit_output = (log_level_ == "info") || is_debug();
     ProcessHandle started_handle = utils::ProcessManager::start_process(
         executable,
@@ -182,7 +177,6 @@ void MoonshineServer::load(const std::string& model_name,
 
     LOG(INFO, "MoonshineServer") << "Process started with PID: " << started_handle.pid << std::endl;
 
-    // Wait for server to be ready
     if (!wait_for_ready("/health")) {
         unload();
         throw std::runtime_error("moonshine-server failed to start or become ready");
diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
index 69e1eed16..9b21edfca 100644
--- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
+++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp
@@ -57,10 +57,8 @@ void RyzenAIServer::load(const std::string& model_name,
     LOG(DEBUG, "RyzenAI") << "Loading model: " << model_name << std::endl;
     int ctx_size = options.get_option("ctx_size");
 
-    // Install/check RyzenAI-Server (will download if not found)
     backend_manager_->install_backend("ryzenai-llm", "npu");
 
-    // Get the path to ryzenai-server
     std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu");
     if (ryzenai_server_path.empty()) {
         throw std::runtime_error("RyzenAI-Server executable not found even after installation attempt");
@@ -68,7 +66,6 @@ void RyzenAIServer::load(const std::string& model_name,
 
     LOG(DEBUG, "RyzenAI") << "Found ryzenai-server at: " << ryzenai_server_path << std::endl;
 
-    // Model path should have been set via set_model_path() before calling load()
     if (model_path_.empty()) {
         throw std::runtime_error("Model path is required for RyzenAI-Server. Call set_model_path() before load()");
     }
@@ -81,10 +78,8 @@ void RyzenAIServer::load(const std::string& model_name,
 
     LOG(DEBUG, "RyzenAI") << "Model path: " << model_path_ << std::endl;
 
-    // Find available port
     port_ = choose_port();
 
-    // Build command line arguments
     std::vector<std::string> args = {
         "-m", model_path_,
         "--port", std::to_string(port_),
@@ -95,7 +90,6 @@ void RyzenAIServer::load(const std::string& model_name,
         args.push_back("--verbose");
     }
 
-    // Log the full command line
     LOG(DEBUG, "RyzenAI") << "Starting: \"" << ryzenai_server_path << "\"";
     for (const auto& arg : args) {
         LOG(DEBUG, "RyzenAI") << " \"" << arg << "\"";
@@ -119,7 +113,6 @@ void RyzenAIServer::load(const std::string& model_name,
     LOG(DEBUG, "ProcessManager") << "Process started successfully, PID: "
                 << started_handle.pid << std::endl;
 
-    // Wait for server to be ready
     if (!wait_for_ready("/health")) {
         const ProcessHandle handle = consume_process_handle_for_cleanup();
         if (has_process_handle(handle)) {
@@ -150,7 +143,6 @@ json RyzenAIServer::chat_completion(const json& request) {
         throw ModelNotLoadedException("RyzenAI-Server");
     }
 
-    // Forward to /v1/chat/completions endpoint
     return forward_request("/v1/chat/completions", request);
 }
 
@@ -159,7 +151,6 @@ json RyzenAIServer::completion(const json& request) {
         throw ModelNotLoadedException("RyzenAI-Server");
     }
 
-    // Forward to /v1/completions endpoint
     return forward_request("/v1/completions", request);
 }
 
@@ -168,7 +159,6 @@ json RyzenAIServer::responses(const json& request) {
         throw ModelNotLoadedException("RyzenAI-Server");
     }
 
-    // Forward to /v1/responses endpoint
     return forward_request("/v1/responses", request);
 }
 
@@ -179,8 +169,7 @@ namespace backends {
 namespace ryzenai {
 
 std::unique_ptr<WrappedServer> create(const BackendContext& ctx) {
-    // RyzenAI resolves its model path before load (set_model_path), matching the
-    // original router factory's special-casing.
+    // RyzenAI requires its model path resolved before load() via set_model_path().
     auto server = std::make_unique<::lemon::RyzenAIServer>(
         ctx.model_info->model_name, ctx.log_level == "debug",
         ctx.model_manager, ctx.backend_manager);
diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
index a4b1787f9..9dfec350b 100644
--- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
+++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp
@@ -211,10 +211,8 @@ void SDServer::load(const std::string& model_name,
         device_type_ = DEVICE_CPU;
     }
 
-    // Install sd-server if needed
     backend_manager_->install_backend(sdcpp::spec()->recipe, backend);
 
-    // Get model path
     std::string model_path = model_info.resolved_path("main");
     std::string llm_path = model_info.resolved_path("text_encoder");
     std::string vae_path = model_info.resolved_path("vae");
@@ -233,10 +231,8 @@ void SDServer::load(const std::string& model_name,
 
     LOG(DEBUG, "SDServer") << "Using model: " << model_path << std::endl;
 
-    // Get sd-server executable path
     std::string exe_path = BackendUtils::get_backend_binary_path(*sdcpp::spec(), backend);
 
-    // Choose a port
     port_ = choose_port();
     if (port_ == 0) {
         throw std::runtime_error("Failed to find an available port");
@@ -244,7 +240,6 @@ void SDServer::load(const std::string& model_name,
 
     LOG(INFO, "SDServer") << "Starting server on port " << port_ << " (backend: " << backend << ")" << std::endl;
 
-    // Build command line arguments
     std::vector<std::string> args = {
         "--listen-port", std::to_string(port_)
     };
@@ -295,7 +290,6 @@ void SDServer::load(const std::string& model_name,
         args.insert(args.end(), custom_args_vec.begin(), custom_args_vec.end());
     }
 
-    // Set up environment variables
     std::vector<std::pair<std::string, std::string>> env_vars;
     fs::path exe_dir = fs::path(exe_path).parent_path();
 #ifdef _WIN32
@@ -304,7 +298,6 @@ void SDServer::load(const std::string& model_name,
 #endif
 
 #ifndef _WIN32
-    // For Linux, always set LD_LIBRARY_PATH to include executable directory
     std::string lib_path = exe_dir.string();
 
     if (resolved_backend == "rocm-stable") {
@@ -328,8 +321,6 @@ void SDServer::load(const std::string& model_name,
     // ROCm builds on Windows require hipblaslt.dll, rocblas.dll, amdhip64.dll, etc.
     // These DLLs are distributed alongside sd-server.exe but need PATH to be set for loading
     if (is_rocm_backend(resolved_backend)) {
-        // Add executable directory to PATH for ROCm runtime DLLs
-        // This allows the sd-server.exe to find required HIP/ROCm libraries at runtime
         std::string new_path = path_to_utf8(exe_dir);
 
         if (resolved_backend == "rocm-stable") {
@@ -368,7 +359,6 @@ void SDServer::load(const std::string& model_name,
         BackendUtils::apply_cuda_env_vars(env_vars, "SDServer");
     }
 
-    // Launch the server process
     std::string process_exe_path = exe_path;
     std::string working_dir;
 #ifdef _WIN32
@@ -392,7 +382,6 @@ void SDServer::load(const std::string& model_name,
 
     LOG(INFO, "SDServer") << "Process started with PID: " << started_handle.pid << std::endl;
 
-    // Wait for server to be ready
     if (!wait_for_ready("/")) {
         unload();
         throw std::runtime_error("sd-server failed to start or become ready");
@@ -546,7 +535,7 @@ json SDServer::responses(const json& /* request */) {
 }
 
 json SDServer::image_generations(const json& request) {
-    // Build request - sd-server uses OpenAI-compatible format.
+    // sd-server uses OpenAI-compatible format.
     //
     // See PR #1173: https://github.com/leejet/stable-diffusion.cpp/pull/1173
     // for the <sd_cpp_extra_args> convention.
@@ -596,7 +585,6 @@ json SDServer::image_edits(const json& request) {
         fields.push_back({"size", size, "", ""});
     }
 
-    // Decode base64 image data back to binary for multipart upload
     if (request.contains("image_data")) {
         std::string image_binary = JsonUtils::base64_decode(
             request["image_data"].get<std::string>());
@@ -634,7 +622,6 @@ json SDServer::image_variations(const json& request) {
         fields.push_back({"size", size, "", ""});
     }
 
-    // Decode base64 image data back to binary for multipart upload
     if (request.contains("image_data")) {
         std::string image_binary = JsonUtils::base64_decode(
             request["image_data"].get<std::string>());
diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp
index 60a79c95f..335f0660a 100644
--- a/src/cpp/server/backends/vllm/vllm_server.cpp
+++ b/src/cpp/server/backends/vllm/vllm_server.cpp
@@ -123,11 +123,9 @@ void VLLMServer::load(const std::string& model_name,
 
     RuntimeConfig::validate_backend_choice("vllm", vllm_backend);
 
-    // Install vllm-server if needed
     backend_manager_->install_backend(vllm::spec()->recipe, vllm_backend);
 
-    // vLLM uses HuggingFace model names, not local file paths.
-    // The checkpoint field in server_models.json is the HF model ID.
+    // vLLM uses HuggingFace model IDs, not local file paths.
     std::string model_id = model_info.checkpoint();
     if (model_id.empty()) {
         throw std::runtime_error("Model checkpoint (HuggingFace ID) not found for: " + model_name);
@@ -135,13 +133,10 @@ void VLLMServer::load(const std::string& model_name,
 
     LOG(DEBUG, "vLLM") << "Using model: " << model_id << std::endl;
 
-    // Choose port
     port_ = choose_port();
 
-    // Get executable path
     std::string executable = BackendUtils::get_backend_binary_path(*vllm::spec(), vllm_backend);
 
-    // Build command line arguments
     std::vector<std::string> args;
     args.push_back("--model");
     args.push_back(model_id);
@@ -175,7 +170,6 @@ void VLLMServer::load(const std::string& model_name,
                            << "'; letting vLLM auto-select kernel" << std::endl;
     }
 
-    // enable prompt caching
     args.push_back("--enable-prefix-caching");
 
     // Avoid vLLM's default gpu_memory_utilization=0.92 on shared-memory systems.
@@ -186,7 +180,6 @@ void VLLMServer::load(const std::string& model_name,
         args.push_back("4G");
     }
 
-    // Append custom vllm_args if provided
     if (!vllm_args.empty()) {
         LOG(DEBUG, "vLLM") << "Adding custom arguments: " << vllm_args << std::endl;
         std::istringstream iss(vllm_args);
@@ -198,16 +191,13 @@ void VLLMServer::load(const std::string& model_name,
 
     LOG(INFO, "vLLM") << "Starting vllm-server on port " << get_backend_port() << "..." << std::endl;
 
-    // Set environment variables
     std::vector<std::pair<std::string, std::string>> env_vars;
 
-    // The vllm-server launcher script handles LD_LIBRARY_PATH for ROCm libs.
-    // Set FLASH_ATTENTION_TRITON_AMD_ENABLE for ROCm flash attention.
+    // Enable ROCm flash attention (the launcher script handles LD_LIBRARY_PATH).
     env_vars.push_back({"FLASH_ATTENTION_TRITON_AMD_ENABLE", "TRUE"});
     // Prevent system/user Python packages from leaking into the bundled vLLM environment
     env_vars.push_back({"PYTHONNOUSERSITE", "1"});
 
-    // Start process
     bool inherit_output = (log_level_ == "info") || is_debug();
     set_process_handle(ProcessManager::start_process(executable, args, "", inherit_output, true, env_vars));
 
diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
index d1222e551..2d9a683b0 100644
--- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
+++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp
@@ -68,7 +68,6 @@ WhisperServer::WhisperServer(const std::string& log_level, ModelManager* model_m
 WhisperServer::~WhisperServer() {
     unload();
 
-    // Clean up temp directory
     try {
         if (fs::exists(temp_dir_)) {
             fs::remove_all(temp_dir_);
@@ -127,7 +126,6 @@ InstallParams WhisperServer::get_install_params(const std::string& backend, cons
     return params;
 }
 
-// Helper to determine NPU compiled cache info based on model info from server_models.json
 static std::pair<std::string, std::string> get_npu_cache_info(const ModelInfo& model_info) {
     std::string npu_cache = model_info.checkpoint("npu_cache");
     std::string npu_cache_repo = "";
@@ -147,7 +145,6 @@ static std::pair<std::string, std::string> get_npu_cache_info(const ModelInfo& m
         return {npu_cache_repo, npu_cache_filename};
     }
 
-    // No NPU cache configured for this model in server_models.json
     LOG(INFO, "WhisperServer") << "No NPU cache configured for model: " << model_info.model_name << std::endl;
     return {"", ""};
 }
@@ -185,19 +182,16 @@ void WhisperServer::download_npu_compiled_cache(const std::string& model_path,
         throw std::runtime_error("npu_cache path escapes model directory");
     }
 
-    // Check if cache already exists
     if (fs::exists(cache_path) && !do_not_upgrade) {
         LOG(INFO, "WhisperServer") << "NPU cache already exists: " << cache_path << std::endl;
         return;
     }
 
     try {
-        // Download .rai file directly from HuggingFace using HttpClient
         std::string hf_url = "https://huggingface.co/" + cache_repo + "/resolve/main/" + cache_filename;
 
         LOG(INFO, "WhisperServer") << "Downloading from: " << hf_url << std::endl;
 
-        // Download directly to the target location
         auto download_result = utils::HttpClient::download_file(
             hf_url,
             cache_path.string(),
@@ -253,15 +247,13 @@ void WhisperServer::load(const std::string& model_name,
     LOG(INFO, "WhisperServer") << "Using model: " << model_path << std::endl;
     LOG(INFO, "WhisperServer") << "Using backend: " << whispercpp_backend << std::endl;
 
-    // For NPU backend, download the compiled cache (.rai file). This is a must-have for NPU backend.
+    // For NPU backend, download the compiled cache (.rai file).
     if (whispercpp_backend == "npu") {
         download_npu_compiled_cache(model_path, model_info, do_not_upgrade);
     }
 
-    // Get whisper-server executable path
     std::string exe_path = BackendUtils::get_backend_binary_path(*whispercpp::spec(), whispercpp_backend);
 
-    // Choose a port
     port_ = choose_port();
     if (port_ == 0) {
         throw std::runtime_error("Failed to find an available port");
@@ -269,7 +261,7 @@ void WhisperServer::load(const std::string& model_name,
 
     LOG(INFO, "WhisperServer") << "Starting server on port " << port_ << std::endl;
 
-    // Build command line arguments. Lemonade manages the model path and port;
+    // Lemonade manages the model path and port;
     // optional whisper-server flags like --convert come from whispercpp_args.
     // Note: Don't include exe_path here - ProcessManager::start_process already handles it
     std::vector<std::string> args = {
@@ -298,12 +290,10 @@ void WhisperServer::load(const std::string& model_name,
 
     // Note: whisper-server doesn't support --debug flag
 
-    // Set up environment variables for shared library loading
     std::vector<std::pair<std::string, std::string>> env_vars;
     fs::path exe_dir = fs::path(exe_path).parent_path();
 
 #ifndef _WIN32
-    // set LD_LIBRARY_PATH to include executable directory
     std::string lib_path = exe_dir.string();
 
     // ROCm whisper-server needs the TheRock ROCm libs (libamd_comgr.so.3, etc.)
@@ -330,7 +320,6 @@ void WhisperServer::load(const std::string& model_name,
     }
 #endif
 
-    // Launch the subprocess
     ProcessHandle started_handle = utils::ProcessManager::start_process(
         exe_path,
         args,
@@ -347,7 +336,6 @@ void WhisperServer::load(const std::string& model_name,
 
     LOG(INFO, "WhisperServer") << "Process started with PID: " << started_handle.pid << std::endl;
 
-    // Wait for server to be ready
     if (!wait_for_ready("/health")) {
         unload();
         throw std::runtime_error("whisper-server failed to start or become ready");
@@ -399,14 +387,13 @@ json WhisperServer::responses(const json& request) {
 // Audio file handling helpers
 std::string WhisperServer::save_audio_to_temp(const std::string& audio_data,
                                               const std::string& filename) {
-    // Generate unique filename
     std::random_device rd;
     std::mt19937 gen(rd());
     std::uniform_int_distribution<> dis(0, 999999);
 
     std::string ext = fs::path(filename).extension().string();
     if (ext.empty()) {
-        ext = ".audio";  // Default extension
+        ext = ".audio";
     }
 
     std::stringstream ss;
@@ -414,7 +401,6 @@ std::string WhisperServer::save_audio_to_temp(const std::string& audio_data,
 
     fs::path temp_file = temp_dir_ / ss.str();
 
-    // Write audio data to file
     std::ofstream outfile(temp_file, std::ios::binary);
     if (!outfile) {
         throw std::runtime_error("Failed to create temporary audio file: " + temp_file.string());
@@ -458,12 +444,10 @@ void WhisperServer::validate_audio_file(const std::string& path) {
 json WhisperServer::build_transcription_request(const json& request, bool translate) {
     json whisper_req;
 
-    // Required fields
     if (request.contains("file_path")) {
         whisper_req["file"] = request["file_path"];
     }
 
-    // Optional fields
     if (request.contains("language") && !translate) {
         // For transcription, respect language hint
         whisper_req["language"] = request["language"];
@@ -481,10 +465,9 @@ json WhisperServer::build_transcription_request(const json& request, bool transl
     if (request.contains("response_format")) {
         whisper_req["response_format"] = request["response_format"];
     } else {
-        whisper_req["response_format"] = "json";  // Default
+        whisper_req["response_format"] = "json";
     }
 
-    // Add translate flag if needed
     if (translate) {
         whisper_req["translate"] = true;
     }
@@ -492,11 +475,9 @@ json WhisperServer::build_transcription_request(const json& request, bool transl
     return whisper_req;
 }
 
-// Forward audio file to whisper-server using multipart form-data
 json WhisperServer::forward_multipart_audio_request(const std::string& file_path,
                                                     const json& params,
                                                     bool translate) {
-    // Read the audio file content
     std::ifstream file(file_path, std::ios::binary);
     if (!file) {
         throw std::runtime_error("Could not open audio file: " + file_path);
@@ -509,12 +490,10 @@ json WhisperServer::forward_multipart_audio_request(const std::string& file_path
 
     LOG(DEBUG, "WhisperServer") << "Audio file size: " << file_content.size() << " bytes" << std::endl;
 
-    // Determine content type based on file extension
     fs::path filepath(file_path);
     std::string ext = filepath.extension().string();
-    std::string content_type = "audio/wav";  // Default
+    std::string content_type = "audio/wav";
 
-    // Map common audio extensions to MIME types
     if (ext == ".mp3") content_type = "audio/mpeg";
     else if (ext == ".wav") content_type = "audio/wav";
     else if (ext == ".m4a") content_type = "audio/mp4";
@@ -531,7 +510,6 @@ json WhisperServer::forward_multipart_audio_request(const std::string& file_path
     audio_file.content_type = content_type;
     fields.push_back(audio_file);
 
-    // Add optional parameters as form fields
     std::string response_format = params.value("response_format", "json");
     utils::MultipartField fmt_field;
     fmt_field.name = "response_format";
@@ -585,7 +563,6 @@ json WhisperServer::forward_multipart_audio_request(const std::string& file_path
                                 std::to_string(res.status_code) + ": " + res.body);
     }
 
-    // Try to parse as JSON
     try {
         return json::parse(res.body);
     } catch (const json::parse_error&) {
@@ -604,10 +581,9 @@ json WhisperServer::forward_multipart_audio_data(const std::string& audio_data,
 
     LOG(DEBUG, "WhisperServer") << "Audio data size: " << audio_data.size() << " bytes (no file I/O)" << std::endl;
 
-    // Determine content type based on filename extension
     fs::path filepath(filename);
     std::string ext = filepath.extension().string();
-    std::string content_type = "audio/wav";  // Default
+    std::string content_type = "audio/wav";
 
     if (ext == ".mp3") content_type = "audio/mpeg";
     else if (ext == ".wav") content_type = "audio/wav";
@@ -683,7 +659,6 @@ json WhisperServer::forward_multipart_audio_data(const std::string& audio_data,
 // ITranscriptionServer implementation
 json WhisperServer::audio_transcriptions(const json& request) {
     try {
-        // Extract audio data from request
         if (!request.contains("file_data")) {
             throw std::runtime_error("Missing 'file_data' in request");
         }
@@ -691,7 +666,6 @@ json WhisperServer::audio_transcriptions(const json& request) {
         std::string audio_data = request["file_data"].get<std::string>();
         std::string filename = request.value("filename", "audio.wav");
 
-        // Send directly to whisper-server without file I/O
         return forward_multipart_audio_data(audio_data, filename, request, false);
 
     } catch (const std::exception& e) {
diff --git a/src/cpp/server/config_file.cpp b/src/cpp/server/config_file.cpp
index 2787c0167..dce9d17d9 100644
--- a/src/cpp/server/config_file.cpp
+++ b/src/cpp/server/config_file.cpp
@@ -32,11 +32,9 @@ json ConfigFile::base_defaults() {
     json defaults = load_json_file(utils::path_from_utf8(
         utils::get_resource_path("resources/defaults.json")));
 
-    // Seed each backend's config.json section from its descriptor. The per-recipe
-    // defaults are authored in the backend's descriptor; resources/defaults.json
-    // is the generated, committed mirror (see GET /internal/config/defaults and
-    // docs/tools/gen_backend_boilerplate.py). Re-seeding here keeps the descriptor
-    // authoritative even if the committed file lags. Empty result = no section.
+    // Seed each backend's config.json section from its descriptor.
+    // resources/defaults.json is the generated, committed mirror; re-seeding here
+    // keeps the descriptor authoritative even if that file lags.
     for (const auto* d : backends::all_descriptors()) {
         json block = d->config_defaults();
         if (!block.empty()) {
diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp
index 02679e803..276449b6f 100644
--- a/src/cpp/server/model_manager.cpp
+++ b/src/cpp/server/model_manager.cpp
@@ -110,8 +110,6 @@ namespace lemon {
 // Properties which are defined by the user for model registration.
 static const std::vector<std::string> USER_DEFINED_MODEL_PROPS = std::vector<std::string>{"checkpoints", "checkpoint", "recipe", "mmproj", "size", "image_defaults", "components", "recipe_options"};
 
-// Helper functions for string operations — use shared implementations from gguf_reader_detail
-
 static constexpr const char USER_MODEL_PREFIX[] = "user.";
 static constexpr size_t USER_MODEL_PREFIX_LEN = sizeof(USER_MODEL_PREFIX) - 1;
 static constexpr const char EXTRA_MODEL_PREFIX[] = "extra.";
@@ -1084,7 +1082,7 @@ std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::s
 
     // Compute the HF cache location for this checkpoint's repo, then let the
     // backend's ops find its artifact within (a .gguf file, a genai_config.json
-    // directory, a .bin, …) — no per-recipe switchboard here.
+    // directory, a .bin, …).
     backends::CheckpointResolveContext ctx;
     ctx.hf_cache = hf_cache;
     ctx.repo_id = checkpoint_to_repo_id(checkpoint);
@@ -2160,7 +2158,7 @@ void ModelManager::register_user_model(const std::string& model_name,
     std::string recipe = model_data.value("recipe", "");
 
     // Inject the backend's default labels for models that omit them (e.g. sd-cpp
-    // -> image, whispercpp/moonshine -> transcription). Sourced from the descriptor.
+    // -> image, whispercpp/moonshine -> transcription).
     if (const auto* desc = lemon::backends::descriptor_for(recipe)) {
         for (const auto& label : desc->default_labels) {
             labels.insert(label);
diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp
index 70c188e34..65d1474fa 100644
--- a/src/cpp/server/recipe_options.cpp
+++ b/src/cpp/server/recipe_options.cpp
@@ -240,9 +240,9 @@ void RecipeOptions::set_option(const std::string& opt, const json& value) {
 #ifdef LEMONADE_CLI
 // CLI_OPTIONS used only by the lemonade CLI client for add_cli_options.
 // ctx_size/merge_args are the common flags; everything else is derived from
-// descriptor options that declare a CLI flag, so the CLI never needs editing
-// when a backend is added. Image-gen params (steps/cfg_scale/width/height) have
-// no cli_flag in their descriptor, so they stay recipe-level only as before.
+// descriptor options that declare a CLI flag. Image-gen params
+// (steps/cfg_scale/width/height) have no cli_flag in their descriptor, so they
+// stay recipe-level only.
 static const json& get_cli_options() {
     static const json cli_options = [] {
         json o = json::object();
diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp
index 514a9773e..eb9c9d18f 100644
--- a/src/cpp/server/router.cpp
+++ b/src/cpp/server/router.cpp
@@ -145,8 +145,7 @@ bool Router::reload_model_after_watchdog_reset(const std::string& requested_mode
 }
 
 // Slot/eviction policy for a recipe, from its descriptor (default Standard).
-// This is the recipe-static policy used for pre-load slot decisions, mirroring
-// the historical use of get_device_type_from_recipe at load time.
+// This is the recipe-static policy used for pre-load slot decisions.
 static SlotPolicy slot_policy_for_recipe(const std::string& recipe) {
     if (const auto* desc = backends::descriptor_for(recipe)) {
         return desc->slot_policy;
@@ -324,8 +323,7 @@ std::unique_ptr<WrappedServer> Router::create_backend_server(const ModelInfo& mo
     ctx.cloud_registry = cloud_registry_;
     ctx.model_info = &model_info;
 
-    // The backend registry binds each recipe's descriptor to its create(). It is
-    // the single source of truth for backend construction (see LEMON_BACKENDS).
+    // The backend registry binds each recipe to its create() (see LEMON_BACKENDS).
     std::unique_ptr<WrappedServer> new_server = backends::create_server(model_info.recipe, ctx);
     if (new_server) {
         LOG(DEBUG, "Router") << "Created backend for recipe '" << model_info.recipe
diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp
index cc9bd6189..a6f7ffa7f 100644
--- a/src/cpp/server/runtime_config.cpp
+++ b/src/cpp/server/runtime_config.cpp
@@ -32,7 +32,7 @@ RuntimeConfig* RuntimeConfig::global() {
 
 // A valid config.json backend section is the config_section of any descriptor
 // that runs a local subprocess (binary != ""). Cloud has no binary, so it is not
-// a backend section. Derived from descriptors — no hand-maintained list.
+// a backend section.
 static bool is_backend_name(const std::string& key) {
     for (const auto* desc : lemon::backends::all_descriptors()) {
         if (!desc->binary.empty() && desc->effective_config_section() == key) {
@@ -291,8 +291,7 @@ std::string RuntimeConfig::rocm_channel_for_recipe(const std::string& recipe) co
     std::string channel = rocm_channel();
     // Clamp to a channel the backend actually publishes. A backend that lists
     // only {"stable"} (e.g. sd-cpp, which has no nightly artifacts) falls back to
-    // its first channel when "nightly" is requested. Driven by the descriptor's
-    // rocm_channels, so no per-recipe special case lives here.
+    // its first channel when "nightly" is requested.
     const auto* desc = lemon::backends::descriptor_for(recipe);
     if (desc && !desc->rocm_channels.empty()) {
         const auto& channels = desc->rocm_channels;
@@ -365,9 +364,9 @@ json RuntimeConfig::recipe_options(const std::string& backend) const {
     const std::string backend_args = backend + "_args";
 
     // Translate each backend's nested config.json section into the flat
-    // recipe_options format, driven by the descriptor's option list — no
-    // per-recipe block. The flat key is the descriptor option name; the
-    // config.json key is derived from the option's role (its name suffix):
+    // recipe_options format, driven by the descriptor's option list. The flat
+    // key is the descriptor option name; the config.json key is derived from the
+    // option's role (its name suffix):
     //   *_backend -> "backend"   *_args -> variant "<backend>_args" then "args"
     //   *_device  -> "device"    everything else -> the option name verbatim
     //                            (sd-cpp's steps/cfg_scale/width/height/…)
diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp
index 511aa080c..d4f7ad1ee 100644
--- a/src/cpp/server/server.cpp
+++ b/src/cpp/server/server.cpp
@@ -4475,7 +4475,6 @@ void Server::handle_config_defaults_get(const httplib::Request& /*req*/, httplib
     try {
         // The canonical default config (global keys + descriptor-derived per-recipe
         // sections), independent of this host's config.json or deployment overrides.
-        // gen_backend_boilerplate.py reads this to regenerate resources/defaults.json.
         res.set_content(ConfigFile::base_defaults().dump(2), "application/json");
     } catch (const std::exception& e) {
         LOG(ERROR, "Server") << "ERROR in handle_config_defaults_get: " << e.what() << std::endl;
diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp
index f7cccc162..22ef1c749 100644
--- a/src/cpp/server/system_info.cpp
+++ b/src/cpp/server/system_info.cpp
@@ -419,15 +419,13 @@ std::vector<GPUInfo> query_dxg_amd_gpus(const std::string& gpu_type) {
 //
 // Empty family set {} means "all families of that device type"
 // The recipe/backend support matrix is assembled from every backend descriptor's
-// `support` rows (see lemon/backends/*_descriptor.cpp). Concatenated in registry
-// order; within a recipe, row order is the backend preference order. This is the
-// single source of truth — there is no separate hand-maintained table.
+// `support` rows. Concatenated in registry order; within a recipe, row order is
+// the backend preference order.
 static const std::vector<RecipeBackendDef>& recipe_defs() {
     static const std::vector<RecipeBackendDef> defs = [] {
         std::vector<RecipeBackendDef> v;
         for (const auto* desc : lemon::backends::all_descriptors()) {
             for (const auto& row : desc->support) {
-                // Fill in the recipe (the owning descriptor's) per support row.
                 v.push_back({desc->recipe, row.backend, row.supported_os, row.devices, row.device_summary});
             }
         }
@@ -579,7 +577,7 @@ static bool device_matches_constraint(const std::string& device_family,
 // Generic installation check
 static bool is_recipe_installed(const std::string& recipe, const std::string& backend, std::string& error_message) {
     // Special handling for ROCm backends on gfx1151 (Strix Halo) if the kernel
-    // CWSR fix is missing — which backends' rocm build needs it is a descriptor flag.
+    // CWSR fix is missing (a per-descriptor flag).
     const auto* cwsr_desc = backends::descriptor_for(recipe);
     if (backend == "rocm" && cwsr_desc && cwsr_desc->rocm_requires_cwsr_fix &&
         needs_gfx1151_cwsr_fix()) {
@@ -607,7 +605,7 @@ static bool is_recipe_installed(const std::string& recipe, const std::string& ba
 static std::string get_recipe_version(const std::string& recipe, const std::string& backend) {
     // Read the on-disk version.txt generically, then let the backend's ops
     // override (llamacpp "system" runs llama-server --version; flm queries the
-    // CLI when no file is present). No per-recipe branches here.
+    // CLI when no file is present).
     auto* spec = try_get_spec_for_recipe(recipe);
     std::string file_version;
     if (spec) {
@@ -1471,8 +1469,7 @@ json SystemInfo::build_recipes_info(const json& devices) {
 
     // Enrich each recipe entry with descriptor metadata so clients (the desktop
     // app, the docs generator) can render display names and per-recipe option
-    // schemas without hardcoding them. This is the single source the frontend
-    // reads instead of its own per-recipe TypeScript tables.
+    // schemas without hardcoding them.
     int recipe_order = 0;
     for (const auto* desc : lemon::backends::all_descriptors()) {
         auto it = recipes.find(desc->recipe);
@@ -1488,11 +1485,9 @@ json SystemInfo::build_recipes_info(const json& devices) {
         entry["modality"] = desc->modality;
         entry["experimental"] = desc->experimental;
         entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name;
-        entry["web_priority"] = desc->web_priority;
         entry["slot_policy"] = slot_policy_to_string(desc->slot_policy);
         // Machine-independent support matrix (OS + device families + friendly
-        // device summary per backend), straight from the descriptor — used by the
-        // docs generator to render the README support matrix etc.
+        // device summary per backend), straight from the descriptor.
         json support = json::array();
         for (const auto& row : desc->support) {
             json devices = json::array();
@@ -1586,8 +1581,7 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std
 std::string SystemInfo::check_recipe_supported(const std::string& recipe) {
     // A backend whose descriptor declares no support rows has no local
     // hardware/OS gating (e.g. cloud offload): availability is determined at
-    // runtime (provider creds via the CloudProviderRegistry / API key), checked
-    // elsewhere in filter_models_by_backend / CloudServer::load.
+    // runtime (provider creds via the CloudProviderRegistry / API key).
     const auto* desc = lemon::backends::descriptor_for(recipe);
     if (desc && desc->support.empty()) {
         return "";
diff --git a/test/cpp/test_auto_tune.cpp b/test/cpp/test_auto_tune.cpp
index 75dbeba74..c1f976f83 100644
--- a/test/cpp/test_auto_tune.cpp
+++ b/test/cpp/test_auto_tune.cpp
@@ -1,12 +1,5 @@
 // Standalone test for GGUF array storage and weighted KV cache computation.
 //
-// Covers:
-//  - GgufMetadata raw array fields (head_count_kv_per_layer, sliding_window_pattern)
-//  - Post-loop derivation of scalar convenience fields
-//  - compute_weighted_kv_cache_bytes_per_token() with per-layer arrays
-//  - full_attention_interval exact count (floor((n-1)/interval) + 1)
-//  - SWA precise weighted sum vs proportional approximation
-//
 // Compile: g++ -std=c++17 -I src/cpp/include test/cpp/test_auto_tune.cpp -o test_auto_tune
 
 #include "lemon/gguf_reader.h"
@@ -23,17 +16,13 @@ static void check(const char* name, bool ok) {
     if (!ok) ++g_failures;
 }
 
-// Floating-point equality with tolerance
 static bool approx_eq(double a, double b, double tol = 0.001) {
     return std::fabs(a - b) < tol;
 }
 
-// ── Helpers to simulate post-loop derivation ─────────────────────────
-
 /// Simulate what read_gguf_metadata does after the KV loop:
 /// derive head_count_kv and swa_layer_count from raw arrays/scalars.
 static void derive_scalars(GgufMetadata& m) {
-    // head_count_kv derivation
     if (!m.head_count_kv_per_layer.empty()) {
         for (int64_t v : m.head_count_kv_per_layer)
             m.head_count_kv += v;
@@ -42,19 +31,16 @@ static void derive_scalars(GgufMetadata& m) {
         m.head_count_kv_per_layer.assign(m.block_count, m.head_count_kv_scalar);
     }
 
-    // swa_layer_count derivation
     if (!m.sliding_window_pattern.empty()) {
         for (bool v : m.sliding_window_pattern)
             if (v) m.swa_layer_count++;
     }
 }
 
-// ── Test: scalar head_count_kv derivation ─────────────────────────────
-
 static void test_scalar_head_count_kv() {
     GgufMetadata m;
     m.block_count = 32;
-    m.head_count_kv_scalar = 4;  // 4 KV heads per block
+    m.head_count_kv_scalar = 4;
 
     derive_scalars(m);
 
@@ -68,8 +54,6 @@ static void test_scalar_head_count_kv() {
                       [](int64_t v) { return v == 4; }));
 }
 
-// ── Test: array head_count_kv derivation ──────────────────────────────
-
 static void test_array_head_count_kv() {
     GgufMetadata m;
     m.block_count = 4;
@@ -83,8 +67,6 @@ static void test_array_head_count_kv() {
           m.head_count_kv == 48);
 }
 
-// ── Test: sliding_window_pattern derivation ───────────────────────────
-
 static void test_swa_pattern_derivation() {
     GgufMetadata m;
     m.block_count = 8;
@@ -96,13 +78,11 @@ static void test_swa_pattern_derivation() {
           m.swa_layer_count == 4);
 }
 
-// ── Test: standard MHA/GQA (no scaling) ──────────────────────────────
-
 static void test_standard_mha() {
     GgufMetadata m;
     m.block_count = 32;
     m.key_length = 128;
-    m.head_count_kv_per_layer.assign(32, 4);  // 4 KV heads per block
+    m.head_count_kv_per_layer.assign(32, 4);
 
     derive_scalars(m);
     // Expected: 128 total heads * 128 key_len * 2[F16] * 2[K+V] = 65536
@@ -112,8 +92,6 @@ static void test_standard_mha() {
           approx_eq(bytes, 128.0 * 128.0 * 4.0));
 }
 
-// ── Test: SWA with per-layer arrays (precise weighted sum) ───────────
-
 static void test_swa_precise() {
     GgufMetadata m;
     m.block_count = 4;
@@ -140,10 +118,8 @@ static void test_swa_precise() {
 
     double scale = 0;
     lemon::compute_weighted_kv_cache_bytes_per_token(m, &scale);
-    // Unweighted: (8+4+8+4) * 256 = 4608
-    // scale = 5120 / 4608 ≈ 1.1111... wait, that's > 1 which is wrong
-    // Actually: weighted = 8*256 + 4*128 + 8*256 + 4*128 = 2048+512+2048+512 = 5120
-    // unweighted = (8+4+8+4) * 256 = 24 * 256 = 6144
+    // weighted = 8*256 + 4*128 + 8*256 + 4*128 = 5120
+    // unweighted = (8+4+8+4) * 256 = 6144
     // scale = 5120 / 6144 ≈ 0.8333
     check("swa-precise: scale factor < 1.0",
           scale > 0.0 && scale < 1.0);
@@ -151,8 +127,6 @@ static void test_swa_precise() {
           approx_eq(scale, 5120.0 / 6144.0));
 }
 
-// ── Test: SWA with scalar fallback (proportional approximation) ──────
-
 static void test_swa_scalar_fallback() {
     GgufMetadata m;
     m.block_count = 4;
@@ -160,7 +134,7 @@ static void test_swa_scalar_fallback() {
     m.key_length_swa = 128;
 
     // Scalar case: no per-layer array, no sliding_window_pattern
-    m.head_count_kv_scalar = 8;  // uniform 8 heads per block
+    m.head_count_kv_scalar = 8;
 
     derive_scalars(m);
     // After derivation, per_layer IS populated from scalar, and swa_pattern is empty.
@@ -175,14 +149,12 @@ static void test_swa_scalar_fallback() {
           approx_eq(bytes, 32.0 * 256.0 * 4.0));
 }
 
-// ── Test: SWA with scalar + manually set swa_layer_count ─────────────
-
 static void test_swa_scalar_with_count() {
     GgufMetadata m;
     m.block_count = 4;
     m.key_length = 256;
     m.key_length_swa = 128;
-    m.swa_layer_count = 2;  // 2 out of 4 layers are SWA
+    m.swa_layer_count = 2;
 
     m.head_count_kv_scalar = 8;
 
@@ -196,8 +168,6 @@ static void test_swa_scalar_with_count() {
           approx_eq(bytes, 32.0 * 256.0 * 4.0 * 0.75));
 }
 
-// ── Test: full_attention_interval exact count ─────────────────────────
-
 static void test_full_attention_interval() {
     // For each (blocks, interval), verify the exact count:
     // floor((blocks - 1) / interval) + 1
@@ -251,8 +221,6 @@ static void test_full_attention_interval() {
     }
 }
 
-// ── Test: full_attention_interval formula vs old approximation ────────
-
 static void test_fai_improvement() {
     // Demonstrate that the exact formula differs meaningfully from 1/interval
     // for non-divisible block counts.
@@ -278,8 +246,6 @@ static void test_fai_improvement() {
           scale > old_approx);
 }
 
-// ── Test: missing metadata returns 0 ──────────────────────────────────
-
 static void test_missing_metadata() {
     GgufMetadata m_empty;
     double bytes = lemon::compute_weighted_kv_cache_bytes_per_token(m_empty);
@@ -298,11 +264,8 @@ static void test_missing_metadata() {
     check("missing: no key_length returns 0", bytes == 0.0);
 }
 
-// ── Test: varying head counts with SWA ────────────────────────────────
-
 static void test_varying_heads_swa() {
     // Model where SWA layers have FEWER heads than full layers.
-    // This is where the precise weighted sum matters most.
     GgufMetadata m;
     m.block_count = 6;
     m.key_length = 256;
@@ -326,14 +289,11 @@ static void test_varying_heads_swa() {
     // Old proportional approximation (with uniform head count = total/6 = 6):
     //   factor = 1 - 3/6 + 3/6 * 64/256 = 1 - 0.5 + 0.125 = 0.625
     //   bytes = 60 * 256 * 4 * 0.625 = 38400
-    // The precise value (52224) is significantly different!
     double old_approx = 60.0 * 256.0 * 4.0 * 0.625;
     check("varying-heads-swa: precise differs from proportional",
           !approx_eq(bytes, old_approx, 1000.0));
 }
 
-// ── Main ──────────────────────────────────────────────────────────────
-
 int main() {
     test_scalar_head_count_kv();
     test_array_head_count_kv();