From 118f54a632ac1d0cdbf93754da795d5d38e75d7b Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Fri, 19 Jun 2026 12:48:14 -0400 Subject: [PATCH 01/39] refactor(backends): self-describing WrappedServer backends (#2287) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make each inference backend describe itself with a plain-data descriptor plus a server class, and rewrite the scattered `if (recipe == "...")` sites to read a registry built from those descriptors. Adding a backend becomes one LEMON_BACKENDS line plus a descriptor + factory file — no router, CLI, docs, or support-matrix edits. - Descriptor types (BackendDescriptor/BackendOption/SlotPolicy) + a CLI-safe data registry and a server-only factory registry, generated from the LEMON_BACKENDS list at CMake configure time. - All 9 backends carry a descriptor (device, slot policy, options, support matrix, labels, binary) and a create(). - Descriptor-driven: router creation, NPU/slot eviction, device type, recipe options/CLI flags, config-section identity, support matrix, recipe labels, cloud availability. - /system-info recipes enriched with display_name/selectable_backend/options/ support; the app reads recipe display names from it instead of hardcoded TS. - docs/tools/gen_backend_docs.py generates docs/dev/backends-reference.md from /system-info; a CI step fails on drift. Authoring guide in docs/dev/adding-a-backend.md. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/docs_and_style.yml | 17 + CMakeLists.txt | 69 ++++ docs/dev/adding-a-backend.md | 145 ++++++++ docs/dev/backends-reference.md | 325 ++++++++++++++++++ docs/dev/contribute.md | 4 + docs/tools/gen_backend_docs.py | 309 +++++++++++++++++ src/app/src/renderer/utils/recipeNames.ts | 29 +- src/app/src/renderer/utils/systemData.ts | 20 ++ src/cpp/cli/CMakeLists.txt | 4 + .../lemon/backends/backend_descriptor.h | 58 ++++ .../backends/backend_descriptor_registry.h | 25 ++ .../include/lemon/backends/backend_registry.h | 47 +++ .../include/lemon/backends/cloud_descriptor.h | 13 + .../include/lemon/backends/cloud_factory.h | 14 + .../lemon/backends/fastflowlm_descriptor.h | 13 + .../lemon/backends/fastflowlm_factory.h | 14 + .../lemon/backends/kokoro_descriptor.h | 13 + .../include/lemon/backends/kokoro_factory.h | 14 + .../lemon/backends/llamacpp_descriptor.h | 13 + .../include/lemon/backends/llamacpp_factory.h | 14 + .../lemon/backends/moonshine_descriptor.h | 13 + .../lemon/backends/moonshine_factory.h | 14 + .../lemon/backends/ryzenai_descriptor.h | 13 + .../include/lemon/backends/ryzenai_factory.h | 14 + .../include/lemon/backends/sdcpp_descriptor.h | 13 + .../include/lemon/backends/sdcpp_factory.h | 14 + .../include/lemon/backends/vllm_descriptor.h | 13 + src/cpp/include/lemon/backends/vllm_factory.h | 14 + .../lemon/backends/whispercpp_descriptor.h | 13 + .../lemon/backends/whispercpp_factory.h | 14 + src/cpp/include/lemon/model_manager.h | 13 + src/cpp/include/lemon/recipe_backend_def.h | 26 ++ src/cpp/include/lemon/wrapped_server.h | 57 ++- .../backends/backend_descriptor_registry.cpp | 29 ++ .../backend_descriptors_generated.h.in | 19 + .../backends/backend_factories_generated.h.in | 21 ++ src/cpp/server/backends/backend_registry.cpp | 31 ++ src/cpp/server/backends/cloud_descriptor.cpp | 23 ++ src/cpp/server/backends/cloud_factory.cpp | 16 + .../server/backends/fastflowlm_descriptor.cpp | 29 ++ .../server/backends/fastflowlm_factory.cpp | 13 + src/cpp/server/backends/kokoro_descriptor.cpp | 30 ++ src/cpp/server/backends/kokoro_factory.cpp | 13 + .../server/backends/llamacpp_descriptor.cpp | 43 +++ src/cpp/server/backends/llamacpp_factory.cpp | 13 + .../server/backends/moonshine_descriptor.cpp | 30 ++ src/cpp/server/backends/moonshine_factory.cpp | 13 + .../server/backends/ryzenai_descriptor.cpp | 29 ++ src/cpp/server/backends/ryzenai_factory.cpp | 20 ++ src/cpp/server/backends/sdcpp_descriptor.cpp | 47 +++ src/cpp/server/backends/sdcpp_factory.cpp | 13 + src/cpp/server/backends/vllm_descriptor.cpp | 30 ++ src/cpp/server/backends/vllm_factory.cpp | 13 + .../server/backends/whispercpp_descriptor.cpp | 39 +++ .../server/backends/whispercpp_factory.cpp | 13 + src/cpp/server/model_manager.cpp | 58 +++- src/cpp/server/recipe_options.cpp | 182 +++++----- src/cpp/server/router.cpp | 143 ++++---- src/cpp/server/runtime_config.cpp | 39 ++- src/cpp/server/server.cpp | 5 +- src/cpp/server/system_info.cpp | 205 ++++------- 61 files changed, 2216 insertions(+), 334 deletions(-) create mode 100644 docs/dev/adding-a-backend.md create mode 100644 docs/dev/backends-reference.md create mode 100644 docs/tools/gen_backend_docs.py create mode 100644 src/cpp/include/lemon/backends/backend_descriptor.h create mode 100644 src/cpp/include/lemon/backends/backend_descriptor_registry.h create mode 100644 src/cpp/include/lemon/backends/backend_registry.h create mode 100644 src/cpp/include/lemon/backends/cloud_descriptor.h create mode 100644 src/cpp/include/lemon/backends/cloud_factory.h create mode 100644 src/cpp/include/lemon/backends/fastflowlm_descriptor.h create mode 100644 src/cpp/include/lemon/backends/fastflowlm_factory.h create mode 100644 src/cpp/include/lemon/backends/kokoro_descriptor.h create mode 100644 src/cpp/include/lemon/backends/kokoro_factory.h create mode 100644 src/cpp/include/lemon/backends/llamacpp_descriptor.h create mode 100644 src/cpp/include/lemon/backends/llamacpp_factory.h create mode 100644 src/cpp/include/lemon/backends/moonshine_descriptor.h create mode 100644 src/cpp/include/lemon/backends/moonshine_factory.h create mode 100644 src/cpp/include/lemon/backends/ryzenai_descriptor.h create mode 100644 src/cpp/include/lemon/backends/ryzenai_factory.h create mode 100644 src/cpp/include/lemon/backends/sdcpp_descriptor.h create mode 100644 src/cpp/include/lemon/backends/sdcpp_factory.h create mode 100644 src/cpp/include/lemon/backends/vllm_descriptor.h create mode 100644 src/cpp/include/lemon/backends/vllm_factory.h create mode 100644 src/cpp/include/lemon/backends/whispercpp_descriptor.h create mode 100644 src/cpp/include/lemon/backends/whispercpp_factory.h create mode 100644 src/cpp/include/lemon/recipe_backend_def.h create mode 100644 src/cpp/server/backends/backend_descriptor_registry.cpp create mode 100644 src/cpp/server/backends/backend_descriptors_generated.h.in create mode 100644 src/cpp/server/backends/backend_factories_generated.h.in create mode 100644 src/cpp/server/backends/backend_registry.cpp create mode 100644 src/cpp/server/backends/cloud_descriptor.cpp create mode 100644 src/cpp/server/backends/cloud_factory.cpp create mode 100644 src/cpp/server/backends/fastflowlm_descriptor.cpp create mode 100644 src/cpp/server/backends/fastflowlm_factory.cpp create mode 100644 src/cpp/server/backends/kokoro_descriptor.cpp create mode 100644 src/cpp/server/backends/kokoro_factory.cpp create mode 100644 src/cpp/server/backends/llamacpp_descriptor.cpp create mode 100644 src/cpp/server/backends/llamacpp_factory.cpp create mode 100644 src/cpp/server/backends/moonshine_descriptor.cpp create mode 100644 src/cpp/server/backends/moonshine_factory.cpp create mode 100644 src/cpp/server/backends/ryzenai_descriptor.cpp create mode 100644 src/cpp/server/backends/ryzenai_factory.cpp create mode 100644 src/cpp/server/backends/sdcpp_descriptor.cpp create mode 100644 src/cpp/server/backends/sdcpp_factory.cpp create mode 100644 src/cpp/server/backends/vllm_descriptor.cpp create mode 100644 src/cpp/server/backends/vllm_factory.cpp create mode 100644 src/cpp/server/backends/whispercpp_descriptor.cpp create mode 100644 src/cpp/server/backends/whispercpp_factory.cpp diff --git a/.github/workflows/docs_and_style.yml b/.github/workflows/docs_and_style.yml index a64e4d7f2..35aa4cf50 100644 --- a/.github/workflows/docs_and_style.yml +++ b/.github/workflows/docs_and_style.yml @@ -24,6 +24,23 @@ jobs: - name: Run app regression tests run: node test/app/run-app-regression-tests.cjs + backend-docs-drift: + # The backend reference doc (docs/dev/backends-reference.md) is generated from + # the self-describing backend descriptors. Build lemond, regenerate, and fail + # if the committed doc is stale — the same guarantee a lint provides. + runs-on: ubuntu-latest + concurrency: + group: ${{ github.workflow }}-backend-docs-${{ github.ref }} + cancel-in-progress: true + steps: + - uses: actions/checkout@v5 + - name: Configure and install build dependencies + run: ./setup.sh + - name: Build lemond + run: cmake --build --preset default --target lemond + - name: Check backend reference docs are up to date + run: python3 docs/tools/gen_backend_docs.py --check + markdown-link-check: runs-on: ubuntu-latest concurrency: diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e125642f..3220e6c42 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -647,6 +647,75 @@ elseif(UNIX) list(APPEND SOURCES_CORE src/cpp/server/utils/platform/process_unix.cpp) endif() +# ============================================================ +# Self-describing backends registry +# ============================================================ +# The authoritative backend list. Each entry is "|": +# recipe - the recipe string used in server_models.json (may contain dashes) +# stem - identifier-safe name. The backend ships two files: +# src/cpp/server/backends/_descriptor.cpp (plain data; CLI-safe) +# src/cpp/server/backends/_factory.cpp (create(); server-only) +# declaring lemon::backends::_descriptor and _create. +# +# Adding a backend is one line here plus those two files. The foreach below +# compiles the sources and regenerates the registry headers, which bind each +# descriptor to its create(). Because this list is a tracked input, editing it +# forces regeneration on the next build (a file(GLOB) would silently miss a +# newly added backend). Descriptor DATA links into both the lemonade CLI and +# lemond; only lemond links the factories (which pull in server classes). +set(LEMON_BACKENDS + # "|" + "llamacpp|llamacpp" + "whispercpp|whispercpp" + "moonshine|moonshine" + "kokoro|kokoro" + "sd-cpp|sdcpp" + "flm|fastflowlm" + "ryzenai-llm|ryzenai" + "vllm|vllm" + "cloud|cloud" +) + +set(LEMON_DESCRIPTOR_INCLUDES "") +set(LEMON_DESCRIPTOR_ENTRIES "") +set(LEMON_FACTORY_INCLUDES "") +set(LEMON_FACTORY_ENTRIES "") +# Descriptor sources are CLI-safe (data only); factory sources are server-only. +# Absolute paths so the CLI subdirectory can reuse LEMON_BACKEND_DESCRIPTOR_SOURCES. +set(LEMON_BACKEND_DESCRIPTOR_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp) +set(LEMON_BACKEND_FACTORY_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp) +foreach(_backend_entry ${LEMON_BACKENDS}) + string(REPLACE "|" ";" _backend_parts "${_backend_entry}") + list(GET _backend_parts 1 _backend_stem) + list(APPEND LEMON_BACKEND_DESCRIPTOR_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_descriptor.cpp) + list(APPEND LEMON_BACKEND_FACTORY_SOURCES + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_factory.cpp) + string(APPEND LEMON_DESCRIPTOR_INCLUDES + "#include \"lemon/backends/${_backend_stem}_descriptor.h\"\n") + string(APPEND LEMON_DESCRIPTOR_ENTRIES + " &lemon::backends::${_backend_stem}_descriptor,\n") + string(APPEND LEMON_FACTORY_INCLUDES + "#include \"lemon/backends/${_backend_stem}_factory.h\"\n") + string(APPEND LEMON_FACTORY_ENTRIES + " { &lemon::backends::${_backend_stem}_descriptor, &lemon::backends::${_backend_stem}_create },\n") +endforeach() + +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptors_generated.h.in + ${CMAKE_CURRENT_BINARY_DIR}/include/backend_descriptors_generated.h + @ONLY) +configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_factories_generated.h.in + ${CMAKE_CURRENT_BINARY_DIR}/include/backend_factories_generated.h + @ONLY) + +# lemond gets both descriptor data and factories; the CLI gets only the data +# (see src/cpp/cli/CMakeLists.txt, which reuses LEMON_BACKEND_DESCRIPTOR_SOURCES). +list(APPEND SOURCES_CORE ${LEMON_BACKEND_DESCRIPTOR_SOURCES} ${LEMON_BACKEND_FACTORY_SOURCES}) + # ============================================================ # Server core OBJECT library (shared by lemond and Lemonade.exe) # ============================================================ diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md new file mode 100644 index 000000000..512770e73 --- /dev/null +++ b/docs/dev/adding-a-backend.md @@ -0,0 +1,145 @@ +# Adding a backend + +Lemonade backends are **self-describing**. A backend declares *what it is* in a +plain-data **descriptor** and implements *how it runs* in a **server class**. A +registry collects every descriptor, and the router, the CLI, `/system-info`, and +the generated docs all read it — so there are no scattered `if (recipe == "...")` +sites to update. + +Adding a backend is **one folder's worth of files plus three small appends**: + +| You edit | What goes there | +|----------|-----------------| +| `CMakeLists.txt` → `LEMON_BACKENDS` | **one line**: `"\|"` | +| `src/cpp/server/backends/_descriptor.cpp` + `.h` | the descriptor (plain data) | +| `src/cpp/server/backends/_factory.cpp` + `.h` | `create()` + the `WrappedServer` subclass | +| `src/cpp/resources/backend_versions.json` | version pin(s) — skip if there's no downloaded binary (e.g. cloud) | +| `src/cpp/resources/server_models.json` | the models | + +No router edits, no CLI edits, no doc edits, no support-matrix edits. + +## The descriptor (plain data — CLI-safe) + +The descriptor is the single object every consumer reads. It links into **both** +the `lemonade` CLI and `lemond`, so it must not reference server classes. + +`src/cpp/include/lemon/backends/_descriptor.h`: + +```cpp +#pragma once +#include "lemon/backends/backend_descriptor.h" +namespace lemon { namespace backends { +extern const BackendDescriptor _descriptor; +} } +``` + +`src/cpp/server/backends/_descriptor.cpp`: + +```cpp +#include "lemon/backends/_descriptor.h" +namespace lemon { namespace backends { +const BackendDescriptor _descriptor = { + /*recipe*/ "myrecipe", + /*display_name*/ "My Backend", + /*binary*/ "my-server", // "" = no subprocess (e.g. cloud) + /*config_section*/ "myrecipe", // defaults to recipe + /*default_device*/ DEVICE_GPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ false, // true auto-exposes "_backend" + "--" + /*uses_ctx_size*/ true, // opt in to the shared ctx_size option + /*dynamic_models*/ false, // true = models discovered at runtime (cloud) + /*options*/ { // backend-specific knobs (common ones are automatic) + {"myrecipe_args", "--myrecipe-args", "", "ARGS", "Custom args to pass", "My Options"}, + }, + /*support*/ { // OS / device families ({} = no local gating) + {"myrecipe", "cpu", {"linux", "windows"}, {{"cpu", {"x86_64"}}}}, + }, + /*default_labels*/ {}, // labels injected when a model omits them + /*required_checkpoints*/ {"main"}, // unconditional files; conditional ones checked in load() +}; +} } +``` + +`SlotPolicy` controls accelerator sharing: `Standard` (counts toward LRU slots), +`ExclusiveNpu` (evicts all NPU servers first), `CoexistByType` (one per model +type), `Unmetered` (never counted, never auto-evicted — cloud). + +## The factory + server class (server-only) + +The factory builds the `WrappedServer` subclass. It is compiled into `lemond` +only (it references server classes), which keeps the `lemonade` CLI link clean. + +`src/cpp/include/lemon/backends/_factory.h`: + +```cpp +#pragma once +#include +#include "lemon/backends/backend_registry.h" +namespace lemon { namespace backends { +std::unique_ptr _create(const BackendContext& ctx); +} } +``` + +`src/cpp/server/backends/_factory.cpp`: + +```cpp +#include "lemon/backends/_factory.h" +#include "lemon/backends/_server.h" +#include "lemon/wrapped_server.h" +namespace lemon { namespace backends { +std::unique_ptr _create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} +} } +``` + +The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`, +and only the capability interfaces you actually serve (`ITranscriptionServer`, +`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default +"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend +does not stub them. + +## Register it: one line + +```cmake +set(LEMON_BACKENDS + ... + "myrecipe|myrecipe" # "|" +) +``` + +The `foreach` in `CMakeLists.txt` compiles your two sources and regenerates the +registry headers, binding the descriptor to its `create()`. + +## What you get for free + +- **Standard options:** `merge_args`, `auto_evict`, `evict_idle_timeout`, + `downsize_idle_timeout`, `evict_weight_factor`, `pinned`. `ctx_size` is opt-in + via `uses_ctx_size`. +- **Generated CLI flags** for every descriptor option with a `cli_flag`, plus + `--` when `selectable_backend = true`. +- **Install/download** via the backend's `BackendSpec` (binary + install params). +- **`/system-info`** `recipes` entry (display name, options schema, support matrix). +- **Generated docs** — your backend appears in + [`backends-reference.md`](backends-reference.md) automatically. + +## Escape hatches + +| Need | Hook | +|------|------| +| Device depends on the chosen backend variant (whisper npu vs cpu) | override `WrappedServer::effective_device(opts)` | +| Eviction rule depends on the variant | override `WrappedServer::effective_slot_policy(opts)` | +| Availability decided at runtime (cloud creds) | override `WrappedServer::availability()` | +| Conditional / grouped checkpoints (sd-cpp flux, whisper npu_cache) | validate in `load()`; list only unconditional files in `required_checkpoints` | +| Custom per-model fields without editing `ModelInfo` | read `model_info.extra("my_field", fallback)` (populated from unknown `server_models.json` keys) | +| Models supplied at runtime, not from `server_models.json` | set `dynamic_models = true` and provide them in the class (see cloud's `discover_models()`) | +| Per-create setup before load (ryzenai `set_model_path`) | do it in `create()` | + +## The simplest end-to-end example + +**Moonshine** is the minimal case: a single descriptor option, no backend +selection, CPU-only, one capability interface. See +`src/cpp/server/backends/moonshine_descriptor.cpp` and `moonshine_factory.cpp`. + +> Note: collections (`collection.omni`) are orchestrator-driven, not +> `WrappedServer` subprocesses, and are the one explicit exception to this model. diff --git a/docs/dev/backends-reference.md b/docs/dev/backends-reference.md new file mode 100644 index 000000000..f5c8edebb --- /dev/null +++ b/docs/dev/backends-reference.md @@ -0,0 +1,325 @@ +# Backend reference + + + +## Backends + + +| Recipe | Name | Selectable backend | Uses ctx_size | Backends | +|--------|------|--------------------|---------------|----------| +| `flm` | FastFlowLM NPU | no | yes | npu | +| `kokoro` | Kokoro | no | no | cpu, metal | +| `llamacpp` | Llama.cpp GPU | yes | yes | cpu, cuda, metal, rocm, system, vulkan | +| `moonshine` | Moonshine | no | no | cpu | +| `ryzenai-llm` | Ryzen AI LLM | no | yes | npu | +| `sd-cpp` | StableDiffusion.cpp | yes | no | cpu, cuda, metal, rocm, vulkan | +| `vllm` | vLLM ROCm (experimental) | yes | yes | rocm | +| `whispercpp` | Whisper.cpp | yes | no | cpu, metal, npu, rocm, vulkan | + + +## Support matrix + + +| Recipe | Backend | OS | Device families | +|--------|---------|----|-----------------| +| `flm` | npu | linux, windows | amd_npu (XDNA2) | +| `kokoro` | cpu | linux, windows | cpu (x86_64) | +| `kokoro` | metal | macos | metal | +| `llamacpp` | system | linux | cpu (arm64, x86_64) | +| `llamacpp` | metal | macos | metal | +| `llamacpp` | cuda | linux, windows | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) | +| `llamacpp` | vulkan | linux, windows | amd_gpu; cpu (arm64, x86_64) | +| `llamacpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) | +| `llamacpp` | cpu | linux, windows | cpu (arm64, x86_64) | +| `moonshine` | cpu | windows | cpu (x86_64) | +| `moonshine` | cpu | linux | cpu (arm64, x86_64) | +| `moonshine` | cpu | macos | cpu (arm64) | +| `ryzenai-llm` | npu | windows | amd_npu (XDNA2) | +| `sd-cpp` | rocm | linux, windows | amd_gpu (gfx103X, gfx110X, gfx1150, gfx1151, gfx1152, gfx120X) | +| `sd-cpp` | cuda | linux | nvidia_gpu (sm_100, sm_120, sm_121, sm_75, sm_80, sm_86, sm_89, sm_90) | +| `sd-cpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64); nvidia_gpu | +| `sd-cpp` | cpu | linux, windows | cpu (x86_64) | +| `sd-cpp` | metal | macos | metal | +| `vllm` | rocm | linux | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) | +| `whispercpp` | npu | windows | amd_npu (XDNA2) | +| `whispercpp` | rocm | linux, windows | amd_gpu (gfx110X, gfx1150, gfx1151, gfx120X) | +| `whispercpp` | vulkan | linux, windows | amd_gpu; cpu (x86_64) | +| `whispercpp` | cpu | linux, windows | cpu (x86_64) | +| `whispercpp` | metal | macos | metal | + + +## Recipe options + + +#### `llamacpp` — Llama.cpp GPU + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model | +| `llamacpp_backend` | `--llamacpp` | BACKEND | "" | LlamaCpp backend to use | +| `llamacpp_device` | `--llamacpp-device` | DEVICES | "" | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | +| `llamacpp_args` | `--llamacpp-args` | ARGS | "" | Custom arguments to pass to llama-server | + +#### `moonshine` — Moonshine + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `moonshine_args` | `--moonshine-args` | ARGS | "" | Custom arguments to pass to moonshine-server | + +#### `sd-cpp` — StableDiffusion.cpp + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `sd-cpp_backend` | `--sdcpp` | BACKEND | "" | SD.cpp backend to use | +| `sdcpp_args` | `--sdcpp-args` | ARGS | "" | Custom arguments to pass to sd-server (must not conflict with managed args) | +| `steps` | — | SIZE | 20 | Number of diffusion steps | +| `cfg_scale` | — | SIZE | 7.0 | Classifier-free guidance scale | +| `width` | — | SIZE | 512 | Output image width | +| `height` | — | SIZE | 512 | Output image height | +| `sampling_method` | — | ARGS | "" | Sampling method | +| `flow_shift` | — | SIZE | 0.0 | Flow shift | + +#### `vllm` — vLLM ROCm (experimental) + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model | +| `vllm_backend` | `--vllm` | BACKEND | "" | vLLM backend to use | +| `vllm_args` | `--vllm-args` | ARGS | "" | Custom arguments to pass to vllm-server | + +#### `whispercpp` — Whisper.cpp + +| Option | CLI flag | Type | Default | Description | +|--------|----------|------|---------|-------------| +| `whispercpp_backend` | `--whispercpp` | BACKEND | "" | WhisperCpp backend to use | +| `whispercpp_args` | `--whispercpp-args` | ARGS | "" | Custom arguments to pass to whisper-server | + + +## Models + + +#### `collection.omni` — collection.omni (4 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `LMX-Omni-5.5B-Lite` | 9.3 | — | +| `LMX-Omni-52B-Halo` | 44.77 | — | +| `Lite Collection` | | — | +| `Ultra Collection` | | — | + +#### `kokoro` — Kokoro (1 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `kokoro-v1` | 0.354 | tts | + +#### `llamacpp` — Llama.cpp GPU (74 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Bonsai-1.7B-gguf` | 0.25 | llamacpp | +| `Bonsai-4B-gguf` | 0.572 | llamacpp | +| `Bonsai-8B-gguf` | 1.16 | llamacpp | +| `Cogito-v2-llama-109B-MoE-GGUF` | 65.4 | vision | +| `DeepSeek-Qwen3-8B-GGUF` | 5.25 | reasoning | +| `Devstral-Small-2507-GGUF` | 14.3 | coding, tool-calling | +| `GLM-4.5-Air-UD-Q4K-XL-GGUF` | 67.7 | reasoning | +| `GLM-4.7-Flash-GGUF` | 17.5 | tool-calling | +| `Gemma-3-4b-it-GGUF` | 3.34 | vision | +| `Gemma-4-12B-it-GGUF` | 7.12 | tool-calling, llamacpp | +| `Gemma-4-26B-A4B-it-GGUF` | 18.1 | hot, tool-calling, vision, llamacpp | +| `Gemma-4-31B-it-GGUF` | 19.5 | hot, tool-calling, vision, llamacpp | +| `Gemma-4-E2B-it-GGUF` | 4.09 | tool-calling, vision, llamacpp | +| `Gemma-4-E4B-it-GGUF` | 5.97 | tool-calling, vision, llamacpp | +| `Jan-nano-128k-GGUF` | 2.5 | — | +| `Jan-v1-4B-GGUF` | 2.5 | — | +| `LFM2-1.2B-GGUF` | 0.731 | — | +| `LFM2-24B-A2B-GGUF` | 14.4 | — | +| `LFM2-8B-A1B-GGUF` | 5.04 | — | +| `LFM2.5-1.2B-Instruct-GGUF` | 0.731 | — | +| `LFM2.5-8B-A1B` | 5.16 | — | +| `Llama-3.2-1B-Instruct-GGUF` | 0.834 | — | +| `Llama-3.2-3B-Instruct-GGUF` | 2.06 | — | +| `Llama-4-Scout-17B-16E-Instruct-GGUF` | 63.2 | vision | +| `Ministral-3-3B-Instruct-2512-GGUF` | 2.99 | vision | +| `Nemotron-3-Nano-30B-A3B-GGUF` | 22.8 | — | +| `Phi-4-mini-instruct-GGUF` | 2.49 | — | +| `Playable1-GGUF` | 4.68 | coding | +| `PromptBridge-0.6b-Alpha-GGUF` | 0.397 | — | +| `Qwen2.5-Coder-32B-Instruct-GGUF` | 19.9 | coding | +| `Qwen2.5-Omni-3B-GGUF` | 4.73 | vision, chat-transcription | +| `Qwen2.5-Omni-7B-GGUF` | 7.33 | vision, chat-transcription | +| `Qwen2.5-VL-3B-Instruct-GGUF` | 3.27 | vision | +| `Qwen2.5-VL-7B-Instruct-GGUF` | 6.04 | vision | +| `Qwen3-0.6B-GGUF` | 0.38 | reasoning | +| `Qwen3-1.7B-GGUF` | 1.06 | reasoning | +| `Qwen3-14B-GGUF` | 8.54 | reasoning | +| `Qwen3-30B-A3B-GGUF` | 17.4 | reasoning | +| `Qwen3-30B-A3B-Instruct-2507-GGUF` | 17.4 | tool-calling | +| `Qwen3-4B-GGUF` | 2.38 | reasoning | +| `Qwen3-4B-Instruct-2507-GGUF` | 2.5 | tool-calling | +| `Qwen3-8B-GGUF` | 5.25 | reasoning | +| `Qwen3-Coder-30B-A3B-Instruct-GGUF` | 18.6 | coding, tool-calling, hot | +| `Qwen3-Coder-Next-GGUF` | 48.0 | coding, tool-calling, hot | +| `Qwen3-Embedding-0.6B-GGUF` | 0.64 | embeddings | +| `Qwen3-Embedding-4B-GGUF` | 4.28 | embeddings | +| `Qwen3-Embedding-8B-GGUF` | 8.05 | embeddings | +| `Qwen3-Next-80B-A3B-Instruct-GGUF` | 46.1 | tool-calling | +| `Qwen3-VL-4B-Instruct-GGUF` | 3.33 | vision | +| `Qwen3-VL-8B-Instruct-GGUF` | 6.19 | vision | +| `Qwen3.5-0.8B-GGUF` | 0.764 | vision, tool-calling | +| `Qwen3.5-122B-A10B-GGUF` | 77.9 | vision, tool-calling | +| `Qwen3.5-122B-A10B-MTP-GGUF` | 79.6 | vision, tool-calling, mtp | +| `Qwen3.5-27B-GGUF` | 18.5 | vision, tool-calling | +| `Qwen3.5-2B-GGUF` | 2.01 | vision, tool-calling | +| `Qwen3.5-35B-A3B-GGUF` | 23.1 | vision, tool-calling | +| `Qwen3.5-4B-GGUF` | 3.58 | vision, tool-calling, hot | +| `Qwen3.5-4B-MTP-GGUF` | 3.66 | vision, tool-calling, mtp | +| `Qwen3.5-9B-GGUF` | 6.88 | vision, tool-calling | +| `Qwen3.6-27B-GGUF` | 18.5 | vision, tool-calling | +| `Qwen3.6-27B-MTP-GGUF` | 18.8 | vision, tool-calling, mtp, hot | +| `Qwen3.6-35B-A3B-GGUF` | 23.3 | vision, tool-calling, hot | +| `Qwen3.6-35B-A3B-MTP-GGUF` | 23.8 | vision, tool-calling, mtp | +| `SmolLM3-3B-GGUF` | 1.94 | — | +| `Tiny-Test-Model-GGUF` | 0.18 | — | +| `bge-reranker-v2-m3-GGUF` | 0.636 | reranking | +| `gpt-oss-120b-GGUF` | 62.8 | reasoning, tool-calling | +| `gpt-oss-120b-mxfp-GGUF` | 63.4 | hot, reasoning, tool-calling | +| `gpt-oss-20b-GGUF` | 11.6 | reasoning, tool-calling | +| `gpt-oss-20b-mxfp4-GGUF` | 12.1 | hot, reasoning, tool-calling | +| `granite-4.0-h-tiny-GGUF` | 4.25 | tool-calling | +| `jina-reranker-v1-tiny-en-GGUF` | 0.0367 | reranking | +| `nomic-embed-text-v1-GGUF` | 0.0781 | embeddings | +| `nomic-embed-text-v2-moe-GGUF` | 0.51 | embeddings | + +#### `moonshine` — Moonshine (3 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Moonshine-Medium-Streaming` | 1.08 | transcription, realtime-transcription, hot | +| `Moonshine-Small-Streaming` | 0.431 | transcription, realtime-transcription | +| `Moonshine-Tiny-Streaming` | 0.202 | transcription, realtime-transcription | + +#### `ryzenai-llm` — Ryzen AI LLM (79 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `AMD-OLMo-1B-SFT-DPO-Hybrid` | 1.48 | — | +| `CodeLlama-7b-Instruct-hf-Hybrid` | 7.24 | coding | +| `CodeLlama-7b-Instruct-hf-NPU` | 7.54 | coding | +| `DeepSeek-R1-Distill-Llama-8B-CPU` | 6.2 | reasoning | +| `DeepSeek-R1-Distill-Llama-8B-Hybrid` | 9.09 | reasoning | +| `DeepSeek-R1-Distill-Llama-8B-NPU` | 9.3 | reasoning | +| `DeepSeek-R1-Distill-Qwen-1.5B-Hybrid` | 2.19 | reasoning | +| `DeepSeek-R1-Distill-Qwen-1.5B-NPU` | 2.3 | reasoning | +| `DeepSeek-R1-Distill-Qwen-7B-CPU` | 6.2 | reasoning | +| `DeepSeek-R1-Distill-Qwen-7B-Hybrid` | 8.67 | reasoning | +| `DeepSeek-R1-Distill-Qwen-7B-NPU` | 8.87 | reasoning | +| `Gemma-3-4b-it-mm-NPU` | 6.68 | vision | +| `Llama-2-7b-chat-hf-Hybrid` | 7.31 | — | +| `Llama-2-7b-chat-hf-NPU` | 7.47 | — | +| `Llama-2-7b-hf-Hybrid` | 7.31 | — | +| `Llama-2-7b-hf-NPU` | 7.47 | — | +| `Llama-3.1-8B-Hybrid` | 9.09 | — | +| `Llama-3.1-8B-NPU` | 9.3 | — | +| `Llama-3.2-1B-Hybrid` | 1.89 | — | +| `Llama-3.2-1B-Instruct-CPU` | 1.76 | — | +| `Llama-3.2-1B-Instruct-Hybrid` | 1.89 | — | +| `Llama-3.2-1B-Instruct-NPU` | 1.96 | — | +| `Llama-3.2-1B-NPU` | 1.96 | — | +| `Llama-3.2-3B-Hybrid` | 4.28 | — | +| `Llama-3.2-3B-Instruct-CPU` | 3.38 | — | +| `Llama-3.2-3B-Instruct-Hybrid` | 4.28 | — | +| `Meta-Llama-3-8B-Hybrid` | 9.06 | — | +| `Meta-Llama-3-8B-NPU` | 9.23 | — | +| `Meta-Llama-3.1-8B-Instruct-Hybrid` | 9.09 | — | +| `Meta-Llama-3.1-8B-Instruct-NPU` | 9.3 | — | +| `Mistral-7B-Instruct-v0.1-Hybrid` | 7.84 | — | +| `Mistral-7B-Instruct-v0.1-NPU` | 8.01 | — | +| `Mistral-7B-Instruct-v0.2-Hybrid` | 7.84 | — | +| `Mistral-7B-Instruct-v0.2-NPU` | 8.01 | — | +| `Mistral-7B-Instruct-v0.3-Hybrid` | 7.85 | — | +| `Mistral-7B-Instruct-v0.3-NPU` | 8.09 | — | +| `Mistral-7B-v0.3-Hybrid` | 7.85 | — | +| `Mistral-7B-v0.3-NPU` | 8.09 | — | +| `Phi-3-Mini-Instruct-CPU` | 2.39 | — | +| `Phi-3-mini-128k-instruct-Hybrid` | 4.21 | — | +| `Phi-3-mini-128k-instruct-NPU` | 4.35 | — | +| `Phi-3-mini-4k-instruct-Hybrid` | 4.19 | — | +| `Phi-3-mini-4k-instruct-NPU` | 4.3 | — | +| `Phi-3.5-mini-instruct-Hybrid` | 4.21 | — | +| `Phi-3.5-mini-instruct-NPU` | 4.35 | — | +| `Phi-4-mini-instruct-Hybrid` | 5.47 | — | +| `Phi-4-mini-instruct-NPU` | 5.59 | — | +| `Phi-4-mini-reasoning-Hybrid` | 5.47 | reasoning | +| `Qwen-1.5-7B-Chat-CPU` | 6.32 | — | +| `Qwen-2.5-1.5B-Instruct-Hybrid` | 2.17 | — | +| `Qwen-2.5-1.5B-Instruct-NPU` | 2.25 | — | +| `Qwen1.5-7B-Chat-Hybrid` | 8.83 | — | +| `Qwen1.5-7B-Chat-NPU` | 9.02 | — | +| `Qwen2-1.5B-Hybrid` | 2.19 | — | +| `Qwen2-1.5B-NPU` | 2.3 | — | +| `Qwen2-7B-Hybrid` | 8.68 | — | +| `Qwen2-7B-NPU` | 8.88 | — | +| `Qwen2.5-0.5B-Instruct-CPU` | 0.834 | — | +| `Qwen2.5-0.5B-Instruct-Hybrid` | 0.828 | — | +| `Qwen2.5-14B-instruct-Hybrid` | 16.5 | — | +| `Qwen2.5-3B-Instruct-Hybrid` | 3.97 | — | +| `Qwen2.5-3B-Instruct-NPU` | 4.1 | — | +| `Qwen2.5-7B-Instruct-Hybrid` | 8.65 | — | +| `Qwen2.5-7B-Instruct-NPU` | 8.83 | — | +| `Qwen2.5-Coder-0.5B-Instruct-Hybrid` | 0.828 | coding | +| `Qwen2.5-Coder-1.5B-Instruct-Hybrid` | 2.17 | coding | +| `Qwen2.5-Coder-1.5B-Instruct-NPU` | 2.25 | coding | +| `Qwen2.5-Coder-7B-Instruct-Hybrid` | 8.65 | coding | +| `Qwen2.5-Coder-7B-Instruct-NPU` | 8.83 | coding | +| `Qwen3-1.7B-Hybrid` | 2.55 | reasoning | +| `Qwen3-14B-Hybrid` | 16.5 | reasoning | +| `Qwen3-4B-Hybrid` | 5.17 | reasoning | +| `Qwen3-8B-Hybrid` | 9.42 | reasoning | +| `SmolLM-135M-Instruct-Hybrid` | 0.232 | — | +| `SmolLM2-135M-Instruct-Hybrid` | 0.233 | — | +| `chatglm3-6b-Hybrid` | 6.9 | — | +| `chatglm3-6b-NPU` | 7.04 | — | +| `gemma-2-2b-Hybrid` | 4.04 | — | +| `gpt-oss-20b-NPU` | 13.4 | — | + +#### `sd-cpp` — StableDiffusion.cpp (12 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Flux-2-Klein-4B` | 16.1 | image, edit | +| `Flux-2-Klein-9B-GGUF` | 19.0 | image, edit | +| `Qwen-Image-2512-GGUF` | 19.4 | image | +| `Qwen-Image-GGUF` | 18.2 | image | +| `RealESRGAN-x4plus` | 0.064 | upscaling, image | +| `RealESRGAN-x4plus-anime` | 0.017 | upscaling, image | +| `SD-1.5` | 7.7 | image | +| `SD-Turbo` | 5.21 | image | +| `SD-Turbo-GGUF` | 2.02 | image | +| `SDXL-Base-1.0` | 6.94 | image | +| `SDXL-Turbo` | 6.94 | image | +| `Z-Image-Turbo` | 20.7 | image | + +#### `vllm` — vLLM ROCm (experimental) (4 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Qwen3.5-0.8B-FP16-vLLM` | 1.77 | reasoning | +| `Qwen3.5-2B-FP16-vLLM` | 4.57 | reasoning, tool-calling | +| `Qwen3.5-4B-FP16-vLLM` | 9.34 | reasoning, hot, tool-calling | +| `Qwen3.5-9B-FP16-vLLM` | 19.3 | reasoning, tool-calling | + +#### `whispercpp` — Whisper.cpp (6 models) + +| Model | Size (GB) | Labels | +|-------|-----------|--------| +| `Whisper-Base` | 0.148 | transcription, realtime-transcription | +| `Whisper-Large-v3` | 3.1 | transcription, realtime-transcription | +| `Whisper-Large-v3-Turbo` | 1.62 | transcription, realtime-transcription, hot | +| `Whisper-Medium` | 1.53 | transcription, realtime-transcription | +| `Whisper-Small` | 0.488 | transcription, realtime-transcription | +| `Whisper-Tiny` | 0.075 | transcription, realtime-transcription | + diff --git a/docs/dev/contribute.md b/docs/dev/contribute.md index 36357658a..97b7ee4d6 100644 --- a/docs/dev/contribute.md +++ b/docs/dev/contribute.md @@ -23,6 +23,10 @@ Lemonade's roadmap is defined by a set of [working groups](./working-groups/READ Not sure what to work on? Come to the feature-requests and troubleshooting channels on the Discord and see what people need! +### Adding a Backend + +Inference backends are self-describing: a backend is a descriptor (plain data) plus a server class, and everything else (router, CLI, `/system-info`, docs) is derived from it. See [Adding a backend](./adding-a-backend.md) for the full contract and a minimal example. + ### Issues Issues are a great way to document a bug or feature request. However, Lemonade is a community-driven project and you still need to find someone to implement your issue. It is highly recommended that you bring your issue to the [Lemonade discord community](https://discord.gg/5xXzkMu8Zk) and connect with a contributor who wants to implement it. diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py new file mode 100644 index 000000000..737715605 --- /dev/null +++ b/docs/tools/gen_backend_docs.py @@ -0,0 +1,309 @@ +#!/usr/bin/env python3 +"""Generate backend reference docs from the self-describing backend descriptors. + +The C++ backend descriptors (src/cpp/server/backends/*_descriptor.cpp) are the +single source of truth for what each backend is. This script boots a `lemond` +server, reads the descriptor-generated ``/system-info`` ``recipes`` object and +``server_models.json``, and rewrites the marker-delimited regions of the target +doc(s). A CI step runs it with ``--check`` and fails if the committed docs drift. + +Usage: + python docs/tools/gen_backend_docs.py [--lemond PATH] [--check] + +``--check`` regenerates in memory and exits non-zero if the on-disk docs differ, +without modifying them. + +Only the regions between:: + + + + +are rewritten; surrounding prose is left untouched. +""" + +import argparse +import json +import re +import socket +import subprocess +import sys +import tempfile +import time +import urllib.request +from pathlib import Path + +REPO_ROOT = Path(__file__).resolve().parents[2] +SERVER_MODELS = REPO_ROOT / "src" / "cpp" / "resources" / "server_models.json" +TARGET_DOC = REPO_ROOT / "docs" / "dev" / "backends-reference.md" + + +def free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("127.0.0.1", 0)) + return s.getsockname()[1] + + +def find_lemond(explicit: str | None) -> Path: + if explicit: + p = Path(explicit) + if not p.exists(): + sys.exit(f"lemond not found at {p}") + return p + for candidate in [ + REPO_ROOT / "build" / "lemond", + REPO_ROOT / "build" / "lemond.exe", + ]: + if candidate.exists(): + return candidate + sys.exit("Could not find a built lemond (looked in build/). Pass --lemond PATH.") + + +class Lemond: + """Boots a throwaway lemond on a free port with an isolated cache dir.""" + + def __init__(self, binary: Path): + self.binary = binary + self.port = free_port() + self._cache = tempfile.TemporaryDirectory(prefix="lemond-docs-") + self._proc: subprocess.Popen | None = None + + def __enter__(self): + self._proc = subprocess.Popen( + [str(self.binary), self._cache.name, "--port", str(self.port)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + ) + deadline = time.time() + 60 + while time.time() < deadline: + try: + self._get("/api/v1/health") + return self + except Exception: + if self._proc.poll() is not None: + sys.exit("lemond exited before becoming ready") + time.sleep(0.5) + self.__exit__(None, None, None) + sys.exit("lemond did not become ready within 60s") + + def __exit__(self, *exc): + if self._proc and self._proc.poll() is None: + try: + self._get("/internal/shutdown", timeout=2) + except Exception: + pass + try: + self._proc.wait(timeout=10) + except Exception: + self._proc.kill() + self._cache.cleanup() + + def _get(self, path: str, timeout: float = 5): + url = f"http://127.0.0.1:{self.port}{path}" + with urllib.request.urlopen(url, timeout=timeout) as r: + return r.read() + + def system_info(self) -> dict: + return json.loads(self._get("/api/v1/system-info", timeout=30)) + + +def md_escape(text: str) -> str: + return str(text).replace("|", "\\|") + + +def render_overview(recipes: dict) -> str: + rows = [ + "| Recipe | Name | Selectable backend | Uses ctx_size | Backends |", + "|--------|------|--------------------|---------------|----------|", + ] + for recipe in sorted(recipes): + info = recipes[recipe] + if "display_name" not in info: + continue # not a descriptor-backed recipe on this run + backends = sorted({b["backend"] for b in info.get("support", [])}) or sorted( + info.get("backends", {}) + ) + rows.append( + "| `{r}` | {n} | {s} | {c} | {b} |".format( + r=recipe, + n=md_escape(info.get("display_name", "")), + s="yes" if info.get("selectable_backend") else "no", + c="yes" if info.get("uses_ctx_size") else "no", + b=", ".join(backends) if backends else "—", + ) + ) + return "\n".join(rows) + + +def render_support_matrix(recipes: dict) -> str: + rows = [ + "| Recipe | Backend | OS | Device families |", + "|--------|---------|----|-----------------|", + ] + for recipe in sorted(recipes): + info = recipes[recipe] + for row in info.get("support", []): + fams = [] + for d in row.get("devices", []): + f = d.get("families") or [] + fams.append(d["device"] + (f" ({', '.join(f)})" if f else "")) + rows.append( + "| `{r}` | {b} | {o} | {d} |".format( + r=recipe, + b=row.get("backend", ""), + o=", ".join(sorted(row.get("os", []))), + d=md_escape("; ".join(fams)) if fams else "—", + ) + ) + return "\n".join(rows) + + +def render_options(recipes: dict) -> str: + blocks = [] + for recipe in sorted(recipes): + info = recipes[recipe] + opts = info.get("options") + if not opts: + continue + blocks.append(f"#### `{recipe}` — {info.get('display_name', recipe)}\n") + blocks.append("| Option | CLI flag | Type | Default | Description |") + blocks.append("|--------|----------|------|---------|-------------|") + if info.get("uses_ctx_size"): + blocks.append( + "| `ctx_size` | `--ctx-size` | SIZE | -1 | Context size for the model |" + ) + for o in opts: + blocks.append( + "| `{n}` | {f} | {t} | {d} | {h} |".format( + n=o["name"], + f=f"`{o['cli_flag']}`" if o.get("cli_flag") else "—", + t=o.get("type_name", ""), + d=md_escape( + json.dumps(o.get("default")) + if not isinstance(o.get("default"), str) + else o.get("default") or '""' + ), + h=md_escape(o.get("help", "")), + ) + ) + blocks.append("") + return "\n".join(blocks).rstrip() + + +def render_models(recipes: dict) -> str: + models = json.loads(SERVER_MODELS.read_text()) + by_recipe: dict[str, list] = {} + for name, data in models.items(): + if not isinstance(data, dict): + continue + by_recipe.setdefault(data.get("recipe", "(unspecified)"), []).append( + (name, data) + ) + blocks = [] + for recipe in sorted(by_recipe): + entries = sorted(by_recipe[recipe]) + display = recipes.get(recipe, {}).get("display_name", recipe) + blocks.append(f"#### `{recipe}` — {display} ({len(entries)} models)\n") + blocks.append("| Model | Size (GB) | Labels |") + blocks.append("|-------|-----------|--------|") + for name, data in entries: + blocks.append( + "| `{n}` | {s} | {l} |".format( + n=md_escape(name), + s=data.get("size", ""), + l=md_escape(", ".join(data.get("labels", []))) or "—", + ) + ) + blocks.append("") + return "\n".join(blocks).rstrip() + + +DEFAULT_TEMPLATE = """# Backend reference + + + +## Backends + + + + +## Support matrix + + + + +## Recipe options + + + + +## Models + + + +""" + + +def apply_sections(text: str, sections: dict[str, str]) -> str: + for marker_id, body in sections.items(): + pattern = re.compile( + r"().*?()", + re.DOTALL, + ) + if not pattern.search(text): + sys.exit(f"Marker region '{marker_id}' not found in target doc") + # Escape backslashes and group-ref markers in the body for re.sub. + safe_body = body.replace("\\", "\\\\") + replacement = r"\1" + "\n" + safe_body + "\n" + r"\2" + text = pattern.sub(replacement, text) + return text + + +def main() -> int: + ap = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + ap.add_argument("--lemond", help="Path to the built lemond binary") + ap.add_argument( + "--check", action="store_true", help="Fail if docs are stale; do not write" + ) + args = ap.parse_args() + + binary = find_lemond(args.lemond) + with Lemond(binary) as server: + info = server.system_info() + recipes = info.get("recipes", {}) + if not recipes: + sys.exit("/system-info returned no recipes") + + sections = { + "backends-overview": render_overview(recipes), + "backends-matrix": render_support_matrix(recipes), + "backend-options": render_options(recipes), + "backend-models": render_models(recipes), + } + + current = TARGET_DOC.read_text() if TARGET_DOC.exists() else DEFAULT_TEMPLATE + updated = apply_sections(current, sections) + + if args.check: + if not TARGET_DOC.exists() or TARGET_DOC.read_text() != updated: + sys.exit( + f"{TARGET_DOC.relative_to(REPO_ROOT)} is stale. Run: python docs/tools/gen_backend_docs.py" + ) + print(f"{TARGET_DOC.relative_to(REPO_ROOT)} is up to date.") + return 0 + + TARGET_DOC.parent.mkdir(parents=True, exist_ok=True) + TARGET_DOC.write_text(updated) + print(f"Wrote {TARGET_DOC.relative_to(REPO_ROOT)}") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/app/src/renderer/utils/recipeNames.ts b/src/app/src/renderer/utils/recipeNames.ts index d654c635a..8f1fdbb1f 100644 --- a/src/app/src/renderer/utils/recipeNames.ts +++ b/src/app/src/renderer/utils/recipeNames.ts @@ -4,15 +4,28 @@ export const isCollectionRecipe = (recipe?: string): boolean => { return recipe === COLLECTION_OMNI_MODEL_RECIPE; }; +// Recipe display names. Hardware-backend names (llamacpp, whispercpp, sd-cpp, …) +// are populated at runtime from /system-info's `recipes[].display_name`, which is +// generated from the C++ backend descriptors — the single source of truth. Only +// recipes NOT surfaced by /system-info's hardware support matrix are seeded here: +// the collection orchestrator (not a backend) and cloud offload (a backend with +// no local support rows). export const RECIPE_DISPLAY_NAMES: Record = { [COLLECTION_OMNI_MODEL_RECIPE]: 'Lemonade', - 'flm': 'FastFlowLM NPU', - 'llamacpp': 'Llama.cpp GPU', - 'ryzenai-llm': 'Ryzen AI LLM', - 'whispercpp': 'Whisper.cpp', - 'moonshine': 'Moonshine', - 'sd-cpp': 'StableDiffusion.cpp', - 'kokoro': 'Kokoro', 'cloud': 'Cloud', - 'vllm': 'vLLM ROCm (experimental)', +}; + +// Merge display names from a /system-info `recipes` object into RECIPE_DISPLAY_NAMES. +// Called whenever system info is (re)fetched so the map reflects the descriptors. +export const updateRecipeDisplayNames = ( + recipes?: Record +): void => { + if (!recipes) { + return; + } + for (const [recipe, info] of Object.entries(recipes)) { + if (info && typeof info.display_name === 'string' && info.display_name) { + RECIPE_DISPLAY_NAMES[recipe] = info.display_name; + } + } }; diff --git a/src/app/src/renderer/utils/systemData.ts b/src/app/src/renderer/utils/systemData.ts index 63f1d9427..fcd3b8f92 100644 --- a/src/app/src/renderer/utils/systemData.ts +++ b/src/app/src/renderer/utils/systemData.ts @@ -39,8 +39,23 @@ export interface Recipes { [recipeName: string]: Recipe; } +// Per-recipe option schema, generated from the C++ backend descriptor. +export interface RecipeOptionSchema { + name: string; + cli_flag: string; + default: unknown; + type_name: string; + help: string; + group: string; +} + export interface Recipe { default_backend?: string; + // Descriptor metadata (generated from the C++ backend descriptors). + display_name?: string; + selectable_backend?: boolean; + uses_ctx_size?: boolean; + options?: RecipeOptionSchema[]; backends: { [backendName: string]: BackendInfo; }; @@ -75,6 +90,11 @@ const fetchSystemInfoFromAPI = async (): Promise => { const data = await response.json(); const systemInfo: SystemInfo = { ...data }; + // Seed recipe display names from the descriptor-generated /system-info data + // so the UI doesn't hardcode per-recipe names. + const { updateRecipeDisplayNames } = await import('./recipeNames'); + updateRecipeDisplayNames(systemInfo.recipes); + return { info: systemInfo }; } catch (error) { console.error('Failed to fetch supported inference data from API:', error); diff --git a/src/cpp/cli/CMakeLists.txt b/src/cpp/cli/CMakeLists.txt index bd58c60ba..b6a0f26d6 100644 --- a/src/cpp/cli/CMakeLists.txt +++ b/src/cpp/cli/CMakeLists.txt @@ -97,6 +97,10 @@ set(COMMON_SOURCES agent_config_file.cpp opencode_profile.cpp pi_profile.cpp + # Self-describing backend descriptors (plain data; CLI-safe). Lets the CLI + # read recipe options/flags from descriptors without linking server classes. + # The matching factories (create()) are server-only and NOT listed here. + ${LEMON_BACKEND_DESCRIPTOR_SOURCES} ) # Add platform-specific sources diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h new file mode 100644 index 000000000..fc6c50bc2 --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -0,0 +1,58 @@ +#pragma once + +#include +#include +#include +#include "lemon/model_types.h" +#include "lemon/recipe_backend_def.h" + +namespace lemon { + +// A single declarative configuration knob a backend exposes. The same list +// drives config.json defaults, CLI flag registration, and load-time option +// resolution, so they can never drift apart. +struct BackendOption { + std::string name; // option key, e.g. "vllm_args" + std::string cli_flag; // CLI flag, e.g. "--vllm-args" ("" = not a CLI flag) + nlohmann::json default_value; // default value when the option is unset + std::string type_name; // "ARGS" | "SIZE" | "BACKEND" | "BOOL" + std::string help; // CLI help text + std::string group; // CLI help group, e.g. "General Options" +}; + +// How a backend shares the accelerator. Replaces the router's recipe-string +// checks for NPU exclusivity and LRU slot accounting. +enum class SlotPolicy { + Standard, // counts toward the LRU slots, no device exclusivity (llamacpp, sd-cpp) + ExclusiveNpu, // evict ALL npu servers before loading (ryzenai-llm, whispercpp-npu) + CoexistByType, // one per model type, evicts exclusive-npu peers (flm) + Unmetered // never counts toward slots, never auto-evicted (cloud) +}; + +// Plain data declaring *what a backend is*. This is the single object the +// registry, the CLI, /system-info, and the docs all read. Behavior lives in the +// paired WrappedServer subclass (see backend_registry.h for how they bind). +struct BackendDescriptor { + std::string recipe; // "vllm" + std::string display_name; // "vLLM ROCm (experimental)" + std::string binary; // subprocess to launch/install ("" = none, e.g. cloud) + std::string config_section; // config.json section; defaults to recipe (sd-cpp -> "sdcpp") + + DeviceType default_device = DEVICE_GPU; // default; override effective_device() if variant-dependent + SlotPolicy slot_policy = SlotPolicy::Standard; // default; override effective_slot_policy() if variant-dependent + bool selectable_backend = false; // auto-creates "_backend" option + "--" flag + bool uses_ctx_size = false; // opt in to the shared ctx_size option + bool dynamic_models = false; // true = class supplies models at runtime (cloud), not server_models.json + + std::vector options; // backend-specific knobs (common ones are automatic) + std::vector support; // which OS / GPU families it runs on ({} = no local gating) + std::vector default_labels; // labels injected when a model omits them + std::vector required_checkpoints{"main"}; // unconditional files; conditional ones checked in load() + + // The config.json section name for this backend, falling back to the recipe. + std::string effective_config_section() const { + return config_section.empty() ? recipe : config_section; + } +}; + +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/backend_descriptor_registry.h b/src/cpp/include/lemon/backends/backend_descriptor_registry.h new file mode 100644 index 000000000..e3be93cda --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_descriptor_registry.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// Read-only view over every backend descriptor (plain data). This API is +// CLI-safe: it pulls in no server classes, so it links into both the lemonade +// CLI and lemond. The factory side (create_server) lives in backend_registry.h +// and is server-only. + +// All registered descriptors, in LEMON_BACKENDS order. +const std::vector& all_descriptors(); + +// Descriptor for a recipe, or nullptr if the recipe has no registered backend. +const BackendDescriptor* descriptor_for(const std::string& recipe); + +// True if the recipe is backed by a registered descriptor. +bool has_backend(const std::string& recipe); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h new file mode 100644 index 000000000..394f49145 --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_registry.h @@ -0,0 +1,47 @@ +#pragma once + +#include +#include +#include "lemon/backends/backend_descriptor.h" +#include "lemon/backends/backend_descriptor_registry.h" + +namespace lemon { + +class WrappedServer; +class ModelManager; +class BackendManager; +class CloudProviderRegistry; +struct ModelInfo; + +namespace backends { + +// Everything a backend's create() needs to build an instance. Mirrors the +// arguments the old router factory passed to each backend constructor. +struct BackendContext { + std::string log_level; + ModelManager* model_manager = nullptr; + BackendManager* backend_manager = nullptr; + CloudProviderRegistry* cloud_registry = nullptr; + const ModelInfo* model_info = nullptr; // for per-create setup (cloud provider, ryzenai model path) +}; + +using BackendCreateFn = std::unique_ptr (*)(const BackendContext&); + +// Binds a descriptor (what the backend is) to its server class's create() (how +// it runs). The generated factory registry supplies one per backend. This API is +// server-only: it references server classes via create(), so it is compiled into +// lemond but not the CLI. The CLI reads descriptors through backend_descriptor_registry.h. +struct BackendRegistration { + const BackendDescriptor* descriptor; + BackendCreateFn create; +}; + +// All registered (descriptor, create) pairs, in LEMON_BACKENDS order. +const std::vector& all_registrations(); + +// Construct a backend instance for a recipe and associate its descriptor, or +// nullptr if the recipe has no registered backend. +std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/cloud_descriptor.h b/src/cpp/include/lemon/backends/cloud_descriptor.h new file mode 100644 index 000000000..6e5f49bdb --- /dev/null +++ b/src/cpp/include/lemon/backends/cloud_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The cloud backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in cloud_descriptor.cpp. +extern const BackendDescriptor cloud_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/cloud_factory.h b/src/cpp/include/lemon/backends/cloud_factory.h new file mode 100644 index 000000000..889958bd1 --- /dev/null +++ b/src/cpp/include/lemon/backends/cloud_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The cloud backend's factory (constructs the server class — lemond only). +// Defined in cloud_factory.cpp. +std::unique_ptr cloud_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm_descriptor.h b/src/cpp/include/lemon/backends/fastflowlm_descriptor.h new file mode 100644 index 000000000..5e8f71467 --- /dev/null +++ b/src/cpp/include/lemon/backends/fastflowlm_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The fastflowlm backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in fastflowlm_descriptor.cpp. +extern const BackendDescriptor fastflowlm_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm_factory.h b/src/cpp/include/lemon/backends/fastflowlm_factory.h new file mode 100644 index 000000000..8581dbdf7 --- /dev/null +++ b/src/cpp/include/lemon/backends/fastflowlm_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The fastflowlm backend's factory (constructs the server class — lemond only). +// Defined in fastflowlm_factory.cpp. +std::unique_ptr fastflowlm_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro_descriptor.h b/src/cpp/include/lemon/backends/kokoro_descriptor.h new file mode 100644 index 000000000..1d3542f0a --- /dev/null +++ b/src/cpp/include/lemon/backends/kokoro_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The kokoro backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in kokoro_descriptor.cpp. +extern const BackendDescriptor kokoro_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro_factory.h b/src/cpp/include/lemon/backends/kokoro_factory.h new file mode 100644 index 000000000..0df3ec37b --- /dev/null +++ b/src/cpp/include/lemon/backends/kokoro_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The kokoro backend's factory (constructs the server class — lemond only). +// Defined in kokoro_factory.cpp. +std::unique_ptr kokoro_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp_descriptor.h b/src/cpp/include/lemon/backends/llamacpp_descriptor.h new file mode 100644 index 000000000..501e0854c --- /dev/null +++ b/src/cpp/include/lemon/backends/llamacpp_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The llamacpp backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in llamacpp_descriptor.cpp. +extern const BackendDescriptor llamacpp_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp_factory.h b/src/cpp/include/lemon/backends/llamacpp_factory.h new file mode 100644 index 000000000..853f5171b --- /dev/null +++ b/src/cpp/include/lemon/backends/llamacpp_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The llamacpp backend's factory (constructs the server class — lemond only). +// Defined in llamacpp_factory.cpp. +std::unique_ptr llamacpp_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine_descriptor.h b/src/cpp/include/lemon/backends/moonshine_descriptor.h new file mode 100644 index 000000000..d70083244 --- /dev/null +++ b/src/cpp/include/lemon/backends/moonshine_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The moonshine backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in moonshine_descriptor.cpp. +extern const BackendDescriptor moonshine_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine_factory.h b/src/cpp/include/lemon/backends/moonshine_factory.h new file mode 100644 index 000000000..67e6f7298 --- /dev/null +++ b/src/cpp/include/lemon/backends/moonshine_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The moonshine backend's factory (constructs the server class — lemond only). +// Defined in moonshine_factory.cpp. +std::unique_ptr moonshine_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenai_descriptor.h b/src/cpp/include/lemon/backends/ryzenai_descriptor.h new file mode 100644 index 000000000..26aa0b21f --- /dev/null +++ b/src/cpp/include/lemon/backends/ryzenai_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The ryzenai backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in ryzenai_descriptor.cpp. +extern const BackendDescriptor ryzenai_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenai_factory.h b/src/cpp/include/lemon/backends/ryzenai_factory.h new file mode 100644 index 000000000..9483d8d55 --- /dev/null +++ b/src/cpp/include/lemon/backends/ryzenai_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The ryzenai backend's factory (constructs the server class — lemond only). +// Defined in ryzenai_factory.cpp. +std::unique_ptr ryzenai_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/sdcpp_descriptor.h b/src/cpp/include/lemon/backends/sdcpp_descriptor.h new file mode 100644 index 000000000..0bee2e552 --- /dev/null +++ b/src/cpp/include/lemon/backends/sdcpp_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The sdcpp backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in sdcpp_descriptor.cpp. +extern const BackendDescriptor sdcpp_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/sdcpp_factory.h b/src/cpp/include/lemon/backends/sdcpp_factory.h new file mode 100644 index 000000000..f7da955e2 --- /dev/null +++ b/src/cpp/include/lemon/backends/sdcpp_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The sdcpp backend's factory (constructs the server class — lemond only). +// Defined in sdcpp_factory.cpp. +std::unique_ptr sdcpp_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm_descriptor.h b/src/cpp/include/lemon/backends/vllm_descriptor.h new file mode 100644 index 000000000..7119dff88 --- /dev/null +++ b/src/cpp/include/lemon/backends/vllm_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The vllm backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in vllm_descriptor.cpp. +extern const BackendDescriptor vllm_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm_factory.h b/src/cpp/include/lemon/backends/vllm_factory.h new file mode 100644 index 000000000..7bf398987 --- /dev/null +++ b/src/cpp/include/lemon/backends/vllm_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The vllm backend's factory (constructs the server class — lemond only). +// Defined in vllm_factory.cpp. +std::unique_ptr vllm_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/whispercpp_descriptor.h b/src/cpp/include/lemon/backends/whispercpp_descriptor.h new file mode 100644 index 000000000..2c3c87f19 --- /dev/null +++ b/src/cpp/include/lemon/backends/whispercpp_descriptor.h @@ -0,0 +1,13 @@ +#pragma once + +#include "lemon/backends/backend_descriptor.h" + +namespace lemon { +namespace backends { + +// The whispercpp backend's descriptor (plain data — CLI-safe, links into both the +// lemonade CLI and lemond). Defined in whispercpp_descriptor.cpp. +extern const BackendDescriptor whispercpp_descriptor; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/whispercpp_factory.h b/src/cpp/include/lemon/backends/whispercpp_factory.h new file mode 100644 index 000000000..d98c97b27 --- /dev/null +++ b/src/cpp/include/lemon/backends/whispercpp_factory.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include "lemon/backends/backend_registry.h" + +namespace lemon { +namespace backends { + +// The whispercpp backend's factory (constructs the server class — lemond only). +// Defined in whispercpp_factory.cpp. +std::unique_ptr whispercpp_create(const BackendContext& ctx); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h index cdcf844dc..be850b583 100644 --- a/src/cpp/include/lemon/model_manager.h +++ b/src/cpp/include/lemon/model_manager.h @@ -110,6 +110,19 @@ struct ModelInfo { // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING) int moonshine_arch = -1; + // Generic per-model fields a backend declares for itself. Any server_models.json + // key not consumed by a typed field above lands here, so a new backend can read + // custom per-model config in load() without editing this shared struct. + std::map extras; + + // Look up an extra field, returning a default when absent. + template + T extra(const std::string& key, const T& fallback) const { + auto it = extras.find(key); + if (it == extras.end() || it->second.is_null()) return fallback; + try { return it->second.get(); } catch (...) { return fallback; } + } + // Utility std::string checkpoint(const std::string& type = "main") const { return checkpoints.count(type) ? checkpoints.at(type) : ""; } std::string resolved_path(const std::string& type = "main") const { return resolved_paths.count(type) ? resolved_paths.at(type) : ""; } diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h new file mode 100644 index 000000000..1557db077 --- /dev/null +++ b/src/cpp/include/lemon/recipe_backend_def.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + +namespace lemon { + +// Device constraints: device_type -> set of allowed families (empty = all families) +using DeviceConstraints = std::map>; + +// A single recipe/backend support row: which OS and device families a given +// (recipe, backend) pair runs on. The canonical support matrix is assembled by +// collecting these rows from every backend descriptor (see BackendDescriptor::support). +// +// IMPORTANT: For recipes with multiple backends (e.g. llamacpp), the order in +// which these rows appear defines the preference order — first listed = most +// preferred. Empty family set {} means "all families of that device type". +struct RecipeBackendDef { + std::string recipe; + std::string backend; + std::set supported_os; + DeviceConstraints devices; +}; + +} // namespace lemon diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h index f3ec74da4..41e91595b 100644 --- a/src/cpp/include/lemon/wrapped_server.h +++ b/src/cpp/include/lemon/wrapped_server.h @@ -17,6 +17,7 @@ #include "model_manager.h" #include "backend_manager.h" #include "recipe_options.h" +#include "backends/backend_descriptor.h" namespace lemon { @@ -307,10 +308,46 @@ class WrappedServer : public ICompletionServer { // No-op by default } - // ICompletionServer implementation - forward requests to the wrapped server - virtual json chat_completion(const json& request) override = 0; - virtual json completion(const json& request) override = 0; - virtual json responses(const json& request) = 0; + // ICompletionServer implementation - forward requests to the wrapped server. + // Default to an "unsupported" error so non-chat backends (TTS, image, + // transcription) inherit a sensible response instead of stubbing each one. + virtual json chat_completion(const json& request) override { + return unsupported_capability_error("chat completion"); + } + virtual json completion(const json& request) override { + return unsupported_capability_error("text completion"); + } + virtual json responses(const json& request) { + return unsupported_capability_error("responses"); + } + + // Descriptor association (set by the backend registry at create() time). The + // effective_* hooks below default to the descriptor's declared values; a + // backend whose device or eviction rule depends on the chosen backend + // variant overrides them (e.g. whisper on npu vs cpu, llamacpp on cpu vs gpu). + void set_descriptor(const BackendDescriptor* descriptor) { descriptor_ = descriptor; } + const BackendDescriptor* get_descriptor() const { return descriptor_; } + + // Effective accelerator device for this load. The router calls this after it + // resolves the "_backend" option but before eviction. Defaults to the + // descriptor's default_device; variant-dependent backends override. + virtual DeviceType effective_device(const RecipeOptions& options) const { + (void)options; + return descriptor_ ? descriptor_->default_device : device_type_; + } + + // Effective slot/eviction policy for this load. The router switches on this + // value to enforce NPU exclusivity and LRU slot accounting. Defaults to the + // descriptor's slot_policy; variant-dependent backends override. + virtual SlotPolicy effective_slot_policy(const RecipeOptions& options) const { + (void)options; + return descriptor_ ? descriptor_->slot_policy : SlotPolicy::Standard; + } + + // Dynamic availability check. Returns "" if the backend can run on this + // system, or a user-facing reason why it cannot. Defaults to "available"; + // backends with runtime-dependent availability (cloud) override. + virtual std::string availability() const { return ""; } // Forward streaming requests to the wrapped server (public for Router access) // Virtual so backends can transform request (e.g., FLM needs checkpoint in model field) @@ -373,6 +410,17 @@ class WrappedServer : public ICompletionServer { BackendRequestKind kind_; }; + // Standard "this backend does not serve " error payload, matching the + // shape backends return from unsupported capability methods. + json unsupported_capability_error(const std::string& what) const { + return json{{"error", { + {"message", server_name_ + " does not support " + what + + ". Use the appropriate endpoint for this model type instead."}, + {"type", "unsupported_operation"}, + {"code", "model_not_applicable"} + }}}; + } + static bool has_process_handle(const ProcessHandle& handle); ProcessHandle get_process_handle_snapshot() const; void set_process_handle(ProcessHandle handle); @@ -420,6 +468,7 @@ class WrappedServer : public ICompletionServer { std::string log_level_; ModelManager* model_manager_; // Non-owning pointer to ModelManager BackendManager* backend_manager_; // Non-owning pointer to BackendManager + const BackendDescriptor* descriptor_ = nullptr; // Non-owning; set by the backend registry at create() // Multi-model support fields std::string model_name_; diff --git a/src/cpp/server/backends/backend_descriptor_registry.cpp b/src/cpp/server/backends/backend_descriptor_registry.cpp new file mode 100644 index 000000000..5fd217909 --- /dev/null +++ b/src/cpp/server/backends/backend_descriptor_registry.cpp @@ -0,0 +1,29 @@ +#include "lemon/backends/backend_descriptor_registry.h" + +// Generated from LEMON_BACKENDS at configure time. Defines +// lemon::backends::all_generated_descriptors() (descriptor data only). +#include "backend_descriptors_generated.h" + +namespace lemon { +namespace backends { + +const std::vector& all_descriptors() { + static const std::vector kDescriptors = all_generated_descriptors(); + return kDescriptors; +} + +const BackendDescriptor* descriptor_for(const std::string& recipe) { + for (const BackendDescriptor* d : all_descriptors()) { + if (d->recipe == recipe) { + return d; + } + } + return nullptr; +} + +bool has_backend(const std::string& recipe) { + return descriptor_for(recipe) != nullptr; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_descriptors_generated.h.in b/src/cpp/server/backends/backend_descriptors_generated.h.in new file mode 100644 index 000000000..3f6d7ec2a --- /dev/null +++ b/src/cpp/server/backends/backend_descriptors_generated.h.in @@ -0,0 +1,19 @@ +#pragma once +// +// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt. +// Do not edit by hand. Descriptor DATA only (CLI-safe; no server classes). +// +#include +#include "lemon/backends/backend_descriptor.h" +@LEMON_DESCRIPTOR_INCLUDES@ +namespace lemon { +namespace backends { + +inline std::vector all_generated_descriptors() { + return { +@LEMON_DESCRIPTOR_ENTRIES@ + }; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_factories_generated.h.in b/src/cpp/server/backends/backend_factories_generated.h.in new file mode 100644 index 000000000..d488ce014 --- /dev/null +++ b/src/cpp/server/backends/backend_factories_generated.h.in @@ -0,0 +1,21 @@ +#pragma once +// +// AUTO-GENERATED at CMake configure time from LEMON_BACKENDS in CMakeLists.txt. +// Do not edit by hand. Binds each descriptor to its server class's create() +// (server-only: pulls in server classes, compiled into lemond not the CLI). +// +#include +#include "lemon/backends/backend_registry.h" +@LEMON_DESCRIPTOR_INCLUDES@ +@LEMON_FACTORY_INCLUDES@ +namespace lemon { +namespace backends { + +inline std::vector generated_registrations() { + return { +@LEMON_FACTORY_ENTRIES@ + }; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp new file mode 100644 index 000000000..5e0de071f --- /dev/null +++ b/src/cpp/server/backends/backend_registry.cpp @@ -0,0 +1,31 @@ +#include "lemon/backends/backend_registry.h" +#include "lemon/wrapped_server.h" + +// Generated from LEMON_BACKENDS at configure time. Defines +// lemon::backends::generated_registrations(), pairing each descriptor with its +// server class's create(). +#include "backend_factories_generated.h" + +namespace lemon { +namespace backends { + +const std::vector& all_registrations() { + static const std::vector kRegistrations = generated_registrations(); + return kRegistrations; +} + +std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx) { + for (const auto& reg : all_registrations()) { + if (reg.descriptor->recipe == recipe) { + std::unique_ptr server = reg.create(ctx); + if (server) { + server->set_descriptor(reg.descriptor); + } + return server; + } + } + return nullptr; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/cloud_descriptor.cpp b/src/cpp/server/backends/cloud_descriptor.cpp new file mode 100644 index 000000000..fe87a32a2 --- /dev/null +++ b/src/cpp/server/backends/cloud_descriptor.cpp @@ -0,0 +1,23 @@ +#include "lemon/backends/cloud_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor cloud_descriptor = { + /*recipe*/ "cloud", + /*display_name*/ "Cloud", + /*binary*/ "", // no subprocess: runs on a remote provider + /*config_section*/ "cloud", + /*default_device*/ DEVICE_NONE, + /*slot_policy*/ SlotPolicy::Unmetered, // never counts toward slots, never auto-evicted + /*selectable_backend*/ false, + /*uses_ctx_size*/ false, + /*dynamic_models*/ true, // models discovered at runtime from the provider + /*options*/ {}, + /*support*/ {}, // no local gating: install/support machinery skips cloud + /*default_labels*/ {}, + /*required_checkpoints*/ {}, // no downloaded files +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/cloud_factory.cpp b/src/cpp/server/backends/cloud_factory.cpp new file mode 100644 index 000000000..cee2c4ab5 --- /dev/null +++ b/src/cpp/server/backends/cloud_factory.cpp @@ -0,0 +1,16 @@ +#include "lemon/backends/cloud_factory.h" +#include "lemon/backends/cloud_server.h" +#include "lemon/model_manager.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr cloud_create(const BackendContext& ctx) { + return std::make_unique( + ctx.model_info->cloud_provider, ctx.log_level, + ctx.model_manager, ctx.backend_manager, ctx.cloud_registry); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm_descriptor.cpp b/src/cpp/server/backends/fastflowlm_descriptor.cpp new file mode 100644 index 000000000..7b67b8d42 --- /dev/null +++ b/src/cpp/server/backends/fastflowlm_descriptor.cpp @@ -0,0 +1,29 @@ +#include "lemon/backends/fastflowlm_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor fastflowlm_descriptor = { + /*recipe*/ "flm", + /*display_name*/ "FastFlowLM NPU", +#ifdef _WIN32 + /*binary*/ "flm.exe", +#else + /*binary*/ "flm", +#endif + /*config_section*/ "flm", + /*default_device*/ DEVICE_NPU, + /*slot_policy*/ SlotPolicy::CoexistByType, + /*selectable_backend*/ false, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ {}, + /*support*/ { + {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm_factory.cpp b/src/cpp/server/backends/fastflowlm_factory.cpp new file mode 100644 index 000000000..96eddd998 --- /dev/null +++ b/src/cpp/server/backends/fastflowlm_factory.cpp @@ -0,0 +1,13 @@ +#include "lemon/backends/fastflowlm_factory.h" +#include "lemon/backends/fastflowlm_server.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr fastflowlm_create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/kokoro_descriptor.cpp b/src/cpp/server/backends/kokoro_descriptor.cpp new file mode 100644 index 000000000..281f0e0f1 --- /dev/null +++ b/src/cpp/server/backends/kokoro_descriptor.cpp @@ -0,0 +1,30 @@ +#include "lemon/backends/kokoro_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor kokoro_descriptor = { + /*recipe*/ "kokoro", + /*display_name*/ "Kokoro", +#ifdef _WIN32 + /*binary*/ "koko.exe", +#else + /*binary*/ "koko", +#endif + /*config_section*/ "kokoro", + /*default_device*/ DEVICE_CPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ false, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ {}, + /*support*/ { + {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}}, + {"kokoro", "metal", {"macos"}, {{"metal", {}}}}, + }, + /*default_labels*/ {}, // kokoro models carry "tts" explicitly in server_models.json + /*required_checkpoints*/ {"main"}, +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/kokoro_factory.cpp b/src/cpp/server/backends/kokoro_factory.cpp new file mode 100644 index 000000000..a7d4f3be8 --- /dev/null +++ b/src/cpp/server/backends/kokoro_factory.cpp @@ -0,0 +1,13 @@ +#include "lemon/backends/kokoro_factory.h" +#include "lemon/backends/kokoro_server.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr kokoro_create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/llamacpp_descriptor.cpp b/src/cpp/server/backends/llamacpp_descriptor.cpp new file mode 100644 index 000000000..f426e9f20 --- /dev/null +++ b/src/cpp/server/backends/llamacpp_descriptor.cpp @@ -0,0 +1,43 @@ +#include "lemon/backends/llamacpp_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor llamacpp_descriptor = { + /*recipe*/ "llamacpp", + /*display_name*/ "Llama.cpp GPU", +#ifdef _WIN32 + /*binary*/ "llama-server.exe", +#else + /*binary*/ "llama-server", +#endif + /*config_section*/ "llamacpp", + /*default_device*/ DEVICE_GPU, // cpu/system variants resolve to CPU via effective_device() + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ { + {"llamacpp_backend", "--llamacpp", "", "BACKEND", + "LlamaCpp backend to use", "Llama.cpp Backend Options"}, + {"llamacpp_device", "--llamacpp-device", "", "DEVICES", + "Comma-separated list of accelerator devices to use (e.g. Vulkan0)", "Llama.cpp Backend Options"}, + {"llamacpp_args", "--llamacpp-args", "", "ARGS", + "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"}, + }, + /*support*/ { + {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}}, + {"llamacpp", "metal", {"macos"}, {{"metal", {}}}}, + {"llamacpp", "cuda", {"windows", "linux"}, + {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}}, + {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}}, + {"llamacpp", "rocm", {"windows", "linux"}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}}, + {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/llamacpp_factory.cpp b/src/cpp/server/backends/llamacpp_factory.cpp new file mode 100644 index 000000000..cd34fab5a --- /dev/null +++ b/src/cpp/server/backends/llamacpp_factory.cpp @@ -0,0 +1,13 @@ +#include "lemon/backends/llamacpp_factory.h" +#include "lemon/backends/llamacpp_server.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr llamacpp_create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/moonshine_descriptor.cpp b/src/cpp/server/backends/moonshine_descriptor.cpp new file mode 100644 index 000000000..63277ad3c --- /dev/null +++ b/src/cpp/server/backends/moonshine_descriptor.cpp @@ -0,0 +1,30 @@ +#include "lemon/backends/moonshine_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor moonshine_descriptor = { + /*recipe*/ "moonshine", + /*display_name*/ "Moonshine", + /*binary*/ "moonshine-server", + /*config_section*/ "moonshine", + /*default_device*/ DEVICE_CPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ false, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ { + {"moonshine_args", "--moonshine-args", "", "ARGS", + "Custom arguments to pass to moonshine-server", ""}, + }, + /*support*/ { + {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}}, + {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}}, + {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}}, + }, + /*default_labels*/ {"transcription", "realtime-transcription"}, + /*required_checkpoints*/ {"main"}, +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/moonshine_factory.cpp b/src/cpp/server/backends/moonshine_factory.cpp new file mode 100644 index 000000000..859b37b30 --- /dev/null +++ b/src/cpp/server/backends/moonshine_factory.cpp @@ -0,0 +1,13 @@ +#include "lemon/backends/moonshine_factory.h" +#include "lemon/backends/moonshine_server.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr moonshine_create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/ryzenai_descriptor.cpp b/src/cpp/server/backends/ryzenai_descriptor.cpp new file mode 100644 index 000000000..23651ec94 --- /dev/null +++ b/src/cpp/server/backends/ryzenai_descriptor.cpp @@ -0,0 +1,29 @@ +#include "lemon/backends/ryzenai_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor ryzenai_descriptor = { + /*recipe*/ "ryzenai-llm", + /*display_name*/ "Ryzen AI LLM", +#ifdef _WIN32 + /*binary*/ "ryzenai-server.exe", +#else + /*binary*/ "ryzenai-server", +#endif + /*config_section*/ "ryzenai", + /*default_device*/ DEVICE_NPU, + /*slot_policy*/ SlotPolicy::ExclusiveNpu, + /*selectable_backend*/ false, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ {}, + /*support*/ { + {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/ryzenai_factory.cpp b/src/cpp/server/backends/ryzenai_factory.cpp new file mode 100644 index 000000000..4e013a30c --- /dev/null +++ b/src/cpp/server/backends/ryzenai_factory.cpp @@ -0,0 +1,20 @@ +#include "lemon/backends/ryzenai_factory.h" +#include "lemon/backends/ryzenaiserver.h" +#include "lemon/model_manager.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr ryzenai_create(const BackendContext& ctx) { + // RyzenAI resolves its model path before load (set_model_path), matching the + // original router factory's special-casing. + auto server = std::make_unique<::lemon::RyzenAIServer>( + ctx.model_info->model_name, ctx.log_level == "debug", + ctx.model_manager, ctx.backend_manager); + server->set_model_path(ctx.model_info->resolved_path()); + return server; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/sdcpp_descriptor.cpp b/src/cpp/server/backends/sdcpp_descriptor.cpp new file mode 100644 index 000000000..10ebfdd58 --- /dev/null +++ b/src/cpp/server/backends/sdcpp_descriptor.cpp @@ -0,0 +1,47 @@ +#include "lemon/backends/sdcpp_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor sdcpp_descriptor = { + /*recipe*/ "sd-cpp", + /*display_name*/ "StableDiffusion.cpp", +#ifdef _WIN32 + /*binary*/ "sd-server.exe", +#else + /*binary*/ "sd-server", +#endif + /*config_section*/ "sdcpp", + /*default_device*/ DEVICE_CPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ { + {"sd-cpp_backend", "--sdcpp", "", "BACKEND", + "SD.cpp backend to use", "Stable Diffusion Options"}, + {"sdcpp_args", "--sdcpp-args", "", "ARGS", + "Custom arguments to pass to sd-server (must not conflict with managed args)", "Stable Diffusion Options"}, + // Image generation defaults (recipe-level only, not CLI flags). + {"steps", "", 20, "SIZE", "Number of diffusion steps", "Stable Diffusion Options"}, + {"cfg_scale", "", 7.0, "SIZE", "Classifier-free guidance scale", "Stable Diffusion Options"}, + {"width", "", 512, "SIZE", "Output image width", "Stable Diffusion Options"}, + {"height", "", 512, "SIZE", "Output image height", "Stable Diffusion Options"}, + {"sampling_method", "", "", "ARGS", "Sampling method", "Stable Diffusion Options"}, + {"flow_shift", "", 0.0, "SIZE", "Flow shift", "Stable Diffusion Options"}, + }, + /*support*/ { + {"sd-cpp", "rocm", {"windows", "linux"}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}}, + {"sd-cpp", "cuda", {"linux"}, + {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}}, + {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}}, + {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}}, + {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}}, + }, + /*default_labels*/ {"image"}, + /*required_checkpoints*/ {"main"}, // flux text_encoder+vae validated together in load() +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/sdcpp_factory.cpp b/src/cpp/server/backends/sdcpp_factory.cpp new file mode 100644 index 000000000..009fffd43 --- /dev/null +++ b/src/cpp/server/backends/sdcpp_factory.cpp @@ -0,0 +1,13 @@ +#include "lemon/backends/sdcpp_factory.h" +#include "lemon/backends/sd_server.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr sdcpp_create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/vllm_descriptor.cpp b/src/cpp/server/backends/vllm_descriptor.cpp new file mode 100644 index 000000000..54451f365 --- /dev/null +++ b/src/cpp/server/backends/vllm_descriptor.cpp @@ -0,0 +1,30 @@ +#include "lemon/backends/vllm_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor vllm_descriptor = { + /*recipe*/ "vllm", + /*display_name*/ "vLLM ROCm (experimental)", + /*binary*/ "vllm-server", + /*config_section*/ "vllm", + /*default_device*/ DEVICE_GPU, + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ true, + /*dynamic_models*/ false, + /*options*/ { + {"vllm_backend", "--vllm", "", "BACKEND", + "vLLM backend to use", "vLLM Options"}, + {"vllm_args", "--vllm-args", "", "ARGS", + "Custom arguments to pass to vllm-server", "vLLM Options"}, + }, + /*support*/ { + {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}}, + }, + /*default_labels*/ {}, + /*required_checkpoints*/ {"main"}, +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/vllm_factory.cpp b/src/cpp/server/backends/vllm_factory.cpp new file mode 100644 index 000000000..20fd71851 --- /dev/null +++ b/src/cpp/server/backends/vllm_factory.cpp @@ -0,0 +1,13 @@ +#include "lemon/backends/vllm_factory.h" +#include "lemon/backends/vllm_server.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr vllm_create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/whispercpp_descriptor.cpp b/src/cpp/server/backends/whispercpp_descriptor.cpp new file mode 100644 index 000000000..6124e779e --- /dev/null +++ b/src/cpp/server/backends/whispercpp_descriptor.cpp @@ -0,0 +1,39 @@ +#include "lemon/backends/whispercpp_descriptor.h" + +namespace lemon { +namespace backends { + +const BackendDescriptor whispercpp_descriptor = { + /*recipe*/ "whispercpp", + /*display_name*/ "Whisper.cpp", +#ifdef _WIN32 + /*binary*/ "whisper-server.exe", +#else + /*binary*/ "whisper-server", +#endif + /*config_section*/ "whispercpp", + /*default_device*/ DEVICE_CPU, // npu variant resolves to NPU + ExclusiveNpu via effective_*() + /*slot_policy*/ SlotPolicy::Standard, + /*selectable_backend*/ true, + /*uses_ctx_size*/ false, + /*dynamic_models*/ false, + /*options*/ { + {"whispercpp_backend", "--whispercpp", "", "BACKEND", + "WhisperCpp backend to use", "Whisper.cpp Options"}, + {"whispercpp_args", "--whispercpp-args", "", "ARGS", + "Custom arguments to pass to whisper-server", "Whisper.cpp Options"}, + }, + /*support*/ { + {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}}, + {"whispercpp", "rocm", {"windows", "linux"}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}}, + {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}}, + {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}}, + {"whispercpp", "metal", {"macos"}, {{"metal", {}}}}, + }, + /*default_labels*/ {"transcription", "realtime-transcription"}, + /*required_checkpoints*/ {"main"}, // npu_cache validated in load() (npu variant only) +}; + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/whispercpp_factory.cpp b/src/cpp/server/backends/whispercpp_factory.cpp new file mode 100644 index 000000000..3223804aa --- /dev/null +++ b/src/cpp/server/backends/whispercpp_factory.cpp @@ -0,0 +1,13 @@ +#include "lemon/backends/whispercpp_factory.h" +#include "lemon/backends/whisper_server.h" +#include "lemon/wrapped_server.h" + +namespace lemon { +namespace backends { + +std::unique_ptr whispercpp_create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 1036e6bb6..febdc4ec8 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -618,6 +619,35 @@ static void parse_image_defaults(ModelInfo& info, const json& model_json) { } } +// Populate ModelInfo::extras with any model-JSON key not consumed by a typed +// ModelInfo field. This lets a new backend read custom per-model fields in load() +// without editing the shared ModelInfo struct. Keep this set in sync with the +// keys read by the parse blocks in build_cache(). +static void parse_extras(ModelInfo& info, const json& model_json) { + static const std::set kKnownKeys = { + "checkpoint", "checkpoints", "components", "mmproj", "recipe", "suggested", + "hf_load", "source", "size", "cloud_provider", "moonshine_arch", + "labels", "image_defaults", "recipe_options" + }; + if (!model_json.is_object()) return; + for (auto& [key, value] : model_json.items()) { + if (kKnownKeys.count(key) == 0) { + info.extras[key] = value; + } + } +} + +// Default device for a recipe: the backend descriptor is authoritative for +// registered backends; collection/unknown recipes fall back to the recipe map. +// (A backend whose device depends on the chosen backend variant resolves the +// final device at load time via WrappedServer::effective_device.) +static DeviceType device_type_for_recipe(const std::string& recipe) { + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + return desc->default_device; + } + return get_device_type_from_recipe(recipe); +} + // Build merged recipe options: image_defaults -> JSON recipe_options -> user-saved overrides. // json_recipe_options: pre-extracted recipe_options for this model (from build_cache's // two-phase pattern). Pass a null json if the model JSON should be read directly instead. @@ -1276,7 +1306,7 @@ std::map ModelManager::discover_extra_models() const { info.downloaded = true; info.source = EXTRA_MODEL_SOURCE; info.labels.push_back("custom"); - info.device = get_device_type_from_recipe(EXTRA_MODEL_RECIPE); + info.device = device_type_for_recipe(EXTRA_MODEL_RECIPE); return info; }; @@ -2045,6 +2075,7 @@ void ModelManager::build_cache() { } parse_image_defaults(info, value); + parse_extras(info, value); // Parse recipe_options if present (for per-model runtime config like sdcpp_args) if (value.contains("recipe_options") && value["recipe_options"].is_object()) { @@ -2053,7 +2084,7 @@ void ModelManager::build_cache() { // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); + info.device = device_type_for_recipe(info.recipe); try { resolve_all_model_paths(info); @@ -2098,6 +2129,7 @@ void ModelManager::build_cache() { } parse_image_defaults(info, value); + parse_extras(info, value); // Parse recipe_options if present (for per-model runtime config like sdcpp_args) if (value.contains("recipe_options") && value["recipe_options"].is_object()) { @@ -2106,7 +2138,7 @@ void ModelManager::build_cache() { // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); + info.device = device_type_for_recipe(info.recipe); try { resolve_all_model_paths(info); @@ -2287,7 +2319,7 @@ void ModelManager::add_model_to_cache(const std::string& model_name) { // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); + info.device = device_type_for_recipe(info.recipe); resolve_all_model_paths(info); @@ -2862,16 +2894,12 @@ void ModelManager::register_user_model(const std::string& model_name, // loop above; this local is just for the label inference below. std::string recipe = model_data.value("recipe", ""); - if (recipe == "sd-cpp") { - labels.insert("image"); - } - if (recipe == "whispercpp") { - labels.insert("transcription"); - labels.insert("realtime-transcription"); - } - if (recipe == "moonshine") { - labels.insert("transcription"); - labels.insert("realtime-transcription"); + // Inject the backend's default labels for models that omit them (e.g. sd-cpp + // -> image, whispercpp/moonshine -> transcription). Sourced from the descriptor. + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + for (const auto& label : desc->default_labels) { + labels.insert(label); + } } model_entry["labels"] = labels; @@ -3100,7 +3128,7 @@ std::vector ModelManager::get_flm_available_models() { // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); - info.device = get_device_type_from_recipe(info.recipe); + info.device = device_type_for_recipe(info.recipe); flm_models.push_back(info); } diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp index 65d4bb676..70c188e34 100644 --- a/src/cpp/server/recipe_options.cpp +++ b/src/cpp/server/recipe_options.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -12,78 +13,68 @@ namespace lemon { using json = nlohmann::json; -static const json DEFAULTS = { - {"ctx_size", -1}, // -1 triggers auto-resolution (memory + arch metadata) - {"merge_args", true}, - {"llamacpp_device", ""}, - {"llamacpp_backend", ""}, // Will be overridden dynamically - {"llamacpp_args", ""}, - {"sd-cpp_backend", ""}, // "" means auto-detect (mapped from "auto" in config.json) - {"sdcpp_args", ""}, - {"whispercpp_backend", ""}, // "" means auto-detect (mapped from "auto" in config.json) - {"whispercpp_args", ""}, - {"moonshine_args", ""}, // Custom arguments to pass to moonshine-server - // Image generation defaults (for sd-cpp recipe) - // These are recipe-level defaults only, not CLI arguments — per reviewer guidance, - // there are too many image gen params for CLI flags, and no universal defaults. - {"steps", 20}, - {"cfg_scale", 7.0}, - {"width", 512}, - {"height", 512}, - {"sampling_method", ""}, - {"flow_shift", 0.0}, - // vLLM-specific options - {"vllm_backend", ""}, // "" means auto-detect - {"vllm_args", ""}, // Custom arguments to pass to vllm-server - // Cloud recipe has no backend variants (provider selection lives on the - // per-model cloud_provider field). The empty string satisfies Router's - // per-backend-args lookup; cloud reads no backend-specific config. - {"cloud_backend", ""}, - - // Auto-eviction options - {"auto_evict", nullptr}, // nullptr means fallback to global config - {"evict_idle_timeout", 300}, // Default hard idle timeout (5 mins) - {"downsize_idle_timeout", 60}, // Default soft idle timeout (1 min) - {"evict_weight_factor", 1.0}, // Eviction-protection weight (higher = more protected) - {"pinned", false} -}; - - -// Mapping from flat option names to CLI flags (used by to_cli_options) -// Note: Image generation params (steps, cfg_scale, width, height, sampling_method, -// flow_shift) are recipe-level defaults only — not exposed as CLI arguments. -// Runtime options (diffusion_fa, offload_to_cpu) go through --sdcpp-args. -static const std::map OPTION_TO_CLI_FLAG = { - {"ctx_size", "--ctx-size"}, - {"merge_args", "--merge-args"}, - {"llamacpp_backend", "--llamacpp"}, - {"llamacpp_device", "--llamacpp-device"}, - {"llamacpp_args", "--llamacpp-args"}, - {"sd-cpp_backend", "--sdcpp"}, - {"sdcpp_args", "--sdcpp-args"}, - {"whispercpp_backend", "--whispercpp"}, - {"whispercpp_args", "--whispercpp-args"}, - {"moonshine_args", "--moonshine-args"}, - {"vllm_backend", "--vllm"}, - {"vllm_args", "--vllm-args"} -}; +// Options shared by every backend. Per-backend options (and ctx_size opt-in) +// come from each backend's descriptor; these are the universal kit. +static const json& common_defaults() { + static const json d = { + {"ctx_size", -1}, // -1 triggers auto-resolution (memory + arch metadata) + {"merge_args", true}, + // Auto-eviction options (apply to every recipe) + {"auto_evict", nullptr}, // nullptr means fallback to global config + {"evict_idle_timeout", 300}, // Default hard idle timeout (5 mins) + {"downsize_idle_timeout", 60}, // Default soft idle timeout (1 min) + {"evict_weight_factor", 1.0}, // Eviction-protection weight (higher = more protected) + {"pinned", false}, + }; + return d; +} + +// Defaults for every option: the common kit plus each backend descriptor's +// declared options. Built once from the registry so config defaults, CLI flags, +// and load-time resolution can never drift from the descriptors. +static const json& get_defaults() { + static const json defaults = [] { + json d = common_defaults(); + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& opt : desc->options) { + d[opt.name] = opt.default_value; + } + } + return d; + }(); + return defaults; +} + +// Flat option name -> CLI flag, for to_cli_options(). ctx_size/merge_args are +// the common flags; the rest come from descriptor options that declare a flag. +static const std::map& get_option_to_cli_flag() { + static const std::map mapping = [] { + std::map m{ + {"ctx_size", "--ctx-size"}, + {"merge_args", "--merge-args"}, + }; + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& opt : desc->options) { + if (!opt.cli_flag.empty()) { + m[opt.name] = opt.cli_flag; + } + } + } + return m; + }(); + return mapping; +} static std::vector get_keys_for_recipe(const std::string& recipe) { std::vector keys; - if (recipe == "llamacpp") { - keys = {"ctx_size", "llamacpp_device", "llamacpp_backend", "llamacpp_args", "merge_args"}; - } else if (recipe == "whispercpp") { - keys = {"whispercpp_backend", "whispercpp_args", "merge_args"}; - } else if (recipe == "moonshine") { - keys = {"moonshine_args", "merge_args"}; - } else if (recipe == "flm") { - return {"ctx_size", "merge_args"}; - } else if (recipe == "ryzenai-llm") { - keys = {"ctx_size"}; - } else if (recipe == "sd-cpp") { - keys = {"sd-cpp_backend", "sdcpp_args", "steps", "cfg_scale", "width", "height", "sampling_method", "flow_shift", "merge_args"}; - } else if (recipe == "vllm") { - keys = {"ctx_size", "vllm_backend", "vllm_args", "merge_args"}; + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + if (desc->uses_ctx_size) { + keys.push_back("ctx_size"); + } + for (const auto& opt : desc->options) { + keys.push_back(opt.name); + } + keys.push_back("merge_args"); } // Add auto-eviction options for all recipes @@ -125,7 +116,7 @@ static bool try_get_backend_options(const std::string& opt_name, SystemInfo::Sup std::vector RecipeOptions::to_cli_options(const json& raw_options) { std::vector cli; - for (auto& [opt_name, cli_flag] : OPTION_TO_CLI_FLAG) { + for (auto& [opt_name, cli_flag] : get_option_to_cli_flag()) { if (raw_options.contains(opt_name)) { auto val = raw_options[opt_name]; if (!val.is_null() && val != "") { @@ -146,7 +137,7 @@ std::vector RecipeOptions::to_cli_options(const json& raw_options) std::vector RecipeOptions::known_keys() { std::vector keys; - for (auto& [key, value] : DEFAULTS.items()) { + for (auto& [key, value] : get_defaults().items()) { keys.push_back(key); } return keys; @@ -239,7 +230,7 @@ json RecipeOptions::get_option(const std::string& opt) const { } } #endif - return DEFAULTS.contains(opt) ? DEFAULTS[opt] : json(); + return get_defaults().contains(opt) ? get_defaults()[opt] : json(); } void RecipeOptions::set_option(const std::string& opt, const json& value) { @@ -247,29 +238,38 @@ void RecipeOptions::set_option(const std::string& opt, const json& value) { } #ifdef LEMONADE_CLI -// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options -static const json CLI_OPTIONS = { - {"--ctx-size", {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}}}, - {"--merge-args", {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}}}, - {"--llamacpp", {{"option_name", "llamacpp_backend"}, {"type_name", "BACKEND"}, {"help", "LlamaCpp backend to use"}, {"group", "Llama.cpp Backend Options"}}}, - {"--llamacpp-device", {{"option_name", "llamacpp_device"}, {"type_name", "DEVICES"}, {"help", "Comma-separated list of accelerator devices to use (e.g. Vulkan0)"}, {"group", "Llama.cpp Backend Options"}}}, - {"--llamacpp-args", {{"option_name", "llamacpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to llama-server"}, {"group", "Llama.cpp Backend Options"}}}, - {"--sdcpp", {{"option_name", "sd-cpp_backend"}, {"type_name", "BACKEND"}, {"help", "SD.cpp backend to use"}, {"group", "Stable Diffusion Options"}}}, - {"--sdcpp-args", {{"option_name", "sdcpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to sd-server (must not conflict with managed args)"}, {"group", "Stable Diffusion Options"}}}, - {"--whispercpp", {{"option_name", "whispercpp_backend"}, {"type_name", "BACKEND"}, {"help", "WhisperCpp backend to use"}, {"group", "Whisper.cpp Options"}}}, - {"--whispercpp-args", {{"option_name", "whispercpp_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to whisper-server"}, {"group", "Whisper.cpp Options"}}}, - {"--moonshine-args", {{"option_name", "moonshine_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to moonshine-server"}}}, - {"--vllm", {{"option_name", "vllm_backend"}, {"type_name", "BACKEND"}, {"help", "vLLM backend to use"}, {"group", "vLLM Options"}}}, - {"--vllm-args", {{"option_name", "vllm_args"}, {"type_name", "ARGS"}, {"help", "Custom arguments to pass to vllm-server"}, {"group", "vLLM Options"}}}, - // Note: Image gen params (--steps, --cfg-scale, --width, --height) removed — recipe-level only. - // Runtime options (--diffusion-fa, --offload-to-cpu) go through --sdcpp-args. -}; +// CLI_OPTIONS used only by the lemonade CLI client for add_cli_options. +// ctx_size/merge_args are the common flags; everything else is derived from +// descriptor options that declare a CLI flag, so the CLI never needs editing +// when a backend is added. Image-gen params (steps/cfg_scale/width/height) have +// no cli_flag in their descriptor, so they stay recipe-level only as before. +static const json& get_cli_options() { + static const json cli_options = [] { + json o = json::object(); + o["--ctx-size"] = {{"option_name", "ctx_size"}, {"type_name", "SIZE"}, {"help", "Context size for the model"}, {"group", "General Options"}}; + o["--merge-args"] = {{"option_name", "merge_args"}, {"type_name", "BOOL"}, {"help", "Merge global and model arguments when loading the model"}, {"group", "General Options"}}; + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& opt : desc->options) { + if (opt.cli_flag.empty()) { + continue; + } + json entry = {{"option_name", opt.name}, {"type_name", opt.type_name}, {"help", opt.help}}; + if (!opt.group.empty()) { + entry["group"] = opt.group; + } + o[opt.cli_flag] = entry; + } + } + return o; + }(); + return cli_options; +} void RecipeOptions::add_cli_options(CLI::App& app, json& storage) { - for (auto& [key, opt] : CLI_OPTIONS.items()) { + for (auto& [key, opt] : get_cli_options().items()) { const std::string opt_name = opt["option_name"]; CLI::Option* o; - json defval = DEFAULTS[opt_name]; + json defval = get_defaults()[opt_name]; if (defval.is_number_float()) { o = app.add_option_function(key, [opt_name, &storage = storage](double val) { storage[opt_name] = val; }, opt["help"]); diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp index b3ec22c3b..a3c4bec74 100644 --- a/src/cpp/server/router.cpp +++ b/src/cpp/server/router.cpp @@ -1,5 +1,6 @@ #include "lemon/router.h" #include "lemon/cloud_provider_registry.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/cloud_server.h" #include "lemon/backends/llamacpp_server.h" #include "lemon/backends/fastflowlm_server.h" @@ -143,12 +144,26 @@ bool Router::reload_model_after_watchdog_reset(const std::string& requested_mode } } +// Slot/eviction policy for a recipe, from its descriptor (default Standard). +// This is the recipe-static policy used for pre-load slot decisions, mirroring +// the historical use of get_device_type_from_recipe at load time. +static SlotPolicy slot_policy_for_recipe(const std::string& recipe) { + if (const auto* desc = backends::descriptor_for(recipe)) { + return desc->slot_policy; + } + return SlotPolicy::Standard; +} + +static bool is_unmetered_recipe(const std::string& recipe) { + return slot_policy_for_recipe(recipe) == SlotPolicy::Unmetered; +} + int Router::count_servers_by_type(ModelType type) const { int count = 0; for (const auto& server : loaded_servers_) { - // Cloud servers consume no local memory and stay loaded for free, so - // they are excluded from the slot accounting that drives LRU eviction. - if (server->get_recipe_options().get_recipe() == "cloud") { + // Unmetered backends (cloud) consume no local memory and stay loaded for + // free, so they are excluded from the slot accounting that drives LRU eviction. + if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) { continue; } if (server->is_backend_alive() && server->get_model_type() == type) { @@ -162,10 +177,10 @@ WrappedServer* Router::find_lru_server_by_type(ModelType type) const { WrappedServer* lru = nullptr; for (const auto& server : loaded_servers_) { - // Cloud servers are not eviction candidates; they have no memory cost - // and reloading them is essentially free, but evicting them throws - // away the cached api key/upstream-id binding for no benefit. - if (server->get_recipe_options().get_recipe() == "cloud") { + // Unmetered backends (cloud) are not eviction candidates; they have no + // memory cost and reloading them is essentially free, but evicting them + // throws away the cached api key/upstream-id binding for no benefit. + if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) { continue; } if (server->is_backend_alive() && server->get_model_type() == type) { @@ -299,49 +314,28 @@ void Router::simulate_vram_pressure(double pct) { } std::unique_ptr Router::create_backend_server(const ModelInfo& model_info) { - std::unique_ptr new_server; std::string log_level = config_->log_level(); - if (model_info.recipe == "cloud") { - LOG(DEBUG, "Router") << "Creating CloudServer backend (provider: " - << model_info.cloud_provider << ")" << std::endl; - new_server = std::make_unique(model_info.cloud_provider, log_level, - model_manager_, backend_manager_, - cloud_registry_); - } else if (model_info.recipe == "whispercpp") { - LOG(DEBUG, "Router") << "Creating WhisperServer backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "moonshine") { - LOG(DEBUG, "Router") << "Creating MoonshineServer backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "kokoro") { - LOG(DEBUG, "Router") << "Creating Kokoro backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "sd-cpp") { - LOG(DEBUG, "Router") << "Creating SDServer backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "flm") { - LOG(DEBUG, "Router") << "Creating FastFlowLM backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else if (model_info.recipe == "ryzenai-llm") { - LOG(DEBUG, "Router") << "Creating RyzenAI-Server backend" << std::endl; - - std::string model_path = model_info.resolved_path(); - LOG(DEBUG, "Router") << "Using model path: " << model_path << std::endl; - - auto* ryzenai_server = new RyzenAIServer(model_info.model_name, - log_level == "debug", model_manager_, backend_manager_); - ryzenai_server->set_model_path(model_path); - new_server.reset(ryzenai_server); - } else if (model_info.recipe == "vllm") { - LOG(DEBUG, "Router") << "Creating vLLM backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); - } else { - LOG(DEBUG, "Router") << "Creating LlamaCpp backend" << std::endl; - new_server = std::make_unique(log_level, model_manager_, backend_manager_); + backends::BackendContext ctx; + ctx.log_level = log_level; + ctx.model_manager = model_manager_; + ctx.backend_manager = backend_manager_; + ctx.cloud_registry = cloud_registry_; + ctx.model_info = &model_info; + + // The backend registry binds each recipe's descriptor to its create(). It is + // the single source of truth for backend construction (see LEMON_BACKENDS). + std::unique_ptr new_server = backends::create_server(model_info.recipe, ctx); + if (new_server) { + LOG(DEBUG, "Router") << "Created backend for recipe '" << model_info.recipe + << "' via registry" << std::endl; + return new_server; } - return new_server; + // Unknown recipe: fall back to llamacpp, preserving the historical default. + LOG(DEBUG, "Router") << "No registered backend for recipe '" << model_info.recipe + << "', defaulting to LlamaCpp" << std::endl; + return std::make_unique(log_level, model_manager_, backend_manager_); } void Router::load_model(const std::string& model_name, @@ -427,28 +421,39 @@ void Router::load_model(const std::string& model_name, // Get max models for this type (same limit for all types) int max_models = config_->max_loaded_models(); - // NPU EXCLUSIVITY CHECK (recipe-aware rules) - // FLM can run up to 3 concurrent NPU processes (1 LLM + 1 transcription + 1 embedding) - // RyzenAI and WhisperCpp lock the entire NPU exclusively - if (device_type & DEVICE_NPU) { - if (model_info.recipe == "ryzenai-llm" || model_info.recipe == "whispercpp") { - // Exclusive NPU recipes - evict ALL NPU servers + // NPU EXCLUSIVITY CHECK — driven by the backend's slot policy (descriptor). + // ExclusiveNpu (ryzenai-llm, whisper-on-npu): lock the entire NPU, + // evicting ALL NPU servers first. + // CoexistByType (flm): coexist with other FLM types (max 1 per type), + // but evict exclusive-NPU peers. + // Standard/Unmetered backends share no device exclusivity. + switch (slot_policy_for_recipe(model_info.recipe)) { + case SlotPolicy::ExclusiveNpu: { if (has_npu_server()) { LOG(INFO, "Router") << model_info.recipe << " requires exclusive NPU access, evicting all NPU servers..." << std::endl; evict_all_npu_servers(); } - } else if (model_info.recipe == "flm") { - // FLM can coexist with other FLM types, but not with exclusive-NPU recipes - // 1. Evict any exclusive-NPU server (mutually exclusive) - for (const std::string& exclusive_recipe : {"ryzenai-llm", "whispercpp"}) { - WrappedServer* exclusive_server = find_npu_server_by_recipe(exclusive_recipe); - if (exclusive_server) { - LOG(INFO, "Router") << "FLM cannot coexist with " << exclusive_recipe - << ", evicting: " << exclusive_server->get_model_name() << std::endl; - evict_server(exclusive_server); + break; + } + case SlotPolicy::CoexistByType: { + // 1. Evict every NPU holder that is not itself a coexisting (FLM) + // backend — i.e. exclusive-NPU peers like ryzenai-llm and + // whisper-on-npu. Collect first; evict_server mutates loaded_servers_. + std::vector exclusive_peers; + for (const auto& server : loaded_servers_) { + if (server->is_backend_alive() && (server->get_device_type() & DEVICE_NPU) && + slot_policy_for_recipe(server->get_recipe_options().get_recipe()) != + SlotPolicy::CoexistByType) { + exclusive_peers.push_back(server.get()); } } + for (auto* peer : exclusive_peers) { + LOG(INFO, "Router") << "FLM cannot coexist with " + << peer->get_recipe_options().get_recipe() + << ", evicting: " << peer->get_model_name() << std::endl; + evict_server(peer); + } // 2. Evict FLM of the SAME model type (max 1 per type: 1 LLM, 1 transcription, 1 embed) WrappedServer* same_type_flm = find_flm_server_by_type(model_type); if (same_type_flm) { @@ -457,22 +462,20 @@ void Router::load_model(const std::string& model_name, << ", evicting..." << std::endl; evict_server(same_type_flm); } - } else { - // Unknown NPU recipe - default to exclusive access - if (has_npu_server()) { - LOG(INFO, "Router") << "Unknown NPU recipe, evicting all NPU servers..." << std::endl; - evict_all_npu_servers(); - } + break; } + case SlotPolicy::Standard: + case SlotPolicy::Unmetered: + break; } // LRU EVICTION CHECK (from spec: Least Recently Used Cache) - // Skip eviction if unlimited (-1). Cloud-recipe loads also skip the + // Skip eviction if unlimited (-1). Unmetered (cloud) loads also skip the // check entirely: they consume no local resources, so they have no // business kicking a warm local model out of memory. - bool is_cloud_load = (model_info.recipe == "cloud"); + bool is_unmetered_load = is_unmetered_recipe(model_info.recipe); int current_count = count_servers_by_type(model_type); - if (!is_cloud_load && max_models != -1 && current_count >= max_models) { + if (!is_unmetered_load && max_models != -1 && current_count >= max_models) { WrappedServer* lru = find_lru_server_by_type(model_type); if (lru) { LOG(INFO, "Router") << "Slot limit reached for type " diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp index 5aa62457d..08aa41dc6 100644 --- a/src/cpp/server/runtime_config.cpp +++ b/src/cpp/server/runtime_config.cpp @@ -1,4 +1,5 @@ #include "lemon/runtime_config.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/system_info.h" #include "lemon/utils/aixlog.hpp" #include "lemon/utils/path_utils.h" @@ -29,22 +30,26 @@ RuntimeConfig* RuntimeConfig::global() { return s_global_instance.load(std::memory_order_acquire); } -static const std::vector s_backend_names = { - "llamacpp", "whispercpp", "moonshine", "sdcpp", "flm", "vllm", "ryzenai", "kokoro" -}; - +// A valid config.json backend section is the config_section of any descriptor +// that runs a local subprocess (binary != ""). Cloud has no binary, so it is not +// a backend section. Derived from descriptors — no hand-maintained list. static bool is_backend_name(const std::string& key) { - return std::find(s_backend_names.begin(), s_backend_names.end(), key) != s_backend_names.end(); + for (const auto* desc : lemon::backends::all_descriptors()) { + if (!desc->binary.empty() && desc->effective_config_section() == key) { + return true; + } + } + return false; } -// Backends that have a selectable "backend" key -static const std::vector s_selectable_backends = { - "llamacpp", "whispercpp", "sdcpp", "vllm" -}; - +// A config section has a selectable "backend" key iff its descriptor opts in. static bool has_backend_selection(const std::string& config_section) { - return std::find(s_selectable_backends.begin(), s_selectable_backends.end(), - config_section) != s_selectable_backends.end(); + for (const auto* desc : lemon::backends::all_descriptors()) { + if (desc->selectable_backend && desc->effective_config_section() == config_section) { + return true; + } + } + return false; } static std::pair normalize_config_set_changes(const json& changes) { @@ -71,12 +76,18 @@ static std::pair normalize_config_set_changes(const json& cha } std::string RuntimeConfig::config_section_to_recipe(const std::string& config_section) { - if (config_section == "sdcpp") return "sd-cpp"; + for (const auto* desc : lemon::backends::all_descriptors()) { + if (desc->effective_config_section() == config_section) { + return desc->recipe; + } + } return config_section; } std::string RuntimeConfig::recipe_to_config_section(const std::string& recipe) { - if (recipe == "sd-cpp") return "sdcpp"; + if (const auto* desc = lemon::backends::descriptor_for(recipe)) { + return desc->effective_config_section(); + } return recipe; } diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index b95176957..d0fea0504 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -4329,9 +4329,8 @@ void Server::handle_bin_change(const std::string& section, std::string backend = bin_key.substr(0, bin_key.size() - 4); // The "server_bin" key (as in ryzenai.server_bin) is not consumed by the - // current install flow — find_external_backend_binary uses recipe-based - // section lookup and there is no recipe whose section equals "ryzenai". - // Skip the hot-swap rather than attempt an install that won't help. + // current install flow, so skip the hot-swap rather than attempt an install + // that won't help. if (backend == "server") { LOG(WARNING, "Server") << section << "." << bin_key << " is not consumed by the install flow; " diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index d2d3f7d51..6a27a4fb2 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -7,6 +7,8 @@ #include "lemon/utils/json_utils.h" #include "lemon/utils/process_manager.h" #include "lemon/backends/backend_utils.h" +#include "lemon/backends/backend_descriptor_registry.h" +#include "lemon/recipe_backend_def.h" #include #include #include @@ -404,15 +406,8 @@ std::vector query_dxg_amd_gpus(const std::string& gpu_type) { // Recipe/Backend definition table - single source of truth for support matrix // ============================================================================ -// Device constraints: device_type -> set of allowed families (empty = all families) -using DeviceConstraints = std::map>; - -struct RecipeBackendDef { - std::string recipe; - std::string backend; - std::set supported_os; - DeviceConstraints devices; -}; +// RecipeBackendDef and DeviceConstraints are declared in lemon/recipe_backend_def.h +// so backend descriptors can carry their own support rows. // Recipe definitions table - single source of truth for all recipe/backend support // Format: {recipe, backend, {supported_os}, {{device_type, {allowed_families}}}} @@ -422,115 +417,22 @@ struct RecipeBackendDef { // Example: metal is listed before vulkan on macOS, vulkan before cpu elsewhere. // // Empty family set {} means "all families of that device type" -static const std::vector RECIPE_DEFS = { - // llamacpp with multiple backends (order = preference) - {"llamacpp", "system", {"linux"}, { - {"cpu", {"x86_64", "arm64"}}, // Placeholder, actual check is PATH-based - }}, - {"llamacpp", "metal", {"macos"}, - { - {"metal", {}}, - }}, - {"llamacpp", "cuda", {"windows", "linux"}, { - {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}, - }}, - {"llamacpp", "vulkan", {"windows", "linux"}, { - {"cpu", {"x86_64", "arm64"}}, - {"amd_gpu", {}}, // all AMD GPU families - }}, - {"llamacpp", "rocm", {"windows", "linux"}, { - {"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}, // STX iGPUs + RDNA2/3/4 dGPUs - }}, - {"llamacpp", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64", "arm64"}}, - }}, - - // whisper.cpp - NPU, ROCm GPU, Vulkan, CPU, Metal - {"whispercpp", "npu", {"windows"}, { - {"amd_npu", {"XDNA2"}}, - }}, - {"whispercpp", "rocm", {"windows", "linux"}, { - // gfx103X omitted: lemonade-sdk/whisper.cpp-rocm publishes no gfx103X - // ROCm whisper build, so advertising it would yield a 404 on install. - {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}, - }}, - {"whispercpp", "vulkan", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - {"amd_gpu", {}}, - }}, - {"whispercpp", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - }}, - {"whispercpp", "metal", {"macos"}, { - {"metal", {}}, - }}, - - // kokoro - Windows/Linux x86_64; macOS arm64 (Metal) - {"kokoro", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - }}, - {"kokoro", "metal", {"macos"}, { - {"metal", {}}, - }}, - - // stable-diffusion.cpp - ROCm backend for AMD GPUs - {"sd-cpp", "rocm", {"windows", "linux"}, { - {"amd_gpu", { - "gfx1150", "gfx1151", "gfx1152", - "gfx103X", "gfx110X", "gfx120X" - }}, - }}, - - // stable-diffusion.cpp - CUDA backend for NVIDIA GPUs (Linux) - {"sd-cpp", "cuda", {"linux"}, { - {"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}, - }}, - - // stable-diffusion.cpp - Vulkan backend (Windows/Linux x86_64) - {"sd-cpp", "vulkan", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - {"amd_gpu", {}}, - {"nvidia_gpu", {}}, - }}, - - // stable-diffusion.cpp - CPU backend (Windows/Linux x86_64) - {"sd-cpp", "cpu", {"windows", "linux"}, { - {"cpu", {"x86_64"}}, - }}, - - // stable-diffusion.cpp - Metal backend (macOS arm64) - {"sd-cpp", "metal", {"macos"}, { - {"metal", {}}, - }}, - - // FLM - NPU (XDNA2) - {"flm", "npu", {"windows", "linux"}, { - {"amd_npu", {"XDNA2"}}, - }}, - - // RyzenAI LLM - Windows NPU (XDNA2) - {"ryzenai-llm", "npu", {"windows"}, { - {"amd_npu", {"XDNA2"}}, - }}, - - // vLLM - ROCm backend for AMD GPUs (Linux only) - {"vllm", "rocm", {"linux"}, { - {"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}, - }}, - - // Moonshine - CPU-only streaming STT. Platforms match the published - // moonshine-server-rocm bundles (moonshine-voice wheels): Windows x64, - // Linux x64/arm64, macOS arm64. No Intel macOS or Windows-arm64 wheel. - {"moonshine", "cpu", {"windows"}, { - {"cpu", {"x86_64"}}, - }}, - {"moonshine", "cpu", {"linux"}, { - {"cpu", {"x86_64", "arm64"}}, - }}, - {"moonshine", "cpu", {"macos"}, { - {"cpu", {"arm64"}}, - }}, -}; +// The recipe/backend support matrix is assembled from every backend descriptor's +// `support` rows (see lemon/backends/*_descriptor.cpp). Concatenated in registry +// order; within a recipe, row order is the backend preference order. This is the +// single source of truth — there is no separate hand-maintained table. +static const std::vector& recipe_defs() { + static const std::vector defs = [] { + std::vector v; + for (const auto* desc : lemon::backends::all_descriptors()) { + for (const auto& row : desc->support) { + v.push_back(row); + } + } + return v; + }(); + return defs; +} // ============================================================================ // Device family to human-readable name mapping @@ -592,7 +494,7 @@ std::string SystemInfo::get_unsupported_backend_error(const std::string& recipe, std::string error; // Find the recipe/backend in RECIPE_DEFS - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (def.recipe == recipe && def.backend == backend) { // Collect all required family names std::vector family_names; @@ -1203,12 +1105,12 @@ json SystemInfo::build_recipes_info(const json& devices) { std::map configured_default_backends; if (auto* cfg = RuntimeConfig::global()) { std::set processed_recipes; - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (!processed_recipes.insert(def.recipe).second) continue; std::string section = RuntimeConfig::recipe_to_config_section(def.recipe); std::string backend = cfg->backend_string(section, "backend"); if (backend.empty() || backend == "auto") continue; - bool known = std::any_of(RECIPE_DEFS.begin(), RECIPE_DEFS.end(), + bool known = std::any_of(recipe_defs().begin(), recipe_defs().end(), [&](const RecipeBackendDef& d) { return d.recipe == def.recipe && d.backend == backend; }); @@ -1268,7 +1170,7 @@ json SystemInfo::build_recipes_info(const json& devices) { } // Build recipes from the definition table - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { // Skip if not supported on current OS if (def.supported_os.count(current_os) == 0) { // Helper to format OS name nicely @@ -1599,6 +1501,50 @@ json SystemInfo::build_recipes_info(const json& devices) { } } + // Enrich each recipe entry with descriptor metadata so clients (the desktop + // app, the docs generator) can render display names and per-recipe option + // schemas without hardcoding them. This is the single source the frontend + // reads instead of its own per-recipe TypeScript tables. + for (const auto* desc : lemon::backends::all_descriptors()) { + auto it = recipes.find(desc->recipe); + if (it == recipes.end()) { + continue; // recipe not surfaced on this system (e.g. cloud has no support rows) + } + json& entry = it.value(); + entry["display_name"] = desc->display_name; + entry["selectable_backend"] = desc->selectable_backend; + entry["uses_ctx_size"] = desc->uses_ctx_size; + // Machine-independent support matrix (OS + device families per backend), + // straight from the descriptor — used by the docs generator. + json support = json::array(); + for (const auto& row : desc->support) { + json devices = json::array(); + for (const auto& [device, families] : row.devices) { + devices.push_back({{"device", device}, + {"families", std::vector(families.begin(), families.end())}}); + } + support.push_back({ + {"backend", row.backend}, + {"os", std::vector(row.supported_os.begin(), row.supported_os.end())}, + {"devices", devices}, + }); + } + entry["support"] = support; + json options = json::array(); + for (const auto& opt : desc->options) { + json o = { + {"name", opt.name}, + {"cli_flag", opt.cli_flag}, + {"default", opt.default_value}, + {"type_name", opt.type_name}, + {"help", opt.help}, + {"group", opt.group}, + }; + options.push_back(o); + } + entry["options"] = options; + } + return recipes; } @@ -1631,7 +1577,7 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std } // Collect remaining supported backends and capture first error (in preference order from RECIPE_DEFS) - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (def.recipe == recipe) { // Skip the default_backend since we already added it if (def.backend == default_backend) { @@ -1660,11 +1606,12 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std } std::string SystemInfo::check_recipe_supported(const std::string& recipe) { - // Cloud offload has no local hardware/OS requirements; availability is - // gated by the CloudProviderRegistry (config.json "cloud_providers") and - // a resolvable API key (env var or runtime auth), checked elsewhere in - // filter_models_by_backend / CloudServer::load. - if (recipe == "cloud") { + // A backend whose descriptor declares no support rows has no local + // hardware/OS gating (e.g. cloud offload): availability is determined at + // runtime (provider creds via the CloudProviderRegistry / API key), checked + // elsewhere in filter_models_by_backend / CloudServer::load. + const auto* desc = lemon::backends::descriptor_for(recipe); + if (desc && desc->support.empty()) { return ""; } auto result = get_supported_backends(recipe); @@ -1685,7 +1632,7 @@ std::vector SystemInfo::get_all_recipe_statuses() { if (recipe_info.contains("backends") && recipe_info["backends"].is_object()) { // Iterate in preference order (from RECIPE_DEFS table) - for (const auto& def : RECIPE_DEFS) { + for (const auto& def : recipe_defs()) { if (def.recipe != recipe_name) continue; if (!recipe_info["backends"].contains(def.backend)) continue; From 2ef9379e1aa16a9296639d1129dad612a510a403 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Fri, 19 Jun 2026 16:25:18 -0400 Subject: [PATCH 02/39] refactor(backends): move each backend into its own folder (per spec) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructure the self-describing backends to the layout the issue #2287 plan specified — one folder per backend — instead of the flat file layout I used before. This also folds the earlier _descriptor/_factory split into the spec's cleaner shape: the descriptor is a header-only `inline const` and create() lives with the server class. Each backend now lives in its own folder, in namespace lemon::backends::: include/lemon/backends//.h inline const descriptor (CLI-safe data) include/lemon/backends//_server.h WrappedServer subclass + create() decl server/backends//_server.cpp implementation + create() def Shared registry/util files stay at the top of backends/. The CMake foreach over LEMON_BACKENDS compiles each /_server.cpp and generates the registry headers from the folder paths. Removes the per-backend *_descriptor.{h,cpp} and *_factory.{h,cpp} files. Behavior is unchanged (same descriptors, same create()). Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 42 ++++----- docs/dev/adding-a-backend.md | 92 ++++++++++--------- .../lemon/backends/cloud/cloud.h} | 14 ++- .../lemon/backends/{ => cloud}/cloud_server.h | 15 ++- .../include/lemon/backends/cloud_descriptor.h | 13 --- .../include/lemon/backends/cloud_factory.h | 14 --- .../lemon/backends/fastflowlm/fastflowlm.h} | 14 ++- .../{ => fastflowlm}/fastflowlm_server.h | 15 ++- .../lemon/backends/fastflowlm_descriptor.h | 13 --- .../lemon/backends/fastflowlm_factory.h | 14 --- .../lemon/backends/kokoro/kokoro.h} | 14 ++- .../backends/{ => kokoro}/kokoro_server.h | 17 +++- .../lemon/backends/kokoro_descriptor.h | 13 --- .../include/lemon/backends/kokoro_factory.h | 14 --- .../lemon/backends/llamacpp/llamacpp.h} | 14 ++- .../backends/{ => llamacpp}/llamacpp_server.h | 15 ++- .../lemon/backends/llamacpp_descriptor.h | 13 --- .../include/lemon/backends/llamacpp_factory.h | 14 --- .../lemon/backends/moonshine/moonshine.h} | 14 ++- .../{ => moonshine}/moonshine_server.h | 17 +++- .../lemon/backends/moonshine_descriptor.h | 13 --- .../lemon/backends/moonshine_factory.h | 14 --- .../lemon/backends/ryzenai/ryzenai.h} | 14 ++- .../ryzenai_server.h} | 11 +++ .../lemon/backends/ryzenai_descriptor.h | 13 --- .../include/lemon/backends/ryzenai_factory.h | 14 --- .../lemon/backends/sdcpp/sdcpp.h} | 14 ++- .../{sd_server.h => sdcpp/sdcpp_server.h} | 23 +++-- .../include/lemon/backends/sdcpp_descriptor.h | 13 --- .../include/lemon/backends/sdcpp_factory.h | 14 --- .../lemon/backends/vllm/vllm.h} | 14 ++- .../lemon/backends/{ => vllm}/vllm_server.h | 15 ++- .../include/lemon/backends/vllm_descriptor.h | 13 --- src/cpp/include/lemon/backends/vllm_factory.h | 14 --- .../lemon/backends/whispercpp/whispercpp.h} | 14 ++- .../whispercpp_server.h} | 17 +++- .../lemon/backends/whispercpp_descriptor.h | 13 --- .../lemon/backends/whispercpp_factory.h | 14 --- src/cpp/server/backends/backend_utils.cpp | 16 ++-- .../backends/{ => cloud}/cloud_server.cpp | 18 +++- src/cpp/server/backends/cloud_factory.cpp | 16 ---- .../{ => fastflowlm}/fastflowlm_server.cpp | 15 ++- .../server/backends/fastflowlm_factory.cpp | 13 --- .../backends/{ => kokoro}/kokoro_server.cpp | 15 ++- src/cpp/server/backends/kokoro_factory.cpp | 13 --- .../{ => llamacpp}/llamacpp_server.cpp | 15 ++- src/cpp/server/backends/llamacpp_factory.cpp | 13 --- .../{ => moonshine}/moonshine_server.cpp | 15 ++- src/cpp/server/backends/moonshine_factory.cpp | 13 --- .../ryzenai_server.cpp} | 22 ++++- src/cpp/server/backends/ryzenai_factory.cpp | 20 ---- .../{sd_server.cpp => sdcpp/sdcpp_server.cpp} | 15 ++- src/cpp/server/backends/sdcpp_factory.cpp | 13 --- .../backends/{ => vllm}/vllm_server.cpp | 15 ++- src/cpp/server/backends/vllm_factory.cpp | 13 --- .../whispercpp_server.cpp} | 15 ++- .../server/backends/whispercpp_factory.cpp | 13 --- src/cpp/server/model_manager.cpp | 4 +- src/cpp/server/router.cpp | 18 ++-- src/cpp/server/server.cpp | 4 +- 60 files changed, 433 insertions(+), 529 deletions(-) rename src/cpp/{server/backends/cloud_descriptor.cpp => include/lemon/backends/cloud/cloud.h} (66%) rename src/cpp/include/lemon/backends/{ => cloud}/cloud_server.h (92%) delete mode 100644 src/cpp/include/lemon/backends/cloud_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/cloud_factory.h rename src/cpp/{server/backends/fastflowlm_descriptor.cpp => include/lemon/backends/fastflowlm/fastflowlm.h} (62%) rename src/cpp/include/lemon/backends/{ => fastflowlm}/fastflowlm_server.h (84%) delete mode 100644 src/cpp/include/lemon/backends/fastflowlm_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/fastflowlm_factory.h rename src/cpp/{server/backends/kokoro_descriptor.cpp => include/lemon/backends/kokoro/kokoro.h} (67%) rename src/cpp/include/lemon/backends/{ => kokoro}/kokoro_server.h (74%) delete mode 100644 src/cpp/include/lemon/backends/kokoro_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/kokoro_factory.h rename src/cpp/{server/backends/llamacpp_descriptor.cpp => include/lemon/backends/llamacpp/llamacpp.h} (82%) rename src/cpp/include/lemon/backends/{ => llamacpp}/llamacpp_server.h (80%) delete mode 100644 src/cpp/include/lemon/backends/llamacpp_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/llamacpp_factory.h rename src/cpp/{server/backends/moonshine_descriptor.cpp => include/lemon/backends/moonshine/moonshine.h} (70%) rename src/cpp/include/lemon/backends/{ => moonshine}/moonshine_server.h (79%) delete mode 100644 src/cpp/include/lemon/backends/moonshine_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/moonshine_factory.h rename src/cpp/{server/backends/ryzenai_descriptor.cpp => include/lemon/backends/ryzenai/ryzenai.h} (64%) rename src/cpp/include/lemon/backends/{ryzenaiserver.h => ryzenai/ryzenai_server.h} (82%) delete mode 100644 src/cpp/include/lemon/backends/ryzenai_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/ryzenai_factory.h rename src/cpp/{server/backends/sdcpp_descriptor.cpp => include/lemon/backends/sdcpp/sdcpp.h} (85%) rename src/cpp/include/lemon/backends/{sd_server.h => sdcpp/sdcpp_server.h} (86%) delete mode 100644 src/cpp/include/lemon/backends/sdcpp_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/sdcpp_factory.h rename src/cpp/{server/backends/vllm_descriptor.cpp => include/lemon/backends/vllm/vllm.h} (69%) rename src/cpp/include/lemon/backends/{ => vllm}/vllm_server.h (78%) delete mode 100644 src/cpp/include/lemon/backends/vllm_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/vllm_factory.h rename src/cpp/{server/backends/whispercpp_descriptor.cpp => include/lemon/backends/whispercpp/whispercpp.h} (79%) rename src/cpp/include/lemon/backends/{whisper_server.h => whispercpp/whispercpp_server.h} (85%) delete mode 100644 src/cpp/include/lemon/backends/whispercpp_descriptor.h delete mode 100644 src/cpp/include/lemon/backends/whispercpp_factory.h rename src/cpp/server/backends/{ => cloud}/cloud_server.cpp (98%) delete mode 100644 src/cpp/server/backends/cloud_factory.cpp rename src/cpp/server/backends/{ => fastflowlm}/fastflowlm_server.cpp (97%) delete mode 100644 src/cpp/server/backends/fastflowlm_factory.cpp rename src/cpp/server/backends/{ => kokoro}/kokoro_server.cpp (94%) delete mode 100644 src/cpp/server/backends/kokoro_factory.cpp rename src/cpp/server/backends/{ => llamacpp}/llamacpp_server.cpp (98%) delete mode 100644 src/cpp/server/backends/llamacpp_factory.cpp rename src/cpp/server/backends/{ => moonshine}/moonshine_server.cpp (96%) delete mode 100644 src/cpp/server/backends/moonshine_factory.cpp rename src/cpp/server/backends/{ryzenaiserver.cpp => ryzenai/ryzenai_server.cpp} (87%) delete mode 100644 src/cpp/server/backends/ryzenai_factory.cpp rename src/cpp/server/backends/{sd_server.cpp => sdcpp/sdcpp_server.cpp} (98%) delete mode 100644 src/cpp/server/backends/sdcpp_factory.cpp rename src/cpp/server/backends/{ => vllm}/vllm_server.cpp (97%) delete mode 100644 src/cpp/server/backends/vllm_factory.cpp rename src/cpp/server/backends/{whisper_server.cpp => whispercpp/whispercpp_server.cpp} (98%) delete mode 100644 src/cpp/server/backends/whispercpp_factory.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 3220e6c42..0b59e883f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -607,15 +607,6 @@ set(SOURCES_CORE src/cpp/server/utils/wmi_helper.cpp src/cpp/server/utils/network_beacon.cpp src/cpp/server/utils/tcp_jsonl_client.cpp - src/cpp/server/backends/cloud_server.cpp - src/cpp/server/backends/llamacpp_server.cpp - src/cpp/server/backends/fastflowlm_server.cpp - src/cpp/server/backends/ryzenaiserver.cpp - src/cpp/server/backends/whisper_server.cpp - src/cpp/server/backends/moonshine_server.cpp - src/cpp/server/backends/kokoro_server.cpp - src/cpp/server/backends/sd_server.cpp - src/cpp/server/backends/vllm_server.cpp src/cpp/server/backends/backend_utils.cpp src/cpp/server/backend_manager.cpp src/cpp/server/ollama_api.cpp @@ -652,17 +643,18 @@ endif() # ============================================================ # The authoritative backend list. Each entry is "|": # recipe - the recipe string used in server_models.json (may contain dashes) -# stem - identifier-safe name. The backend ships two files: -# src/cpp/server/backends/_descriptor.cpp (plain data; CLI-safe) -# src/cpp/server/backends/_factory.cpp (create(); server-only) -# declaring lemon::backends::_descriptor and _create. +# stem - identifier-safe name and folder. Each backend lives in its own +# folder, shipping (in namespace lemon::backends::): +# include/lemon/backends//.h inline const descriptor (CLI-safe data) +# include/lemon/backends//_server.h WrappedServer subclass + create() decl +# server/backends//_server.cpp implementation + create() def # -# Adding a backend is one line here plus those two files. The foreach below -# compiles the sources and regenerates the registry headers, which bind each +# Adding a backend is one line here plus that folder. The foreach below compiles +# the server source and regenerates the registry headers, which bind each # descriptor to its create(). Because this list is a tracked input, editing it # forces regeneration on the next build (a file(GLOB) would silently miss a -# newly added backend). Descriptor DATA links into both the lemonade CLI and -# lemond; only lemond links the factories (which pull in server classes). +# newly added backend). The descriptor is a header-only inline const, so it links +# into both the lemonade CLI and lemond; only lemond links the server sources. set(LEMON_BACKENDS # "|" "llamacpp|llamacpp" @@ -680,7 +672,8 @@ set(LEMON_DESCRIPTOR_INCLUDES "") set(LEMON_DESCRIPTOR_ENTRIES "") set(LEMON_FACTORY_INCLUDES "") set(LEMON_FACTORY_ENTRIES "") -# Descriptor sources are CLI-safe (data only); factory sources are server-only. +# The data registry (descriptors, header-only) links into both binaries; the +# factory registry + per-backend server sources are server-only. # Absolute paths so the CLI subdirectory can reuse LEMON_BACKEND_DESCRIPTOR_SOURCES. set(LEMON_BACKEND_DESCRIPTOR_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp) @@ -689,18 +682,17 @@ set(LEMON_BACKEND_FACTORY_SOURCES foreach(_backend_entry ${LEMON_BACKENDS}) string(REPLACE "|" ";" _backend_parts "${_backend_entry}") list(GET _backend_parts 1 _backend_stem) - list(APPEND LEMON_BACKEND_DESCRIPTOR_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_descriptor.cpp) + # The descriptor is header-only (no source); only the server source compiles. list(APPEND LEMON_BACKEND_FACTORY_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}_factory.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/${_backend_stem}_server.cpp) string(APPEND LEMON_DESCRIPTOR_INCLUDES - "#include \"lemon/backends/${_backend_stem}_descriptor.h\"\n") + "#include \"lemon/backends/${_backend_stem}/${_backend_stem}.h\"\n") string(APPEND LEMON_DESCRIPTOR_ENTRIES - " &lemon::backends::${_backend_stem}_descriptor,\n") + " &lemon::backends::${_backend_stem}::descriptor,\n") string(APPEND LEMON_FACTORY_INCLUDES - "#include \"lemon/backends/${_backend_stem}_factory.h\"\n") + "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n") string(APPEND LEMON_FACTORY_ENTRIES - " { &lemon::backends::${_backend_stem}_descriptor, &lemon::backends::${_backend_stem}_create },\n") + " { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create },\n") endforeach() configure_file( diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md index 512770e73..ae5006a4f 100644 --- a/docs/dev/adding-a-backend.md +++ b/docs/dev/adding-a-backend.md @@ -1,44 +1,39 @@ # Adding a backend Lemonade backends are **self-describing**. A backend declares *what it is* in a -plain-data **descriptor** and implements *how it runs* in a **server class**. A -registry collects every descriptor, and the router, the CLI, `/system-info`, and -the generated docs all read it — so there are no scattered `if (recipe == "...")` -sites to update. +plain-data **descriptor** and implements *how it runs* in a **server class**, and +both live together in the backend's own folder. A registry collects every +descriptor, and the router, the CLI, `/system-info`, and the generated docs all +read it — so there are no scattered `if (recipe == "...")` sites to update. -Adding a backend is **one folder's worth of files plus three small appends**: +Adding a backend is **one folder plus three small appends**: | You edit | What goes there | |----------|-----------------| | `CMakeLists.txt` → `LEMON_BACKENDS` | **one line**: `"\|"` | -| `src/cpp/server/backends/_descriptor.cpp` + `.h` | the descriptor (plain data) | -| `src/cpp/server/backends/_factory.cpp` + `.h` | `create()` + the `WrappedServer` subclass | +| `src/cpp/include/lemon/backends//.h` | the descriptor (header-only `inline const`) | +| `src/cpp/include/lemon/backends//_server.h` | the `WrappedServer` subclass + `create()` declaration | +| `src/cpp/server/backends//_server.cpp` | the implementation + `create()` definition | | `src/cpp/resources/backend_versions.json` | version pin(s) — skip if there's no downloaded binary (e.g. cloud) | | `src/cpp/resources/server_models.json` | the models | No router edits, no CLI edits, no doc edits, no support-matrix edits. -## The descriptor (plain data — CLI-safe) +Everything for one backend lives in `lemon::backends::`. The descriptor is +header-only so it links into **both** the `lemonade` CLI and `lemond`; the server +class and `create()` are server-only (compiled into `lemond`). -The descriptor is the single object every consumer reads. It links into **both** -the `lemonade` CLI and `lemond`, so it must not reference server classes. +## The descriptor — `/.h` -`src/cpp/include/lemon/backends/_descriptor.h`: +Plain data. The single object the registry, CLI, `/system-info`, and docs all read. ```cpp #pragma once #include "lemon/backends/backend_descriptor.h" -namespace lemon { namespace backends { -extern const BackendDescriptor _descriptor; -} } -``` -`src/cpp/server/backends/_descriptor.cpp`: +namespace lemon { namespace backends { namespace myrecipe { -```cpp -#include "lemon/backends/_descriptor.h" -namespace lemon { namespace backends { -const BackendDescriptor _descriptor = { +inline const BackendDescriptor descriptor = { /*recipe*/ "myrecipe", /*display_name*/ "My Backend", /*binary*/ "my-server", // "" = no subprocess (e.g. cloud) @@ -57,48 +52,55 @@ const BackendDescriptor _descriptor = { /*default_labels*/ {}, // labels injected when a model omits them /*required_checkpoints*/ {"main"}, // unconditional files; conditional ones checked in load() }; -} } + +}}} // namespace lemon::backends::myrecipe ``` `SlotPolicy` controls accelerator sharing: `Standard` (counts toward LRU slots), `ExclusiveNpu` (evicts all NPU servers first), `CoexistByType` (one per model type), `Unmetered` (never counted, never auto-evicted — cloud). -## The factory + server class (server-only) +## The server class + factory — `/_server.{h,cpp}` -The factory builds the `WrappedServer` subclass. It is compiled into `lemond` -only (it references server classes), which keeps the `lemonade` CLI link clean. +The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`, +and only the capability interfaces you serve (`ITranscriptionServer`, +`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default +"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend +does not stub them. Alongside it, a free `create()` builds the instance. -`src/cpp/include/lemon/backends/_factory.h`: +`_server.h`: ```cpp #pragma once -#include -#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_registry.h" // BackendContext +#include "lemon/wrapped_server.h" + namespace lemon { namespace backends { -std::unique_ptr _create(const BackendContext& ctx); -} } + +class MyServer : public WrappedServer, public ICompletionServer { + // load(), unload(), the capability methods you serve … +}; + +namespace myrecipe { +std::unique_ptr create(const BackendContext& ctx); // server-only +} + +}} // namespace lemon::backends ``` -`src/cpp/server/backends/_factory.cpp`: +`_server.cpp`: ```cpp -#include "lemon/backends/_factory.h" -#include "lemon/backends/_server.h" -#include "lemon/wrapped_server.h" -namespace lemon { namespace backends { -std::unique_ptr _create(const BackendContext& ctx) { +#include "lemon/backends/myrecipe/myrecipe_server.h" +// … MyServer method definitions … + +namespace lemon { namespace backends { namespace myrecipe { +std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } -} } +}}} // namespace lemon::backends::myrecipe ``` -The server class is a `WrappedServer` subclass. Implement `load()`, `unload()`, -and only the capability interfaces you actually serve (`ITranscriptionServer`, -`IImageServer`, `ITextToSpeechServer`, …). `WrappedServer` provides default -"unsupported" `chat_completion`/`completion`/`responses`, so a non-chat backend -does not stub them. - ## Register it: one line ```cmake @@ -108,8 +110,8 @@ set(LEMON_BACKENDS ) ``` -The `foreach` in `CMakeLists.txt` compiles your two sources and regenerates the -registry headers, binding the descriptor to its `create()`. +The `foreach` in `CMakeLists.txt` compiles `/_server.cpp` and +regenerates the registry headers, binding `::descriptor` to `::create`. ## What you get for free @@ -139,7 +141,7 @@ registry headers, binding the descriptor to its `create()`. **Moonshine** is the minimal case: a single descriptor option, no backend selection, CPU-only, one capability interface. See -`src/cpp/server/backends/moonshine_descriptor.cpp` and `moonshine_factory.cpp`. +`src/cpp/server/backends/moonshine/` and `include/lemon/backends/moonshine/`. > Note: collections (`collection.omni`) are orchestrator-driven, not > `WrappedServer` subprocesses, and are the one explicit exception to this model. diff --git a/src/cpp/server/backends/cloud_descriptor.cpp b/src/cpp/include/lemon/backends/cloud/cloud.h similarity index 66% rename from src/cpp/server/backends/cloud_descriptor.cpp rename to src/cpp/include/lemon/backends/cloud/cloud.h index fe87a32a2..2ad4f3186 100644 --- a/src/cpp/server/backends/cloud_descriptor.cpp +++ b/src/cpp/include/lemon/backends/cloud/cloud.h @@ -1,9 +1,14 @@ -#include "lemon/backends/cloud_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace cloud { -const BackendDescriptor cloud_descriptor = { +// The cloud backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "cloud", /*display_name*/ "Cloud", /*binary*/ "", // no subprocess: runs on a remote provider @@ -19,5 +24,6 @@ const BackendDescriptor cloud_descriptor = { /*required_checkpoints*/ {}, // no downloaded files }; -} // namespace backends -} // namespace lemon +} // namespace cloud +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h similarity index 92% rename from src/cpp/include/lemon/backends/cloud_server.h rename to src/cpp/include/lemon/backends/cloud/cloud_server.h index efc765728..21e28512a 100644 --- a/src/cpp/include/lemon/backends/cloud_server.h +++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../model_manager.h" -#include "../wrapped_server.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/model_manager.h" +#include "lemon/wrapped_server.h" #include #include @@ -108,3 +110,12 @@ class CloudServer : public WrappedServer { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace cloud { +// Factory for the cloud backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace cloud +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/cloud_descriptor.h b/src/cpp/include/lemon/backends/cloud_descriptor.h deleted file mode 100644 index 6e5f49bdb..000000000 --- a/src/cpp/include/lemon/backends/cloud_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The cloud backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in cloud_descriptor.cpp. -extern const BackendDescriptor cloud_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/cloud_factory.h b/src/cpp/include/lemon/backends/cloud_factory.h deleted file mode 100644 index 889958bd1..000000000 --- a/src/cpp/include/lemon/backends/cloud_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The cloud backend's factory (constructs the server class — lemond only). -// Defined in cloud_factory.cpp. -std::unique_ptr cloud_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm_descriptor.cpp b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h similarity index 62% rename from src/cpp/server/backends/fastflowlm_descriptor.cpp rename to src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index 7b67b8d42..648d84c57 100644 --- a/src/cpp/server/backends/fastflowlm_descriptor.cpp +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -1,9 +1,14 @@ -#include "lemon/backends/fastflowlm_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace fastflowlm { -const BackendDescriptor fastflowlm_descriptor = { +// The fastflowlm backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "flm", /*display_name*/ "FastFlowLM NPU", #ifdef _WIN32 @@ -25,5 +30,6 @@ const BackendDescriptor fastflowlm_descriptor = { /*required_checkpoints*/ {"main"}, }; -} // namespace backends -} // namespace lemon +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h similarity index 84% rename from src/cpp/include/lemon/backends/fastflowlm_server.h rename to src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h index bd9c554ac..58b99f1ba 100644 --- a/src/cpp/include/lemon/backends/fastflowlm_server.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../wrapped_server.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -72,3 +74,12 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace fastflowlm { +// Factory for the fastflowlm backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm_descriptor.h b/src/cpp/include/lemon/backends/fastflowlm_descriptor.h deleted file mode 100644 index 5e8f71467..000000000 --- a/src/cpp/include/lemon/backends/fastflowlm_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The fastflowlm backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in fastflowlm_descriptor.cpp. -extern const BackendDescriptor fastflowlm_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm_factory.h b/src/cpp/include/lemon/backends/fastflowlm_factory.h deleted file mode 100644 index 8581dbdf7..000000000 --- a/src/cpp/include/lemon/backends/fastflowlm_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The fastflowlm backend's factory (constructs the server class — lemond only). -// Defined in fastflowlm_factory.cpp. -std::unique_ptr fastflowlm_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/kokoro_descriptor.cpp b/src/cpp/include/lemon/backends/kokoro/kokoro.h similarity index 67% rename from src/cpp/server/backends/kokoro_descriptor.cpp rename to src/cpp/include/lemon/backends/kokoro/kokoro.h index 281f0e0f1..f0492576f 100644 --- a/src/cpp/server/backends/kokoro_descriptor.cpp +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -1,9 +1,14 @@ -#include "lemon/backends/kokoro_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace kokoro { -const BackendDescriptor kokoro_descriptor = { +// The kokoro backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "kokoro", /*display_name*/ "Kokoro", #ifdef _WIN32 @@ -26,5 +31,6 @@ const BackendDescriptor kokoro_descriptor = { /*required_checkpoints*/ {"main"}, }; -} // namespace backends -} // namespace lemon +} // namespace kokoro +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h similarity index 74% rename from src/cpp/include/lemon/backends/kokoro_server.h rename to src/cpp/include/lemon/backends/kokoro/kokoro_server.h index 0b99bcb96..c1f170ca7 100644 --- a/src/cpp/include/lemon/backends/kokoro_server.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h @@ -1,8 +1,10 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/backends/backend_utils.h" #include #include @@ -47,3 +49,12 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace kokoro { +// Factory for the kokoro backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace kokoro +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro_descriptor.h b/src/cpp/include/lemon/backends/kokoro_descriptor.h deleted file mode 100644 index 1d3542f0a..000000000 --- a/src/cpp/include/lemon/backends/kokoro_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The kokoro backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in kokoro_descriptor.cpp. -extern const BackendDescriptor kokoro_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro_factory.h b/src/cpp/include/lemon/backends/kokoro_factory.h deleted file mode 100644 index 0df3ec37b..000000000 --- a/src/cpp/include/lemon/backends/kokoro_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The kokoro backend's factory (constructs the server class — lemond only). -// Defined in kokoro_factory.cpp. -std::unique_ptr kokoro_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/llamacpp_descriptor.cpp b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h similarity index 82% rename from src/cpp/server/backends/llamacpp_descriptor.cpp rename to src/cpp/include/lemon/backends/llamacpp/llamacpp.h index f426e9f20..8348f877e 100644 --- a/src/cpp/server/backends/llamacpp_descriptor.cpp +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -1,9 +1,14 @@ -#include "lemon/backends/llamacpp_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace llamacpp { -const BackendDescriptor llamacpp_descriptor = { +// The llamacpp backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "llamacpp", /*display_name*/ "Llama.cpp GPU", #ifdef _WIN32 @@ -39,5 +44,6 @@ const BackendDescriptor llamacpp_descriptor = { /*required_checkpoints*/ {"main"}, }; -} // namespace backends -} // namespace lemon +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h similarity index 80% rename from src/cpp/include/lemon/backends/llamacpp_server.h rename to src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h index c9356f6b8..7ef4bb44b 100644 --- a/src/cpp/include/lemon/backends/llamacpp_server.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../wrapped_server.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -58,3 +60,12 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace llamacpp { +// Factory for the llamacpp backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp_descriptor.h b/src/cpp/include/lemon/backends/llamacpp_descriptor.h deleted file mode 100644 index 501e0854c..000000000 --- a/src/cpp/include/lemon/backends/llamacpp_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The llamacpp backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in llamacpp_descriptor.cpp. -extern const BackendDescriptor llamacpp_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp_factory.h b/src/cpp/include/lemon/backends/llamacpp_factory.h deleted file mode 100644 index 853f5171b..000000000 --- a/src/cpp/include/lemon/backends/llamacpp_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The llamacpp backend's factory (constructs the server class — lemond only). -// Defined in llamacpp_factory.cpp. -std::unique_ptr llamacpp_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/moonshine_descriptor.cpp b/src/cpp/include/lemon/backends/moonshine/moonshine.h similarity index 70% rename from src/cpp/server/backends/moonshine_descriptor.cpp rename to src/cpp/include/lemon/backends/moonshine/moonshine.h index 63277ad3c..28b3e3e58 100644 --- a/src/cpp/server/backends/moonshine_descriptor.cpp +++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h @@ -1,9 +1,14 @@ -#include "lemon/backends/moonshine_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace moonshine { -const BackendDescriptor moonshine_descriptor = { +// The moonshine backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "moonshine", /*display_name*/ "Moonshine", /*binary*/ "moonshine-server", @@ -26,5 +31,6 @@ const BackendDescriptor moonshine_descriptor = { /*required_checkpoints*/ {"main"}, }; -} // namespace backends -} // namespace lemon +} // namespace moonshine +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h similarity index 79% rename from src/cpp/include/lemon/backends/moonshine_server.h rename to src/cpp/include/lemon/backends/moonshine/moonshine_server.h index 6f13f216b..b98e52806 100644 --- a/src/cpp/include/lemon/backends/moonshine_server.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h @@ -1,8 +1,10 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -53,3 +55,12 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace moonshine { +// Factory for the moonshine backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace moonshine +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine_descriptor.h b/src/cpp/include/lemon/backends/moonshine_descriptor.h deleted file mode 100644 index d70083244..000000000 --- a/src/cpp/include/lemon/backends/moonshine_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The moonshine backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in moonshine_descriptor.cpp. -extern const BackendDescriptor moonshine_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine_factory.h b/src/cpp/include/lemon/backends/moonshine_factory.h deleted file mode 100644 index 67e6f7298..000000000 --- a/src/cpp/include/lemon/backends/moonshine_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The moonshine backend's factory (constructs the server class — lemond only). -// Defined in moonshine_factory.cpp. -std::unique_ptr moonshine_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/ryzenai_descriptor.cpp b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h similarity index 64% rename from src/cpp/server/backends/ryzenai_descriptor.cpp rename to src/cpp/include/lemon/backends/ryzenai/ryzenai.h index 23651ec94..c1896ee7e 100644 --- a/src/cpp/server/backends/ryzenai_descriptor.cpp +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -1,9 +1,14 @@ -#include "lemon/backends/ryzenai_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace ryzenai { -const BackendDescriptor ryzenai_descriptor = { +// The ryzenai backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "ryzenai-llm", /*display_name*/ "Ryzen AI LLM", #ifdef _WIN32 @@ -25,5 +30,6 @@ const BackendDescriptor ryzenai_descriptor = { /*required_checkpoints*/ {"main"}, }; -} // namespace backends -} // namespace lemon +} // namespace ryzenai +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenaiserver.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h similarity index 82% rename from src/cpp/include/lemon/backends/ryzenaiserver.h rename to src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h index 36e1ba98d..1420efae5 100644 --- a/src/cpp/include/lemon/backends/ryzenaiserver.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h @@ -1,5 +1,7 @@ #pragma once +#include "lemon/backends/backend_registry.h" + #include "lemon/wrapped_server.h" #include "lemon/server_capabilities.h" #include "lemon/backends/backend_utils.h" @@ -54,3 +56,12 @@ class RyzenAIServer : public WrappedServer { }; } // namespace lemon + +namespace lemon { +namespace backends { +namespace ryzenai { +// Factory for the ryzenai backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace ryzenai +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenai_descriptor.h b/src/cpp/include/lemon/backends/ryzenai_descriptor.h deleted file mode 100644 index 26aa0b21f..000000000 --- a/src/cpp/include/lemon/backends/ryzenai_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The ryzenai backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in ryzenai_descriptor.cpp. -extern const BackendDescriptor ryzenai_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenai_factory.h b/src/cpp/include/lemon/backends/ryzenai_factory.h deleted file mode 100644 index 9483d8d55..000000000 --- a/src/cpp/include/lemon/backends/ryzenai_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The ryzenai backend's factory (constructs the server class — lemond only). -// Defined in ryzenai_factory.cpp. -std::unique_ptr ryzenai_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/sdcpp_descriptor.cpp b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h similarity index 85% rename from src/cpp/server/backends/sdcpp_descriptor.cpp rename to src/cpp/include/lemon/backends/sdcpp/sdcpp.h index 10ebfdd58..323ec11bc 100644 --- a/src/cpp/server/backends/sdcpp_descriptor.cpp +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -1,9 +1,14 @@ -#include "lemon/backends/sdcpp_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace sdcpp { -const BackendDescriptor sdcpp_descriptor = { +// The sdcpp backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "sd-cpp", /*display_name*/ "StableDiffusion.cpp", #ifdef _WIN32 @@ -43,5 +48,6 @@ const BackendDescriptor sdcpp_descriptor = { /*required_checkpoints*/ {"main"}, // flux text_encoder+vae validated together in load() }; -} // namespace backends -} // namespace lemon +} // namespace sdcpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/sd_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h similarity index 86% rename from src/cpp/include/lemon/backends/sd_server.h rename to src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h index 857374951..999a1de72 100644 --- a/src/cpp/include/lemon/backends/sd_server.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h @@ -1,11 +1,13 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "../model_manager.h" -#include "../recipe_options.h" -#include "../utils/process_manager.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/model_manager.h" +#include "lemon/recipe_options.h" +#include "lemon/utils/process_manager.h" +#include "lemon/backends/backend_utils.h" #include #include @@ -95,3 +97,12 @@ class SDServer : public WrappedServer, public IImageServer { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace sdcpp { +// Factory for the sdcpp backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace sdcpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/sdcpp_descriptor.h b/src/cpp/include/lemon/backends/sdcpp_descriptor.h deleted file mode 100644 index 0bee2e552..000000000 --- a/src/cpp/include/lemon/backends/sdcpp_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The sdcpp backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in sdcpp_descriptor.cpp. -extern const BackendDescriptor sdcpp_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/sdcpp_factory.h b/src/cpp/include/lemon/backends/sdcpp_factory.h deleted file mode 100644 index f7da955e2..000000000 --- a/src/cpp/include/lemon/backends/sdcpp_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The sdcpp backend's factory (constructs the server class — lemond only). -// Defined in sdcpp_factory.cpp. -std::unique_ptr sdcpp_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/vllm_descriptor.cpp b/src/cpp/include/lemon/backends/vllm/vllm.h similarity index 69% rename from src/cpp/server/backends/vllm_descriptor.cpp rename to src/cpp/include/lemon/backends/vllm/vllm.h index 54451f365..5d0210a37 100644 --- a/src/cpp/server/backends/vllm_descriptor.cpp +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -1,9 +1,14 @@ -#include "lemon/backends/vllm_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace vllm { -const BackendDescriptor vllm_descriptor = { +// The vllm backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "vllm", /*display_name*/ "vLLM ROCm (experimental)", /*binary*/ "vllm-server", @@ -26,5 +31,6 @@ const BackendDescriptor vllm_descriptor = { /*required_checkpoints*/ {"main"}, }; -} // namespace backends -} // namespace lemon +} // namespace vllm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h similarity index 78% rename from src/cpp/include/lemon/backends/vllm_server.h rename to src/cpp/include/lemon/backends/vllm/vllm_server.h index 62ec94af2..0eaf4e7d8 100644 --- a/src/cpp/include/lemon/backends/vllm_server.h +++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h @@ -1,7 +1,9 @@ #pragma once -#include "../wrapped_server.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/backends/backend_utils.h" #include namespace lemon { @@ -47,3 +49,12 @@ class VLLMServer : public WrappedServer { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace vllm { +// Factory for the vllm backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace vllm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm_descriptor.h b/src/cpp/include/lemon/backends/vllm_descriptor.h deleted file mode 100644 index 7119dff88..000000000 --- a/src/cpp/include/lemon/backends/vllm_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The vllm backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in vllm_descriptor.cpp. -extern const BackendDescriptor vllm_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm_factory.h b/src/cpp/include/lemon/backends/vllm_factory.h deleted file mode 100644 index 7bf398987..000000000 --- a/src/cpp/include/lemon/backends/vllm_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The vllm backend's factory (constructs the server class — lemond only). -// Defined in vllm_factory.cpp. -std::unique_ptr vllm_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/whispercpp_descriptor.cpp b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h similarity index 79% rename from src/cpp/server/backends/whispercpp_descriptor.cpp rename to src/cpp/include/lemon/backends/whispercpp/whispercpp.h index 6124e779e..f49fca08f 100644 --- a/src/cpp/server/backends/whispercpp_descriptor.cpp +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -1,9 +1,14 @@ -#include "lemon/backends/whispercpp_descriptor.h" +#pragma once + +#include "lemon/backends/backend_descriptor.h" namespace lemon { namespace backends { +namespace whispercpp { -const BackendDescriptor whispercpp_descriptor = { +// The whispercpp backend descriptor (plain data). Header-only `inline const` so it +// links into both the lemonade CLI and lemond without a separate source file. +inline const BackendDescriptor descriptor = { /*recipe*/ "whispercpp", /*display_name*/ "Whisper.cpp", #ifdef _WIN32 @@ -35,5 +40,6 @@ const BackendDescriptor whispercpp_descriptor = { /*required_checkpoints*/ {"main"}, // npu_cache validated in load() (npu variant only) }; -} // namespace backends -} // namespace lemon +} // namespace whispercpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/whisper_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h similarity index 85% rename from src/cpp/include/lemon/backends/whisper_server.h rename to src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h index 55a1734d9..90744875f 100644 --- a/src/cpp/include/lemon/backends/whisper_server.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h @@ -1,8 +1,10 @@ #pragma once -#include "../wrapped_server.h" -#include "../server_capabilities.h" -#include "backend_utils.h" +#include "lemon/backends/backend_registry.h" + +#include "lemon/wrapped_server.h" +#include "lemon/server_capabilities.h" +#include "lemon/backends/backend_utils.h" #include #include @@ -76,3 +78,12 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace whispercpp { +// Factory for the whispercpp backend (constructs the server class — lemond only). +std::unique_ptr create(const BackendContext& ctx); +} // namespace whispercpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/whispercpp_descriptor.h b/src/cpp/include/lemon/backends/whispercpp_descriptor.h deleted file mode 100644 index 2c3c87f19..000000000 --- a/src/cpp/include/lemon/backends/whispercpp_descriptor.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -#include "lemon/backends/backend_descriptor.h" - -namespace lemon { -namespace backends { - -// The whispercpp backend's descriptor (plain data — CLI-safe, links into both the -// lemonade CLI and lemond). Defined in whispercpp_descriptor.cpp. -extern const BackendDescriptor whispercpp_descriptor; - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/include/lemon/backends/whispercpp_factory.h b/src/cpp/include/lemon/backends/whispercpp_factory.h deleted file mode 100644 index d98c97b27..000000000 --- a/src/cpp/include/lemon/backends/whispercpp_factory.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include -#include "lemon/backends/backend_registry.h" - -namespace lemon { -namespace backends { - -// The whispercpp backend's factory (constructs the server class — lemond only). -// Defined in whispercpp_factory.cpp. -std::unique_ptr whispercpp_create(const BackendContext& ctx); - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp index 28feccaab..b5b6680fb 100644 --- a/src/cpp/server/backends/backend_utils.cpp +++ b/src/cpp/server/backends/backend_utils.cpp @@ -1,14 +1,14 @@ #include "lemon/backends/backend_utils.h" #include "lemon/runtime_config.h" #include "lemon/system_info.h" -#include "lemon/backends/llamacpp_server.h" -#include "lemon/backends/whisper_server.h" -#include "lemon/backends/sd_server.h" -#include "lemon/backends/kokoro_server.h" -#include "lemon/backends/ryzenaiserver.h" -#include "lemon/backends/vllm_server.h" -#include "lemon/backends/fastflowlm_server.h" -#include "lemon/backends/moonshine_server.h" +#include "lemon/backends/llamacpp/llamacpp_server.h" +#include "lemon/backends/whispercpp/whispercpp_server.h" +#include "lemon/backends/sdcpp/sdcpp_server.h" +#include "lemon/backends/kokoro/kokoro_server.h" +#include "lemon/backends/ryzenai/ryzenai_server.h" +#include "lemon/backends/vllm/vllm_server.h" +#include "lemon/backends/fastflowlm/fastflowlm_server.h" +#include "lemon/backends/moonshine/moonshine_server.h" #include "lemon/model_manager.h" // For DownloadProgress, DownloadProgressCallback #include "lemon/utils/path_utils.h" diff --git a/src/cpp/server/backends/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp similarity index 98% rename from src/cpp/server/backends/cloud_server.cpp rename to src/cpp/server/backends/cloud/cloud_server.cpp index 96bdcf4a3..64a940e4f 100644 --- a/src/cpp/server/backends/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -1,4 +1,6 @@ -#include "lemon/backends/cloud_server.h" +#include "lemon/backends/cloud/cloud_server.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/model_manager.h" #include "lemon/cloud_provider_registry.h" #include "lemon/error_types.h" #include "lemon/runtime_config.h" @@ -792,3 +794,17 @@ std::vector CloudServer::discover_models(const std::string& provider, } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace cloud { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique( + ctx.model_info->cloud_provider, ctx.log_level, + ctx.model_manager, ctx.backend_manager, ctx.cloud_registry); +} + +} // namespace cloud +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/cloud_factory.cpp b/src/cpp/server/backends/cloud_factory.cpp deleted file mode 100644 index cee2c4ab5..000000000 --- a/src/cpp/server/backends/cloud_factory.cpp +++ /dev/null @@ -1,16 +0,0 @@ -#include "lemon/backends/cloud_factory.h" -#include "lemon/backends/cloud_server.h" -#include "lemon/model_manager.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr cloud_create(const BackendContext& ctx) { - return std::make_unique( - ctx.model_info->cloud_provider, ctx.log_level, - ctx.model_manager, ctx.backend_manager, ctx.cloud_registry); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp similarity index 97% rename from src/cpp/server/backends/fastflowlm_server.cpp rename to src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index dc38928e3..81e40ba60 100644 --- a/src/cpp/server/backends/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -1,4 +1,5 @@ -#include "lemon/backends/fastflowlm_server.h" +#include "lemon/backends/fastflowlm/fastflowlm_server.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/system_info.h" #include "lemon/error_types.h" @@ -465,3 +466,15 @@ std::string FastFlowLMServer::get_flm_path() { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace fastflowlm { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm_factory.cpp b/src/cpp/server/backends/fastflowlm_factory.cpp deleted file mode 100644 index 96eddd998..000000000 --- a/src/cpp/server/backends/fastflowlm_factory.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "lemon/backends/fastflowlm_factory.h" -#include "lemon/backends/fastflowlm_server.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr fastflowlm_create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp similarity index 94% rename from src/cpp/server/backends/kokoro_server.cpp rename to src/cpp/server/backends/kokoro/kokoro_server.cpp index 7a707cd7e..e0a2f7ada 100644 --- a/src/cpp/server/backends/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -1,4 +1,5 @@ -#include "lemon/backends/kokoro_server.h" +#include "lemon/backends/kokoro/kokoro_server.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/utils/process_manager.h" @@ -203,3 +204,15 @@ void KokoroServer::audio_speech(const json& request, httplib::DataSink& sink) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace kokoro { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace kokoro +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/kokoro_factory.cpp b/src/cpp/server/backends/kokoro_factory.cpp deleted file mode 100644 index a7d4f3be8..000000000 --- a/src/cpp/server/backends/kokoro_factory.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "lemon/backends/kokoro_factory.h" -#include "lemon/backends/kokoro_server.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr kokoro_create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp similarity index 98% rename from src/cpp/server/backends/llamacpp_server.cpp rename to src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 7f50022af..2c828f1c4 100644 --- a/src/cpp/server/backends/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -1,4 +1,5 @@ -#include "lemon/backends/llamacpp_server.h" +#include "lemon/backends/llamacpp/llamacpp_server.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/auto_tune.h" #include "lemon/backend_manager.h" @@ -644,3 +645,15 @@ json LlamaCppServer::responses(const json& request) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace llamacpp { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/llamacpp_factory.cpp b/src/cpp/server/backends/llamacpp_factory.cpp deleted file mode 100644 index cd34fab5a..000000000 --- a/src/cpp/server/backends/llamacpp_factory.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "lemon/backends/llamacpp_factory.h" -#include "lemon/backends/llamacpp_server.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr llamacpp_create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp similarity index 96% rename from src/cpp/server/backends/moonshine_server.cpp rename to src/cpp/server/backends/moonshine/moonshine_server.cpp index 3257c05ba..7cb338286 100644 --- a/src/cpp/server/backends/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -1,4 +1,5 @@ -#include "lemon/backends/moonshine_server.h" +#include "lemon/backends/moonshine/moonshine_server.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" @@ -330,3 +331,15 @@ json MoonshineServer::audio_transcriptions(const json& request) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace moonshine { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace moonshine +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/moonshine_factory.cpp b/src/cpp/server/backends/moonshine_factory.cpp deleted file mode 100644 index 859b37b30..000000000 --- a/src/cpp/server/backends/moonshine_factory.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "lemon/backends/moonshine_factory.h" -#include "lemon/backends/moonshine_server.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr moonshine_create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/ryzenaiserver.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp similarity index 87% rename from src/cpp/server/backends/ryzenaiserver.cpp rename to src/cpp/server/backends/ryzenai/ryzenai_server.cpp index 6e250fa35..925fece3f 100644 --- a/src/cpp/server/backends/ryzenaiserver.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -1,4 +1,6 @@ -#include "lemon/backends/ryzenaiserver.h" +#include "lemon/backends/ryzenai/ryzenai_server.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/model_manager.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/utils/process_manager.h" @@ -167,3 +169,21 @@ json RyzenAIServer::responses(const json& request) { } } // namespace lemon + +namespace lemon { +namespace backends { +namespace ryzenai { + +std::unique_ptr create(const BackendContext& ctx) { + // RyzenAI resolves its model path before load (set_model_path), matching the + // original router factory's special-casing. + auto server = std::make_unique<::lemon::RyzenAIServer>( + ctx.model_info->model_name, ctx.log_level == "debug", + ctx.model_manager, ctx.backend_manager); + server->set_model_path(ctx.model_info->resolved_path()); + return server; +} + +} // namespace ryzenai +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/ryzenai_factory.cpp b/src/cpp/server/backends/ryzenai_factory.cpp deleted file mode 100644 index 4e013a30c..000000000 --- a/src/cpp/server/backends/ryzenai_factory.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include "lemon/backends/ryzenai_factory.h" -#include "lemon/backends/ryzenaiserver.h" -#include "lemon/model_manager.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr ryzenai_create(const BackendContext& ctx) { - // RyzenAI resolves its model path before load (set_model_path), matching the - // original router factory's special-casing. - auto server = std::make_unique<::lemon::RyzenAIServer>( - ctx.model_info->model_name, ctx.log_level == "debug", - ctx.model_manager, ctx.backend_manager); - server->set_model_path(ctx.model_info->resolved_path()); - return server; -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/sd_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp similarity index 98% rename from src/cpp/server/backends/sd_server.cpp rename to src/cpp/server/backends/sdcpp/sdcpp_server.cpp index 734454c36..b561906bb 100644 --- a/src/cpp/server/backends/sd_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -1,4 +1,5 @@ -#include "lemon/backends/sd_server.h" +#include "lemon/backends/sdcpp/sdcpp_server.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" @@ -746,3 +747,15 @@ std::string SDServer::upscale_via_cli( } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace sdcpp { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace sdcpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/sdcpp_factory.cpp b/src/cpp/server/backends/sdcpp_factory.cpp deleted file mode 100644 index 009fffd43..000000000 --- a/src/cpp/server/backends/sdcpp_factory.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "lemon/backends/sdcpp_factory.h" -#include "lemon/backends/sd_server.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr sdcpp_create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp similarity index 97% rename from src/cpp/server/backends/vllm_server.cpp rename to src/cpp/server/backends/vllm/vllm_server.cpp index 7584d56d9..dae6fb883 100644 --- a/src/cpp/server/backends/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -1,4 +1,5 @@ -#include "lemon/backends/vllm_server.h" +#include "lemon/backends/vllm/vllm_server.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/model_manager.h" #include "lemon/runtime_config.h" @@ -311,3 +312,15 @@ void VLLMServer::forward_streaming_request(const std::string& endpoint, } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace vllm { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace vllm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/vllm_factory.cpp b/src/cpp/server/backends/vllm_factory.cpp deleted file mode 100644 index 20fd71851..000000000 --- a/src/cpp/server/backends/vllm_factory.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "lemon/backends/vllm_factory.h" -#include "lemon/backends/vllm_server.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr vllm_create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/backends/whisper_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp similarity index 98% rename from src/cpp/server/backends/whisper_server.cpp rename to src/cpp/server/backends/whispercpp/whispercpp_server.cpp index cc37be36d..3c574f27a 100644 --- a/src/cpp/server/backends/whisper_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -1,4 +1,5 @@ -#include "lemon/backends/whisper_server.h" +#include "lemon/backends/whispercpp/whispercpp_server.h" +#include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" @@ -688,3 +689,15 @@ json WhisperServer::audio_transcriptions(const json& request) { } // namespace backends } // namespace lemon + +namespace lemon { +namespace backends { +namespace whispercpp { + +std::unique_ptr create(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + +} // namespace whispercpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/whispercpp_factory.cpp b/src/cpp/server/backends/whispercpp_factory.cpp deleted file mode 100644 index 3223804aa..000000000 --- a/src/cpp/server/backends/whispercpp_factory.cpp +++ /dev/null @@ -1,13 +0,0 @@ -#include "lemon/backends/whispercpp_factory.h" -#include "lemon/backends/whisper_server.h" -#include "lemon/wrapped_server.h" - -namespace lemon { -namespace backends { - -std::unique_ptr whispercpp_create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); -} - -} // namespace backends -} // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index febdc4ec8..6695fbfc7 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -9,9 +9,9 @@ #include #include #include -#include +#include #include -#include +#include #include #include #include diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp index a3c4bec74..307c51294 100644 --- a/src/cpp/server/router.cpp +++ b/src/cpp/server/router.cpp @@ -1,15 +1,15 @@ #include "lemon/router.h" #include "lemon/cloud_provider_registry.h" #include "lemon/backends/backend_registry.h" -#include "lemon/backends/cloud_server.h" -#include "lemon/backends/llamacpp_server.h" -#include "lemon/backends/fastflowlm_server.h" -#include "lemon/backends/ryzenaiserver.h" -#include "lemon/backends/whisper_server.h" -#include "lemon/backends/moonshine_server.h" -#include "lemon/backends/kokoro_server.h" -#include "lemon/backends/sd_server.h" -#include "lemon/backends/vllm_server.h" +#include "lemon/backends/cloud/cloud_server.h" +#include "lemon/backends/llamacpp/llamacpp_server.h" +#include "lemon/backends/fastflowlm/fastflowlm_server.h" +#include "lemon/backends/ryzenai/ryzenai_server.h" +#include "lemon/backends/whispercpp/whispercpp_server.h" +#include "lemon/backends/moonshine/moonshine_server.h" +#include "lemon/backends/kokoro/kokoro_server.h" +#include "lemon/backends/sdcpp/sdcpp_server.h" +#include "lemon/backends/vllm/vllm_server.h" #include "lemon/server_capabilities.h" #include "lemon/error_types.h" #include "lemon/recipe_options.h" diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index d0fea0504..384412753 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -5,8 +5,8 @@ #include "lemon/config_file.h" #include "lemon/mcp_server.h" #include "lemon/ollama_api.h" -#include "lemon/backends/cloud_server.h" -#include "lemon/backends/sd_server.h" +#include "lemon/backends/cloud/cloud_server.h" +#include "lemon/backends/sdcpp/sdcpp_server.h" #include "lemon/backends/backend_utils.h" #include #include "lemon/utils/json_utils.h" From 33b437b634b8982943dc2ae9cd1b27763117554b Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Fri, 19 Jun 2026 20:51:16 -0400 Subject: [PATCH 03/39] docs(backends): mechanize the README support matrix from descriptors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the existing curated docs generate from the backend descriptors instead of just shipping a separate reference file — closing appendix rows 14 and 22. - Expand the descriptor with the editorial fields the curated docs need: `modality`, `experimental`, `web_display_name`, and a per-support-row `device_summary` (RecipeBackendDef). These keep the descriptor the single source of truth. - /system-info exposes them plus a registry `order` index and `slot_policy`. - gen_backend_docs.py now targets multiple docs and renders: * README.md "Supported Configurations" HTML matrix (grouped by modality, merged rows, rowspans, experimental tag) — wrapped in GENERATED markers; * docs/guide/configuration/multi-model.md NPU-exclusivity list. The backend-docs-drift CI job's --check now covers all three docs. The generated README matrix is also more complete than the hand-written one (it now includes whispercpp rocm/metal, kokoro metal, sd-cpp metal). Footnotes and prose outside the markers are preserved. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 82 +++++--- docs/dev/adding-a-backend.md | 8 +- docs/guide/configuration/multi-model.md | 4 +- docs/tools/gen_backend_docs.py | 177 ++++++++++++++++-- .../lemon/backends/backend_descriptor.h | 15 ++ src/cpp/include/lemon/backends/cloud/cloud.h | 3 + .../lemon/backends/fastflowlm/fastflowlm.h | 5 +- .../include/lemon/backends/kokoro/kokoro.h | 7 +- .../lemon/backends/llamacpp/llamacpp.h | 15 +- .../lemon/backends/moonshine/moonshine.h | 9 +- .../include/lemon/backends/ryzenai/ryzenai.h | 5 +- src/cpp/include/lemon/backends/sdcpp/sdcpp.h | 13 +- src/cpp/include/lemon/backends/vllm/vllm.h | 5 +- .../lemon/backends/whispercpp/whispercpp.h | 13 +- src/cpp/include/lemon/recipe_backend_def.h | 3 + src/cpp/server/system_info.cpp | 13 +- 16 files changed, 301 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 38d9db6fe..2175b846e 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,7 @@ Use `lemonade pull` or the built-in **Model Manager** to download models. You ca Lemonade supports multiple inference engines for LLM, speech, TTS, and image generation, and each has its own backend and hardware requirements. + @@ -137,14 +138,14 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen - - - + + + - - - + + + @@ -152,49 +153,54 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen - - + + - - - + + + - - - + + + - + - + - + - - + + + + + + + - + @@ -202,28 +208,33 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen - + + + + + + - - + + - - - - - + + + - - + + + + @@ -231,13 +242,24 @@ Lemonade supports multiple inference engines for LLM, speech, TTS, and image gen + + + + + + + + + +
Text generation llamacppvulkanx86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)Windows, Linuxsystemx86_64/ARM64 CPU, GPULinux
rocmSupported AMD ROCm iGPU/dGPU families*Windows, LinuxmetalApple Silicon GPUmacOS
cudaWindows, Linux
cpux86_64 CPU; ARM64 CPU (Linux)vulkanx86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux) Windows, Linux
metalApple Silicon GPUmacOSrocmSupported AMD ROCm iGPU/dGPU families*Windows, Linux
systemx86_64/ARM64 CPU, GPULinuxcpux86_64 CPU; ARM64 CPU (Linux)Windows, Linux
flmflm npu XDNA2 NPU Windows, Linux
ryzenai-llmryzenai-llm npu XDNA2 NPU Windows
vllm (experimental)vllm (experimental) rocm Strix Halo iGPU (gfx1151) Linux
Speech-to-textwhispercppSpeech-to-textwhispercpp npu XDNA2 NPU Windows
rocmSupported AMD ROCm iGPU/dGPU families*Windows, Linux
vulkan x86_64 CPULinuxWindows, Linux
cpuWindows, Linux
moonshinemetalApple Silicon GPUmacOS
moonshine cpu x86_64/arm64 CPU Windows, Linux, macOS
Text-to-speechkokoroText-to-speechkokoro cpu x86_64 CPU Windows, Linux
Image generationsd-cpprocmSupported AMD ROCm iGPU/dGPU families*Windows, LinuxmetalApple Silicon GPUmacOS
vulkanVulkan-capable GPUsImage generationsd-cpprocmSupported AMD ROCm iGPU/dGPU families* Windows, Linux
NVIDIA GPUs (Turing or newer)** Linux
vulkanVulkan-capable GPUsWindows, Linux
cpu x86_64 CPU Windows, Linux
metalApple Silicon GPUmacOS
+ To check exactly which recipes/backends are supported on your own machine, run: diff --git a/docs/dev/adding-a-backend.md b/docs/dev/adding-a-backend.md index ae5006a4f..7699f97e6 100644 --- a/docs/dev/adding-a-backend.md +++ b/docs/dev/adding-a-backend.md @@ -122,8 +122,12 @@ regenerates the registry headers, binding `::descriptor` to `::creat `--` when `selectable_backend = true`. - **Install/download** via the backend's `BackendSpec` (binary + install params). - **`/system-info`** `recipes` entry (display name, options schema, support matrix). -- **Generated docs** — your backend appears in - [`backends-reference.md`](backends-reference.md) automatically. +- **Generated docs** — your backend appears automatically in + [`backends-reference.md`](backends-reference.md), the README "Supported + Configurations" matrix, and the multi-model NPU-exclusivity list. A CI job + (`backend-docs-drift`) fails if the committed docs are stale. The descriptor's + `modality`, `experimental`, `web_display_name`, and each support row's + `device_summary` supply the editorial bits the matrix needs. ## Escape hatches diff --git a/docs/guide/configuration/multi-model.md b/docs/guide/configuration/multi-model.md index 30ed840d5..db9944ff9 100644 --- a/docs/guide/configuration/multi-model.md +++ b/docs/guide/configuration/multi-model.md @@ -22,7 +22,9 @@ Each type has its own independent LRU cache, all sharing the same slot limit set ## Device Constraints -- **NPU Exclusivity:** `flm`, `ryzenai-llm`, and `whispercpp` are mutually exclusive on the NPU. + +- **NPU Exclusivity:** `whispercpp`, `flm`, and `ryzenai-llm` are mutually exclusive on the NPU. + - Loading a model from one of these backends will automatically evict all NPU models from the other backends. - `flm` supports loading 1 ASR model, 1 LLM, and 1 embedding model on the NPU at the same time. - `ryzenai-llm` supports loading exactly 1 LLM, which uses the entire NPU. diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py index 737715605..895c0a318 100644 --- a/docs/tools/gen_backend_docs.py +++ b/docs/tools/gen_backend_docs.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 """Generate backend reference docs from the self-describing backend descriptors. -The C++ backend descriptors (src/cpp/server/backends/*_descriptor.cpp) are the -single source of truth for what each backend is. This script boots a `lemond` +The C++ backend descriptors (src/cpp/include/lemon/backends//.h) are +the single source of truth for what each backend is. This script boots a `lemond` server, reads the descriptor-generated ``/system-info`` ``recipes`` object and ``server_models.json``, and rewrites the marker-delimited regions of the target doc(s). A CI step runs it with ``--check`` and fails if the committed docs drift. @@ -110,6 +110,121 @@ def md_escape(text: str) -> str: return str(text).replace("|", "\\|") +MODALITY_ORDER = [ + "Text generation", + "Speech-to-text", + "Text-to-speech", + "Image generation", +] +OS_LABEL = {"windows": "Windows", "linux": "Linux", "macos": "macOS"} +OS_ORDER = ["windows", "linux", "macos"] + + +def _fmt_os(os_set) -> str: + return ", ".join(OS_LABEL.get(o, o) for o in OS_ORDER if o in os_set) + + +def _code_devices(summary: str) -> str: + # Light formatting: render bare arch tokens as , matching the README style. + summary = re.sub(r"\bx86_64\b", "x86_64", summary) + summary = re.sub(r"\barm64\b", "arm64", summary) + return summary + + +def _ordered(recipes: dict) -> list: + # Recipes in descriptor registry order (stable, deterministic doc rendering). + return sorted(recipes.items(), key=lambda kv: kv[1].get("order", 999)) + + +def render_readme_matrix(recipes: dict) -> str: + # Group descriptor-backed recipes by modality, in descriptor registry order. + by_mod: dict[str, list] = {m: [] for m in MODALITY_ORDER} + for recipe, info in _ordered(recipes): + mod = info.get("modality") + if not mod or mod not in by_mod: + continue + # Merge support rows sharing a (backend, device summary); union their OS. + merged: list[dict] = [] + seen: dict[tuple, dict] = {} + for row in info.get("support", []): + key = (row["backend"], row.get("device_summary", "")) + if key in seen: + seen[key]["os"] |= set(row.get("os", [])) + else: + d = { + "backend": row["backend"], + "summary": row.get("device_summary", ""), + "os": set(row.get("os", [])), + } + seen[key] = d + merged.append(d) + if merged: + by_mod[mod].append((recipe, info, merged)) + + out = [ + "", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + ] + for mod in MODALITY_ORDER: + recipes_in = by_mod[mod] + if not recipes_in: + continue + mod_span = sum(len(m) for _, _, m in recipes_in) + first_mod = True + for recipe, info, merged in recipes_in: + engine = f"{recipe}" + ( + " (experimental)" if info.get("experimental") else "" + ) + first_recipe = True + for d in merged: + out.append(" ") + if first_mod: + out.append( + f' ' + ) + first_mod = False + if first_recipe: + out.append(f' ') + first_recipe = False + out.append(f' ') + out.append(f" ") + out.append(f" ") + out.append(" ") + out += [" ", "
ModalityEngineBackendDeviceOS
{mod}{engine}{d["backend"]}{_code_devices(d['summary'])}{_fmt_os(d['os'])}
"] + return "\n".join(out) + + +def _oxford(items: list) -> str: + items = [f"`{i}`" for i in items] + if len(items) <= 1: + return "".join(items) + if len(items) == 2: + return f"{items[0]} and {items[1]}" + return ", ".join(items[:-1]) + f", and {items[-1]}" + + +def render_npu_exclusivity(recipes: dict) -> str: + npu = [ + r + for r, info in _ordered(recipes) + if any( + row.get("backend") == "npu" + or any(d.get("device") == "amd_npu" for d in row.get("devices", [])) + for row in info.get("support", []) + ) + ] + return f"- **NPU Exclusivity:** {_oxford(npu)} are mutually exclusive on the NPU." + + def render_overview(recipes: dict) -> str: rows = [ "| Recipe | Name | Selectable backend | Uses ctx_size | Backends |", @@ -281,27 +396,55 @@ def main() -> int: if not recipes: sys.exit("/system-info returned no recipes") - sections = { - "backends-overview": render_overview(recipes), - "backends-matrix": render_support_matrix(recipes), - "backend-options": render_options(recipes), - "backend-models": render_models(recipes), + # Each target doc maps marker IDs -> generated content. backends-reference.md + # is created from a template if missing; the others must already contain their + # markers (the regions were added to the curated docs by hand once). + targets: dict = { + TARGET_DOC: { + "sections": { + "backends-overview": render_overview(recipes), + "backends-matrix": render_support_matrix(recipes), + "backend-options": render_options(recipes), + "backend-models": render_models(recipes), + }, + "template": DEFAULT_TEMPLATE, + }, + REPO_ROOT + / "README.md": { + "sections": {"backends-matrix": render_readme_matrix(recipes)}, + }, + REPO_ROOT + / "docs" + / "guide" + / "configuration" + / "multi-model.md": { + "sections": {"npu-exclusivity": render_npu_exclusivity(recipes)}, + }, } - current = TARGET_DOC.read_text() if TARGET_DOC.exists() else DEFAULT_TEMPLATE - updated = apply_sections(current, sections) + stale = [] + for path, spec in targets.items(): + rel = path.relative_to(REPO_ROOT) + current = path.read_text() if path.exists() else spec.get("template", "") + if not current: + sys.exit(f"{rel} is missing and has no template") + updated = apply_sections(current, spec["sections"]) + if args.check: + if not path.exists() or path.read_text() != updated: + stale.append(str(rel)) + else: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(updated) + print(f"Wrote {rel}") if args.check: - if not TARGET_DOC.exists() or TARGET_DOC.read_text() != updated: + if stale: sys.exit( - f"{TARGET_DOC.relative_to(REPO_ROOT)} is stale. Run: python docs/tools/gen_backend_docs.py" + "Stale generated docs: " + + ", ".join(stale) + + "\nRun: python docs/tools/gen_backend_docs.py" ) - print(f"{TARGET_DOC.relative_to(REPO_ROOT)} is up to date.") - return 0 - - TARGET_DOC.parent.mkdir(parents=True, exist_ok=True) - TARGET_DOC.write_text(updated) - print(f"Wrote {TARGET_DOC.relative_to(REPO_ROOT)}") + print("All generated docs are up to date.") return 0 diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index fc6c50bc2..4b26246b6 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -29,6 +29,16 @@ enum class SlotPolicy { Unmetered // never counts toward slots, never auto-evicted (cloud) }; +inline const char* slot_policy_to_string(SlotPolicy p) { + switch (p) { + case SlotPolicy::Standard: return "standard"; + case SlotPolicy::ExclusiveNpu: return "exclusive_npu"; + case SlotPolicy::CoexistByType: return "coexist_by_type"; + case SlotPolicy::Unmetered: return "unmetered"; + } + return "standard"; +} + // Plain data declaring *what a backend is*. This is the single object the // registry, the CLI, /system-info, and the docs all read. Behavior lives in the // paired WrappedServer subclass (see backend_registry.h for how they bind). @@ -49,6 +59,11 @@ struct BackendDescriptor { std::vector default_labels; // labels injected when a model omits them std::vector required_checkpoints{"main"}; // unconditional files; conditional ones checked in load() + // Editorial metadata for the generated docs (README support matrix, website). + std::string modality; // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation" + bool experimental = false; // true renders "(experimental)" next to the recipe in generated docs + std::string web_display_name; // name used on the docs website ("" = fall back to display_name) + // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { return config_section.empty() ? recipe : config_section; diff --git a/src/cpp/include/lemon/backends/cloud/cloud.h b/src/cpp/include/lemon/backends/cloud/cloud.h index 2ad4f3186..9d4f5559b 100644 --- a/src/cpp/include/lemon/backends/cloud/cloud.h +++ b/src/cpp/include/lemon/backends/cloud/cloud.h @@ -22,6 +22,9 @@ inline const BackendDescriptor descriptor = { /*support*/ {}, // no local gating: install/support machinery skips cloud /*default_labels*/ {}, /*required_checkpoints*/ {}, // no downloaded files + /*modality*/ "", + /*experimental*/ false, + /*web_display_name*/ "", }; } // namespace cloud diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index 648d84c57..b5b04b853 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -24,10 +24,13 @@ inline const BackendDescriptor descriptor = { /*dynamic_models*/ false, /*options*/ {}, /*support*/ { - {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}}, + {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ false, + /*web_display_name*/ "FastFlowLM NPU", }; } // namespace fastflowlm diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h index f0492576f..69cb17dc2 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -24,11 +24,14 @@ inline const BackendDescriptor descriptor = { /*dynamic_models*/ false, /*options*/ {}, /*support*/ { - {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}}, - {"kokoro", "metal", {"macos"}, {{"metal", {}}}}, + {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"kokoro", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, }, /*default_labels*/ {}, // kokoro models carry "tts" explicitly in server_models.json /*required_checkpoints*/ {"main"}, + /*modality*/ "Text-to-speech", + /*experimental*/ false, + /*web_display_name*/ "", }; } // namespace kokoro diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index 8348f877e..ec101dd3c 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -31,17 +31,20 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"}, }, /*support*/ { - {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}}, - {"llamacpp", "metal", {"macos"}, {{"metal", {}}}}, + {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"}, + {"llamacpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, {"llamacpp", "cuda", {"windows", "linux"}, - {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}}, - {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}}, + {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"}, + {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"}, {"llamacpp", "rocm", {"windows", "linux"}, - {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}}, - {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, + {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ false, + /*web_display_name*/ "llama.cpp GPU", }; } // namespace llamacpp diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h index 28b3e3e58..81f45dc25 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h @@ -23,12 +23,15 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to moonshine-server", ""}, }, /*support*/ { - {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}}, - {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}}, - {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}}, + {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"}, + {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"}, + {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"}, }, /*default_labels*/ {"transcription", "realtime-transcription"}, /*required_checkpoints*/ {"main"}, + /*modality*/ "Speech-to-text", + /*experimental*/ false, + /*web_display_name*/ "", }; } // namespace moonshine diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h index c1896ee7e..2df87cc2e 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -24,10 +24,13 @@ inline const BackendDescriptor descriptor = { /*dynamic_models*/ false, /*options*/ {}, /*support*/ { - {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}}, + {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ false, + /*web_display_name*/ "Ryzen AI SW NPU", }; } // namespace ryzenai diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h index 323ec11bc..3b8f78e85 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -37,15 +37,18 @@ inline const BackendDescriptor descriptor = { }, /*support*/ { {"sd-cpp", "rocm", {"windows", "linux"}, - {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, {"sd-cpp", "cuda", {"linux"}, - {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}}, - {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}}, - {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}}, - {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}}, + {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"}, + {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"}, + {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, }, /*default_labels*/ {"image"}, /*required_checkpoints*/ {"main"}, // flux text_encoder+vae validated together in load() + /*modality*/ "Image generation", + /*experimental*/ false, + /*web_display_name*/ "stable-diffusion.cpp", }; } // namespace sdcpp diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h index 5d0210a37..6f468a1ed 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm.h +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -25,10 +25,13 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to vllm-server", "vLLM Options"}, }, /*support*/ { - {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}}, + {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, + /*modality*/ "Text generation", + /*experimental*/ true, + /*web_display_name*/ "", }; } // namespace vllm diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h index f49fca08f..1a031b6e3 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -29,15 +29,18 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to whisper-server", "Whisper.cpp Options"}, }, /*support*/ { - {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}}, + {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, {"whispercpp", "rocm", {"windows", "linux"}, - {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}}, - {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}}, - {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}}, - {"whispercpp", "metal", {"macos"}, {{"metal", {}}}}, + {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, + {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"}, + {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"whispercpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, }, /*default_labels*/ {"transcription", "realtime-transcription"}, /*required_checkpoints*/ {"main"}, // npu_cache validated in load() (npu variant only) + /*modality*/ "Speech-to-text", + /*experimental*/ false, + /*web_display_name*/ "whisper.cpp", }; } // namespace whispercpp diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h index 1557db077..829ff0f78 100644 --- a/src/cpp/include/lemon/recipe_backend_def.h +++ b/src/cpp/include/lemon/recipe_backend_def.h @@ -21,6 +21,9 @@ struct RecipeBackendDef { std::string backend; std::set supported_os; DeviceConstraints devices; + // Human-friendly device description for the generated support matrix (README). + // May contain footnote markers (e.g. "*") whose text lives as prose in the doc. + std::string device_summary = ""; }; } // namespace lemon diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index 6a27a4fb2..730d2c985 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -1505,17 +1505,25 @@ json SystemInfo::build_recipes_info(const json& devices) { // app, the docs generator) can render display names and per-recipe option // schemas without hardcoding them. This is the single source the frontend // reads instead of its own per-recipe TypeScript tables. + int recipe_order = 0; for (const auto* desc : lemon::backends::all_descriptors()) { auto it = recipes.find(desc->recipe); if (it == recipes.end()) { + ++recipe_order; continue; // recipe not surfaced on this system (e.g. cloud has no support rows) } json& entry = it.value(); + entry["order"] = recipe_order++; // descriptor registry order, for deterministic doc rendering entry["display_name"] = desc->display_name; entry["selectable_backend"] = desc->selectable_backend; entry["uses_ctx_size"] = desc->uses_ctx_size; - // Machine-independent support matrix (OS + device families per backend), - // straight from the descriptor — used by the docs generator. + entry["modality"] = desc->modality; + entry["experimental"] = desc->experimental; + entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name; + entry["slot_policy"] = slot_policy_to_string(desc->slot_policy); + // Machine-independent support matrix (OS + device families + friendly + // device summary per backend), straight from the descriptor — used by the + // docs generator to render the README support matrix etc. json support = json::array(); for (const auto& row : desc->support) { json devices = json::array(); @@ -1527,6 +1535,7 @@ json SystemInfo::build_recipes_info(const json& devices) { {"backend", row.backend}, {"os", std::vector(row.supported_os.begin(), row.supported_os.end())}, {"devices", devices}, + {"device_summary", row.device_summary}, }); } entry["support"] = support; From 84616c42f73092be4db66fd677218bff4404b12f Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Fri, 19 Jun 2026 20:54:04 -0400 Subject: [PATCH 04/39] docs(cli): mechanize the per-recipe load-options tables from descriptors Wrap cli.md's "Recipe-Specific Options" tables in GENERATED markers and render them from the descriptor options. This also fixes pre-existing drift: the section documented `--steps`/`--cfg-scale`/`--width`/`--height` flags that the CLI no longer registers, and omitted the moonshine and vllm recipes. Now covered by the backend-docs-drift CI check. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/guide/cli.md | 44 +++++++++++++++++++++------------ docs/tools/gen_backend_docs.py | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/docs/guide/cli.md b/docs/guide/cli.md index 8749b1661..cad252dcb 100644 --- a/docs/guide/cli.md +++ b/docs/guide/cli.md @@ -325,44 +325,56 @@ The following options apply to all model loads: The following options are available depending on the recipe being used: -#### Llama.cpp (`llamacpp` recipe) + +#### Llama.cpp GPU (`llamacpp` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--ctx-size SIZE` | Context size for the model | `4096` | +| `--ctx-size SIZE` | Context size for the model | auto | | `--llamacpp BACKEND` | LlamaCpp backend to use | Auto-detected | -| `--llamacpp-device DEVICE` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | (empty) | -| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server (must not conflict with managed args) | `""` | +| `--llamacpp-device DEVICES` | Comma-separated list of accelerator devices to use (e.g. Vulkan0) | `""` | +| `--llamacpp-args ARGS` | Custom arguments to pass to llama-server | `""` | -#### FLM (`flm` recipe) +#### Whisper.cpp (`whispercpp` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--ctx-size SIZE` | Context size for the model | `4096` | +| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected | +| `--whispercpp-args ARGS` | Custom arguments to pass to whisper-server | `""` | -#### RyzenAI LLM (`ryzenai-llm` recipe) +#### Moonshine (`moonshine` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--ctx-size SIZE` | Context size for the model | `4096` | +| `--moonshine-args ARGS` | Custom arguments to pass to moonshine-server | `""` | -#### SD.cpp (`sd-cpp` recipe) +#### StableDiffusion.cpp (`sd-cpp` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--sdcpp BACKEND` | SD.cpp backend to use (`cpu` for CPU, `rocm` for AMD GPU) | Auto-detected | +| `--sdcpp BACKEND` | SD.cpp backend to use | Auto-detected | | `--sdcpp-args ARGS` | Custom arguments to pass to sd-server (must not conflict with managed args) | `""` | -| `--steps N` | Number of inference steps for image generation | `20` | -| `--cfg-scale SCALE` | Classifier-free guidance scale for image generation | `7.0` | -| `--width PX` | Image width in pixels | `512` | -| `--height PX` | Image height in pixels | `512` | -#### Whisper.cpp (`whispercpp` recipe) +#### FastFlowLM NPU (`flm` recipe) | Option | Description | Default | |--------|-------------|---------| -| `--whispercpp BACKEND` | WhisperCpp backend to use | Auto-detected | +| `--ctx-size SIZE` | Context size for the model | auto | + +#### Ryzen AI LLM (`ryzenai-llm` recipe) +| Option | Description | Default | +|--------|-------------|---------| +| `--ctx-size SIZE` | Context size for the model | auto | + +#### vLLM ROCm (experimental) (`vllm` recipe) + +| Option | Description | Default | +|--------|-------------|---------| +| `--ctx-size SIZE` | Context size for the model | auto | +| `--vllm BACKEND` | vLLM backend to use | Auto-detected | +| `--vllm-args ARGS` | Custom arguments to pass to vllm-server | `""` | + **Notes:** - Unspecified options will use the backend's default values - Backend options (`--llamacpp`, `--sdcpp`, `--whispercpp`) are auto-detected based on system capabilities diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py index 895c0a318..49f9db2ba 100644 --- a/docs/tools/gen_backend_docs.py +++ b/docs/tools/gen_backend_docs.py @@ -203,6 +203,45 @@ def render_readme_matrix(recipes: dict) -> str: return "\n".join(out) +def _cli_default(opt: dict) -> str: + d = opt.get("default") + if opt.get("type_name") == "BACKEND" and d == "": + return "Auto-detected" + if isinstance(d, str): + return '`""`' if d == "" else f"`{d}`" + if isinstance(d, bool): + return f"`{str(d).lower()}`" + if d == -1: + return "auto" + return f"`{d}`" + + +def render_cli_recipe_options(recipes: dict) -> str: + # Per-recipe load options, exactly as the CLI registers them from descriptors. + # Recipes with no CLI options (kokoro, cloud) are omitted. + blocks: list[str] = [] + for recipe, info in _ordered(recipes): + cli_opts = [o for o in info.get("options", []) if o.get("cli_flag")] + if not info.get("uses_ctx_size") and not cli_opts: + continue + blocks.append(f"#### {info.get('display_name', recipe)} (`{recipe}` recipe)\n") + blocks.append("| Option | Description | Default |") + blocks.append("|--------|-------------|---------|") + if info.get("uses_ctx_size"): + blocks.append("| `--ctx-size SIZE` | Context size for the model | auto |") + for o in cli_opts: + blocks.append( + "| `{flag} {t}` | {h} | {d} |".format( + flag=o["cli_flag"], + t=o.get("type_name", ""), + h=md_escape(o.get("help", "")), + d=_cli_default(o), + ) + ) + blocks.append("") + return "\n".join(blocks).rstrip() + + def _oxford(items: list) -> str: items = [f"`{i}`" for i in items] if len(items) <= 1: @@ -420,6 +459,12 @@ def main() -> int: / "multi-model.md": { "sections": {"npu-exclusivity": render_npu_exclusivity(recipes)}, }, + REPO_ROOT + / "docs" + / "guide" + / "cli.md": { + "sections": {"cli-recipe-options": render_cli_recipe_options(recipes)}, + }, } stale = [] From 2d1fd36b14265ec8fc1b1e090b03300eb65f1a04 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Fri, 19 Jun 2026 20:56:08 -0400 Subject: [PATCH 05/39] docs(custom-models): mechanize the --recipe value list from descriptors Add inline-marker support to the generator and wrap the `--recipe` "Common values" list in custom-models.md so it renders from the descriptor recipe set (plus collection.omni). Now covered by the backend-docs-drift CI check. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/guide/configuration/custom-models.md | 2 +- docs/tools/gen_backend_docs.py | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/guide/configuration/custom-models.md b/docs/guide/configuration/custom-models.md index c3e770442..5a7dbd878 100644 --- a/docs/guide/configuration/custom-models.md +++ b/docs/guide/configuration/custom-models.md @@ -71,7 +71,7 @@ Supported registration flags: | Flag | Description | |------|-------------| | `--checkpoint TYPE CHECKPOINT` | Add a checkpoint entry. Repeat for multi-file models such as `main` + `mmproj` or `main` + `vae`. | -| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: `llamacpp`, `flm`, `ryzenai-llm`, `vllm`, `whispercpp`, `moonshine`, `sd-cpp`, `kokoro`, `collection.omni`. | +| `--recipe RECIPE` | Recipe to associate with the new `user.*` model. Common values: `llamacpp`, `whispercpp`, `moonshine`, `kokoro`, `sd-cpp`, `flm`, `ryzenai-llm`, `vllm`, `collection.omni`. | | `--label LABEL` | Add a label to the new model. Repeatable. Valid labels include `coding`, `embeddings`, `hot`, `mtp`, `reasoning`, `reranking`, `tool-calling`, `vision`. | | `--components MODEL [MODEL ...]` | Components for an omni collection (see below). Use with `--recipe collection.omni`. | diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py index 49f9db2ba..357d58aa6 100644 --- a/docs/tools/gen_backend_docs.py +++ b/docs/tools/gen_backend_docs.py @@ -251,6 +251,12 @@ def _oxford(items: list) -> str: return ", ".join(items[:-1]) + f", and {items[-1]}" +def render_recipe_values(recipes: dict) -> str: + # Inline list of recipe values for `--recipe`, plus the collection orchestrator. + rs = [r for r, _ in _ordered(recipes)] + ["collection.omni"] + return ", ".join(f"`{r}`" for r in rs) + + def render_npu_exclusivity(recipes: dict) -> str: npu = [ r @@ -409,11 +415,16 @@ def apply_sections(text: str, sections: dict[str, str]) -> str: + r" -->)", re.DOTALL, ) - if not pattern.search(text): + m = pattern.search(text) + if not m: sys.exit(f"Marker region '{marker_id}' not found in target doc") + # Inline regions (markers mid-line, e.g. inside a table cell) get no + # surrounding newlines; block regions are wrapped on their own lines. + inline = m.start() > 0 and text[m.start() - 1] != "\n" # Escape backslashes and group-ref markers in the body for re.sub. safe_body = body.replace("\\", "\\\\") - replacement = r"\1" + "\n" + safe_body + "\n" + r"\2" + sep = "" if inline else "\n" + replacement = r"\1" + sep + safe_body + sep + r"\2" text = pattern.sub(replacement, text) return text @@ -465,6 +476,13 @@ def main() -> int: / "cli.md": { "sections": {"cli-recipe-options": render_cli_recipe_options(recipes)}, }, + REPO_ROOT + / "docs" + / "guide" + / "configuration" + / "custom-models.md": { + "sections": {"recipe-values": render_recipe_values(recipes)}, + }, } stale = [] From 9b8383cbc87aaa5db768684e120cd4c020686f74 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 13:44:42 -0400 Subject: [PATCH 06/39] docs: mechanize config.json example and models.js recipe metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Close the last two cleanly-derivable doc touchpoints (appendix rows 16 and 21). - configuration/README.md "Example config.json": generated from a fresh lemond's GET /internal/config (the real canonical config). This also fixes pre-existing drift — the hand-written block had `config_version: 1` (now 2), `prefer_system: false` (now true), a stray `device` key, and an invalid trailing comma. `port` is normalized to the documented default 13305. - docs/assets/models.js RECIPE_PRIORITY + RECIPE_DISPLAY_NAMES: generated from descriptors. A new `web_priority` editorial field preserves the curated website ordering (so the order is descriptor-sourced, not a silent reorder); legacy `oga-*` recipes are dropped as agreed. Adds the correct `vllm` display name. The generator now drives 7 docs and supports both `` (Markdown) and `/* */` (JS) GENERATED markers. backend-docs-drift --check covers all of them. Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/assets/models.js | 12 +-- docs/guide/configuration/README.md | 89 +++++++++++-------- docs/tools/gen_backend_docs.py | 89 +++++++++++++++++-- .../lemon/backends/backend_descriptor.h | 1 + .../lemon/backends/fastflowlm/fastflowlm.h | 1 + .../include/lemon/backends/kokoro/kokoro.h | 1 + .../lemon/backends/llamacpp/llamacpp.h | 1 + .../include/lemon/backends/ryzenai/ryzenai.h | 1 + src/cpp/include/lemon/backends/sdcpp/sdcpp.h | 1 + .../lemon/backends/whispercpp/whispercpp.h | 1 + src/cpp/server/system_info.cpp | 1 + 11 files changed, 148 insertions(+), 50 deletions(-) diff --git a/docs/assets/models.js b/docs/assets/models.js index 5bb604006..d9814cccb 100644 --- a/docs/assets/models.js +++ b/docs/assets/models.js @@ -2,25 +2,25 @@ const GITHUB_REPO = 'lemonade-sdk/lemonade'; const TAGS_URL = `https://api.github.com/repos/${GITHUB_REPO}/tags?per_page=100`; const RAW_BASE = 'https://raw.githubusercontent.com/lemonade-sdk/lemonade'; +/* BEGIN GENERATED: models-js-recipes */ const RECIPE_PRIORITY = [ 'llamacpp', 'ryzenai-llm', 'flm', 'whispercpp', 'sd-cpp', - 'oga-hybrid', - 'oga-npu', - 'oga-cpu', 'kokoro' ]; const RECIPE_DISPLAY_NAMES = { llamacpp: 'llama.cpp GPU', - 'ryzenai-llm': 'Ryzen AI SW NPU', - flm: 'FastFlowLM NPU', whispercpp: 'whisper.cpp', - 'sd-cpp': 'stable-diffusion.cpp' + 'sd-cpp': 'stable-diffusion.cpp', + flm: 'FastFlowLM NPU', + 'ryzenai-llm': 'Ryzen AI SW NPU', + vllm: 'vLLM ROCm (experimental)' }; +/* END GENERATED: models-js-recipes */ const state = { tag: null, diff --git a/docs/guide/configuration/README.md b/docs/guide/configuration/README.md index 93977148c..2a388dc8f 100644 --- a/docs/guide/configuration/README.md +++ b/docs/guide/configuration/README.md @@ -31,68 +31,81 @@ Values set in the user's `config.json` always take precedence over these seeded ### Example config.json + ```json { - "config_version": 1, - "port": 13305, - "host": "localhost", - "log_level": "info", - "global_timeout": 600, - "max_loaded_models": 1, - "no_broadcast": false, - "extra_models_dir": "", - "models_dir": "auto", + "cloud_providers": [], + "config_version": 2, "ctx_size": -1, - "offline": false, - "no_fetch_executables": false, "disable_model_filtering": false, "enable_dgpu_gtt": false, - "rocm_channel": "stable", + "extra_models_dir": "", + "flm": { + "args": "" + }, + "global_timeout": 600, + "host": "localhost", + "kokoro": { + "cpu_bin": "builtin" + }, "llamacpp": { - "backend": "auto", "args": "", - "vulkan_args": "", - "rocm_args": "", + "backend": "auto", "cpu_args": "", - "device": "", - "prefer_system": false, + "cpu_bin": "builtin", + "cuda_bin": "builtin", + "prefer_system": true, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin", - "cpu_bin": "builtin" + "vulkan_args": "", + "vulkan_bin": "builtin" }, - "whispercpp": { - "backend": "auto", + "log_level": "info", + "max_loaded_models": 1, + "models_dir": "auto", + "moonshine": { "args": "", "cpu_args": "", - "npu_args": "", - "cpu_bin": "builtin", - "npu_bin": "builtin" + "cpu_bin": "builtin" + }, + "no_broadcast": false, + "no_fetch_executables": false, + "offline": false, + "port": 13305, + "rocm_channel": "stable", + "ryzenai": { + "server_bin": "builtin" }, "sdcpp": { - "backend": "auto", "args": "", - "cpu_args": "", - "rocm_args": "", - "vulkan_args": "", - "steps": 20, + "backend": "auto", "cfg_scale": 7.0, - "width": 512, - "height": 512, + "cpu_args": "", "cpu_bin": "builtin", + "height": 512, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin" + "steps": 20, + "vulkan_args": "", + "vulkan_bin": "builtin", + "width": 512 }, - "flm": { + "vllm": { "args": "", + "backend": "auto" }, - "ryzenai": { - "server_bin": "builtin" - }, - "kokoro": { - "cpu_bin": "builtin" + "websocket_port": "auto", + "whispercpp": { + "args": "", + "backend": "auto", + "cpu_args": "", + "cpu_bin": "builtin", + "npu_args": "", + "npu_bin": "builtin" } } ``` + ### Settings Reference diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_docs.py index 357d58aa6..8e5bf3133 100644 --- a/docs/tools/gen_backend_docs.py +++ b/docs/tools/gen_backend_docs.py @@ -105,6 +105,9 @@ def _get(self, path: str, timeout: float = 5): def system_info(self) -> dict: return json.loads(self._get("/api/v1/system-info", timeout=30)) + def config(self) -> dict: + return json.loads(self._get("/internal/config", timeout=10)) + def md_escape(text: str) -> str: return str(text).replace("|", "\\|") @@ -251,6 +254,53 @@ def _oxford(items: list) -> str: return ", ".join(items[:-1]) + f", and {items[-1]}" +def _js_to_title(recipe: str) -> str: + # Mirror models.js toTitle(): the website's fallback for unlisted display names. + return re.sub( + r"\b\w", + lambda m: m.group(0).upper(), + recipe.replace("_", " ").replace("-", " "), + ) + + +def _js_key(recipe: str) -> str: + # Bare identifier if it's a valid JS key, else quoted (matches models.js style). + return recipe if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", recipe) else f"'{recipe}'" + + +def render_models_js(recipes: dict) -> str: + # RECIPE_PRIORITY: recipes with web_priority > 0, in that order (legacy oga-* + # recipes have no descriptor and are intentionally dropped). + prioritized = sorted( + (r for r, i in recipes.items() if i.get("web_priority", 0) > 0), + key=lambda r: recipes[r]["web_priority"], + ) + pri_lines = ",\n".join(f" '{r}'" for r in prioritized) + + # RECIPE_DISPLAY_NAMES: only recipes whose name differs from the JS toTitle() + # fallback (matching the curated map, which omits redundant entries). + name_lines = [] + for r, info in _ordered(recipes): + name = info.get("web_display_name") or info.get("display_name", r) + if name and name != _js_to_title(r): + name_lines.append(f" {_js_key(r)}: '{name}'") + names = ",\n".join(name_lines) + + return ( + f"const RECIPE_PRIORITY = [\n{pri_lines}\n];\n\n" + f"const RECIPE_DISPLAY_NAMES = {{\n{names}\n}};" + ) + + +def render_config_example(config: dict) -> str: + # The canonical config.json, straight from a fresh lemond's /internal/config. + # `port` is the only environment-dependent field (it reflects the launch port); + # normalize it to the documented default. + cfg = dict(config) + cfg["port"] = 13305 + return "```json\n" + json.dumps(cfg, indent=2) + "\n```" + + def render_recipe_values(recipes: dict) -> str: # Inline list of recipe values for `--recipe`, plus the collection orchestrator. rs = [r for r, _ in _ordered(recipes)] + ["collection.omni"] @@ -407,17 +457,28 @@ def render_models(recipes: dict) -> str: def apply_sections(text: str, sections: dict[str, str]) -> str: for marker_id, body in sections.items(): - pattern = re.compile( + # Accept HTML (``) markers for Markdown and block (`/* ... */`) + # markers for code files like .js, so the same generator drives both. + mid = re.escape(marker_id) + begin = ( r"().*?()", - re.DOTALL, + + mid + + r" -->|/\* BEGIN GENERATED: " + + mid + + r" \*/)" ) + end = ( + r"(|/\* END GENERATED: " + + mid + + r" \*/)" + ) + pattern = re.compile(begin + r".*?" + end, re.DOTALL) m = pattern.search(text) if not m: sys.exit(f"Marker region '{marker_id}' not found in target doc") + # Inline regions (markers mid-line, e.g. inside a table cell) get no # surrounding newlines; block regions are wrapped on their own lines. inline = m.start() > 0 and text[m.start() - 1] != "\n" @@ -442,9 +503,12 @@ def main() -> int: binary = find_lemond(args.lemond) with Lemond(binary) as server: info = server.system_info() + config = server.config() recipes = info.get("recipes", {}) if not recipes: sys.exit("/system-info returned no recipes") + if not config: + sys.exit("/internal/config returned nothing") # Each target doc maps marker IDs -> generated content. backends-reference.md # is created from a template if missing; the others must already contain their @@ -483,6 +547,19 @@ def main() -> int: / "custom-models.md": { "sections": {"recipe-values": render_recipe_values(recipes)}, }, + REPO_ROOT + / "docs" + / "guide" + / "configuration" + / "README.md": { + "sections": {"config-example": render_config_example(config)}, + }, + REPO_ROOT + / "docs" + / "assets" + / "models.js": { + "sections": {"models-js-recipes": render_models_js(recipes)}, + }, } stale = [] diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index 4b26246b6..3b9cdb2fb 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -63,6 +63,7 @@ struct BackendDescriptor { std::string modality; // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation" bool experimental = false; // true renders "(experimental)" next to the recipe in generated docs std::string web_display_name; // name used on the docs website ("" = fall back to display_name) + int web_priority = 0; // model-grouping order on the docs website (lower = higher; 0 = unlisted) // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index b5b04b853..b9efb610b 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -31,6 +31,7 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ false, /*web_display_name*/ "FastFlowLM NPU", + /*web_priority*/ 3, }; } // namespace fastflowlm diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h index 69cb17dc2..4663d3ad3 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -32,6 +32,7 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text-to-speech", /*experimental*/ false, /*web_display_name*/ "", + /*web_priority*/ 6, }; } // namespace kokoro diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index ec101dd3c..19d63c370 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -45,6 +45,7 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ false, /*web_display_name*/ "llama.cpp GPU", + /*web_priority*/ 1, }; } // namespace llamacpp diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h index 2df87cc2e..4171dbe93 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -31,6 +31,7 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ false, /*web_display_name*/ "Ryzen AI SW NPU", + /*web_priority*/ 2, }; } // namespace ryzenai diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h index 3b8f78e85..2e12af119 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -49,6 +49,7 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Image generation", /*experimental*/ false, /*web_display_name*/ "stable-diffusion.cpp", + /*web_priority*/ 5, }; } // namespace sdcpp diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h index 1a031b6e3..ce2014dec 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -41,6 +41,7 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Speech-to-text", /*experimental*/ false, /*web_display_name*/ "whisper.cpp", + /*web_priority*/ 4, }; } // namespace whispercpp diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index 730d2c985..cdf089843 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -1520,6 +1520,7 @@ json SystemInfo::build_recipes_info(const json& devices) { entry["modality"] = desc->modality; entry["experimental"] = desc->experimental; entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name; + entry["web_priority"] = desc->web_priority; entry["slot_policy"] = slot_policy_to_string(desc->slot_policy); // Machine-independent support matrix (OS + device families + friendly // device summary per backend), straight from the descriptor — used by the From 566ea83875d7eab50f754524185d64674cccb19e Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 14:14:47 -0400 Subject: [PATCH 07/39] refactor(backends): finish agreed touchpoints rows 4 & 5 (registry-drive spec; drop device map) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two agreed plan touchpoints were left incomplete; this finishes them. Row 4 — try_get_spec_for_recipe was still a hand-written 8-branch if-ladder in backend_utils.cpp, which also forced it to #include all 8 server headers. Each backend now exposes a uniform `spec()` accessor (alongside create()); the generated factory registry binds it, and `backends::spec_for(recipe)` / try_get_spec_for_recipe iterate the registry. backend_utils.cpp now includes ZERO server headers. Also reroute the two leaking `Server::SPEC` references (model_manager find_flm_binary) through the registry. Row 5 — get_device_type_from_recipe still carried the full recipe->device map, redundant with BackendDescriptor::default_device. Reduced to a DEVICE_NONE fallback for non-descriptor recipes (collections/unknown); the descriptor is the single source via ModelManager::device_type_for_recipe. Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 2 +- .../include/lemon/backends/backend_registry.h | 8 +++++- .../lemon/backends/cloud/cloud_server.h | 1 + .../backends/fastflowlm/fastflowlm_server.h | 1 + .../lemon/backends/kokoro/kokoro_server.h | 1 + .../lemon/backends/llamacpp/llamacpp_server.h | 1 + .../backends/moonshine/moonshine_server.h | 1 + .../lemon/backends/ryzenai/ryzenai_server.h | 1 + .../lemon/backends/sdcpp/sdcpp_server.h | 1 + .../include/lemon/backends/vllm/vllm_server.h | 1 + .../backends/whispercpp/whispercpp_server.h | 1 + src/cpp/include/lemon/model_types.h | 28 +++++-------------- src/cpp/server/backends/backend_registry.cpp | 9 ++++++ src/cpp/server/backends/backend_utils.cpp | 21 +++----------- .../server/backends/cloud/cloud_server.cpp | 2 ++ .../backends/fastflowlm/fastflowlm_server.cpp | 2 ++ .../server/backends/kokoro/kokoro_server.cpp | 2 ++ .../backends/llamacpp/llamacpp_server.cpp | 2 ++ .../backends/moonshine/moonshine_server.cpp | 2 ++ .../backends/ryzenai/ryzenai_server.cpp | 2 ++ .../server/backends/sdcpp/sdcpp_server.cpp | 4 ++- src/cpp/server/backends/vllm/vllm_server.cpp | 2 ++ .../backends/whispercpp/whispercpp_server.cpp | 4 ++- src/cpp/server/model_manager.cpp | 8 ++++-- 24 files changed, 62 insertions(+), 45 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b59e883f..ca95c586a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -692,7 +692,7 @@ foreach(_backend_entry ${LEMON_BACKENDS}) string(APPEND LEMON_FACTORY_INCLUDES "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n") string(APPEND LEMON_FACTORY_ENTRIES - " { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create },\n") + " { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec() },\n") endforeach() configure_file( diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h index 394f49145..a5a116f14 100644 --- a/src/cpp/include/lemon/backends/backend_registry.h +++ b/src/cpp/include/lemon/backends/backend_registry.h @@ -15,6 +15,8 @@ struct ModelInfo; namespace backends { +struct BackendSpec; // install/download spec, defined in backend_utils.h + // Everything a backend's create() needs to build an instance. Mirrors the // arguments the old router factory passed to each backend constructor. struct BackendContext { @@ -34,11 +36,15 @@ using BackendCreateFn = std::unique_ptr (*)(const BackendContext& struct BackendRegistration { const BackendDescriptor* descriptor; BackendCreateFn create; + const BackendSpec* spec; // install/download spec, or nullptr (e.g. cloud has none) }; -// All registered (descriptor, create) pairs, in LEMON_BACKENDS order. +// All registered (descriptor, create, spec) entries, in LEMON_BACKENDS order. const std::vector& all_registrations(); +// Install/download spec for a recipe, or nullptr if the recipe has none. +const BackendSpec* spec_for(const std::string& recipe); + // Construct a backend instance for a recipe and associate its descriptor, or // nullptr if the recipe has no registered backend. std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/cloud/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h index 21e28512a..afddef3a7 100644 --- a/src/cpp/include/lemon/backends/cloud/cloud_server.h +++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h @@ -116,6 +116,7 @@ namespace backends { namespace cloud { // Factory for the cloud backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace cloud } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h index 58b99f1ba..cb6b7d73a 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h @@ -80,6 +80,7 @@ namespace backends { namespace fastflowlm { // Factory for the fastflowlm backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h index c1f170ca7..f2fd1746a 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h @@ -55,6 +55,7 @@ namespace backends { namespace kokoro { // Factory for the kokoro backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace kokoro } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h index 7ef4bb44b..a4086ac83 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h @@ -66,6 +66,7 @@ namespace backends { namespace llamacpp { // Factory for the llamacpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h index b98e52806..70a71bf2a 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h @@ -61,6 +61,7 @@ namespace backends { namespace moonshine { // Factory for the moonshine backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace moonshine } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h index 1420efae5..38152e478 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h @@ -62,6 +62,7 @@ namespace backends { namespace ryzenai { // Factory for the ryzenai backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace ryzenai } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h index 999a1de72..f86b322ec 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h @@ -103,6 +103,7 @@ namespace backends { namespace sdcpp { // Factory for the sdcpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace sdcpp } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h index 0eaf4e7d8..1ce866118 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm_server.h +++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h @@ -55,6 +55,7 @@ namespace backends { namespace vllm { // Factory for the vllm backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace vllm } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h index 90744875f..21d0d3ad4 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h @@ -84,6 +84,7 @@ namespace backends { namespace whispercpp { // Factory for the whispercpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); +const BackendSpec* spec(); } // namespace whispercpp } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h index eb5d4e0b4..c92bedb37 100644 --- a/src/cpp/include/lemon/model_types.h +++ b/src/cpp/include/lemon/model_types.h @@ -139,28 +139,14 @@ inline ModelType get_model_type_from_labels(const std::vector& labe return ModelType::LLM; } -// Determine device type from recipe -// Default device from recipe — individual backends override based on their config +// Fallback device type for recipes with no registered backend descriptor +// (collections and unknown recipes). The authoritative per-backend default lives +// in BackendDescriptor::default_device; ModelManager::device_type_for_recipe +// consults the descriptor registry first and only falls back here. Kept in this +// low-level header (which must not depend on the backend registry) for that +// fallback alone — it intentionally carries no per-backend knowledge. inline DeviceType get_device_type_from_recipe(const std::string& recipe) { - if (recipe == "llamacpp") { - return DEVICE_GPU; - } else if (recipe == "ryzenai-llm") { - return DEVICE_NPU; - } else if (recipe == "flm") { - return DEVICE_NPU; - } else if (recipe == "whispercpp") { - return DEVICE_CPU; - } else if (recipe == "moonshine") { - return DEVICE_CPU; - } else if (recipe == "sd-cpp") { - return DEVICE_CPU; - } else if (recipe == "kokoro") { - return DEVICE_CPU; - } else if (is_collection_recipe(recipe)) { - return DEVICE_NONE; - } else if (recipe == "cloud") { - return DEVICE_NONE; // Cloud-offloaded models execute on a remote provider - } + (void)recipe; return DEVICE_NONE; } diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp index 5e0de071f..a7db3921a 100644 --- a/src/cpp/server/backends/backend_registry.cpp +++ b/src/cpp/server/backends/backend_registry.cpp @@ -14,6 +14,15 @@ const std::vector& all_registrations() { return kRegistrations; } +const BackendSpec* spec_for(const std::string& recipe) { + for (const auto& reg : all_registrations()) { + if (reg.descriptor->recipe == recipe) { + return reg.spec; + } + } + return nullptr; +} + std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx) { for (const auto& reg : all_registrations()) { if (reg.descriptor->recipe == recipe) { diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp index b5b6680fb..9a57a28db 100644 --- a/src/cpp/server/backends/backend_utils.cpp +++ b/src/cpp/server/backends/backend_utils.cpp @@ -1,14 +1,7 @@ #include "lemon/backends/backend_utils.h" #include "lemon/runtime_config.h" #include "lemon/system_info.h" -#include "lemon/backends/llamacpp/llamacpp_server.h" -#include "lemon/backends/whispercpp/whispercpp_server.h" -#include "lemon/backends/sdcpp/sdcpp_server.h" -#include "lemon/backends/kokoro/kokoro_server.h" -#include "lemon/backends/ryzenai/ryzenai_server.h" -#include "lemon/backends/vllm/vllm_server.h" -#include "lemon/backends/fastflowlm/fastflowlm_server.h" -#include "lemon/backends/moonshine/moonshine_server.h" +#include "lemon/backends/backend_registry.h" // spec_for() — descriptor->install spec, no server includes #include "lemon/model_manager.h" // For DownloadProgress, DownloadProgressCallback #include "lemon/utils/path_utils.h" @@ -39,15 +32,9 @@ using json = nlohmann::json; namespace lemon::backends { const BackendSpec* try_get_spec_for_recipe(const std::string& recipe) { - if (recipe == "llamacpp") return &LlamaCppServer::SPEC; - if (recipe == "whispercpp") return &WhisperServer::SPEC; - if (recipe == "sd-cpp") return &SDServer::SPEC; - if (recipe == "kokoro") return &KokoroServer::SPEC; - if (recipe == "ryzenai-llm") return &::lemon::RyzenAIServer::SPEC; - if (recipe == "vllm") return &VLLMServer::SPEC; - if (recipe == "flm") return &FastFlowLMServer::SPEC; - if (recipe == "moonshine") return &MoonshineServer::SPEC; - return nullptr; + // Each backend exposes its install/download spec through the registry + // (see ::spec()); no per-recipe branches or server includes here. + return spec_for(recipe); } static std::string hash_string_from_json(const json& node) { diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp index 64a940e4f..3d06a3f90 100644 --- a/src/cpp/server/backends/cloud/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -805,6 +805,8 @@ std::unique_ptr create(const BackendContext& ctx) { ctx.model_manager, ctx.backend_manager, ctx.cloud_registry); } + +const BackendSpec* spec() { return nullptr; } } // namespace cloud } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 81e40ba60..f2b6885e6 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -475,6 +475,8 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } + +const BackendSpec* spec() { return &FastFlowLMServer::SPEC; } } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp index e0a2f7ada..534225965 100644 --- a/src/cpp/server/backends/kokoro/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -213,6 +213,8 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } + +const BackendSpec* spec() { return &KokoroServer::SPEC; } } // namespace kokoro } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 2c828f1c4..1c7980024 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -654,6 +654,8 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } + +const BackendSpec* spec() { return &LlamaCppServer::SPEC; } } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index 7cb338286..e03b9ac2b 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -340,6 +340,8 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } + +const BackendSpec* spec() { return &MoonshineServer::SPEC; } } // namespace moonshine } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp index 925fece3f..70bedb84b 100644 --- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -184,6 +184,8 @@ std::unique_ptr create(const BackendContext& ctx) { return server; } + +const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; } } // namespace ryzenai } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp index b561906bb..4749f0f0d 100644 --- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -203,7 +203,7 @@ void SDServer::load(const std::string& model_name, RuntimeConfig::validate_backend_choice("sdcpp", backend); // Update device type based on the actual backend selected. - // get_device_type_from_recipe() defaults sd-cpp to CPU, but rocm/vulkan/metal/cuda are GPU backends. + // The descriptor defaults sd-cpp to CPU; rocm/vulkan/metal/cuda variants are GPU backends. if (backend == "rocm" || backend == "vulkan" || backend == "metal" || backend == "cuda") { device_type_ = DEVICE_GPU; } else { @@ -756,6 +756,8 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } + +const BackendSpec* spec() { return &SDServer::SPEC; } } // namespace sdcpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp index dae6fb883..171b4cc0f 100644 --- a/src/cpp/server/backends/vllm/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -321,6 +321,8 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } + +const BackendSpec* spec() { return &VLLMServer::SPEC; } } // namespace vllm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp index 3c574f27a..8fb454f09 100644 --- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -230,7 +230,7 @@ void WhisperServer::load(const std::string& model_name, RuntimeConfig::validate_backend_choice("whispercpp", whispercpp_backend); // Update device type based on the actual backend selected. - // get_device_type_from_recipe() defaults whispercpp to CPU, but npu/vulkan use different devices. + // The descriptor defaults whispercpp to CPU; npu/vulkan variants use different devices. if (whispercpp_backend == "npu") { device_type_ = DEVICE_NPU; } else if (whispercpp_backend == "vulkan" || whispercpp_backend == "metal") { @@ -698,6 +698,8 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } + +const BackendSpec* spec() { return &WhisperServer::SPEC; } } // namespace whispercpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 6695fbfc7..5253a16ad 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -2964,8 +2963,11 @@ void ModelManager::unregister_user_model(const std::string& model_name) { // Returns empty string if not found. static std::string find_flm_binary() { try { - return backends::BackendUtils::get_backend_binary_path( - backends::FastFlowLMServer::SPEC, "npu"); + const backends::BackendSpec* spec = backends::try_get_spec_for_recipe("flm"); + if (!spec) { + return ""; + } + return backends::BackendUtils::get_backend_binary_path(*spec, "npu"); } catch (...) { #ifndef _WIN32 return utils::find_flm_executable(); From cfb6e3d47b48f03c63442c1fedf6953d502311e5 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 14:37:18 -0400 Subject: [PATCH 08/39] refactor(backends): add BackendOps infrastructure (Tier-2 foundation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a stateless per-backend behavior interface for model management that happens WITHOUT a running subprocess (checkpoint-path resolution, download, dynamic discovery, per-model metadata, version detection, availability) — the home for the recipe switchboards currently scattered through model_manager and system_info. - BackendOps base class (lemon/backends/backend_ops.h): shared default behavior; backends override only the policy points they need (inherit shared logic, don't copy it). Methods are added incrementally as switchboards migrate; each has a default so adding one never forces edits to backends that don't override it. - Each backend folder exposes a uniform ops() singleton (alongside create()/spec()), bound into BackendRegistration; backends::ops_for(recipe) returns it. - Purely additive: every backend uses the default base ops for now, so there is no behavior change yet. Migrations follow in subsequent commits. Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 5 +- src/cpp/include/lemon/backends/backend_ops.h | 46 +++++++++++++++++++ .../include/lemon/backends/backend_registry.h | 8 +++- .../lemon/backends/cloud/cloud_server.h | 1 + .../backends/fastflowlm/fastflowlm_server.h | 1 + .../lemon/backends/kokoro/kokoro_server.h | 1 + .../lemon/backends/llamacpp/llamacpp_server.h | 1 + .../backends/moonshine/moonshine_server.h | 1 + .../lemon/backends/ryzenai/ryzenai_server.h | 1 + .../lemon/backends/sdcpp/sdcpp_server.h | 1 + .../include/lemon/backends/vllm/vllm_server.h | 1 + .../backends/whispercpp/whispercpp_server.h | 1 + src/cpp/server/backends/backend_ops.cpp | 12 +++++ src/cpp/server/backends/backend_registry.cpp | 9 ++++ .../server/backends/cloud/cloud_server.cpp | 1 + .../backends/fastflowlm/fastflowlm_server.cpp | 1 + .../server/backends/kokoro/kokoro_server.cpp | 1 + .../backends/llamacpp/llamacpp_server.cpp | 1 + .../backends/moonshine/moonshine_server.cpp | 1 + .../backends/ryzenai/ryzenai_server.cpp | 1 + .../server/backends/sdcpp/sdcpp_server.cpp | 1 + src/cpp/server/backends/vllm/vllm_server.cpp | 1 + .../backends/whispercpp/whispercpp_server.cpp | 1 + 23 files changed, 95 insertions(+), 3 deletions(-) create mode 100644 src/cpp/include/lemon/backends/backend_ops.h create mode 100644 src/cpp/server/backends/backend_ops.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index ca95c586a..6d6d980e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -678,7 +678,8 @@ set(LEMON_FACTORY_ENTRIES "") set(LEMON_BACKEND_DESCRIPTOR_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp) set(LEMON_BACKEND_FACTORY_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp) foreach(_backend_entry ${LEMON_BACKENDS}) string(REPLACE "|" ";" _backend_parts "${_backend_entry}") list(GET _backend_parts 1 _backend_stem) @@ -692,7 +693,7 @@ foreach(_backend_entry ${LEMON_BACKENDS}) string(APPEND LEMON_FACTORY_INCLUDES "#include \"lemon/backends/${_backend_stem}/${_backend_stem}_server.h\"\n") string(APPEND LEMON_FACTORY_ENTRIES - " { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec() },\n") + " { &lemon::backends::${_backend_stem}::descriptor, &lemon::backends::${_backend_stem}::create, lemon::backends::${_backend_stem}::spec(), lemon::backends::${_backend_stem}::ops() },\n") endforeach() configure_file( diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h new file mode 100644 index 000000000..53b046e84 --- /dev/null +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -0,0 +1,46 @@ +#pragma once + +#include +#include + +namespace lemon { + +struct ModelInfo; +class ModelManager; + +namespace backends { + +// Context handed to BackendOps methods — the bits of server state model +// management needs without a running subprocess. Grows as migrations require. +struct BackendOpsContext { + ModelManager* model_manager = nullptr; +}; + +// Stateless per-backend behavior for model management that happens WITHOUT a +// running subprocess: checkpoint-path resolution, download, dynamic discovery, +// per-model metadata, version detection, availability. One singleton per +// backend, exposed via lemon::backends::::ops() and bound in the registry +// (see BackendRegistration::ops). +// +// The base class is the shared default behavior (the common HF-backed case); +// each backend folder overrides ONLY the policy points it needs, so shared +// logic is inherited rather than copied. Methods are added here incrementally as +// switchboards in model_manager / system_info are migrated; every method has a +// default so adding one never forces edits to backends that don't override it. +class BackendOps { +public: + virtual ~BackendOps() = default; + + // Populate model-specific metadata (context window, capability labels, …) + // for a downloaded model. Default: nothing. + virtual void populate_metadata(ModelInfo& info, const BackendOpsContext& ctx) const { + (void)info; + (void)ctx; + } +}; + +// Shared default ops instance for backends that override nothing. +const BackendOps* default_backend_ops(); + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h index a5a116f14..75709781d 100644 --- a/src/cpp/include/lemon/backends/backend_registry.h +++ b/src/cpp/include/lemon/backends/backend_registry.h @@ -4,6 +4,7 @@ #include #include "lemon/backends/backend_descriptor.h" #include "lemon/backends/backend_descriptor_registry.h" +#include "lemon/backends/backend_ops.h" namespace lemon { @@ -37,14 +38,19 @@ struct BackendRegistration { const BackendDescriptor* descriptor; BackendCreateFn create; const BackendSpec* spec; // install/download spec, or nullptr (e.g. cloud has none) + const BackendOps* ops; // stateless model-management behavior (never null) }; -// All registered (descriptor, create, spec) entries, in LEMON_BACKENDS order. +// All registered (descriptor, create, spec, ops) entries, in LEMON_BACKENDS order. const std::vector& all_registrations(); // Install/download spec for a recipe, or nullptr if the recipe has none. const BackendSpec* spec_for(const std::string& recipe); +// Stateless model-management ops for a recipe. Falls back to the shared default +// ops (base behavior) for recipes with no registered backend. +const BackendOps* ops_for(const std::string& recipe); + // Construct a backend instance for a recipe and associate its descriptor, or // nullptr if the recipe has no registered backend. std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/cloud/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h index afddef3a7..774c44300 100644 --- a/src/cpp/include/lemon/backends/cloud/cloud_server.h +++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h @@ -117,6 +117,7 @@ namespace cloud { // Factory for the cloud backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace cloud } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h index cb6b7d73a..c422f4a4d 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h @@ -81,6 +81,7 @@ namespace fastflowlm { // Factory for the fastflowlm backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h index f2fd1746a..9c628c076 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h @@ -56,6 +56,7 @@ namespace kokoro { // Factory for the kokoro backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace kokoro } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h index a4086ac83..8b28296c4 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h @@ -67,6 +67,7 @@ namespace llamacpp { // Factory for the llamacpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h index 70a71bf2a..611bfe51c 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h @@ -62,6 +62,7 @@ namespace moonshine { // Factory for the moonshine backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace moonshine } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h index 38152e478..f824cfde3 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h @@ -63,6 +63,7 @@ namespace ryzenai { // Factory for the ryzenai backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace ryzenai } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h index f86b322ec..99be9e62c 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h @@ -104,6 +104,7 @@ namespace sdcpp { // Factory for the sdcpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace sdcpp } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h index 1ce866118..700296b97 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm_server.h +++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h @@ -56,6 +56,7 @@ namespace vllm { // Factory for the vllm backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace vllm } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h index 21d0d3ad4..8dc88bbb4 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h @@ -85,6 +85,7 @@ namespace whispercpp { // Factory for the whispercpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); const BackendSpec* spec(); +const BackendOps* ops(); } // namespace whispercpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp new file mode 100644 index 000000000..773e39494 --- /dev/null +++ b/src/cpp/server/backends/backend_ops.cpp @@ -0,0 +1,12 @@ +#include "lemon/backends/backend_ops.h" + +namespace lemon { +namespace backends { + +const BackendOps* default_backend_ops() { + static const BackendOps kDefault; + return &kDefault; +} + +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/backend_registry.cpp b/src/cpp/server/backends/backend_registry.cpp index a7db3921a..abbeaf998 100644 --- a/src/cpp/server/backends/backend_registry.cpp +++ b/src/cpp/server/backends/backend_registry.cpp @@ -23,6 +23,15 @@ const BackendSpec* spec_for(const std::string& recipe) { return nullptr; } +const BackendOps* ops_for(const std::string& recipe) { + for (const auto& reg : all_registrations()) { + if (reg.descriptor->recipe == recipe) { + return reg.ops; + } + } + return default_backend_ops(); +} + std::unique_ptr create_server(const std::string& recipe, const BackendContext& ctx) { for (const auto& reg : all_registrations()) { if (reg.descriptor->recipe == recipe) { diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp index 3d06a3f90..29dede2b0 100644 --- a/src/cpp/server/backends/cloud/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -807,6 +807,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return nullptr; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace cloud } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index f2b6885e6..424ea2e2c 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -477,6 +477,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &FastFlowLMServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp index 534225965..13f1a3ffe 100644 --- a/src/cpp/server/backends/kokoro/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -215,6 +215,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &KokoroServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace kokoro } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 1c7980024..05cd10a4d 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -656,6 +656,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &LlamaCppServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index e03b9ac2b..ed709990b 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -342,6 +342,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &MoonshineServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace moonshine } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp index 70bedb84b..e965ea3b9 100644 --- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -186,6 +186,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace ryzenai } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp index 4749f0f0d..718855d8f 100644 --- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -758,6 +758,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &SDServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace sdcpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp index 171b4cc0f..1ab4e22fc 100644 --- a/src/cpp/server/backends/vllm/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -323,6 +323,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &VLLMServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace vllm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp index 8fb454f09..c77d10669 100644 --- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -700,6 +700,7 @@ std::unique_ptr create(const BackendContext& ctx) { const BackendSpec* spec() { return &WhisperServer::SPEC; } +const BackendOps* ops() { return default_backend_ops(); } } // namespace whispercpp } // namespace backends } // namespace lemon From 5a1d5349e5e3acc53d02f234c71ea657221e0076 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 14:47:19 -0400 Subject: [PATCH 09/39] refactor(backends): migrate per-model metadata to ops; move GGUF/FLM readers into folders Replace the populate_model_metadata recipe switchboard with ops_for(recipe)->populate_metadata(). The backend-specific readers move into their folders: - GGUF metadata reader (read_gguf_metadata + byte parsers) -> backends/llamacpp/ llamacpp_gguf.{h,cpp}; LlamaCppOps::populate_metadata reads arch + capability labels there. - FLM model-file helpers (config.json ctx window, model-dir discovery) -> backends/fastflowlm/fastflowlm_models.{h,cpp}; FlmOps::populate_metadata uses it. model_manager no longer knows how either backend stores or introspects models. CMake now globs each backend folder's *.cpp (CONFIGURE_DEPENDS) so backend-private helper files need no CMake edit; the backend LIST stays explicit. Verified: GGUF context windows still populate (131072/128000/32768 for sample models) and test_gguf_capabilities passes. Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 11 +- .../backends/fastflowlm/fastflowlm_models.h | 29 ++ .../lemon/backends/llamacpp/llamacpp_gguf.h | 30 ++ .../backends/fastflowlm/fastflowlm_models.cpp | 119 ++++++ .../backends/fastflowlm/fastflowlm_server.cpp | 17 +- .../backends/llamacpp/llamacpp_gguf.cpp | 253 +++++++++++++ .../backends/llamacpp/llamacpp_server.cpp | 48 ++- src/cpp/server/model_manager.cpp | 347 +----------------- 8 files changed, 510 insertions(+), 344 deletions(-) create mode 100644 src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h create mode 100644 src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h create mode 100644 src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp create mode 100644 src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d6d980e3..758108b69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -683,9 +683,14 @@ set(LEMON_BACKEND_FACTORY_SOURCES foreach(_backend_entry ${LEMON_BACKENDS}) string(REPLACE "|" ";" _backend_parts "${_backend_entry}") list(GET _backend_parts 1 _backend_stem) - # The descriptor is header-only (no source); only the server source compiles. - list(APPEND LEMON_BACKEND_FACTORY_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/${_backend_stem}_server.cpp) + # The descriptor is header-only (no source). Compile every .cpp in the + # backend's folder (server class + any backend-private helpers like GGUF + # parsing) — CONFIGURE_DEPENDS re-globs when a file is added/removed so a new + # helper in a folder needs no CMake edit. (The backend LIST is still explicit + # above so a whole new backend is never silently missed.) + file(GLOB _backend_srcs CONFIGURE_DEPENDS + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/${_backend_stem}/*.cpp) + list(APPEND LEMON_BACKEND_FACTORY_SOURCES ${_backend_srcs}) string(APPEND LEMON_DESCRIPTOR_INCLUDES "#include \"lemon/backends/${_backend_stem}/${_backend_stem}.h\"\n") string(APPEND LEMON_DESCRIPTOR_ENTRIES diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h new file mode 100644 index 000000000..3e99e3003 --- /dev/null +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include +#include + +namespace lemon { + +struct ModelInfo; + +namespace backends { +namespace fastflowlm { + +// FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH / +// platform-default roots and describes them with a config.json; this knowledge +// lives in the fastflowlm backend folder rather than in the shared model manager. + +// Derive the on-disk repo directory name from an FLM model URL. +std::string repo_dir_from_url(const std::string& url); + +// Locate config.json for an FLM repo dir across the candidate model roots. +std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo_dir); + +// Read the model's max context window from its FLM config.json (0 if unknown). +int64_t read_flm_max_context_window(const ModelInfo& info); + +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h new file mode 100644 index 000000000..2e431478b --- /dev/null +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include "lemon/gguf_capabilities.h" + +namespace lemon { +namespace backends { +namespace llamacpp { + +// GGUF metadata extracted in a single pass over the KV header. This is +// llama.cpp-specific model introspection; it lives in the llamacpp backend +// folder rather than in the shared model manager. +struct GgufMetadata { + std::string architecture; + int64_t context_length = 0; + int64_t block_count = 0; + int64_t embedding_length = 0; + int64_t head_count_kv = 0; + int64_t key_length = 0; + GgufCapabilities caps; +}; + +// Read GGUF metadata from a .gguf file. Returns false if the file is missing or +// not a valid GGUF container. +bool read_gguf_metadata(GgufMetadata& out, const std::string& path); + +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp new file mode 100644 index 000000000..0ac7f8caf --- /dev/null +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp @@ -0,0 +1,119 @@ +#include "lemon/backends/fastflowlm/fastflowlm_models.h" + +#include +#include +#include +#include "lemon/model_manager.h" +#include "lemon/utils/aixlog.hpp" +#include "lemon/utils/json_utils.h" +#include "lemon/utils/path_utils.h" + +namespace fs = std::filesystem; +using json = nlohmann::json; + +namespace lemon { +namespace backends { +namespace fastflowlm { +namespace { + +using lemon::utils::path_from_utf8; + +bool safe_exists(const fs::path& p) { + std::error_code ec; + return fs::exists(p, ec); +} + +// Candidate roots that FLM may use to store models. FLM resolves its model +// directory from the FLM_MODEL_PATH env var (set by the installer) and falls +// back to platform-default locations. +std::vector get_flm_models_dir_candidates() { + std::vector roots; + + const char* flm_model_path = std::getenv("FLM_MODEL_PATH"); + if (flm_model_path && *flm_model_path) { + roots.push_back(path_from_utf8(flm_model_path) / "models"); + } + +#ifdef _WIN32 + const char* userprofile = std::getenv("USERPROFILE"); + if (userprofile && *userprofile) { + fs::path home = path_from_utf8(userprofile); + roots.push_back(home / ".flm" / "models"); // current installer default + roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default + roots.push_back(home / "flm" / "models"); + } +#else + const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME"); + if (xdg_config_home && *xdg_config_home) { + roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models"); + } + const char* home = std::getenv("HOME"); + if (home && *home) { + fs::path home_path = path_from_utf8(home); + roots.push_back(home_path / ".flm" / "models"); + roots.push_back(home_path / ".config" / "flm" / "models"); + } +#endif + + return roots; +} + +} // namespace + +fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) { + if (repo_dir.empty()) return fs::path(); + + for (const auto& root : get_flm_models_dir_candidates()) { + fs::path candidate = root / repo_dir / "config.json"; + if (safe_exists(candidate)) return candidate; + } + return fs::path(); +} + +std::string repo_dir_from_url(const std::string& url) { + std::string clean = url; + while (!clean.empty() && clean.back() == '/') clean.pop_back(); + size_t query_pos = clean.find_first_of("?#"); + if (query_pos != std::string::npos) clean = clean.substr(0, query_pos); + + for (const std::string marker : {"/tree/", "/resolve/"}) { + size_t marker_pos = clean.find(marker); + if (marker_pos != std::string::npos) { + clean = clean.substr(0, marker_pos); + break; + } + } + + size_t slash = clean.find_last_of('/'); + return slash == std::string::npos ? clean : clean.substr(slash + 1); +} + +int64_t read_flm_max_context_window(const ModelInfo& info) { + if (info.type != ModelType::LLM) return 0; + + std::string config_path = info.resolved_path("config"); + if (config_path.empty()) return 0; + + try { + json config = lemon::utils::JsonUtils::load_from_file(config_path); + if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) { + int64_t value = config["max_position_embeddings"].get(); + return value > 0 ? value : 0; + } + if (config.contains("text_config") && config["text_config"].is_object()) { + const auto& text_config = config["text_config"]; + if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) { + int64_t value = text_config["max_position_embeddings"].get(); + return value > 0 ? value : 0; + } + } + } catch (const std::exception& e) { + LOG(DEBUG, "FastFlowLM") << "Could not read FLM config metadata for " + << info.model_name << ": " << e.what() << std::endl; + } + return 0; +} + +} // namespace fastflowlm +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 424ea2e2c..0b5e15934 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -1,6 +1,9 @@ #include "lemon/backends/fastflowlm/fastflowlm_server.h" +#include "lemon/backends/fastflowlm/fastflowlm_models.h" #include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/model_manager.h" #include "lemon/system_info.h" #include "lemon/error_types.h" #include "lemon/utils/process_manager.h" @@ -475,9 +478,21 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } +namespace { +// FLM model-management behavior: max context window from the model's config.json. +class FlmOps : public BackendOps { +public: + void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override { + info.max_context_window = read_flm_max_context_window(info); + } +}; +} // namespace const BackendSpec* spec() { return &FastFlowLMServer::SPEC; } -const BackendOps* ops() { return default_backend_ops(); } +const BackendOps* ops() { + static const FlmOps kOps; + return &kOps; +} } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp new file mode 100644 index 000000000..1e099d064 --- /dev/null +++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp @@ -0,0 +1,253 @@ +#include "lemon/backends/llamacpp/llamacpp_gguf.h" + +#include +#include +#include +#include +#include +#include +#include "lemon/utils/path_utils.h" + +namespace lemon { +namespace backends { +namespace llamacpp { +namespace { + +using lemon::utils::path_from_utf8; + +// Local copies of the tiny case-insensitive string helpers (kept out of a shared +// util to keep this GGUF reader self-contained). +bool ends_with_ignore_case(const std::string& str, const std::string& suffix) { + if (suffix.size() > str.size()) return false; + return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin(), + [](char a, char b) { return std::tolower(a) == std::tolower(b); }); +} + +bool contains_ignore_case(const std::string& str, const std::string& substr) { + auto it = std::search(str.begin(), str.end(), substr.begin(), substr.end(), + [](char a, char b) { return std::tolower(a) == std::tolower(b); }); + return it != str.end(); +} + +template +static bool read_le(std::istream& in, T& value) { + in.read(reinterpret_cast(&value), sizeof(T)); + return static_cast(in); +} + +static bool read_gguf_string(std::istream& in, std::string& value) { + uint64_t len = 0; + if (!read_le(in, len)) return false; + if (len > 1024 * 1024) return false; + value.assign(static_cast(len), '\0'); + if (len == 0) return true; + in.read(&value[0], static_cast(len)); + return static_cast(in); +} + +static bool skip_bytes(std::istream& in, uint64_t bytes) { + if (bytes > static_cast(std::numeric_limits::max())) return false; + in.seekg(static_cast(bytes), std::ios::cur); + return static_cast(in); +} + +static uint64_t gguf_scalar_size(uint32_t type) { + switch (type) { + case 0: // UINT8 + case 1: // INT8 + case 7: // BOOL + return 1; + case 2: // UINT16 + case 3: // INT16 + return 2; + case 4: // UINT32 + case 5: // INT32 + case 6: // FLOAT32 + return 4; + case 10: // UINT64 + case 11: // INT64 + case 12: // FLOAT64 + return 8; + default: + return 0; + } +} + +static bool skip_gguf_value(std::istream& in, uint32_t type); + +static bool read_gguf_integer_value(std::istream& in, uint32_t type, int64_t& value) { + switch (type) { + case 0: { uint8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } + case 1: { int8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } + case 2: { uint16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } + case 3: { int16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } + case 4: { uint32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } + case 5: { int32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } + case 10: { + uint64_t v = 0; + if (!read_le(in, v)) return false; + if (v > static_cast(std::numeric_limits::max())) return false; + value = static_cast(v); + return true; + } + case 11: { int64_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } + default: + return skip_gguf_value(in, type) && false; + } +} + +static bool skip_gguf_value(std::istream& in, uint32_t type) { + if (type == 8) { // STRING + std::string ignored; + return read_gguf_string(in, ignored); + } + + if (type == 9) { // ARRAY + uint32_t elem_type = 0; + uint64_t count = 0; + if (!read_le(in, elem_type) || !read_le(in, count)) return false; + + if (elem_type == 8) { + for (uint64_t i = 0; i < count; ++i) { + std::string ignored; + if (!read_gguf_string(in, ignored)) return false; + } + return true; + } + + if (elem_type == 9) return false; + uint64_t elem_size = gguf_scalar_size(elem_type); + if (elem_size == 0) return false; + if (count > std::numeric_limits::max() / elem_size) return false; + return skip_bytes(in, count * elem_size); + } + + uint64_t size = gguf_scalar_size(type); + return size > 0 && skip_bytes(in, size); +} + +} // namespace + +bool read_gguf_metadata(GgufMetadata& out, const std::string& path) { + std::ifstream in(path_from_utf8(path), std::ios::binary); + if (!in) return false; + + char magic[4] = {}; + in.read(magic, sizeof(magic)); + if (!in || std::memcmp(magic, "GGUF", 4) != 0) return false; + + uint32_t version = 0; + uint64_t tensor_count = 0; + uint64_t kv_count = 0; + if (!read_le(in, version) || !read_le(in, tensor_count) || !read_le(in, kv_count)) return false; + (void)version; + (void)tensor_count; + + int64_t pending_context_length = 0; + + for (uint64_t i = 0; i < kv_count; ++i) { + std::string key; + uint32_t type = 0; + if (!read_gguf_string(in, key) || !read_le(in, type)) return false; + + // Read architecture + if (key == "general.architecture" && type == 8) { + if (!read_gguf_string(in, out.architecture)) return false; + if (pending_context_length > 0) { + out.context_length = pending_context_length; + } + continue; + } + + // Context length + const bool context_key = !out.architecture.empty() && key == out.architecture + ".context_length"; + const bool possible_context_key = out.architecture.empty() && key.size() > std::strlen(".context_length") && + ends_with_ignore_case(key, ".context_length"); + if (context_key || possible_context_key) { + int64_t value = 0; + if (read_gguf_integer_value(in, type, value) && value > 0) { + if (context_key) { + out.context_length = value; + } else { + pending_context_length = value; + } + } + continue; + } + + // Architecture fields for KV cache estimation + if (!out.architecture.empty()) { + if (key == out.architecture + ".block_count") { + int64_t value = 0; + if (read_gguf_integer_value(in, type, value) && value > 0) + out.block_count = value; + continue; + } + if (key == out.architecture + ".embedding_length") { + int64_t value = 0; + if (read_gguf_integer_value(in, type, value) && value > 0) + out.embedding_length = value; + continue; + } + if (key == out.architecture + ".attention.head_count_kv") { + int64_t value = 0; + if (read_gguf_integer_value(in, type, value) && value > 0) + out.head_count_kv = value; + continue; + } + if (key == out.architecture + ".attention.key_length") { + int64_t value = 0; + if (read_gguf_integer_value(in, type, value) && value > 0) + out.key_length = value; + continue; + } + } + + // Capability detection (vision, tool-calling, MTP) + if (type == 4) { + uint32_t val = 0; + if (read_le(in, val)) { + if (contains_ignore_case(key, "nextn_predict_layers") && val > 0) + out.caps.mtp = true; + } + } else if (type == 8) { + std::string value; + if (read_gguf_string(in, value)) { + inspect_gguf_string(key, value, out.caps); + } + } else if (type == 9) { + // Array — check string elements for capability hints + uint32_t elem_type = 0; + uint64_t count = 0; + if (read_le(in, elem_type) && read_le(in, count)) { + if (elem_type == 8) { + for (uint64_t j = 0; j < count; ++j) { + std::string value; + if (!read_gguf_string(in, value)) return false; + inspect_gguf_string(key, value, out.caps); + } + } else if (elem_type != 9) { + uint64_t elem_size = gguf_scalar_size(elem_type); + if (elem_size == 0) return false; + if (!skip_bytes(in, count * elem_size)) return false; + } else { + return false; + } + } else { + return false; + } + } else { + if (!skip_gguf_value(in, type)) return false; + } + } + + if (out.context_length == 0 && pending_context_length > 0) { + out.context_length = pending_context_length; + } + return true; +} + + +} // namespace llamacpp +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 05cd10a4d..e295a835a 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -1,6 +1,13 @@ #include "lemon/backends/llamacpp/llamacpp_server.h" +#include "lemon/backends/llamacpp/llamacpp_gguf.h" #include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/gguf_capabilities.h" +#include "lemon/model_manager.h" +#include +#include +#include #include "lemon/auto_tune.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" @@ -654,9 +661,48 @@ std::unique_ptr create(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } +namespace { +// llamacpp model-management behavior: GGUF metadata + capability labels. +class LlamaCppOps : public BackendOps { +public: + void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override { + const std::string gguf_path = info.resolved_path(); + if (gguf_path.size() < 5) { + return; + } + std::string ext = gguf_path.substr(gguf_path.size() - 5); + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + if (ext != ".gguf") { + return; + } + std::error_code ec; + if (!std::filesystem::exists(lemon::utils::path_from_utf8(gguf_path), ec)) { + return; + } + GgufMetadata meta; + if (!read_gguf_metadata(meta, gguf_path)) { + return; + } + info.max_context_window = meta.context_length; + info.gguf_block_count = meta.block_count; + info.gguf_embedding_length = meta.embedding_length; + info.gguf_head_count_kv = meta.head_count_kv; + info.gguf_key_length = meta.key_length; + // GGUF vision/tool metadata are LLM capabilities. Don't apply them to + // embedding/reranking models, or labels like tool-calling would + // reclassify the model away from its endpoint type. + if (info.type == ModelType::LLM) { + apply_gguf_capability_labels(info.labels, meta.caps); + } + } +}; +} // namespace const BackendSpec* spec() { return &LlamaCppServer::SPEC; } -const BackendOps* ops() { return default_backend_ops(); } +const BackendOps* ops() { + static const LlamaCppOps kOps; + return &kOps; +} } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 5253a16ad..81d0200b2 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -1,15 +1,16 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include +#include #include #include #include @@ -153,354 +154,21 @@ static std::string cache_key_to_canonical_id(const std::string& cache_key) { return canonical_id(ModelSource::Builtin, cache_key); } -template -static bool read_le(std::istream& in, T& value) { - in.read(reinterpret_cast(&value), sizeof(T)); - return static_cast(in); -} - -static bool read_gguf_string(std::istream& in, std::string& value) { - uint64_t len = 0; - if (!read_le(in, len)) return false; - if (len > 1024 * 1024) return false; - value.assign(static_cast(len), '\0'); - if (len == 0) return true; - in.read(&value[0], static_cast(len)); - return static_cast(in); -} - -static bool skip_bytes(std::istream& in, uint64_t bytes) { - if (bytes > static_cast(std::numeric_limits::max())) return false; - in.seekg(static_cast(bytes), std::ios::cur); - return static_cast(in); -} - -static uint64_t gguf_scalar_size(uint32_t type) { - switch (type) { - case 0: // UINT8 - case 1: // INT8 - case 7: // BOOL - return 1; - case 2: // UINT16 - case 3: // INT16 - return 2; - case 4: // UINT32 - case 5: // INT32 - case 6: // FLOAT32 - return 4; - case 10: // UINT64 - case 11: // INT64 - case 12: // FLOAT64 - return 8; - default: - return 0; - } -} - -static bool skip_gguf_value(std::istream& in, uint32_t type); - -static bool read_gguf_integer_value(std::istream& in, uint32_t type, int64_t& value) { - switch (type) { - case 0: { uint8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } - case 1: { int8_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } - case 2: { uint16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } - case 3: { int16_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } - case 4: { uint32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } - case 5: { int32_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } - case 10: { - uint64_t v = 0; - if (!read_le(in, v)) return false; - if (v > static_cast(std::numeric_limits::max())) return false; - value = static_cast(v); - return true; - } - case 11: { int64_t v = 0; if (!read_le(in, v)) return false; value = v; return true; } - default: - return skip_gguf_value(in, type) && false; - } -} - -static bool skip_gguf_value(std::istream& in, uint32_t type) { - if (type == 8) { // STRING - std::string ignored; - return read_gguf_string(in, ignored); - } - - if (type == 9) { // ARRAY - uint32_t elem_type = 0; - uint64_t count = 0; - if (!read_le(in, elem_type) || !read_le(in, count)) return false; - - if (elem_type == 8) { - for (uint64_t i = 0; i < count; ++i) { - std::string ignored; - if (!read_gguf_string(in, ignored)) return false; - } - return true; - } - - if (elem_type == 9) return false; - uint64_t elem_size = gguf_scalar_size(elem_type); - if (elem_size == 0) return false; - if (count > std::numeric_limits::max() / elem_size) return false; - return skip_bytes(in, count * elem_size); - } - - uint64_t size = gguf_scalar_size(type); - return size > 0 && skip_bytes(in, size); -} - -// All GGUF metadata extracted in a single pass over the KV header. -// Replaces the previous three separate readers (context_length, arch_info, capabilities) -// that each opened the file independently. -struct GgufMetadata { - std::string architecture; - int64_t context_length = 0; - int64_t block_count = 0; - int64_t embedding_length = 0; - int64_t head_count_kv = 0; - int64_t key_length = 0; - GgufCapabilities caps; -}; - -static bool read_gguf_metadata(GgufMetadata& out, const std::string& path) { - std::ifstream in(path_from_utf8(path), std::ios::binary); - if (!in) return false; - - char magic[4] = {}; - in.read(magic, sizeof(magic)); - if (!in || std::memcmp(magic, "GGUF", 4) != 0) return false; - - uint32_t version = 0; - uint64_t tensor_count = 0; - uint64_t kv_count = 0; - if (!read_le(in, version) || !read_le(in, tensor_count) || !read_le(in, kv_count)) return false; - (void)version; - (void)tensor_count; - - int64_t pending_context_length = 0; - - for (uint64_t i = 0; i < kv_count; ++i) { - std::string key; - uint32_t type = 0; - if (!read_gguf_string(in, key) || !read_le(in, type)) return false; - - // Read architecture - if (key == "general.architecture" && type == 8) { - if (!read_gguf_string(in, out.architecture)) return false; - if (pending_context_length > 0) { - out.context_length = pending_context_length; - } - continue; - } - - // Context length - const bool context_key = !out.architecture.empty() && key == out.architecture + ".context_length"; - const bool possible_context_key = out.architecture.empty() && key.size() > std::strlen(".context_length") && - ends_with_ignore_case(key, ".context_length"); - if (context_key || possible_context_key) { - int64_t value = 0; - if (read_gguf_integer_value(in, type, value) && value > 0) { - if (context_key) { - out.context_length = value; - } else { - pending_context_length = value; - } - } - continue; - } - - // Architecture fields for KV cache estimation - if (!out.architecture.empty()) { - if (key == out.architecture + ".block_count") { - int64_t value = 0; - if (read_gguf_integer_value(in, type, value) && value > 0) - out.block_count = value; - continue; - } - if (key == out.architecture + ".embedding_length") { - int64_t value = 0; - if (read_gguf_integer_value(in, type, value) && value > 0) - out.embedding_length = value; - continue; - } - if (key == out.architecture + ".attention.head_count_kv") { - int64_t value = 0; - if (read_gguf_integer_value(in, type, value) && value > 0) - out.head_count_kv = value; - continue; - } - if (key == out.architecture + ".attention.key_length") { - int64_t value = 0; - if (read_gguf_integer_value(in, type, value) && value > 0) - out.key_length = value; - continue; - } - } - - // Capability detection (vision, tool-calling, MTP) - if (type == 4) { - uint32_t val = 0; - if (read_le(in, val)) { - if (contains_ignore_case(key, "nextn_predict_layers") && val > 0) - out.caps.mtp = true; - } - } else if (type == 8) { - std::string value; - if (read_gguf_string(in, value)) { - inspect_gguf_string(key, value, out.caps); - } - } else if (type == 9) { - // Array — check string elements for capability hints - uint32_t elem_type = 0; - uint64_t count = 0; - if (read_le(in, elem_type) && read_le(in, count)) { - if (elem_type == 8) { - for (uint64_t j = 0; j < count; ++j) { - std::string value; - if (!read_gguf_string(in, value)) return false; - inspect_gguf_string(key, value, out.caps); - } - } else if (elem_type != 9) { - uint64_t elem_size = gguf_scalar_size(elem_type); - if (elem_size == 0) return false; - if (!skip_bytes(in, count * elem_size)) return false; - } else { - return false; - } - } else { - return false; - } - } else { - if (!skip_gguf_value(in, type)) return false; - } - } - - if (out.context_length == 0 && pending_context_length > 0) { - out.context_length = pending_context_length; - } - return true; -} - // Candidate roots that FLM may use to store models. FLM resolves its model // directory from the FLM_MODEL_PATH env var (set by the installer) and falls // back to a built-in default that has changed across releases. lemond is often // launched from a parent process that predates the FLM install and therefore // doesn't see FLM_MODEL_PATH, so we also probe every documented default. // Order is most-specific to most-historical. -static std::vector get_flm_models_dir_candidates() { - std::vector roots; - - const char* flm_model_path = std::getenv("FLM_MODEL_PATH"); - if (flm_model_path && *flm_model_path) { - roots.push_back(path_from_utf8(flm_model_path) / "models"); - } - -#ifdef _WIN32 - const char* userprofile = std::getenv("USERPROFILE"); - if (userprofile && *userprofile) { - fs::path home = path_from_utf8(userprofile); - roots.push_back(home / ".flm" / "models"); // current installer default - roots.push_back(home / "Documents" / "flm" / "models"); // legacy installer default - roots.push_back(home / "flm" / "models"); - } -#else - const char* xdg_config_home = std::getenv("XDG_CONFIG_HOME"); - if (xdg_config_home && *xdg_config_home) { - roots.push_back(path_from_utf8(xdg_config_home) / "flm" / "models"); - } - const char* home = std::getenv("HOME"); - if (home && *home) { - fs::path home_path = path_from_utf8(home); - roots.push_back(home_path / ".flm" / "models"); - roots.push_back(home_path / ".config" / "flm" / "models"); - } -#endif - - return roots; -} - -static fs::path find_flm_config_path_from_repo_dir(const std::string& repo_dir) { - if (repo_dir.empty()) return fs::path(); - - for (const auto& root : get_flm_models_dir_candidates()) { - fs::path candidate = root / repo_dir / "config.json"; - if (safe_exists(candidate)) return candidate; - } - return fs::path(); -} - -static std::string repo_dir_from_url(const std::string& url) { - std::string clean = url; - while (!clean.empty() && clean.back() == '/') clean.pop_back(); - size_t query_pos = clean.find_first_of("?#"); - if (query_pos != std::string::npos) clean = clean.substr(0, query_pos); - - for (const std::string marker : {"/tree/", "/resolve/"}) { - size_t marker_pos = clean.find(marker); - if (marker_pos != std::string::npos) { - clean = clean.substr(0, marker_pos); - break; - } - } - - size_t slash = clean.find_last_of('/'); - return slash == std::string::npos ? clean : clean.substr(slash + 1); -} - -static int64_t read_flm_max_context_window(const ModelInfo& info) { - if (info.type != ModelType::LLM) return 0; - - std::string config_path = info.resolved_path("config"); - if (config_path.empty()) return 0; - - try { - json config = JsonUtils::load_from_file(config_path); - if (config.contains("max_position_embeddings") && config["max_position_embeddings"].is_number_integer()) { - int64_t value = config["max_position_embeddings"].get(); - return value > 0 ? value : 0; - } - if (config.contains("text_config") && config["text_config"].is_object()) { - const auto& text_config = config["text_config"]; - if (text_config.contains("max_position_embeddings") && text_config["max_position_embeddings"].is_number_integer()) { - int64_t value = text_config["max_position_embeddings"].get(); - return value > 0 ? value : 0; - } - } - } catch (const std::exception& e) { - LOG(DEBUG, "ModelManager") << "Could not read FLM config metadata for " - << info.model_name << ": " << e.what() << std::endl; - } - return 0; -} static void populate_model_metadata(ModelInfo& info) { info.max_context_window = 0; if (!info.downloaded) return; - if (info.recipe == "llamacpp") { - std::string gguf_path = info.resolved_path(); - if (!gguf_path.empty() && ends_with_ignore_case(gguf_path, ".gguf") && safe_exists(path_from_utf8(gguf_path))) { - GgufMetadata meta; - if (read_gguf_metadata(meta, gguf_path)) { - info.max_context_window = meta.context_length; - info.gguf_block_count = meta.block_count; - info.gguf_embedding_length = meta.embedding_length; - info.gguf_head_count_kv = meta.head_count_kv; - info.gguf_key_length = meta.key_length; - - // GGUF vision/tool metadata are LLM capabilities. Do not apply - // them to embedding/reranking models, otherwise labels such as - // tool-calling would reclassify the model away from its endpoint - // type and break /embeddings or /rerank. - if (info.type == ModelType::LLM) { - apply_gguf_capability_labels(info.labels, meta.caps); - } - } - } - } else if (info.recipe == "flm") { - info.max_context_window = read_flm_max_context_window(info); - } + // Per-backend metadata (GGUF arch/labels for llamacpp, config.json ctx for + // flm, …) is read by the backend's ops, not a recipe switchboard here. + backends::BackendOpsContext ctx; + backends::ops_for(info.recipe)->populate_metadata(info, ctx); } static bool is_user_model_name(const std::string& model_name) { @@ -3108,7 +2776,8 @@ std::vector ModelManager::get_flm_available_models() { info.suggested = true; // All official FLM models are suggested if (JsonUtils::get_or_default(m, "installed", false) && m.contains("url") && m["url"].is_string()) { - fs::path config_path = find_flm_config_path_from_repo_dir(repo_dir_from_url(m["url"].get())); + fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir( + backends::fastflowlm::repo_dir_from_url(m["url"].get())); if (!config_path.empty()) { info.resolved_paths["config"] = path_to_utf8(config_path); } From 7933852f4009426ecfa7b955f63a34c157ed95a8 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 14:53:40 -0400 Subject: [PATCH 10/39] refactor(backends): descriptor-drive ROCm channels (kill duplicated (llamacpp||sd-cpp)&&rocm) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `rocm_channels` descriptor field (llamacpp {"stable","nightly"}, sd-cpp {"stable"}) and a recipe_has_rocm_channels() registry helper. Replace the hardcoded `(recipe=="llamacpp"||recipe=="sd-cpp") && rocm` predicate — copied across backend_utils.cpp (3×), backend_manager.cpp (2×), and system_info.cpp — with the descriptor check. rocm_channel_for_recipe() now clamps a requested channel to one the backend publishes (so sd-cpp's missing "nightly" -> "stable" falls out of the data instead of a per-recipe special case). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_descriptor.h | 6 ++++++ .../lemon/backends/backend_descriptor_registry.h | 5 +++++ src/cpp/include/lemon/backends/llamacpp/llamacpp.h | 1 + src/cpp/include/lemon/backends/sdcpp/sdcpp.h | 1 + src/cpp/server/backend_manager.cpp | 5 +++-- .../server/backends/backend_descriptor_registry.cpp | 5 +++++ src/cpp/server/backends/backend_utils.cpp | 10 +++++----- src/cpp/server/runtime_config.cpp | 13 ++++++++++--- src/cpp/server/system_info.cpp | 2 +- 9 files changed, 37 insertions(+), 11 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index 3b9cdb2fb..29ea2e0ea 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -65,6 +65,12 @@ struct BackendDescriptor { std::string web_display_name; // name used on the docs website ("" = fall back to display_name) int web_priority = 0; // model-grouping order on the docs website (lower = higher; 0 = unlisted) + // ROCm release channels this backend publishes (e.g. {"stable","nightly"}). + // Empty = the backend has no ROCm channels (its "rocm" build is a single + // artifact). Drives the rocm-stable/rocm-nightly bin-key collapse and the + // channel clamp (a requested channel not listed here falls back to the first). + std::vector rocm_channels; + // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { return config_section.empty() ? recipe : config_section; diff --git a/src/cpp/include/lemon/backends/backend_descriptor_registry.h b/src/cpp/include/lemon/backends/backend_descriptor_registry.h index e3be93cda..44ec7e15d 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor_registry.h +++ b/src/cpp/include/lemon/backends/backend_descriptor_registry.h @@ -21,5 +21,10 @@ const BackendDescriptor* descriptor_for(const std::string& recipe); // True if the recipe is backed by a registered descriptor. bool has_backend(const std::string& recipe); +// True if the recipe publishes ROCm release channels (stable/nightly) — i.e. its +// "rocm" backend resolves to a channel-specific artifact. False for recipes whose +// rocm build is a single artifact (or that have no rocm build at all). +bool recipe_has_rocm_channels(const std::string& recipe); + } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index 19d63c370..fc43c4515 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -46,6 +46,7 @@ inline const BackendDescriptor descriptor = { /*experimental*/ false, /*web_display_name*/ "llama.cpp GPU", /*web_priority*/ 1, + /*rocm_channels*/ {"stable", "nightly"}, }; } // namespace llamacpp diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h index 2e12af119..b65fe4fd6 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -50,6 +50,7 @@ inline const BackendDescriptor descriptor = { /*experimental*/ false, /*web_display_name*/ "stable-diffusion.cpp", /*web_priority*/ 5, + /*rocm_channels*/ {"stable"}, }; } // namespace sdcpp diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp index 1b61f3407..2983d49ca 100644 --- a/src/cpp/server/backend_manager.cpp +++ b/src/cpp/server/backend_manager.cpp @@ -1,4 +1,5 @@ #include "lemon/backend_manager.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/runtime_config.h" #include "lemon/system_info.h" @@ -35,7 +36,7 @@ std::string get_current_os() { } std::string normalize_backend_name(const std::string& recipe, const std::string& backend) { - if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") { + if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") { // Map "rocm" to the appropriate channel based on config std::string channel = "stable"; // default to stable for now if (auto* cfg = RuntimeConfig::global()) { @@ -467,7 +468,7 @@ void BackendManager::install_backend(const std::string& recipe, const std::strin // Do that here before inflating the install to a multi-file UX flow. const std::string os = get_current_os(); const bool is_rocm_stable_backend = - (recipe == "llamacpp" || recipe == "sd-cpp") && + backends::recipe_has_rocm_channels(recipe) && resolved_backend == "rocm-stable"; const bool therock_applicable = is_rocm_stable_backend && will_install_therock(os, backend_versions_); diff --git a/src/cpp/server/backends/backend_descriptor_registry.cpp b/src/cpp/server/backends/backend_descriptor_registry.cpp index 5fd217909..6d1741d87 100644 --- a/src/cpp/server/backends/backend_descriptor_registry.cpp +++ b/src/cpp/server/backends/backend_descriptor_registry.cpp @@ -25,5 +25,10 @@ bool has_backend(const std::string& recipe) { return descriptor_for(recipe) != nullptr; } +bool recipe_has_rocm_channels(const std::string& recipe) { + const BackendDescriptor* d = descriptor_for(recipe); + return d != nullptr && !d->rocm_channels.empty(); +} + } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp index 9a57a28db..4c3e505d2 100644 --- a/src/cpp/server/backends/backend_utils.cpp +++ b/src/cpp/server/backends/backend_utils.cpp @@ -207,8 +207,8 @@ namespace lemon::backends { std::string& out_section, std::string& out_bin_key) { std::string config_backend = backend; - if ((recipe == "llamacpp" || recipe == "sd-cpp") && - (backend == "rocm-stable" || backend == "rocm-nightly")) { + if ((recipe_has_rocm_channels(recipe) && + (backend == "rocm-stable" || backend == "rocm-nightly"))) { config_backend = "rocm"; } out_section = RuntimeConfig::recipe_to_config_section(recipe); @@ -279,7 +279,7 @@ namespace lemon::backends { // Resolve "rocm" to actual channel for backends that support ROCm channels std::string resolved_backend = backend; - if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") { + if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") { std::string channel = "stable"; // default to stable if (auto* cfg = RuntimeConfig::global()) { channel = cfg->rocm_channel_for_recipe(spec.recipe); @@ -319,7 +319,7 @@ namespace lemon::backends { // directory or ROCm backends remain stuck in update_required after a // successful install. std::string resolved_backend = backend; - if ((spec.recipe == "llamacpp" || spec.recipe == "sd-cpp") && backend == "rocm") { + if (recipe_has_rocm_channels(spec.recipe) && backend == "rocm") { std::string channel = "stable"; if (auto* cfg = RuntimeConfig::global()) { channel = cfg->rocm_channel_for_recipe(spec.recipe); @@ -333,7 +333,7 @@ namespace lemon::backends { std::string BackendUtils::get_backend_version(const std::string& recipe, const std::string& backend) { std::string resolved_backend = backend; - if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") { + if (recipe_has_rocm_channels(recipe) && backend == "rocm") { // Map "rocm" to the appropriate channel based on config std::string channel = "stable"; // default to stable for now if (auto* cfg = RuntimeConfig::global()) { diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp index 08aa41dc6..0a14f006b 100644 --- a/src/cpp/server/runtime_config.cpp +++ b/src/cpp/server/runtime_config.cpp @@ -254,9 +254,16 @@ std::string RuntimeConfig::rocm_channel() const { std::string RuntimeConfig::rocm_channel_for_recipe(const std::string& recipe) const { std::string channel = rocm_channel(); - // sd-cpp currently has no nightly artifacts; use stable builds. - if (recipe == "sd-cpp" && channel == "nightly") { - return "stable"; + // Clamp to a channel the backend actually publishes. A backend that lists + // only {"stable"} (e.g. sd-cpp, which has no nightly artifacts) falls back to + // its first channel when "nightly" is requested. Driven by the descriptor's + // rocm_channels, so no per-recipe special case lives here. + const auto* desc = lemon::backends::descriptor_for(recipe); + if (desc && !desc->rocm_channels.empty()) { + const auto& channels = desc->rocm_channels; + if (std::find(channels.begin(), channels.end(), channel) == channels.end()) { + return channels.front(); + } } return channel; } diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index cdf089843..45335912d 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -718,7 +718,7 @@ static std::string get_expected_backend_version(const std::string& recipe, const // version pins ("rocm-stable", "rocm-nightly") in backend_versions.json. // Mirror the resolution done by BackendUtils::get_backend_version(). std::string resolved_backend = backend; - if ((recipe == "llamacpp" || recipe == "sd-cpp") && backend == "rocm") { + if (backends::recipe_has_rocm_channels(recipe) && backend == "rocm") { std::string channel = "stable"; if (auto* cfg = RuntimeConfig::global()) { channel = cfg->rocm_channel_for_recipe(recipe); From 2cc963e499d8da1725fd3dd7620183476766a166 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 15:25:10 -0400 Subject: [PATCH 11/39] refactor(backends): migrate resolve_model_path switchboard to ops (worst leak) Replace the ~290-line recipe switchboard in ModelManager::resolve_model_path with ops_for(recipe)->resolve_checkpoint_path(). The model manager now only does the generic prefix (collections, local_path/local_upload, HF cache-dir computation) and hands off to the backend. - New BackendOps::resolve_checkpoint_path; base = the shared HF behavior (active-snapshot variant/aux resolution, main-repo fallback, directory fallback). Backends override only their artifact layout: * llamacpp -> GGUF resolver (sharding/folder/quant-token), moved into backends/llamacpp/llamacpp_gguf (resolve_gguf_path). * ryzenai -> genai_config.json directory; kokoro -> index.json; whispercpp -> first .bin; cloud -> ""; flm -> checkpoint passthrough. - New shared backends/hf_cache_util (exists/dir_options/active_snapshot_path/ repo_id_to_cache_dir_name) so ops reuse the same HF-cache mechanics. model_manager.cpp -362 lines; resolve_model_path 365 -> 34. Verified all recipes still resolve as downloaded (llamacpp variants, whisper .bin, kokoro index, sd-cpp, ryzenai, flm) via /models. Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 3 +- src/cpp/include/lemon/backends/backend_ops.h | 20 + .../include/lemon/backends/hf_cache_util.h | 30 ++ .../lemon/backends/llamacpp/llamacpp_gguf.h | 6 + src/cpp/server/backends/backend_ops.cpp | 99 +++++ .../server/backends/cloud/cloud_server.cpp | 17 +- .../backends/fastflowlm/fastflowlm_server.cpp | 6 + src/cpp/server/backends/hf_cache_util.cpp | 72 ++++ .../server/backends/kokoro/kokoro_server.cpp | 30 +- .../backends/llamacpp/llamacpp_gguf.cpp | 204 ++++++++++ .../backends/llamacpp/llamacpp_server.cpp | 10 + .../backends/ryzenai/ryzenai_server.cpp | 28 +- .../backends/whispercpp/whispercpp_server.cpp | 39 +- src/cpp/server/model_manager.cpp | 362 +----------------- 14 files changed, 574 insertions(+), 352 deletions(-) create mode 100644 src/cpp/include/lemon/backends/hf_cache_util.h create mode 100644 src/cpp/server/backends/hf_cache_util.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 758108b69..8c19edd87 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -679,7 +679,8 @@ set(LEMON_BACKEND_DESCRIPTOR_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_descriptor_registry.cpp) set(LEMON_BACKEND_FACTORY_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_registry.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/backend_ops.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/cpp/server/backends/hf_cache_util.cpp) foreach(_backend_entry ${LEMON_BACKENDS}) string(REPLACE "|" ";" _backend_parts "${_backend_entry}") list(GET _backend_parts 1 _backend_stem) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index 53b046e84..c973a7de7 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -16,6 +16,19 @@ struct BackendOpsContext { ModelManager* model_manager = nullptr; }; +// Inputs for resolving a checkpoint's on-disk path. The model manager computes +// the HF-cache locations generically; each backend's ops decide how to find its +// artifact within (a .gguf file, a genai_config.json directory, a .bin, …). +struct CheckpointResolveContext { + std::string hf_cache; // HF cache root dir + std::string model_cache_path; // hf_cache/ + std::string repo_id; // checkpoint's repo id + std::string main_repo_id; // the model's "main" checkpoint repo id (fallback) + std::string variant; // checkpoint variant after ':' ("" if none) + std::string type; // checkpoint type ("main", "mmproj", "npu_cache", …) + std::string checkpoint; // the raw checkpoint string +}; + // Stateless per-backend behavior for model management that happens WITHOUT a // running subprocess: checkpoint-path resolution, download, dynamic discovery, // per-model metadata, version detection, availability. One singleton per @@ -37,6 +50,13 @@ class BackendOps { (void)info; (void)ctx; } + + // Resolve a checkpoint to its absolute on-disk path (file or directory). + // Default: the shared HF behavior — locate the variant/aux file in the active + // snapshot, else fall back to the model cache directory. Backends with a + // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override. + virtual std::string resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const; }; // Shared default ops instance for backends that override nothing. diff --git a/src/cpp/include/lemon/backends/hf_cache_util.h b/src/cpp/include/lemon/backends/hf_cache_util.h new file mode 100644 index 000000000..91c64278e --- /dev/null +++ b/src/cpp/include/lemon/backends/hf_cache_util.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include + +namespace lemon { +namespace backends { +namespace hf_cache { + +// Shared Hugging Face cache mechanics used by backend ops to locate model +// artifacts on disk (the same logic model_manager uses for its own cache work). + +// Exists check that tolerates the symlinks HF uses for dedup (Win32 on Windows, +// where MSVC's std::filesystem refuses untrusted reparse points). +bool exists(const std::filesystem::path& p); + +// Directory-iteration options that skip inaccessible/symlinked entries instead +// of throwing. +std::filesystem::directory_options dir_options(); + +// The active HF snapshot directory (snapshots/) for a model cache +// dir, or an empty path if there is no recorded ref / it doesn't exist. +std::filesystem::path active_snapshot_path(const std::filesystem::path& model_cache_path); + +// HF cache directory name for a repo id ("org/repo" -> "models--org--repo"). +std::string repo_id_to_cache_dir_name(const std::string& repo_id); + +} // namespace hf_cache +} // namespace backends +} // namespace lemon diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h index 2e431478b..ccf79ae57 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_gguf.h @@ -25,6 +25,12 @@ struct GgufMetadata { // not a valid GGUF container. bool read_gguf_metadata(GgufMetadata& out, const std::string& path); +// Resolve the on-disk path of the GGUF file for a model cache directory and +// variant (handles sharding, folder variants, and quant-token fallback). Returns +// the cache directory if no GGUF is present, or "" if the requested variant +// can't be resolved. +std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant); + } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp index 773e39494..af21249c5 100644 --- a/src/cpp/server/backends/backend_ops.cpp +++ b/src/cpp/server/backends/backend_ops.cpp @@ -1,8 +1,107 @@ #include "lemon/backends/backend_ops.h" +#include +#include +#include "lemon/backends/hf_cache_util.h" +#include "lemon/utils/path_utils.h" + +namespace fs = std::filesystem; + namespace lemon { namespace backends { +using lemon::utils::path_from_utf8; +using lemon::utils::path_to_utf8; + +// Default checkpoint resolution: the shared Hugging Face behavior. Locate the +// requested variant (or auxiliary file like mmproj) within the active snapshot, +// falling back to the main repo and finally the model cache directory. Backends +// with bespoke layouts override resolve_checkpoint_path(). +std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const { + (void)info; + + // NPU side-cache checkpoints have no resolvable local file here (the backend + // that uses them resolves them itself at load time). + if (ctx.type == "npu_cache") { + return ""; + } + + fs::path model_cache_path_fs = path_from_utf8(ctx.model_cache_path); + + if (!ctx.variant.empty()) { + // Prefer refs/main for auxiliary checkpoints too (e.g. mmproj) so + // companion files stay on the active snapshot as the main model. + fs::path active_snapshot = hf_cache::active_snapshot_path(model_cache_path_fs); + if (!active_snapshot.empty()) { + fs::path direct_variant_path = active_snapshot / path_from_utf8(ctx.variant); + if (hf_cache::exists(direct_variant_path)) { + return path_to_utf8(direct_variant_path); + } + std::error_code ec; + for (const auto& entry : + fs::recursive_directory_iterator(active_snapshot, hf_cache::dir_options(), ec)) { + if (ec) break; + if (entry.is_regular_file(ec)) { + if (entry.path().filename().string() == ctx.variant) { + return path_to_utf8(entry.path()); + } + } else if (entry.is_directory(ec)) { + fs::path variant_path = entry.path() / path_from_utf8(ctx.variant); + if (hf_cache::exists(variant_path)) { + return path_to_utf8(variant_path); + } + } + ec.clear(); + } + } + + // Try to find the exact variant in the cache directory's subtree. + if (hf_cache::exists(model_cache_path_fs)) { + for (const auto& entry : + fs::recursive_directory_iterator(model_cache_path_fs, hf_cache::dir_options())) { + if (entry.is_regular_file()) { + if (entry.path().filename().string() == ctx.variant) { + return path_to_utf8(entry.path()); + } + } else if (entry.is_directory()) { + fs::path variant_path = entry.path() / path_from_utf8(ctx.variant); + if (hf_cache::exists(variant_path)) { + return path_to_utf8(variant_path); + } + } + } + } + + // Backward-compat: older downloads placed all files in the main repo dir. + if (ctx.repo_id != ctx.main_repo_id) { + std::string main_cache_path = + ctx.hf_cache + "/" + hf_cache::repo_id_to_cache_dir_name(ctx.main_repo_id); + fs::path main_cache_path_fs = path_from_utf8(main_cache_path); + if (fs::exists(main_cache_path_fs)) { + for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) { + if (entry.is_regular_file()) { + if (entry.path().filename().string() == ctx.variant) { + return path_to_utf8(entry.path()); + } + } else if (entry.is_directory()) { + fs::path variant_path = entry.path() / path_from_utf8(ctx.variant); + if (fs::exists(variant_path)) { + return path_to_utf8(variant_path); + } + } + } + } + } + + // Variant not found — signal not downloaded. + return ""; + } + + // No variant: return the cache directory. + return ctx.model_cache_path; +} + const BackendOps* default_backend_ops() { static const BackendOps kDefault; return &kDefault; diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp index 29dede2b0..b20aae05c 100644 --- a/src/cpp/server/backends/cloud/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -806,8 +806,23 @@ std::unique_ptr create(const BackendContext& ctx) { } +namespace { +class CloudOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext&) const override { + // Cloud-offloaded models have no local artifacts; the checkpoint is the + // upstream provider's model id, used directly when forwarding requests. + return ""; + } +}; +} // namespace + const BackendSpec* spec() { return nullptr; } -const BackendOps* ops() { return default_backend_ops(); } +const BackendOps* ops() { + static const CloudOps kOps; + return &kOps; +} } // namespace cloud } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 0b5e15934..648cd9ff5 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -485,6 +485,12 @@ class FlmOps : public BackendOps { void populate_metadata(ModelInfo& info, const BackendOpsContext&) const override { info.max_context_window = read_flm_max_context_window(info); } + + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext& ctx) const override { + // FLM uses the checkpoint string as-is (e.g. "gemma3:4b"); no local file. + return ctx.checkpoint; + } }; } // namespace diff --git a/src/cpp/server/backends/hf_cache_util.cpp b/src/cpp/server/backends/hf_cache_util.cpp new file mode 100644 index 000000000..028b25ee4 --- /dev/null +++ b/src/cpp/server/backends/hf_cache_util.cpp @@ -0,0 +1,72 @@ +#include "lemon/backends/hf_cache_util.h" + +#include + +#ifdef _WIN32 +#include +#endif + +namespace fs = std::filesystem; + +namespace lemon { +namespace backends { +namespace hf_cache { + +bool exists(const fs::path& p) { +#ifdef _WIN32 + // The HF cache uses symlinks for dedup; MSVC's std::filesystem refuses + // "untrusted" reparse points when the token lacks symlink privilege, so use + // the Win32 API which has no such restriction. + return GetFileAttributesW(p.c_str()) != INVALID_FILE_ATTRIBUTES; +#else + std::error_code ec; + return fs::exists(p, ec); +#endif +} + +fs::directory_options dir_options() { +#ifdef _WIN32 + return fs::directory_options::skip_permission_denied; +#else + return fs::directory_options::none; +#endif +} + +namespace { +std::string read_ref_main(const fs::path& model_cache_path) { + std::ifstream refs_file(model_cache_path / "refs" / "main"); + if (!refs_file.is_open()) { + return ""; + } + std::string ref; + std::getline(refs_file, ref); + ref.erase(0, ref.find_first_not_of(" \t\r\n")); + size_t last = ref.find_last_not_of(" \t\r\n"); + if (last == std::string::npos) { + return ""; + } + ref.erase(last + 1); + return ref; +} +} // namespace + +fs::path active_snapshot_path(const fs::path& model_cache_path) { + std::string ref = read_ref_main(model_cache_path); + if (ref.empty()) { + return fs::path(); + } + fs::path snapshot_path = model_cache_path / "snapshots" / ref; + return lemon::backends::hf_cache::exists(snapshot_path) ? snapshot_path : fs::path(); +} + +std::string repo_id_to_cache_dir_name(const std::string& repo_id) { + std::string cache_dir_name = "models--"; + for (char c : repo_id) { + cache_dir_name += (c == '/') ? "--" : std::string(1, c); + } + return cache_dir_name; +} + +} // namespace hf_cache +} // namespace backends +} // namespace lemon diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp index 13f1a3ffe..80d502ead 100644 --- a/src/cpp/server/backends/kokoro/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -1,6 +1,11 @@ #include "lemon/backends/kokoro/kokoro_server.h" #include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/backends/hf_cache_util.h" +#include "lemon/model_manager.h" +#include "lemon/utils/path_utils.h" +#include #include "lemon/backend_manager.h" #include "lemon/utils/process_manager.h" #include "lemon/utils/json_utils.h" @@ -214,8 +219,31 @@ std::unique_ptr create(const BackendContext& ctx) { } +namespace { +class KokoroOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext& ctx) const override { + // Kokoro models are a directory; resolve to the index.json file inside. + std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path); + if (hf_cache::exists(dir)) { + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && entry.path().filename() == "index.json") { + return lemon::utils::path_to_utf8(entry.path()); + } + } + } + return ctx.model_cache_path; // directory even if index not found + } +}; +} // namespace + const BackendSpec* spec() { return &KokoroServer::SPEC; } -const BackendOps* ops() { return default_backend_ops(); } +const BackendOps* ops() { + static const KokoroOps kOps; + return &kOps; +} } // namespace kokoro } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp index 1e099d064..e23e3c2a4 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp @@ -3,17 +3,31 @@ #include #include #include +#include #include #include #include +#include +#include +#include "lemon/backends/hf_cache_util.h" +#include "lemon/hf_variants.h" +#include "lemon/utils/aixlog.hpp" #include "lemon/utils/path_utils.h" +namespace fs = std::filesystem; + namespace lemon { namespace backends { namespace llamacpp { namespace { using lemon::utils::path_from_utf8; +using lemon::utils::path_to_utf8; + +std::string to_lower(std::string s) { + std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); }); + return s; +} // Local copies of the tiny case-insensitive string helpers (kept out of a shared // util to keep this GGUF reader self-contained). @@ -248,6 +262,196 @@ bool read_gguf_metadata(GgufMetadata& out, const std::string& path) { } +std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant) { + fs::path model_cache_path_fs = path_from_utf8(model_cache_path); + if (!hf_cache::exists(model_cache_path_fs)) { + return model_cache_path; // Return directory path even if not found + } + + // Prefer the active HF snapshot recorded in refs/main. This lets + // Lemonade keep using the previous snapshot when upstream only changed + // README/metadata and the requested model artifacts are unchanged. + auto collect_gguf_files = [](const fs::path& search_root) { + std::vector files; + if (search_root.empty() || !hf_cache::exists(search_root)) { + return files; + } + + std::error_code ec; + for (const auto& entry : fs::recursive_directory_iterator(search_root, hf_cache::dir_options(), ec)) { + if (ec) break; + if (!entry.is_regular_file(ec)) { + ec.clear(); + continue; + } + + std::string filename = entry.path().filename().string(); + std::string filename_lower = filename; + std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); + + if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) { + files.push_back(path_to_utf8(entry.path())); + } + } + return files; + }; + + std::vector all_gguf_files = collect_gguf_files(hf_cache::active_snapshot_path(model_cache_path_fs)); + if (all_gguf_files.empty()) { + // Backward-compatible fallback for caches without refs/main and for + // partially migrated/manual HF cache layouts. + all_gguf_files = collect_gguf_files(model_cache_path_fs); + } + + if (all_gguf_files.empty()) { + return model_cache_path; // Return directory if no GGUF found + } + + // Sort files for consistent ordering (important for sharded models) + std::sort(all_gguf_files.begin(), all_gguf_files.end()); + + // Case 0: Wildcard (*) - return first file (llama-server will auto-load shards) + if (variant == "*") { + return all_gguf_files[0]; + } + + // Case 1: Empty variant - return first file + if (variant.empty()) { + return all_gguf_files[0]; + } + + // Case 2: Exact filename match (variant ends with .gguf) + if (variant.find(".gguf") != std::string::npos) { + for (const auto& filepath : all_gguf_files) { + std::string filename = path_from_utf8(filepath).filename().string(); + if (filename == variant) { + return filepath; + } + } + return ""; // Exact variant not found — signal not downloaded + } + + // Case 3: Files ending with {variant}.gguf (case insensitive) + std::string variant_lower = variant; + std::transform(variant_lower.begin(), variant_lower.end(), variant_lower.begin(), ::tolower); + std::string suffix = variant_lower + ".gguf"; + + std::vector matching_files; + for (const auto& filepath : all_gguf_files) { + std::string filename = path_from_utf8(filepath).filename().string(); + std::string filename_lower = filename; + std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); + + if (filename_lower.size() >= suffix.size() && + filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) { + matching_files.push_back(filepath); + } + } + + if (!matching_files.empty()) { + return matching_files[0]; + } + + // Case 4: Folder-based sharding (files in variant/ folder) + std::string folder_prefix_lower = variant_lower + "/"; + + for (const auto& filepath : all_gguf_files) { + // Get relative path from model cache path + std::string relative_path = path_to_utf8( + path_from_utf8(filepath).lexically_relative(model_cache_path_fs)); + std::string relative_lower = relative_path; + // Normalize path separators and case so folder-variant matching works cross-platform. + std::transform(relative_lower.begin(), relative_lower.end(), relative_lower.begin(), ::tolower); + std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/'); + + if (relative_lower.find(folder_prefix_lower) != std::string::npos) { + return filepath; + } + } + + // Case 5: Local quant-token fallback. + // + // Keep the existing resolver cases above as the primary logic: exact + // filenames, suffix matches, and folder-based sharding are more + // specific and preserve the CHECKPOINT:VARIANT contract. + // + // Some GGUF repositories name files with the quant token in the middle, + // for example: + // Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf + // for variant: + // IQ4_XS + // That file does not end with IQ4_XS.gguf, so mirror the downloader's + // GGUF variant enumeration over the files that are already present in + // the local HF cache before declaring the model missing. + // + // HF cache paths have an extra snapshots// prefix that is not + // part of the repository-relative filename. Strip it before calling + // enumerate_gguf_variants(); otherwise the enumerator treats + // "snapshots" as a top-level sharded-folder variant and never extracts + // the quant token from the actual GGUF filename. + std::vector relative_gguf_files; + std::map absolute_by_relative; + auto repo_relative_from_cache_relative = [](std::string rel) { + std::replace(rel.begin(), rel.end(), '\\', '/'); + + static const std::string snapshots_prefix = "snapshots/"; + if (rel.rfind(snapshots_prefix, 0) == 0) { + size_t revision_end = rel.find('/', snapshots_prefix.size()); + if (revision_end != std::string::npos && revision_end + 1 < rel.size()) { + rel = rel.substr(revision_end + 1); + } + } + + return rel; + }; + + for (const auto& filepath : all_gguf_files) { + std::string relative_path = path_to_utf8( + path_from_utf8(filepath).lexically_relative(model_cache_path_fs)); + relative_path = repo_relative_from_cache_relative(relative_path); + + // Multiple HF snapshots can contain the same repo-relative file. + // Keep the first absolute path from the sorted all_gguf_files list + // so duplicates do not create false ambiguity. + if (absolute_by_relative.emplace(relative_path, filepath).second) { + relative_gguf_files.push_back(relative_path); + } + } + + std::vector enumerated_matches; + auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files); + for (const auto& local_variant : local_variants.variants) { + if (to_lower(local_variant.name) != variant_lower) { + continue; + } + + auto it = absolute_by_relative.find(local_variant.primary_file); + if (it != absolute_by_relative.end()) { + enumerated_matches.push_back(it->second); + } + } + + if (enumerated_matches.size() == 1) { + LOG(INFO, "ModelManager") + << "Resolved local GGUF variant '" << variant + << "' via quant-token fallback: " << enumerated_matches[0] << std::endl; + return enumerated_matches[0]; + } + + if (enumerated_matches.size() > 1) { + LOG(WARNING, "ModelManager") + << "Multiple local GGUF files matched variant '" << variant + << "' via quant-token fallback; refusing to guess" << std::endl; + return ""; + } + + // No match found for the requested GGUF variant. Do not fall back to + // another quantization in the same Hugging Face repo; otherwise a + // custom download with a different quant can make a built-in model + // appear downloaded and allow deleting the wrong file. + return ""; +} + } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index e295a835a..a9af9359f 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -695,6 +695,16 @@ class LlamaCppOps : public BackendOps { apply_gguf_capability_labels(info.labels, meta.caps); } } + + std::string resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const override { + // The main checkpoint is a GGUF file (with sharding/variant resolution); + // auxiliary checkpoints (mmproj, …) use the shared default. + if (ctx.type == "main") { + return resolve_gguf_path(ctx.model_cache_path, ctx.variant); + } + return BackendOps::resolve_checkpoint_path(info, ctx); + } }; } // namespace diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp index e965ea3b9..f6ba8f457 100644 --- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -1,6 +1,9 @@ #include "lemon/backends/ryzenai/ryzenai_server.h" #include "lemon/backends/backend_registry.h" #include "lemon/model_manager.h" +#include "lemon/backends/backend_ops.h" +#include "lemon/backends/hf_cache_util.h" +#include "lemon/utils/path_utils.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" #include "lemon/utils/process_manager.h" @@ -185,8 +188,31 @@ std::unique_ptr create(const BackendContext& ctx) { } +namespace { +class RyzenAiOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo&, + const CheckpointResolveContext& ctx) const override { + // RyzenAI models are a directory containing genai_config.json. + std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path); + if (hf_cache::exists(dir)) { + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") { + return lemon::utils::path_to_utf8(entry.path().parent_path()); + } + } + } + return ctx.model_cache_path; // directory even if genai_config not found + } +}; +} // namespace + const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; } -const BackendOps* ops() { return default_backend_ops(); } +const BackendOps* ops() { + static const RyzenAiOps kOps; + return &kOps; +} } // namespace ryzenai } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp index c77d10669..dfa0ebea9 100644 --- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -1,6 +1,9 @@ #include "lemon/backends/whispercpp/whispercpp_server.h" #include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" +#include "lemon/backends/hf_cache_util.h" +#include "lemon/model_manager.h" #include "lemon/backend_manager.h" #include "lemon/runtime_config.h" #include "lemon/system_info.h" @@ -699,8 +702,42 @@ std::unique_ptr create(const BackendContext& ctx) { } +namespace { +class WhisperOps : public BackendOps { +public: + std::string resolve_checkpoint_path(const ModelInfo& info, + const CheckpointResolveContext& ctx) const override { + // With no variant, find any .bin model file; otherwise use the shared + // default (variant/aux resolution). + if (ctx.variant.empty()) { + std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path); + if (!hf_cache::exists(dir)) { + return ctx.model_cache_path; + } + std::vector bin_files; + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && + entry.path().filename().string().find(".bin") != std::string::npos) { + bin_files.push_back(lemon::utils::path_to_utf8(entry.path())); + } + } + if (bin_files.empty()) { + return ctx.model_cache_path; + } + std::sort(bin_files.begin(), bin_files.end()); + return bin_files[0]; + } + return BackendOps::resolve_checkpoint_path(info, ctx); + } +}; +} // namespace + const BackendSpec* spec() { return &WhisperServer::SPEC; } -const BackendOps* ops() { return default_backend_ops(); } +const BackendOps* ops() { + static const WhisperOps kOps; + return &kOps; +} } // namespace whispercpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 81d0200b2..d1295ff92 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -1091,370 +1091,38 @@ std::map ModelManager::discover_extra_models() const { } std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::string& type, const std::string& checkpoint) const { - // Collections are virtual entries with no direct checkpoint to resolve + // Collections are virtual entries with no direct checkpoint to resolve. if (is_collection_recipe(info.recipe)) { return ""; } - // Cloud-offloaded models have no local artifacts; checkpoint is the - // upstream provider's model id, used directly when forwarding requests. - if (info.recipe == "cloud") { - return ""; - } - - // FLM models use checkpoint as-is (e.g., "gemma3:4b") - if (info.recipe == "flm") { - return checkpoint; - } - - // Local path models use checkpoint as-is (absolute path to file) + // Local-path models use the checkpoint as-is (absolute path to a file). if (info.source == "local_path") { return checkpoint; } std::string hf_cache = get_hf_cache_dir(); - // Local uploads: checkpoint is relative path from HF cache + // Local uploads: checkpoint is a relative path from the HF cache. if (info.source == "local_upload") { std::string normalized = checkpoint; std::replace(normalized.begin(), normalized.end(), '\\', '/'); return hf_cache + "/" + normalized; } - // For now, NPU cache is handled directly in whisper.cpp - if (type == "npu_cache") { - return ""; - } - - // HuggingFace models: need to find the GGUF file in cache - // Parse checkpoint to get repo_id and variant - // Use the checkpoint's own repo, falling back to main repo for backward compatibility - std::string checkpoint_repo_id = checkpoint_to_repo_id(checkpoint); - std::string main_repo_id = checkpoint_to_repo_id(info.checkpoint("main")); - std::string repo_id = checkpoint_repo_id; - std::string variant = checkpoint_to_variant(checkpoint); - - std::string model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(repo_id); - fs::path model_cache_path_fs = path_from_utf8(model_cache_path); - - // For RyzenAI LLM models, look for genai_config.json directory - if (info.recipe == "ryzenai-llm") { - if (safe_exists(model_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") { - return path_to_utf8(entry.path().parent_path()); - } - } - } - return model_cache_path; // Return directory even if genai_config not found - } - - // For kokoro models, look for index.json directory - if (info.recipe == "kokoro") { - if (safe_exists(model_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file() && entry.path().filename() == "index.json") { - return path_to_utf8(entry.path()); - } - } - } - - return model_cache_path; // Return directory even if index not found - } - - // For whispercpp, find the .bin model file - if (info.recipe == "whispercpp" && variant.empty()) { - // No variant specified - use fallback logic to find any .bin file - if (!safe_exists(model_cache_path_fs)) { - return model_cache_path; // Return directory path even if not found - } - - // Collect all .bin files - std::vector all_bin_files; - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename.find(".bin") != std::string::npos) { - all_bin_files.push_back(path_to_utf8(entry.path())); - } - } - } - - if (all_bin_files.empty()) { - return model_cache_path; // Return directory if no .bin found - } - - // Sort files for consistent ordering - std::sort(all_bin_files.begin(), all_bin_files.end()); - - // Return first .bin file as fallback (only when no variant specified) - return all_bin_files[0]; - } - - // For llamacpp, find the GGUF file with advanced sharded model support - if (info.recipe == "llamacpp" && type == "main") { - if (!safe_exists(model_cache_path_fs)) { - return model_cache_path; // Return directory path even if not found - } - - // Prefer the active HF snapshot recorded in refs/main. This lets - // Lemonade keep using the previous snapshot when upstream only changed - // README/metadata and the requested model artifacts are unchanged. - auto collect_gguf_files = [](const fs::path& search_root) { - std::vector files; - if (search_root.empty() || !safe_exists(search_root)) { - return files; - } - - std::error_code ec; - for (const auto& entry : fs::recursive_directory_iterator(search_root, safe_dir_options, ec)) { - if (ec) break; - if (!entry.is_regular_file(ec)) { - ec.clear(); - continue; - } - - std::string filename = entry.path().filename().string(); - std::string filename_lower = filename; - std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); - - if (filename.find(".gguf") != std::string::npos && filename_lower.find("mmproj") == std::string::npos) { - files.push_back(path_to_utf8(entry.path())); - } - } - return files; - }; - - std::vector all_gguf_files = collect_gguf_files(active_hf_snapshot_path(model_cache_path_fs)); - if (all_gguf_files.empty()) { - // Backward-compatible fallback for caches without refs/main and for - // partially migrated/manual HF cache layouts. - all_gguf_files = collect_gguf_files(model_cache_path_fs); - } - - if (all_gguf_files.empty()) { - return model_cache_path; // Return directory if no GGUF found - } - - // Sort files for consistent ordering (important for sharded models) - std::sort(all_gguf_files.begin(), all_gguf_files.end()); - - // Case 0: Wildcard (*) - return first file (llama-server will auto-load shards) - if (variant == "*") { - return all_gguf_files[0]; - } - - // Case 1: Empty variant - return first file - if (variant.empty()) { - return all_gguf_files[0]; - } - - // Case 2: Exact filename match (variant ends with .gguf) - if (variant.find(".gguf") != std::string::npos) { - for (const auto& filepath : all_gguf_files) { - std::string filename = path_from_utf8(filepath).filename().string(); - if (filename == variant) { - return filepath; - } - } - return ""; // Exact variant not found — signal not downloaded - } - - // Case 3: Files ending with {variant}.gguf (case insensitive) - std::string variant_lower = variant; - std::transform(variant_lower.begin(), variant_lower.end(), variant_lower.begin(), ::tolower); - std::string suffix = variant_lower + ".gguf"; - - std::vector matching_files; - for (const auto& filepath : all_gguf_files) { - std::string filename = path_from_utf8(filepath).filename().string(); - std::string filename_lower = filename; - std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); - - if (filename_lower.size() >= suffix.size() && - filename_lower.substr(filename_lower.size() - suffix.size()) == suffix) { - matching_files.push_back(filepath); - } - } - - if (!matching_files.empty()) { - return matching_files[0]; - } - - // Case 4: Folder-based sharding (files in variant/ folder) - std::string folder_prefix_lower = variant_lower + "/"; - - for (const auto& filepath : all_gguf_files) { - // Get relative path from model cache path - std::string relative_path = path_to_utf8( - path_from_utf8(filepath).lexically_relative(model_cache_path_fs)); - std::string relative_lower = relative_path; - // Normalize path separators and case so folder-variant matching works cross-platform. - std::transform(relative_lower.begin(), relative_lower.end(), relative_lower.begin(), ::tolower); - std::replace(relative_lower.begin(), relative_lower.end(), '\\', '/'); - - if (relative_lower.find(folder_prefix_lower) != std::string::npos) { - return filepath; - } - } - - // Case 5: Local quant-token fallback. - // - // Keep the existing resolver cases above as the primary logic: exact - // filenames, suffix matches, and folder-based sharding are more - // specific and preserve the CHECKPOINT:VARIANT contract. - // - // Some GGUF repositories name files with the quant token in the middle, - // for example: - // Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf - // for variant: - // IQ4_XS - // That file does not end with IQ4_XS.gguf, so mirror the downloader's - // GGUF variant enumeration over the files that are already present in - // the local HF cache before declaring the model missing. - // - // HF cache paths have an extra snapshots// prefix that is not - // part of the repository-relative filename. Strip it before calling - // enumerate_gguf_variants(); otherwise the enumerator treats - // "snapshots" as a top-level sharded-folder variant and never extracts - // the quant token from the actual GGUF filename. - std::vector relative_gguf_files; - std::map absolute_by_relative; - auto repo_relative_from_cache_relative = [](std::string rel) { - std::replace(rel.begin(), rel.end(), '\\', '/'); - - static const std::string snapshots_prefix = "snapshots/"; - if (rel.rfind(snapshots_prefix, 0) == 0) { - size_t revision_end = rel.find('/', snapshots_prefix.size()); - if (revision_end != std::string::npos && revision_end + 1 < rel.size()) { - rel = rel.substr(revision_end + 1); - } - } - - return rel; - }; - - for (const auto& filepath : all_gguf_files) { - std::string relative_path = path_to_utf8( - path_from_utf8(filepath).lexically_relative(model_cache_path_fs)); - relative_path = repo_relative_from_cache_relative(relative_path); - - // Multiple HF snapshots can contain the same repo-relative file. - // Keep the first absolute path from the sorted all_gguf_files list - // so duplicates do not create false ambiguity. - if (absolute_by_relative.emplace(relative_path, filepath).second) { - relative_gguf_files.push_back(relative_path); - } - } - - std::vector enumerated_matches; - auto local_variants = lemon::enumerate_gguf_variants(relative_gguf_files); - for (const auto& local_variant : local_variants.variants) { - if (to_lower(local_variant.name) != variant_lower) { - continue; - } - - auto it = absolute_by_relative.find(local_variant.primary_file); - if (it != absolute_by_relative.end()) { - enumerated_matches.push_back(it->second); - } - } - - if (enumerated_matches.size() == 1) { - LOG(INFO, "ModelManager") - << "Resolved local GGUF variant '" << variant - << "' via quant-token fallback: " << enumerated_matches[0] << std::endl; - return enumerated_matches[0]; - } - - if (enumerated_matches.size() > 1) { - LOG(WARNING, "ModelManager") - << "Multiple local GGUF files matched variant '" << variant - << "' via quant-token fallback; refusing to guess" << std::endl; - return ""; - } - - // No match found for the requested GGUF variant. Do not fall back to - // another quantization in the same Hugging Face repo; otherwise a - // custom download with a different quant can make a built-in model - // appear downloaded and allow deleting the wrong file. - return ""; - } - - // Everything else - if (!variant.empty()) { - // Prefer refs/main for auxiliary checkpoints too (for example mmproj), - // so companion files stay on the same active snapshot as the main model - // when unchanged artifacts are reused across README-only commits. - fs::path active_snapshot = active_hf_snapshot_path(model_cache_path_fs); - if (!active_snapshot.empty()) { - fs::path direct_variant_path = active_snapshot / path_from_utf8(variant); - if (safe_exists(direct_variant_path)) { - return path_to_utf8(direct_variant_path); - } - - std::error_code ec; - for (const auto& entry : fs::recursive_directory_iterator(active_snapshot, safe_dir_options, ec)) { - if (ec) break; - if (entry.is_regular_file(ec)) { - std::string filename = entry.path().filename().string(); - if (filename == variant) { - return path_to_utf8(entry.path()); - } - } else if (entry.is_directory(ec)) { - fs::path variant_path = entry.path() / path_from_utf8(variant); - if (safe_exists(variant_path)) { - return path_to_utf8(variant_path); - } - } - ec.clear(); - } - } - - // Try to find the exact variant in snapshots subdirectories - if (safe_exists(model_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(model_cache_path_fs, safe_dir_options)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename == variant) { - return path_to_utf8(entry.path()); - } - } else if (entry.is_directory()) { - fs::path variant_path = entry.path() / path_from_utf8(variant); - if (safe_exists(variant_path)) { - return path_to_utf8(variant_path); - } - } - } - } - // Variant not found in checkpoint's own repo - try main repo as fallback - // (backward compat: older downloads placed all files in the main repo dir) - if (checkpoint_repo_id != main_repo_id) { - std::string main_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(main_repo_id); - fs::path main_cache_path_fs = path_from_utf8(main_cache_path); - if (fs::exists(main_cache_path_fs)) { - for (const auto& entry : fs::recursive_directory_iterator(main_cache_path_fs)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename == variant) { - return path_to_utf8(entry.path()); - } - } else if (entry.is_directory()) { - fs::path variant_path = entry.path() / path_from_utf8(variant); - if (fs::exists(variant_path)) { - return path_to_utf8(variant_path); - } - } - } - } - } - - // Variant not found - return empty string to indicate model not downloaded - return ""; - } + // Compute the HF cache location for this checkpoint's repo, then let the + // backend's ops find its artifact within (a .gguf file, a genai_config.json + // directory, a .bin, …) — no per-recipe switchboard here. + backends::CheckpointResolveContext ctx; + ctx.hf_cache = hf_cache; + ctx.repo_id = checkpoint_to_repo_id(checkpoint); + ctx.main_repo_id = checkpoint_to_repo_id(info.checkpoint("main")); + ctx.variant = checkpoint_to_variant(checkpoint); + ctx.model_cache_path = hf_cache + "/" + repo_id_to_cache_dir_name(ctx.repo_id); + ctx.type = type; + ctx.checkpoint = checkpoint; - // Fallback: return directory path - return model_cache_path; + return backends::ops_for(info.recipe)->resolve_checkpoint_path(info, ctx); } void ModelManager::resolve_all_model_paths(ModelInfo& info) { From 2feae8471d7ca229925e664dc2012df64683c9ac Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:00:52 -0400 Subject: [PATCH 12/39] =?UTF-8?q?refactor(backends):=20migrate=20download/?= =?UTF-8?q?discovery/is=5Fdownloaded=20to=20ops;=20FLM=20cluster=20?= =?UTF-8?q?=E2=86=92=20folder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dynamic discovery, download status, and downloading now flow through BackendOps instead of recipe switchboards in model_manager: - discover_models: build_cache loops descriptors with dynamic_models=true and merges ops->discover_models(). FLM (`flm list`) and cloud (per-provider) both implement it — the two bespoke discovery blocks collapse to one generic loop. - is_downloaded: base = shared HF completeness (ModelManager::checkpoints_complete); CloudOps → true; FlmOps → installed-set membership. Replaces the flm_set/cloud/ else branches in build_cache and add_model_to_cache. - validate_checkpoint_file: LlamaCppOps does the GGUF-magic check (was an inline llamacpp branch in are_required_checkpoints_complete). - download_model: base = shared HF engine (download_from_huggingface_engine); FlmOps → flm pull; CloudOps → no-op. download_registered_model just dispatches. invalidates_cache_after_download() replaces the recipe=="flm" cache-reset. The whole FLM cluster (find_flm_binary, flm_installed_checkpoints, flm_discover_models, flm_download) moves into backends/fastflowlm/fastflowlm_models. model_manager keeps only the generic HF engine. Verified: server_endpoints 69 pass; download status correct for every recipe. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_ops.h | 35 +- .../backends/fastflowlm/fastflowlm_models.h | 18 +- src/cpp/include/lemon/model_manager.h | 17 +- src/cpp/server/backends/backend_ops.cpp | 14 + .../server/backends/cloud/cloud_server.cpp | 41 ++ .../backends/fastflowlm/fastflowlm_models.cpp | 416 ++++++++++++++ .../backends/fastflowlm/fastflowlm_server.cpp | 16 + .../backends/llamacpp/llamacpp_server.cpp | 21 + src/cpp/server/model_manager.cpp | 544 ++---------------- 9 files changed, 613 insertions(+), 509 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index c973a7de7..35f434df3 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -2,11 +2,11 @@ #include #include +#include "lemon/model_manager.h" // ModelInfo, DownloadProgressCallback (server-side only) namespace lemon { -struct ModelInfo; -class ModelManager; +class CloudProviderRegistry; namespace backends { @@ -14,6 +14,7 @@ namespace backends { // management needs without a running subprocess. Grows as migrations require. struct BackendOpsContext { ModelManager* model_manager = nullptr; + CloudProviderRegistry* cloud_registry = nullptr; // for dynamic cloud discovery }; // Inputs for resolving a checkpoint's on-disk path. The model manager computes @@ -57,6 +58,36 @@ class BackendOps { // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override. virtual std::string resolve_checkpoint_path(const ModelInfo& info, const CheckpointResolveContext& ctx) const; + + // Models supplied at runtime rather than from server_models.json (descriptor + // dynamic_models = true). Default: none. cloud/flm override. + virtual std::vector discover_models(const BackendOpsContext& ctx) const { + (void)ctx; + return {}; + } + + // Whether a model's local artifacts are present. Default: the shared HF + // checkpoint-completeness check (ModelManager::checkpoints_complete). cloud + // (always true) and flm (installed-set membership) override. + virtual bool is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const; + + // Validate a resolved checkpoint file for the cache. Returns "" if valid, or + // a reason it should be treated as not-downloaded. Default: always valid; + // llamacpp checks GGUF magic. + virtual std::string validate_checkpoint_file(const std::string& resolved_path) const { + (void)resolved_path; + return ""; + } + + // Download a model's artifacts. Default: the shared Hugging Face download. + // cloud (no-op) and flm (flm pull) override. + virtual void download_model(const ModelInfo& info, bool do_not_upgrade, + DownloadProgressCallback progress, + const BackendOpsContext& ctx) const; + + // Whether the model cache must be rebuilt after this backend downloads a + // model (e.g. flm, whose model list changes). Default: false. + virtual bool invalidates_cache_after_download() const { return false; } }; // Shared default ops instance for backends that override nothing. diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h index 3e99e3003..f5d0f269d 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h @@ -3,14 +3,24 @@ #include #include #include +#include +#include "lemon/model_manager.h" // ModelInfo, DownloadProgressCallback namespace lemon { -struct ModelInfo; - namespace backends { namespace fastflowlm { +// Locate the FLM executable (install dir on Windows, system PATH on Linux). +std::string find_flm_binary(); + +// Installed FLM model checkpoints (from `flm list --filter installed`). +std::vector flm_installed_checkpoints(); + +// Discover all available FLM models (from `flm list --json`), each with its +// downloaded status set. Returns empty if FLM is not ready. +std::vector flm_discover_models(); + // FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH / // platform-default roots and describes them with a config.json; this knowledge // lives in the fastflowlm backend folder rather than in the shared model manager. @@ -24,6 +34,10 @@ std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo // Read the model's max context window from its FLM config.json (0 if unknown). int64_t read_flm_max_context_window(const ModelInfo& info); +// Download (pull) an FLM model by checkpoint via the `flm` CLI. +void flm_download(const std::string& checkpoint, bool do_not_upgrade, + DownloadProgressCallback progress_callback); + } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h index be850b583..967d6d044 100644 --- a/src/cpp/include/lemon/model_manager.h +++ b/src/cpp/include/lemon/model_manager.h @@ -224,11 +224,15 @@ class ModelManager { // Check if model is downloaded bool is_model_downloaded(const std::string& model_name); - // Get list of installed FLM models (for caching) - std::vector get_flm_installed_models(); + // Shared Hugging Face completeness check: true if all required checkpoints + // are present and complete (per-backend file validation runs via ops). The + // default BackendOps::is_downloaded delegates here for HF-backed backends. + bool checkpoints_complete(const ModelInfo& info) const; - // Get list of all available FLM models from 'flm list --json' - std::vector get_flm_available_models(); + // Shared Hugging Face download engine. The default BackendOps::download_model + // delegates here; flm/cloud override with their own download. + void download_from_huggingface_engine(const ModelInfo& info, + DownloadProgressCallback progress_callback = nullptr); // Get HuggingFace cache directory (respects HF_HUB_CACHE, HF_HOME, and platform defaults) std::string get_hf_cache_dir() const; @@ -310,11 +314,6 @@ class ModelManager { void download_from_huggingface(const ModelInfo& info, DownloadProgressCallback progress_callback = nullptr); - // Download from FLM - void download_from_flm(const std::string& checkpoint, - bool do_not_upgrade = true, - DownloadProgressCallback progress_callback = nullptr); - // Discover GGUF models from extra_models_dir std::map discover_extra_models() const; diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp index af21249c5..2f4cdf48c 100644 --- a/src/cpp/server/backends/backend_ops.cpp +++ b/src/cpp/server/backends/backend_ops.cpp @@ -102,6 +102,20 @@ std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info, return ctx.model_cache_path; } +bool BackendOps::is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const { + // Default: the shared HF checkpoint-completeness check. + return ctx.model_manager != nullptr && ctx.model_manager->checkpoints_complete(info); +} + +void BackendOps::download_model(const ModelInfo& info, bool do_not_upgrade, + DownloadProgressCallback progress, const BackendOpsContext& ctx) const { + // Default: the shared Hugging Face download engine. + (void)do_not_upgrade; + if (ctx.model_manager != nullptr) { + ctx.model_manager->download_from_huggingface_engine(info, progress); + } +} + const BackendOps* default_backend_ops() { static const BackendOps kDefault; return &kDefault; diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp index b20aae05c..03e5c794e 100644 --- a/src/cpp/server/backends/cloud/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -815,6 +815,47 @@ class CloudOps : public BackendOps { // upstream provider's model id, used directly when forwarding requests. return ""; } + + // Cloud models have no local artifacts — always "downloaded". + bool is_downloaded(const ModelInfo&, const BackendOpsContext&) const override { + return true; + } + + // "Downloading" a cloud model is a no-op. + void download_model(const ModelInfo&, bool, DownloadProgressCallback, + const BackendOpsContext&) const override {} + + // Discover models from each installed cloud provider with a resolvable + // credential. Per AGENTS.md invariant #11 the registry persists only + // {provider, base_url}; keys come from env vars / process memory. Failures + // are logged, never propagated, so one offline provider can't block discovery. + std::vector discover_models(const BackendOpsContext& ctx) const override { + std::vector out; + if (ctx.cloud_registry == nullptr) { + return out; + } + for (const auto& rec : ctx.cloud_registry->list_installed()) { + const std::string api_key = ctx.cloud_registry->resolve_key(rec.name); + if (api_key.empty() || rec.base_url.empty()) { + LOG(INFO, "CloudOps") << "Skipping cloud discovery for '" << rec.name + << "': no API key resolvable (set " + << CloudProviderRegistry::env_var_name(rec.name) + << " or POST /v1/cloud/auth)" << std::endl; + continue; + } + try { + for (auto& m : CloudServer::discover_models(rec.name, api_key, rec.base_url)) { + if (m.recipe == "cloud" && !m.model_name.empty()) { + out.push_back(std::move(m)); + } + } + } catch (const std::exception& e) { + LOG(WARNING, "CloudOps") << "Cloud discovery threw for '" << rec.name + << "': " << e.what() << std::endl; + } + } + return out; + } }; } // namespace diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp index 0ac7f8caf..2f2bb36b2 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp @@ -7,6 +7,12 @@ #include "lemon/utils/aixlog.hpp" #include "lemon/utils/json_utils.h" #include "lemon/utils/path_utils.h" +#include +#include "lemon/backends/backend_descriptor_registry.h" +#include "lemon/backends/backend_registry.h" +#include "lemon/backends/backend_utils.h" +#include "lemon/system_info.h" +#include "lemon/utils/process_manager.h" namespace fs = std::filesystem; using json = nlohmann::json; @@ -17,6 +23,7 @@ namespace fastflowlm { namespace { using lemon::utils::path_from_utf8; +using lemon::utils::path_to_utf8; bool safe_exists(const fs::path& p) { std::error_code ec; @@ -114,6 +121,415 @@ int64_t read_flm_max_context_window(const ModelInfo& info) { return 0; } +std::string find_flm_binary() { + try { + const backends::BackendSpec* spec = try_get_spec_for_recipe("flm"); + if (!spec) { + return ""; + } + return BackendUtils::get_backend_binary_path(*spec, "npu"); + } catch (...) { +#ifndef _WIN32 + return lemon::utils::find_flm_executable(); +#else + return ""; +#endif + } +} + +std::vector flm_installed_checkpoints() { + std::vector installed_models; + + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) return installed_models; + + // Run 'flm list --filter installed --quiet --json' to get only installed models + std::string output; +#ifdef _WIN32 + std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL"; + int rc = lemon::utils::ProcessManager::run_command(command, output); +#else + std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + return installed_models; + } + + char buffer[256]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + pclose(pipe); +#endif + + // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] } + try { + json j = lemon::utils::JsonUtils::parse(output); + if (j.contains("models") && j["models"].is_array()) { + for (const auto& model : j["models"]) { + if (model.contains("name") && model["name"].is_string()) { + installed_models.push_back(model["name"].get()); + } + } + return installed_models; + } + } catch (...) { + // Fallback to legacy parsing if JSON parsing fails + } + + // Legacy parsing - cleaner format without emojis + // Expected format: + // Models: + // - modelname:tag + // - another:model + std::istringstream stream(output); + std::string line; + while (std::getline(stream, line)) { + // Trim whitespace + line.erase(0, line.find_first_not_of(" \t\r\n")); + line.erase(line.find_last_not_of(" \t\r\n") + 1); + + // Skip the "Models:" header line or empty lines + if (line == "Models:" || line.empty()) { + continue; + } + + // Parse model checkpoint (format: " - modelname:tag") + if (line.find("- ") == 0) { + std::string checkpoint = line.substr(2); + // Trim any remaining whitespace + checkpoint.erase(0, checkpoint.find_first_not_of(" \t")); + checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1); + if (!checkpoint.empty()) { + installed_models.push_back(checkpoint); + } + } + } + + return installed_models; +} + +std::vector flm_discover_models() { + std::vector flm_models; + if (!SystemInfoCache::get_flm_status().is_ready()) { + return flm_models; + } + + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) return flm_models; + + LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl; + + // Run 'flm list --json' to get all available models + std::string output; +#ifdef _WIN32 + std::string command = "\"" + flm_path + "\" list --json"; + int rc = lemon::utils::ProcessManager::run_command(command, output); + LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc + << ", output length: " << output.size() << std::endl; + if (rc != 0 || output.empty()) { + LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. " + << "Output: " << output.substr(0, 200) << std::endl; + } +#else + std::string command = "\"" + flm_path + "\" list --json 2>/dev/null"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + return flm_models; + } + + char buffer[256]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + pclose(pipe); +#endif + + // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] } + try { + json j = lemon::utils::JsonUtils::parse(output); + if (j.contains("models") && j["models"].is_array()) { + for (const auto& m : j["models"]) { + if (m.contains("name") && m["name"].is_string()) { + std::string checkpoint = m["name"].get(); + + // Format display name: replace : with -, append -FLM + // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM" + std::string display_name = checkpoint; + // Replace : with - + std::replace(display_name.begin(), display_name.end(), ':', '-'); + + std::string model_name = display_name + "-FLM"; + + ModelInfo info; + info.model_name = model_name; + info.checkpoints["main"] = checkpoint; + info.recipe = "flm"; + info.suggested = true; // All official FLM models are suggested + info.downloaded = lemon::utils::JsonUtils::get_or_default(m, "installed", false); + + if (lemon::utils::JsonUtils::get_or_default(m, "installed", false) && m.contains("url") && m["url"].is_string()) { + fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir( + backends::fastflowlm::repo_dir_from_url(m["url"].get())); + if (!config_path.empty()) { + info.resolved_paths["config"] = path_to_utf8(config_path); + } + } + + // Size in GB (footprint field contains disk size in GB) + if (m.contains("footprint") && m["footprint"].is_number()) { + info.size = m["footprint"].get(); + } + + // Labels from FLM metadata + if (m.contains("label") && m["label"].is_array()) { + for (const auto& l : m["label"]) { + if (l.is_string()) { + info.labels.push_back(l.get()); + } + } + } + + // Populate type and device fields (multi-model support) + info.type = get_model_type_from_labels(info.labels); + const BackendDescriptor* flm_desc = descriptor_for("flm"); + info.device = flm_desc ? flm_desc->default_device : DEVICE_NPU; + + flm_models.push_back(info); + } + } + } + } catch (const std::exception& e) { + LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl; + } catch (...) { + LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl; + } + + return flm_models; +} + + +void flm_download(const std::string& checkpoint, bool do_not_upgrade, + DownloadProgressCallback progress_callback) { + LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl; + + // Ensure FLM is ready (single source of truth) + auto status = SystemInfoCache::get_flm_status(); + if (!status.is_ready()) { + throw std::runtime_error(status.error_string()); + } + + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) { + throw std::runtime_error("FLM executable not found"); + } + + // Prepare arguments + std::vector args = {"pull", checkpoint}; + if (!do_not_upgrade) { + args.push_back("--force"); + } + + LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\""; + for (const auto& arg : args) { + LOG(INFO, "ProcessManager") << " \"" << arg << "\""; + } + LOG(INFO, "ProcessManager") << std::endl; + + // State for parsing FLM output + int total_files = 0; + int current_file_index = 0; + std::string current_filename; + bool cancelled = false; + + // Run flm pull command and parse output + int exit_code = lemon::utils::ProcessManager::run_process_with_output( + flm_path, args, + [&](const std::string& line) -> bool { + // Always print the line to console + LOG(INFO, "FLM") << line << std::endl; + + // Parse FLM output to extract progress information + // Pattern: "[FLM] Downloading X/Y: filename" + if (line.find("[FLM] Downloading ") != std::string::npos && + line.find("/") != std::string::npos && + line.find(":") != std::string::npos) { + + // Extract "X/Y: filename" from "[FLM] Downloading X/Y: filename" + size_t start = line.find("Downloading ") + 12; + size_t slash = line.find("/", start); + size_t colon = line.find(":", slash); + + if (slash != std::string::npos && colon != std::string::npos) { + try { + current_file_index = std::stoi(line.substr(start, slash - start)); + total_files = std::stoi(line.substr(slash + 1, colon - slash - 1)); + current_filename = line.substr(colon + 2); // Skip ": " + + // Send progress update + if (progress_callback) { + DownloadProgress progress; + progress.file = current_filename; + progress.file_index = current_file_index; + progress.total_files = total_files; + progress.bytes_downloaded = 0; + progress.bytes_total = 0; + progress.percent = (total_files > 0) ? + ((current_file_index - 1) * 100 / total_files) : 0; + + if (!progress_callback(progress)) { + cancelled = true; + return false; // Kill the process + } + } + } catch (...) { + // Ignore parse errors + } + } + } + // Pattern: "[FLM] Downloading: XX.X% (XXX.XMB / XXX.XMB)" + else if (line.find("[FLM] Downloading: ") != std::string::npos && + line.find("%") != std::string::npos) { + + // Extract percentage and bytes + size_t start = line.find("Downloading: ") + 13; + size_t pct_end = line.find("%", start); + + if (pct_end != std::string::npos) { + try { + std::string pct_str = line.substr(start, pct_end - start); + double file_percent = std::stod(pct_str); + + // Try to extract bytes (XXX.XMB / XXX.XMB) + size_t open_paren = line.find("(", pct_end); + size_t slash = line.find("/", open_paren); + size_t close_paren = line.find(")", slash); + + size_t bytes_downloaded = 0; + size_t bytes_total = 0; + + if (open_paren != std::string::npos && slash != std::string::npos) { + std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1); + std::string total_str = line.substr(slash + 1, close_paren - slash - 1); + + // Parse "XXX.XMB" format + auto parse_size = [](const std::string& s) -> size_t { + double val = 0; + size_t mb_pos = s.find("MB"); + size_t gb_pos = s.find("GB"); + size_t kb_pos = s.find("KB"); + + if (mb_pos != std::string::npos) { + val = std::stod(s.substr(0, mb_pos)); + return static_cast(val * 1024 * 1024); + } else if (gb_pos != std::string::npos) { + val = std::stod(s.substr(0, gb_pos)); + return static_cast(val * 1024 * 1024 * 1024); + } else if (kb_pos != std::string::npos) { + val = std::stod(s.substr(0, kb_pos)); + return static_cast(val * 1024); + } + return 0; + }; + + bytes_downloaded = parse_size(downloaded_str); + bytes_total = parse_size(total_str); + } + + // Send progress update with byte-level info + if (progress_callback) { + DownloadProgress progress; + progress.file = current_filename; + progress.file_index = current_file_index; + progress.total_files = total_files; + progress.bytes_downloaded = bytes_downloaded; + progress.bytes_total = bytes_total; + // Use intra-file percent when we have byte-level progress + progress.percent = static_cast(file_percent); + + if (!progress_callback(progress)) { + cancelled = true; + return false; // Kill the process + } + } + } catch (...) { + // Ignore parse errors + } + } + } + // Pattern: "[FLM] Overall progress: XX.X% (X/Y files)" + else if (line.find("[FLM] Overall progress: ") != std::string::npos) { + size_t start = line.find("progress: ") + 10; + size_t pct_end = line.find("%", start); + + if (pct_end != std::string::npos) { + try { + int overall_percent = static_cast(std::stod(line.substr(start, pct_end - start))); + + if (progress_callback) { + DownloadProgress progress; + progress.file = current_filename; + progress.file_index = current_file_index; + progress.total_files = total_files; + progress.bytes_downloaded = 0; // Not available for overall progress + progress.bytes_total = 0; + progress.percent = overall_percent; + + if (!progress_callback(progress)) { + cancelled = true; + return false; // Kill the process + } + } + } catch (...) { + // Ignore parse errors + } + } + } + // Pattern: "[FLM] Missing files (N):" + else if (line.find("[FLM] Missing files (") != std::string::npos) { + size_t start = line.find("(") + 1; + size_t end = line.find(")", start); + if (end != std::string::npos) { + try { + total_files = std::stoi(line.substr(start, end - start)); + } catch (...) { + // Ignore parse errors + } + } + } + + return true; // Continue + }, + "", // Working directory + 3600 // 1 hour timeout for large model downloads + ); + + if (cancelled) { + LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl; + throw std::runtime_error("Download cancelled"); + } + + if (exit_code != 0) { + LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl; + throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code)); + } + + // Send completion event + if (progress_callback) { + DownloadProgress progress; + progress.complete = true; + progress.file_index = total_files; + progress.total_files = total_files; + progress.percent = 100; + (void)progress_callback(progress); // Ignore return - download already complete + } + + LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl; +} + + } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 648cd9ff5..256fe339b 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -491,6 +491,22 @@ class FlmOps : public BackendOps { // FLM uses the checkpoint string as-is (e.g. "gemma3:4b"); no local file. return ctx.checkpoint; } + + std::vector discover_models(const BackendOpsContext&) const override { + return flm_discover_models(); + } + + bool is_downloaded(const ModelInfo& info, const BackendOpsContext&) const override { + const auto installed = flm_installed_checkpoints(); + return std::find(installed.begin(), installed.end(), info.checkpoint()) != installed.end(); + } + + void download_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress, + const BackendOpsContext&) const override { + flm_download(info.checkpoint(), do_not_upgrade, progress); + } + + bool invalidates_cache_after_download() const override { return true; } }; } // namespace diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index a9af9359f..f1b265fb3 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -705,6 +705,27 @@ class LlamaCppOps : public BackendOps { } return BackendOps::resolve_checkpoint_path(info, ctx); } + + std::string validate_checkpoint_file(const std::string& resolved_path) const override { + // A .gguf file in the cache must start with the GGUF magic, else it's a + // truncated/corrupt download and the model is not really present. + std::error_code ec; + std::filesystem::path p = lemon::utils::path_from_utf8(resolved_path); + if (std::filesystem::is_directory(p, ec)) { + return ""; + } + std::string ext = resolved_path.size() >= 5 ? resolved_path.substr(resolved_path.size() - 5) : ""; + std::transform(ext.begin(), ext.end(), ext.begin(), ::tolower); + if (ext != ".gguf") { + return ""; + } + std::ifstream in(p, std::ios::binary); + char magic[4] = {}; + in.read(magic, sizeof(magic)); + bool ok = in.gcount() == static_cast(sizeof(magic)) && + magic[0] == 'G' && magic[1] == 'G' && magic[2] == 'U' && magic[3] == 'F'; + return ok ? "" : "Invalid GGUF cache file"; + } }; } // namespace diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index d1295ff92..76a4a57a8 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -1297,21 +1297,6 @@ static bool has_partial_files(const fs::path& dir) { return false; } -static bool is_valid_gguf_file_for_cache(const std::string& path) { - std::ifstream in(path_from_utf8(path), std::ios::binary); - if (!in.is_open()) { - return false; - } - - char magic[4] = {}; - in.read(magic, sizeof(magic)); - return in.gcount() == static_cast(sizeof(magic)) && - magic[0] == 'G' && - magic[1] == 'G' && - magic[2] == 'U' && - magic[3] == 'F'; -} - static bool is_checkpoint_path_complete(const std::string& path_str) { if (path_str.empty()) return false; @@ -1347,20 +1332,26 @@ static bool are_required_checkpoints_complete(const ModelInfo& info) { return false; } - fs::path resolved = path_from_utf8(resolved_path); - if (info.recipe == "llamacpp" && - !safe_is_directory(resolved) && - ends_with_ignore_case(resolved_path, ".gguf") && - !is_valid_gguf_file_for_cache(resolved_path)) { + // Per-backend file validation (e.g. llamacpp checks GGUF magic). + std::string invalid = backends::ops_for(info.recipe)->validate_checkpoint_file(resolved_path); + if (!invalid.empty()) { LOG(WARNING, "ModelManager") - << "Invalid GGUF cache file; marking model as not downloaded: " - << resolved_path << std::endl; + << invalid << "; marking model as not downloaded: " << resolved_path << std::endl; return false; } } return true; } +bool ModelManager::checkpoints_complete(const ModelInfo& info) const { + return are_required_checkpoints_complete(info); +} + +void ModelManager::download_from_huggingface_engine(const ModelInfo& info, + DownloadProgressCallback progress_callback) { + download_from_huggingface(info, progress_callback); +} + void ModelManager::build_cache() { std::lock_guard lock(models_cache_mutex_); @@ -1498,48 +1489,20 @@ void ModelManager::build_cache() { all_models[name] = info; } - // Step 1.6: Discover FLM models from 'flm list --json' - // Only discover FLM models if FLM is fully installed - // Precedence: server_models.json > user_models.json > extra_models > flm_list - auto flm_status = SystemInfoCache::get_flm_status(); - if (flm_status.is_ready()) { - auto flm_available = get_flm_available_models(); - for (const auto& info : flm_available) { - // Use emplace to only add if key doesn't exist (respect precedence) - all_models.emplace(info.model_name, info); - } - } - - // Cloud-offload discovery is server-side and automatic. For each - // installed cloud provider with a resolvable credential (env var or - // runtime-auth POST), call discover_models and merge the results into - // all_models. Per AGENTS.md invariant #11, the registry persists only - // {provider, base_url} pairs — API keys live in env vars or process - // memory, never on disk. Failures are logged, never propagated, so a - // single offline provider can't block the rest of cache build. - if (cloud_registry_ != nullptr) { - auto installed = cloud_registry_->list_installed(); - for (const auto& rec : installed) { - const std::string api_key = cloud_registry_->resolve_key(rec.name); - if (api_key.empty() || rec.base_url.empty()) { - LOG(INFO, "ModelManager") << "Skipping cloud discovery for '" - << rec.name << "': no API key resolvable" - << " (set " << CloudProviderRegistry::env_var_name(rec.name) - << " or POST /v1/cloud/auth)" << std::endl; - continue; - } - std::vector discovered; - try { - discovered = backends::CloudServer::discover_models(rec.name, api_key, rec.base_url); - } catch (const std::exception& e) { - LOG(WARNING, "ModelManager") << "Cloud discovery threw for '" - << rec.name << "': " << e.what() - << std::endl; + // Step 1.6: Dynamic discovery. Backends whose models are supplied at runtime + // (descriptor dynamic_models = true — flm from `flm list`, cloud from each + // provider) contribute their models via ops->discover_models(). Each carries + // its own downloaded status. Precedence: server/user/extra models win, so we + // emplace (don't overwrite). Failures are handled inside each backend's ops. + { + backends::BackendOpsContext octx; + octx.model_manager = this; + octx.cloud_registry = cloud_registry_; + for (const auto* desc : backends::all_descriptors()) { + if (!desc->dynamic_models) { continue; } - for (auto& m : discovered) { - if (m.recipe != "cloud" || m.model_name.empty()) continue; - // Same merge precedence as FLM: emplace, don't overwrite. + for (auto& m : backends::ops_for(desc->recipe)->discover_models(octx)) { all_models.emplace(m.model_name, std::move(m)); } } @@ -1556,21 +1519,21 @@ void ModelManager::build_cache() { // Step 2: Filter by backend availability all_models = filter_models_by_backend(all_models); - // Step 3: Check download status ONCE for all models - auto flm_models = get_flm_installed_models(); - std::unordered_set flm_set(flm_models.begin(), flm_models.end()); + // Step 3: Check download status for all models. Dynamic-discovery backends + // (flm, cloud) already set downloaded during discovery; everyone else asks + // its backend ops (default = shared HF completeness check). + backends::BackendOpsContext status_ctx; + status_ctx.model_manager = this; int downloaded_count = 0; // First pass: determine download status for non-collection models for (auto& [name, info] : all_models) { if (is_collection_recipe(info.recipe)) { continue; // Handled in second pass after components are resolved - } else if (info.recipe == "flm") { - info.downloaded = flm_set.count(info.checkpoint()) > 0; - } else if (info.recipe == "cloud") { - info.downloaded = true; // Cloud-offloaded models have no local artifacts - } else { - info.downloaded = are_required_checkpoints_complete(info); + } + const auto* desc = backends::descriptor_for(info.recipe); + if (!(desc && desc->dynamic_models)) { + info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, status_ctx); } if (info.downloaded) { @@ -1667,16 +1630,14 @@ void ModelManager::add_model_to_cache(const std::string& model_name) { return; // Backend not available, don't add to cache } - // Check download status + // Check download status (collections aggregate their components; everyone + // else asks its backend ops). if (is_collection_recipe(info.recipe)) { info.downloaded = check_component_downloaded(info, models_cache_); - } else if (info.recipe == "flm") { - auto flm_models = get_flm_installed_models(); - info.downloaded = std::find(flm_models.begin(), flm_models.end(), info.checkpoint()) != flm_models.end(); - } else if (info.recipe == "cloud") { - info.downloaded = true; // Cloud-offloaded models have no local artifacts } else { - info.downloaded = are_required_checkpoints_complete(info); + backends::BackendOpsContext octx; + octx.model_manager = this; + info.downloaded = backends::ops_for(info.recipe)->is_downloaded(info, octx); } populate_model_metadata(info); @@ -1715,10 +1676,10 @@ void ModelManager::update_model_in_cache(const std::string& model_name, bool dow // The path changes now that files exist on disk if (downloaded) { resolve_all_model_paths(it->second); - if (it->second.recipe == "flm") { + if (backends::ops_for(it->second.recipe)->invalidates_cache_after_download()) { cache_valid_ = false; - LOG(INFO, "ModelManager") << "Invalidated model cache after FLM download for '" - << model_name << "'" << std::endl; + LOG(INFO, "ModelManager") << "Invalidated model cache after download for '" + << model_name << "' (backend rebuilds its model list)" << std::endl; return; } populate_model_metadata(it->second); @@ -2295,192 +2256,8 @@ void ModelManager::unregister_user_model(const std::string& model_name) { cache_valid_ = false; } -// Find the FLM executable: install dir on Windows, system PATH on Linux. -// Returns empty string if not found. -static std::string find_flm_binary() { - try { - const backends::BackendSpec* spec = backends::try_get_spec_for_recipe("flm"); - if (!spec) { - return ""; - } - return backends::BackendUtils::get_backend_binary_path(*spec, "npu"); - } catch (...) { -#ifndef _WIN32 - return utils::find_flm_executable(); -#else - return ""; -#endif - } -} - -// Helper function to get FLM installed models by calling 'flm list --filter installed --quiet' -std::vector ModelManager::get_flm_installed_models() { - std::vector installed_models; - - std::string flm_path = find_flm_binary(); - if (flm_path.empty()) return installed_models; - - // Run 'flm list --filter installed --quiet --json' to get only installed models - std::string output; -#ifdef _WIN32 - std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL"; - int rc = lemon::utils::ProcessManager::run_command(command, output); -#else - std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>/dev/null"; - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - return installed_models; - } - - char buffer[256]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - pclose(pipe); -#endif - - // Parse output: { "models": [ { "name": "modelname:tag", ... }, ... ] } - try { - json j = JsonUtils::parse(output); - if (j.contains("models") && j["models"].is_array()) { - for (const auto& model : j["models"]) { - if (model.contains("name") && model["name"].is_string()) { - installed_models.push_back(model["name"].get()); - } - } - return installed_models; - } - } catch (...) { - // Fallback to legacy parsing if JSON parsing fails - } - - // Legacy parsing - cleaner format without emojis - // Expected format: - // Models: - // - modelname:tag - // - another:model - std::istringstream stream(output); - std::string line; - while (std::getline(stream, line)) { - // Trim whitespace - line.erase(0, line.find_first_not_of(" \t\r\n")); - line.erase(line.find_last_not_of(" \t\r\n") + 1); - - // Skip the "Models:" header line or empty lines - if (line == "Models:" || line.empty()) { - continue; - } - - // Parse model checkpoint (format: " - modelname:tag") - if (line.find("- ") == 0) { - std::string checkpoint = line.substr(2); - // Trim any remaining whitespace - checkpoint.erase(0, checkpoint.find_first_not_of(" \t")); - checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1); - if (!checkpoint.empty()) { - installed_models.push_back(checkpoint); - } - } - } - - return installed_models; -} - -std::vector ModelManager::get_flm_available_models() { - std::vector flm_models; - - std::string flm_path = find_flm_binary(); - if (flm_path.empty()) return flm_models; - - LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl; - - // Run 'flm list --json' to get all available models - std::string output; -#ifdef _WIN32 - std::string command = "\"" + flm_path + "\" list --json"; - int rc = lemon::utils::ProcessManager::run_command(command, output); - LOG(INFO, "ModelManager") << "flm list --json exit code: " << rc - << ", output length: " << output.size() << std::endl; - if (rc != 0 || output.empty()) { - LOG(WARNING, "ModelManager") << "flm list --json failed or returned empty. " - << "Output: " << output.substr(0, 200) << std::endl; - } -#else - std::string command = "\"" + flm_path + "\" list --json 2>/dev/null"; - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - return flm_models; - } - - char buffer[256]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - pclose(pipe); -#endif - - // Parse output: { "models": [ { "name": "modelname:tag", "footprint": 1.23, ... }, ... ] } - try { - json j = JsonUtils::parse(output); - if (j.contains("models") && j["models"].is_array()) { - for (const auto& m : j["models"]) { - if (m.contains("name") && m["name"].is_string()) { - std::string checkpoint = m["name"].get(); - - // Format display name: replace : with -, append -FLM - // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM" - std::string display_name = checkpoint; - // Replace : with - - std::replace(display_name.begin(), display_name.end(), ':', '-'); - - std::string model_name = display_name + "-FLM"; - - ModelInfo info; - info.model_name = model_name; - info.checkpoints["main"] = checkpoint; - info.recipe = "flm"; - info.suggested = true; // All official FLM models are suggested - - if (JsonUtils::get_or_default(m, "installed", false) && m.contains("url") && m["url"].is_string()) { - fs::path config_path = backends::fastflowlm::find_flm_config_path_from_repo_dir( - backends::fastflowlm::repo_dir_from_url(m["url"].get())); - if (!config_path.empty()) { - info.resolved_paths["config"] = path_to_utf8(config_path); - } - } - - // Size in GB (footprint field contains disk size in GB) - if (m.contains("footprint") && m["footprint"].is_number()) { - info.size = m["footprint"].get(); - } - - // Labels from FLM metadata - if (m.contains("label") && m["label"].is_array()) { - for (const auto& l : m["label"]) { - if (l.is_string()) { - info.labels.push_back(l.get()); - } - } - } - - // Populate type and device fields (multi-model support) - info.type = get_model_type_from_labels(info.labels); - info.device = device_type_for_recipe(info.recipe); - flm_models.push_back(info); - } - } - } - } catch (const std::exception& e) { - LOG(WARNING, "ModelManager") << "FLM model discovery failed: " << e.what() << std::endl; - } catch (...) { - LOG(WARNING, "ModelManager") << "FLM model discovery failed with unknown error" << std::endl; - } - return flm_models; -} bool ModelManager::is_model_downloaded(const std::string& model_name) { // Build cache if needed @@ -2505,18 +2282,11 @@ bool ModelManager::is_model_downloaded(const std::string& model_name) { } void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) { - // Cloud models have no local artifacts; "downloading" is a no-op. - if (info.recipe == "cloud") { - update_model_in_cache(info.model_name, true); - return; - } - - // Use recipe-specific download paths - if (info.recipe == "flm") { - download_from_flm(info.checkpoint(), do_not_upgrade, progress_callback); - } else { - download_from_huggingface(info, progress_callback); - } + // The backend's ops own the download (shared HF engine by default; flm pulls + // via the flm CLI; cloud is a no-op). + backends::BackendOpsContext octx; + octx.model_manager = this; + backends::ops_for(info.recipe)->download_model(info, do_not_upgrade, progress_callback, octx); // Update cache after successful download update_model_in_cache(info.model_name, true); @@ -4019,224 +3789,6 @@ void ModelManager::download_from_huggingface(const ModelInfo& info, LOG(INFO, "ModelManager") << "Download location: " << reported_download_path << std::endl; } -void ModelManager::download_from_flm(const std::string& checkpoint, - bool do_not_upgrade, - DownloadProgressCallback progress_callback) { - LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl; - - // Ensure FLM is ready (single source of truth) - auto status = SystemInfoCache::get_flm_status(); - if (!status.is_ready()) { - throw std::runtime_error(status.error_string()); - } - - std::string flm_path = find_flm_binary(); - if (flm_path.empty()) { - throw std::runtime_error("FLM executable not found"); - } - - // Prepare arguments - std::vector args = {"pull", checkpoint}; - if (!do_not_upgrade) { - args.push_back("--force"); - } - - LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\""; - for (const auto& arg : args) { - LOG(INFO, "ProcessManager") << " \"" << arg << "\""; - } - LOG(INFO, "ProcessManager") << std::endl; - - // State for parsing FLM output - int total_files = 0; - int current_file_index = 0; - std::string current_filename; - bool cancelled = false; - - // Run flm pull command and parse output - int exit_code = utils::ProcessManager::run_process_with_output( - flm_path, args, - [&](const std::string& line) -> bool { - // Always print the line to console - LOG(INFO, "FLM") << line << std::endl; - - // Parse FLM output to extract progress information - // Pattern: "[FLM] Downloading X/Y: filename" - if (line.find("[FLM] Downloading ") != std::string::npos && - line.find("/") != std::string::npos && - line.find(":") != std::string::npos) { - - // Extract "X/Y: filename" from "[FLM] Downloading X/Y: filename" - size_t start = line.find("Downloading ") + 12; - size_t slash = line.find("/", start); - size_t colon = line.find(":", slash); - - if (slash != std::string::npos && colon != std::string::npos) { - try { - current_file_index = std::stoi(line.substr(start, slash - start)); - total_files = std::stoi(line.substr(slash + 1, colon - slash - 1)); - current_filename = line.substr(colon + 2); // Skip ": " - - // Send progress update - if (progress_callback) { - DownloadProgress progress; - progress.file = current_filename; - progress.file_index = current_file_index; - progress.total_files = total_files; - progress.bytes_downloaded = 0; - progress.bytes_total = 0; - progress.percent = (total_files > 0) ? - ((current_file_index - 1) * 100 / total_files) : 0; - - if (!progress_callback(progress)) { - cancelled = true; - return false; // Kill the process - } - } - } catch (...) { - // Ignore parse errors - } - } - } - // Pattern: "[FLM] Downloading: XX.X% (XXX.XMB / XXX.XMB)" - else if (line.find("[FLM] Downloading: ") != std::string::npos && - line.find("%") != std::string::npos) { - - // Extract percentage and bytes - size_t start = line.find("Downloading: ") + 13; - size_t pct_end = line.find("%", start); - - if (pct_end != std::string::npos) { - try { - std::string pct_str = line.substr(start, pct_end - start); - double file_percent = std::stod(pct_str); - - // Try to extract bytes (XXX.XMB / XXX.XMB) - size_t open_paren = line.find("(", pct_end); - size_t slash = line.find("/", open_paren); - size_t close_paren = line.find(")", slash); - - size_t bytes_downloaded = 0; - size_t bytes_total = 0; - - if (open_paren != std::string::npos && slash != std::string::npos) { - std::string downloaded_str = line.substr(open_paren + 1, slash - open_paren - 1); - std::string total_str = line.substr(slash + 1, close_paren - slash - 1); - - // Parse "XXX.XMB" format - auto parse_size = [](const std::string& s) -> size_t { - double val = 0; - size_t mb_pos = s.find("MB"); - size_t gb_pos = s.find("GB"); - size_t kb_pos = s.find("KB"); - - if (mb_pos != std::string::npos) { - val = std::stod(s.substr(0, mb_pos)); - return static_cast(val * 1024 * 1024); - } else if (gb_pos != std::string::npos) { - val = std::stod(s.substr(0, gb_pos)); - return static_cast(val * 1024 * 1024 * 1024); - } else if (kb_pos != std::string::npos) { - val = std::stod(s.substr(0, kb_pos)); - return static_cast(val * 1024); - } - return 0; - }; - - bytes_downloaded = parse_size(downloaded_str); - bytes_total = parse_size(total_str); - } - - // Send progress update with byte-level info - if (progress_callback) { - DownloadProgress progress; - progress.file = current_filename; - progress.file_index = current_file_index; - progress.total_files = total_files; - progress.bytes_downloaded = bytes_downloaded; - progress.bytes_total = bytes_total; - // Use intra-file percent when we have byte-level progress - progress.percent = static_cast(file_percent); - - if (!progress_callback(progress)) { - cancelled = true; - return false; // Kill the process - } - } - } catch (...) { - // Ignore parse errors - } - } - } - // Pattern: "[FLM] Overall progress: XX.X% (X/Y files)" - else if (line.find("[FLM] Overall progress: ") != std::string::npos) { - size_t start = line.find("progress: ") + 10; - size_t pct_end = line.find("%", start); - - if (pct_end != std::string::npos) { - try { - int overall_percent = static_cast(std::stod(line.substr(start, pct_end - start))); - - if (progress_callback) { - DownloadProgress progress; - progress.file = current_filename; - progress.file_index = current_file_index; - progress.total_files = total_files; - progress.bytes_downloaded = 0; // Not available for overall progress - progress.bytes_total = 0; - progress.percent = overall_percent; - - if (!progress_callback(progress)) { - cancelled = true; - return false; // Kill the process - } - } - } catch (...) { - // Ignore parse errors - } - } - } - // Pattern: "[FLM] Missing files (N):" - else if (line.find("[FLM] Missing files (") != std::string::npos) { - size_t start = line.find("(") + 1; - size_t end = line.find(")", start); - if (end != std::string::npos) { - try { - total_files = std::stoi(line.substr(start, end - start)); - } catch (...) { - // Ignore parse errors - } - } - } - - return true; // Continue - }, - "", // Working directory - 3600 // 1 hour timeout for large model downloads - ); - - if (cancelled) { - LOG(INFO, "ModelManager") << "FLM download cancelled by client" << std::endl; - throw std::runtime_error("Download cancelled"); - } - - if (exit_code != 0) { - LOG(ERROR, "ModelManager") << "FLM pull failed with exit code: " << exit_code << std::endl; - throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code)); - } - - // Send completion event - if (progress_callback) { - DownloadProgress progress; - progress.complete = true; - progress.file_index = total_files; - progress.total_files = total_files; - progress.percent = 100; - (void)progress_callback(progress); // Ignore return - download already complete - } - - LOG(INFO, "ModelManager") << "FLM model pull completed successfully" << std::endl; -} void ModelManager::delete_model(const std::string& model_name) { auto info = get_model_info(model_name); @@ -4263,7 +3815,7 @@ void ModelManager::delete_model(const std::string& model_name) { // Find flm executable — on Windows flm.exe lives under the lemonade // cache dir, not on PATH, so we must resolve the full path. - std::string flm_path = find_flm_binary(); + std::string flm_path = backends::fastflowlm::find_flm_binary(); if (flm_path.empty()) { throw std::runtime_error("FLM executable not found"); } From 435426052fa1a9fd3ba044feebe18b4ade574f27 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:08:50 -0400 Subject: [PATCH 13/39] refactor(backends): migrate version detection to a resolve_version ops hook get_recipe_version now reads version.txt generically and lets the backend ops override, instead of branching on recipe. The per-backend version commands move into their folders: - system llama-server version (`llama-server --version` + regex) -> backends/ llamacpp; LlamaCppOps::resolve_version returns it for the "system" backend. - flm version (`flm version --json`) -> backends/fastflowlm (flm_version()); FlmOps::resolve_version returns it when no version.txt is present. Removes SystemInfo::get_system_llamacpp_version / get_flm_version and the llamacpp-system / flm branches from system_info. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_ops.h | 11 ++ .../backends/fastflowlm/fastflowlm_models.h | 3 + src/cpp/include/lemon/system_info.h | 3 - .../backends/fastflowlm/fastflowlm_models.cpp | 69 +++++++++ .../backends/fastflowlm/fastflowlm_server.cpp | 8 + .../backends/llamacpp/llamacpp_server.cpp | 48 ++++++ src/cpp/server/system_info.cpp | 137 ++---------------- 7 files changed, 151 insertions(+), 128 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index 35f434df3..854065f24 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -88,6 +88,17 @@ class BackendOps { // Whether the model cache must be rebuilt after this backend downloads a // model (e.g. flm, whose model list changes). Default: false. virtual bool invalidates_cache_after_download() const { return false; } + + // Resolve a backend's installed version for a given backend variant. The + // caller passes the version read from the on-disk version.txt (or "" if + // absent); the default returns it unchanged. Backends that detect their + // version another way override: llamacpp's "system" build runs + // `llama-server --version`; flm queries `flm version` when no file is present. + virtual std::string resolve_version(const std::string& backend, + const std::string& file_version) const { + (void)backend; + return file_version; + } }; // Shared default ops instance for backends that override nothing. diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h index f5d0f269d..20c7a96b8 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h @@ -34,6 +34,9 @@ std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo // Read the model's max context window from its FLM config.json (0 if unknown). int64_t read_flm_max_context_window(const ModelInfo& info); +// Detect the installed FLM version via `flm version` ("unknown" if unavailable). +std::string flm_version(); + // Download (pull) an FLM model by checkpoint via the `flm` CLI. void flm_download(const std::string& checkpoint, bool do_not_upgrade, DownloadProgressCallback progress_callback); diff --git a/src/cpp/include/lemon/system_info.h b/src/cpp/include/lemon/system_info.h index 9b143ae47..a67c744b6 100644 --- a/src/cpp/include/lemon/system_info.h +++ b/src/cpp/include/lemon/system_info.h @@ -104,9 +104,6 @@ class SystemInfo { }; static std::vector get_all_recipe_statuses(); - static std::string get_flm_version(); - static std::string get_system_llamacpp_version(); - // Device support detection static std::string get_rocm_arch(); static std::string get_cuda_arch(); diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp index 2f2bb36b2..0331cc895 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp @@ -530,6 +530,75 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, } +std::string flm_version() { + // Cache real version strings to avoid spawning the subprocess twice per + // build_recipes_info() pass. "unknown" is NOT cached so that post-install + // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed. + static std::string cached_version; + if (!cached_version.empty()) { + return cached_version; + } + + // Find the flm executable using shared utility + std::string flm_path = lemon::utils::find_flm_executable(); + if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) { + return "unknown"; + } + + std::string output; + #ifdef _WIN32 + std::string command = "\"" + flm_path + "\" version --json 2>NUL"; + int rc = lemon::utils::ProcessManager::run_command(command, output); + #else + std::string command = "\"" + flm_path + "\" version --json 2>/dev/null"; + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + return "unknown"; + } + + char buffer[256]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + pclose(pipe); + #endif + + // Parse JSON output: { "version": "0.9.34" } + try { + json j = lemon::utils::JsonUtils::parse(output); + if (j.contains("version") && j["version"].is_string()) { + std::string version = j["version"].get(); + // If the version doesn't start with 'v', prepend it + // for backend_versions.json compatibility (e.g. "v0.9.34"). + if (!version.empty() && version[0] != 'v') { + version = "v" + version; + } + cached_version = version; + return cached_version; + } + } catch (...) { + // Fallback to legacy parsing if JSON parsing fails + } + + // Legacy parsing from output like "FLM v0.9.4" + if (output.find("FLM v") != std::string::npos) { + size_t pos = output.find("FLM v"); + // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34"). + std::string version = output.substr(pos + 4); + // Trim whitespace and newlines + size_t end = version.find_first_of(" \t\n\r"); + if (end != std::string::npos) { + version = version.substr(0, end); + } + cached_version = version; + return cached_version; + } + + return "unknown"; +} + + } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 256fe339b..772fac2d3 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -507,6 +507,14 @@ class FlmOps : public BackendOps { } bool invalidates_cache_after_download() const override { return true; } + + std::string resolve_version(const std::string&, const std::string& file_version) const override { + // On Linux FLM is a system package with no version.txt; query the CLI. + if (file_version.empty() || file_version == "unknown") { + return flm_version(); + } + return file_version; + } }; } // namespace diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index f1b265fb3..63254f155 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -7,6 +7,7 @@ #include "lemon/model_manager.h" #include #include +#include #include #include "lemon/auto_tune.h" #include "lemon/backend_manager.h" @@ -662,6 +663,44 @@ std::unique_ptr create(const BackendContext& ctx) { } namespace { +std::string system_llamacpp_version() { + std::string output; + #ifdef _WIN32 + std::string command = "llama-server --version 2>NUL"; + int rc = lemon::utils::ProcessManager::run_command(command, output); + #else + FILE* pipe = popen("llama-server --version 2>/dev/null", "r"); + if (!pipe) { + return "unknown"; + } + + char buffer[256]; + if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output = buffer; + } + + pclose(pipe); + #endif + + // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432" + if (!output.empty()) { + // Try to find a version number + std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))"); + std::smatch match; + if (std::regex_search(output, match, version_regex)) { + for (size_t i = 1; i < match.size(); ++i) { + if (match[i].matched) { + return "b" + match[i].str(); + } + } + } + return "detected"; + } + + return "unknown"; +} + + // llamacpp model-management behavior: GGUF metadata + capability labels. class LlamaCppOps : public BackendOps { public: @@ -726,6 +765,15 @@ class LlamaCppOps : public BackendOps { magic[0] == 'G' && magic[1] == 'G' && magic[2] == 'U' && magic[3] == 'F'; return ok ? "" : "Invalid GGUF cache file"; } + + std::string resolve_version(const std::string& backend, + const std::string& file_version) const override { + // The PATH-installed "system" llama-server has no version.txt; query it. + if (backend == "system") { + return system_llamacpp_version(); + } + return file_version; + } }; } // namespace diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index 45335912d..6ae2ef03f 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -8,6 +8,7 @@ #include "lemon/utils/process_manager.h" #include "lemon/backends/backend_utils.h" #include "lemon/backends/backend_descriptor_registry.h" +#include "lemon/backends/backend_registry.h" #include "lemon/recipe_backend_def.h" #include #include @@ -609,31 +610,22 @@ static bool is_recipe_installed(const std::string& recipe, const std::string& ba } static std::string get_recipe_version(const std::string& recipe, const std::string& backend) { - if (recipe == "llamacpp" && backend == "system") { - return SystemInfo::get_system_llamacpp_version(); - } + // Read the on-disk version.txt generically, then let the backend's ops + // override (llamacpp "system" runs llama-server --version; flm queries the + // CLI when no file is present). No per-recipe branches here. auto* spec = try_get_spec_for_recipe(recipe); + std::string file_version; if (spec) { std::string version_file = BackendUtils::get_installed_version_file(*spec, backend); - if (version_file.empty()) { -#ifndef _WIN32 - // On Linux, FLM is a system package with no version.txt - query directly - if (recipe == "flm") { - return SystemInfo::get_flm_version(); - } -#endif - return "unknown"; + if (!version_file.empty()) { + file_version = read_version_file(version_file); } - std::string version = read_version_file(version_file); -#ifndef _WIN32 - // On Linux, version.txt may not exist on disk for system-installed FLM - if (recipe == "flm" && (version.empty() || version == "unknown")) { - return SystemInfo::get_flm_version(); - } -#endif - return version; } - return ""; + std::string resolved = backends::ops_for(recipe)->resolve_version(backend, file_version); + if (!spec && resolved.empty()) { + return ""; + } + return resolved.empty() ? "unknown" : resolved; } static std::string get_install_command(const std::string& recipe, const std::string& backend) { @@ -1681,43 +1673,6 @@ static std::string read_version_file(const fs::path& version_file) { return "unknown"; } -std::string SystemInfo::get_system_llamacpp_version() { - std::string output; - #ifdef _WIN32 - std::string command = "llama-server --version 2>NUL"; - int rc = lemon::utils::ProcessManager::run_command(command, output); - #else - FILE* pipe = popen("llama-server --version 2>/dev/null", "r"); - if (!pipe) { - return "unknown"; - } - - char buffer[256]; - if (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output = buffer; - } - - pclose(pipe); - #endif - - // Parse version from output like "version: 3432 (e2b2a632)" or "llama.cpp version b3432" - if (!output.empty()) { - // Try to find a version number - std::regex version_regex(R"(version:\s*(\d+)|version\s+b?(\d+))"); - std::smatch match; - if (std::regex_search(output, match, version_regex)) { - for (size_t i = 1; i < match.size(); ++i) { - if (match[i].matched) { - return "b" + match[i].str(); - } - } - } - return "detected"; - } - - return "unknown"; -} - // Map a CUDA Compute Capability "MAJOR.MINOR" string (as reported by nvidia-smi // --query-gpu=compute_cap) to the sm_XX token used in llamacpp-cuda release filenames. // Returns empty if the value cannot be parsed. @@ -2266,74 +2221,6 @@ bool SystemInfo::get_has_igpu() { return false; // No iGPU detected } -std::string SystemInfo::get_flm_version() { - // Cache real version strings to avoid spawning the subprocess twice per - // build_recipes_info() pass. "unknown" is NOT cached so that post-install - // verification in fastflowlm_server.cpp gets a fresh result after FLM is installed. - static std::string cached_version; - if (!cached_version.empty()) { - return cached_version; - } - - // Find the flm executable using shared utility - std::string flm_path = utils::find_flm_executable(); - if (flm_path.empty() || !utils::is_safe_executable_path(flm_path)) { - return "unknown"; - } - - std::string output; - #ifdef _WIN32 - std::string command = "\"" + flm_path + "\" version --json 2>NUL"; - int rc = lemon::utils::ProcessManager::run_command(command, output); - #else - std::string command = "\"" + flm_path + "\" version --json 2>/dev/null"; - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - return "unknown"; - } - - char buffer[256]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - pclose(pipe); - #endif - - // Parse JSON output: { "version": "0.9.34" } - try { - json j = JsonUtils::parse(output); - if (j.contains("version") && j["version"].is_string()) { - std::string version = j["version"].get(); - // If the version doesn't start with 'v', prepend it - // for backend_versions.json compatibility (e.g. "v0.9.34"). - if (!version.empty() && version[0] != 'v') { - version = "v" + version; - } - cached_version = version; - return cached_version; - } - } catch (...) { - // Fallback to legacy parsing if JSON parsing fails - } - - // Legacy parsing from output like "FLM v0.9.4" - if (output.find("FLM v") != std::string::npos) { - size_t pos = output.find("FLM v"); - // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34"). - std::string version = output.substr(pos + 4); - // Trim whitespace and newlines - size_t end = version.find_first_of(" \t\n\r"); - if (end != std::string::npos) { - version = version.substr(0, end); - } - cached_version = version; - return cached_version; - } - - return "unknown"; -} - // ============================================================================ // Factory function // ============================================================================ From 2a9b38e30cc3456e9d12306b1f0795b5c8dc172e Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:13:42 -0400 Subject: [PATCH 14/39] polish(backends): drop redundant config_section (defaults to recipe) config_section duplicated the recipe string in 8 descriptors; it defaults to the recipe via effective_config_section(), so set those to "". Only sd-cpp ("sdcpp") and ryzenai-llm ("ryzenai") keep an explicit section because theirs genuinely differ from the recipe. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/cloud/cloud.h | 2 +- src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h | 2 +- src/cpp/include/lemon/backends/kokoro/kokoro.h | 2 +- src/cpp/include/lemon/backends/llamacpp/llamacpp.h | 2 +- src/cpp/include/lemon/backends/moonshine/moonshine.h | 2 +- src/cpp/include/lemon/backends/ryzenai/ryzenai.h | 2 +- src/cpp/include/lemon/backends/vllm/vllm.h | 2 +- src/cpp/include/lemon/backends/whispercpp/whispercpp.h | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/cpp/include/lemon/backends/cloud/cloud.h b/src/cpp/include/lemon/backends/cloud/cloud.h index 9d4f5559b..976a84f70 100644 --- a/src/cpp/include/lemon/backends/cloud/cloud.h +++ b/src/cpp/include/lemon/backends/cloud/cloud.h @@ -12,7 +12,7 @@ inline const BackendDescriptor descriptor = { /*recipe*/ "cloud", /*display_name*/ "Cloud", /*binary*/ "", // no subprocess: runs on a remote provider - /*config_section*/ "cloud", + /*config_section*/ "", // defaults to recipe /*default_device*/ DEVICE_NONE, /*slot_policy*/ SlotPolicy::Unmetered, // never counts toward slots, never auto-evicted /*selectable_backend*/ false, diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index b9efb610b..24fc07470 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = { #else /*binary*/ "flm", #endif - /*config_section*/ "flm", + /*config_section*/ "", // defaults to recipe /*default_device*/ DEVICE_NPU, /*slot_policy*/ SlotPolicy::CoexistByType, /*selectable_backend*/ false, diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h index 4663d3ad3..3ebb9efbd 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = { #else /*binary*/ "koko", #endif - /*config_section*/ "kokoro", + /*config_section*/ "", // defaults to recipe /*default_device*/ DEVICE_CPU, /*slot_policy*/ SlotPolicy::Standard, /*selectable_backend*/ false, diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index fc43c4515..cbd6386fa 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = { #else /*binary*/ "llama-server", #endif - /*config_section*/ "llamacpp", + /*config_section*/ "", // defaults to recipe /*default_device*/ DEVICE_GPU, // cpu/system variants resolve to CPU via effective_device() /*slot_policy*/ SlotPolicy::Standard, /*selectable_backend*/ true, diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h index 81f45dc25..5b8faafe2 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h @@ -12,7 +12,7 @@ inline const BackendDescriptor descriptor = { /*recipe*/ "moonshine", /*display_name*/ "Moonshine", /*binary*/ "moonshine-server", - /*config_section*/ "moonshine", + /*config_section*/ "", // defaults to recipe /*default_device*/ DEVICE_CPU, /*slot_policy*/ SlotPolicy::Standard, /*selectable_backend*/ false, diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h index 4171dbe93..c290c4dd1 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = { #else /*binary*/ "ryzenai-server", #endif - /*config_section*/ "ryzenai", + /*config_section*/ "ryzenai", // differs from recipe "ryzenai-llm" /*default_device*/ DEVICE_NPU, /*slot_policy*/ SlotPolicy::ExclusiveNpu, /*selectable_backend*/ false, diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h index 6f468a1ed..4c35ad1ec 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm.h +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -12,7 +12,7 @@ inline const BackendDescriptor descriptor = { /*recipe*/ "vllm", /*display_name*/ "vLLM ROCm (experimental)", /*binary*/ "vllm-server", - /*config_section*/ "vllm", + /*config_section*/ "", // defaults to recipe /*default_device*/ DEVICE_GPU, /*slot_policy*/ SlotPolicy::Standard, /*selectable_backend*/ true, diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h index ce2014dec..8c4a29815 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -16,7 +16,7 @@ inline const BackendDescriptor descriptor = { #else /*binary*/ "whisper-server", #endif - /*config_section*/ "whispercpp", + /*config_section*/ "", // defaults to recipe /*default_device*/ DEVICE_CPU, // npu variant resolves to NPU + ExclusiveNpu via effective_*() /*slot_policy*/ SlotPolicy::Standard, /*selectable_backend*/ true, From 623334c3bb35e0065948101c220fbc41a19fc341 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:18:37 -0400 Subject: [PATCH 15/39] refactor(backends): gate Prometheus scraping on an exposes_prometheus_metrics descriptor flag prometheus_metrics.cpp hardcoded `recipe == "llamacpp"` to decide whether to scrape a backend subprocess's /metrics. Replace with a descriptor flag (exposes_prometheus_metrics; llamacpp = true) so a new backend that exposes Prometheus metrics opts in via its descriptor, not by editing the metrics code. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_descriptor.h | 4 ++++ src/cpp/include/lemon/backends/llamacpp/llamacpp.h | 1 + src/cpp/server/prometheus_metrics.cpp | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index 29ea2e0ea..aad473eba 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -71,6 +71,10 @@ struct BackendDescriptor { // channel clamp (a requested channel not listed here falls back to the first). std::vector rocm_channels; + // True if the backend's subprocess exposes a Prometheus /metrics endpoint + // that lemond should scrape and re-export (llama-server does). + bool exposes_prometheus_metrics = false; + // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { return config_section.empty() ? recipe : config_section; diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index cbd6386fa..a17c24961 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -47,6 +47,7 @@ inline const BackendDescriptor descriptor = { /*web_display_name*/ "llama.cpp GPU", /*web_priority*/ 1, /*rocm_channels*/ {"stable", "nightly"}, + /*exposes_prometheus_metrics*/ true, }; } // namespace llamacpp diff --git a/src/cpp/server/prometheus_metrics.cpp b/src/cpp/server/prometheus_metrics.cpp index 8ecfdb288..88f7bdaf3 100644 --- a/src/cpp/server/prometheus_metrics.cpp +++ b/src/cpp/server/prometheus_metrics.cpp @@ -1,5 +1,6 @@ #include "lemon/prometheus_metrics.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/version.h" #include @@ -274,7 +275,8 @@ void append_llamacpp_backend_metrics(PrometheusBuilder& metrics, const json& model, const std::map& labels, std::set& described_backend_metrics) { - if (model.value("recipe", "") != "llamacpp") { + const auto* desc = backends::descriptor_for(model.value("recipe", "")); + if (desc == nullptr || !desc->exposes_prometheus_metrics) { return; } From ae8ca93330b296cfd351de564107274b3cfed1bb Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:23:44 -0400 Subject: [PATCH 16/39] refactor(backends): move hf_load and moonshine_arch to ModelInfo::extras These backend-specific per-model fields no longer sit on the shared ModelInfo struct: llamacpp reads info.extra("hf_load", false) and moonshine reads info.extra("moonshine_arch", -1). Removed the typed fields, their explicit parse sites, and their kKnownKeys entries; added parse_extras() to the two ModelInfo-building paths that lacked it (add_model_to_cache, get_model_info_ unfiltered) so extras populate everywhere a model is built from JSON. Verified: llamacpp models still resolve/download (hf_load path intact). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/model_manager.h | 8 -------- .../server/backends/llamacpp/llamacpp_server.cpp | 6 +++--- .../server/backends/moonshine/moonshine_server.cpp | 2 +- src/cpp/server/model_manager.cpp | 14 +++----------- 4 files changed, 7 insertions(+), 23 deletions(-) diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h index 967d6d044..f12cd3e9a 100644 --- a/src/cpp/include/lemon/model_manager.h +++ b/src/cpp/include/lemon/model_manager.h @@ -76,11 +76,6 @@ struct ModelInfo { bool suggested = false; std::string source; // "local_upload" for locally uploaded models bool downloaded = false; // Whether model is downloaded and available - // When true, LlamaCppServer launches llama-server with `-hf ` - // instead of `-m [--mmproj ]`. Required for models like - // Qwen2.5-Omni where llama-server's manual-load path rejects audio content - // parts — the -hf path drives the dual-clip (vision+audio) context correctly. - bool hf_load = false; double size = 0.0; // Model size in GB int64_t max_context_window = 0; // Static model-supported text context, when known @@ -107,9 +102,6 @@ struct ModelInfo { double cost_input_per_million = -1.0; double cost_output_per_million = -1.0; - // Moonshine-specific model architecture (e.g., 2 = TINY_STREAMING, 4 = SMALL_STREAMING, 5 = MEDIUM_STREAMING) - int moonshine_arch = -1; - // Generic per-model fields a backend declares for itself. Any server_models.json // key not consumed by a typed field above lands here, so a new backend can read // custom per-model config in load() without editing this shared struct. diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 63254f155..0d8121a37 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -295,7 +295,7 @@ void LlamaCppServer::load(const std::string& model_name, // Use pre-resolved GGUF path. Skipped for hf_load models because llama-server // sources the weights itself via -hf; those models may not have local files. std::string gguf_path = model_info.resolved_path(); - if (gguf_path.empty() && !model_info.hf_load) { + if (gguf_path.empty() && !model_info.extra("hf_load", false)) { throw std::runtime_error("GGUF file not found for checkpoint: " + model_info.checkpoint()); } @@ -331,7 +331,7 @@ void LlamaCppServer::load(const std::string& model_name, // is required for models like Qwen2.5-Omni where the manual -m + --mmproj // path rejects audio content parts in /v1/chat/completions — the -hf path // drives the dual-clip (vision+audio) context correctly. - if (model_info.hf_load) { + if (model_info.extra("hf_load", false)) { push_arg(args, reserved_flags, "-hf", model_info.checkpoint(), std::vector{"--hf-repo", "-mr", "--hf-file", "-mf"}); } else { @@ -353,7 +353,7 @@ void LlamaCppServer::load(const std::string& model_name, // Add mmproj file if present (for vision models). Skip when hf_load is set — // llama-server resolves the mmproj companion itself from the HF repo. - if (!mmproj_path.empty() && !model_info.hf_load) { + if (!mmproj_path.empty() && !model_info.extra("hf_load", false)) { push_arg(args, reserved_flags, "--mmproj", mmproj_path); if (!use_gpu) { LOG(DEBUG, "LlamaCpp") << "Skipping mmproj argument since GPU mode is not enabled" << std::endl; diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index ed709990b..b294f46ee 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -84,7 +84,7 @@ void MoonshineServer::load(const std::string& model_name, // Resolve model architecture. Prefer the explicit registry field; fall back // to inferring from the checkpoint variant (onnx/tiny, onnx/small, etc.). - int model_arch = model_info.moonshine_arch; + int model_arch = model_info.extra("moonshine_arch", -1); if (model_arch < 0) { std::string variant = model_info.checkpoint(); std::transform(variant.begin(), variant.end(), variant.begin(), ::tolower); diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 76a4a57a8..9210afd02 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -293,7 +293,7 @@ static void parse_image_defaults(ModelInfo& info, const json& model_json) { static void parse_extras(ModelInfo& info, const json& model_json) { static const std::set kKnownKeys = { "checkpoint", "checkpoints", "components", "mmproj", "recipe", "suggested", - "hf_load", "source", "size", "cloud_provider", "moonshine_arch", + "source", "size", "cloud_provider", "labels", "image_defaults", "recipe_options" }; if (!model_json.is_object()) return; @@ -1375,11 +1375,9 @@ void ModelManager::build_cache() { parse_components(info, value); info.recipe = JsonUtils::get_or_default(value, "recipe", ""); info.suggested = JsonUtils::get_or_default(value, "suggested", false); - info.hf_load = JsonUtils::get_or_default(value, "hf_load", false); info.source = JsonUtils::get_or_default(value, "source", ""); info.size = JsonUtils::get_or_default(value, "size", 0.0); info.cloud_provider = JsonUtils::get_or_default(value, "cloud_provider", ""); - info.moonshine_arch = JsonUtils::get_or_default(value, "moonshine_arch", -1); // HF-backed collections store their components on Hugging Face — the // cached manifest is the single source of truth. Rebuild the component @@ -1430,11 +1428,9 @@ void ModelManager::build_cache() { parse_components(info, value); info.recipe = JsonUtils::get_or_default(value, "recipe", ""); info.suggested = JsonUtils::get_or_default(value, "suggested", true); - info.hf_load = JsonUtils::get_or_default(value, "hf_load", false); info.source = JsonUtils::get_or_default(value, "source", ""); info.size = JsonUtils::get_or_default(value, "size", 0.0); info.cloud_provider = JsonUtils::get_or_default(value, "cloud_provider", ""); - info.moonshine_arch = JsonUtils::get_or_default(value, "moonshine_arch", -1); // HF-backed user collections (created by `lemonade pull /`) // keep only a repo pointer in user_models.json; their components live in @@ -1601,12 +1597,12 @@ void ModelManager::add_model_to_cache(const std::string& model_name) { info.cloud_provider = JsonUtils::get_or_default(*model_json, "cloud_provider", ""); parse_image_defaults(info, *model_json); + parse_extras(info, *model_json); json jro = (model_json->contains("recipe_options") && (*model_json)["recipe_options"].is_object()) ? (*model_json)["recipe_options"] : json(nullptr); info.recipe_options = build_recipe_options(info, jro, cache_key_to_canonical_id(model_name), recipe_options_); info.suggested = JsonUtils::get_or_default(*model_json, "suggested", is_user_model); - info.hf_load = JsonUtils::get_or_default(*model_json, "hf_load", false); info.source = JsonUtils::get_or_default(*model_json, "source", ""); if (model_json->contains("labels") && (*model_json)["labels"].is_array()) { @@ -4345,7 +4341,6 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name) parse_components(info, *model_json); info.recipe = JsonUtils::get_or_default(*model_json, "recipe", ""); info.suggested = JsonUtils::get_or_default(*model_json, "suggested", false); - info.hf_load = JsonUtils::get_or_default(*model_json, "hf_load", false); info.source = JsonUtils::get_or_default(*model_json, "source", ""); // Parse labels array @@ -4364,10 +4359,7 @@ ModelInfo ModelManager::get_model_info_unfiltered(const std::string& model_name) } } - // Parse moonshine_arch - if (model_json->contains("moonshine_arch") && (*model_json)["moonshine_arch"].is_number_integer()) { - info.moonshine_arch = (*model_json)["moonshine_arch"].get(); - } + parse_extras(info, *model_json); return info; } From add34ed2af2a23e8e17c138f6c15f34c40a7d87f Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:27:30 -0400 Subject: [PATCH 17/39] refactor(backends): descriptor-drive the gfx1151 CWSR availability check Replace the hardcoded (sd-cpp||llamacpp||vllm)&&rocm recipe-list in is_recipe_installed and build_recipes_info with a rocm_requires_cwsr_fix descriptor flag (set on those three backends). The kernel CWSR detection (needs_gfx1151_cwsr_fix) stays in system_info as generic hardware detection; only "which backends' rocm build needs it" is now descriptor data. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../lemon/backends/backend_descriptor.h | 4 ++++ .../lemon/backends/llamacpp/llamacpp.h | 1 + src/cpp/include/lemon/backends/sdcpp/sdcpp.h | 2 ++ src/cpp/include/lemon/backends/vllm/vllm.h | 4 ++++ src/cpp/server/system_info.cpp | 24 +++++++++---------- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index aad473eba..57dd5a7b7 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -75,6 +75,10 @@ struct BackendDescriptor { // that lemond should scrape and re-export (llama-server does). bool exposes_prometheus_metrics = false; + // True if this backend's ROCm build requires the gfx1151 (Strix Halo) kernel + // CWSR fix. Gates the availability/remediation check for the "rocm" backend. + bool rocm_requires_cwsr_fix = false; + // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { return config_section.empty() ? recipe : config_section; diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index a17c24961..8e5435f9f 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -48,6 +48,7 @@ inline const BackendDescriptor descriptor = { /*web_priority*/ 1, /*rocm_channels*/ {"stable", "nightly"}, /*exposes_prometheus_metrics*/ true, + /*rocm_requires_cwsr_fix*/ true, }; } // namespace llamacpp diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h index b65fe4fd6..57ce30cbd 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -51,6 +51,8 @@ inline const BackendDescriptor descriptor = { /*web_display_name*/ "stable-diffusion.cpp", /*web_priority*/ 5, /*rocm_channels*/ {"stable"}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ true, }; } // namespace sdcpp diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h index 4c35ad1ec..84a596168 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm.h +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -32,6 +32,10 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ true, /*web_display_name*/ "", + /*web_priority*/ 0, + /*rocm_channels*/ {}, // single rocm artifact, no stable/nightly channels + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ true, }; } // namespace vllm diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index 6ae2ef03f..b117bf167 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -565,15 +565,13 @@ static bool device_matches_constraint(const std::string& device_family, // Generic installation check static bool is_recipe_installed(const std::string& recipe, const std::string& backend, std::string& error_message) { - bool is_llamacpp_rocm_backend = recipe == "llamacpp" && backend == "rocm"; - - // Special handling for ROCm backends on gfx1151 (Strix Halo) if kernel CWSR fix is missing - bool is_vllm_rocm_backend = recipe == "vllm" && backend == "rocm"; - if ((recipe == "sd-cpp" && backend == "rocm") || is_llamacpp_rocm_backend || is_vllm_rocm_backend) { - if (needs_gfx1151_cwsr_fix()) { - error_message = "Linux kernel missing support"; - return false; - } + // Special handling for ROCm backends on gfx1151 (Strix Halo) if the kernel + // CWSR fix is missing — which backends' rocm build needs it is a descriptor flag. + const auto* cwsr_desc = backends::descriptor_for(recipe); + if (backend == "rocm" && cwsr_desc && cwsr_desc->rocm_requires_cwsr_fix && + needs_gfx1151_cwsr_fix()) { + error_message = "Linux kernel missing support"; + return false; } auto* spec = try_get_spec_for_recipe(recipe); if (spec) { @@ -1365,11 +1363,11 @@ json SystemInfo::build_recipes_info(const json& devices) { : "Backend is supported but not installed."; backend["message"] = install_error.empty() ? default_message : install_error; - bool is_rocm_backend = (def.recipe == "sd-cpp" && def.backend == "rocm") || - (def.recipe == "llamacpp" && def.backend == "rocm") || - (def.recipe == "vllm" && def.backend == "rocm"); + const auto* cwsr_desc = backends::descriptor_for(def.recipe); + bool is_rocm_backend = def.backend == "rocm" && cwsr_desc && + cwsr_desc->rocm_requires_cwsr_fix; - // Special action for ROCm backends on llamacpp/sd-cpp/vllm if CWSR fix is missing + // Special action for ROCm backends that need the gfx1151 CWSR fix. if (is_rocm_backend && !install_error.empty() && needs_gfx1151_cwsr_fix()) { backend["action"] = "Visit https://lemonade-server.ai/gfx1151_linux.html"; From 94ebbab1c8e4fa5360201fb4bb36c0098bb2629a Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:32:42 -0400 Subject: [PATCH 18/39] refactor(backends): migrate install availability to a check_install ops hook is_recipe_installed now finds the managed binary generically and asks the backend's ops whether it's actually installed, instead of hardcoding the llamacpp-system HIP check and the flm PATH fallback: - check_install(backend, binary_found) ops hook; base = installed iff binary found. LlamaCppOps adds the ggml HIP-plugin requirement for the "system" build on AMD GPUs; FlmOps treats a PATH-installed flm as present. - is_ggml_hip_plugin_available moves into backends/llamacpp; find_flm_executable and run_flm_validate move into backends/fastflowlm. Removed from path_utils (+ their orphaned decls/comments). system_info no longer carries llamacpp/flm-specific availability knowledge. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_ops.h | 16 ++ .../backends/fastflowlm/fastflowlm_models.h | 6 + src/cpp/include/lemon/utils/path_utils.h | 23 --- .../backends/fastflowlm/fastflowlm_models.cpp | 124 +++++++++++++- .../backends/fastflowlm/fastflowlm_server.cpp | 12 +- .../backends/llamacpp/llamacpp_server.cpp | 59 +++++++ src/cpp/server/system_info.cpp | 38 ++--- src/cpp/server/utils/path_utils.cpp | 161 ------------------ 8 files changed, 224 insertions(+), 215 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index 854065f24..cf796cb37 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -99,6 +99,22 @@ class BackendOps { (void)backend; return file_version; } + + // Result of a backend-specific install check: whether the backend variant is + // usable, plus an optional error explaining why not. + struct InstallCheck { + bool installed = false; + std::string error; + }; + + // Decide whether a backend variant is installed, given whether its managed + // binary was found on disk. Default: installed iff the binary was found. + // llamacpp's "system" build also requires the ggml HIP plugin when an AMD GPU + // is present; flm can be a system PATH package even without a managed binary. + virtual InstallCheck check_install(const std::string& backend, bool binary_found) const { + (void)backend; + return {binary_found, ""}; + } }; // Shared default ops instance for backends that override nothing. diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h index 20c7a96b8..910e25be6 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h @@ -34,6 +34,12 @@ std::filesystem::path find_flm_config_path_from_repo_dir(const std::string& repo // Read the model's max context window from its FLM config.json (0 if unknown). int64_t read_flm_max_context_window(const ModelInfo& info); +// Locate the flm executable on PATH / install dirs ("" if not found). +std::string find_flm_executable(); + +// Run `flm validate` and report readiness; error_message on failure. +bool run_flm_validate(const std::string& flm_path, std::string& error_message); + // Detect the installed FLM version via `flm version` ("unknown" if unavailable). std::string flm_version(); diff --git a/src/cpp/include/lemon/utils/path_utils.h b/src/cpp/include/lemon/utils/path_utils.h index 96561186c..63f142ee6 100644 --- a/src/cpp/include/lemon/utils/path_utils.h +++ b/src/cpp/include/lemon/utils/path_utils.h @@ -35,22 +35,6 @@ bool is_safe_executable_path(const std::string& path); */ bool looks_like_path(const std::string& v); -/** - * Find the FLM executable (flm.exe on Windows, flm on Unix). - * Uses SearchPathA on Windows (same API as CreateProcessA) to search PATH, - * then falls back to the default installation directory. - * @return Full path to flm executable, or empty string if not found. - */ -std::string find_flm_executable(); - -/** - * Run 'flm validate' command and check if it succeeds. - * @param flm_path Optional path to flm executable. If empty, will search for it. - * @param error_message Output parameter for error message if validation fails. - * @return true if validation succeeds, false otherwise. - */ -bool run_flm_validate(const std::string& flm_path, std::string& error_message); - /** * Get an environment variable as UTF-8 text. */ @@ -73,13 +57,6 @@ std::string path_to_utf8(const std::filesystem::path& path); */ std::string find_executable_in_path(const std::string& executable_name); -/** - * Check if the HIP plugin for GGML backends is available on the system. - * This function checks common installation paths for libggml-hip.so. - * @return true if the HIP plugin is found, false otherwise. - */ -bool is_ggml_hip_plugin_available(); - /** * Set the lemonade cache directory. Must be called once at startup before * get_cache_dir(). After this call, get_cache_dir() returns this path. diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp index 0331cc895..a2c4ad52f 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp @@ -130,7 +130,7 @@ std::string find_flm_binary() { return BackendUtils::get_backend_binary_path(*spec, "npu"); } catch (...) { #ifndef _WIN32 - return lemon::utils::find_flm_executable(); + return find_flm_executable(); #else return ""; #endif @@ -540,7 +540,7 @@ std::string flm_version() { } // Find the flm executable using shared utility - std::string flm_path = lemon::utils::find_flm_executable(); + std::string flm_path = find_flm_executable(); if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) { return "unknown"; } @@ -599,6 +599,126 @@ std::string flm_version() { } +std::string find_flm_executable() { +#ifdef _WIN32 + // On Windows, only check the Lemonade install directory (auto-installed zip). + // No system PATH fallback - FLM should be installed via install_backend(). + std::string install_dir = (fs::path(lemon::utils::get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string(); + if (fs::exists(install_dir)) { + for (const auto& entry : fs::recursive_directory_iterator(install_dir)) { + if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") { + std::string path = entry.path().string(); + if (lemon::utils::is_safe_executable_path(path)) { + return path; + } + } + } + } + return ""; +#else + // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`. + if (!lemon::utils::find_executable_in_path("flm").empty()) { + return "flm"; + } + return ""; +#endif +} + +bool run_flm_validate(const std::string& flm_path, std::string& error_message) { + std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path; + if (flm_exe.empty()) { + error_message = "FLM executable not found"; + return false; + } + if (!lemon::utils::is_safe_executable_path(flm_exe)) { + error_message = "FLM path contains invalid characters"; + return false; + } + + std::string command = "\"" + flm_exe + "\" validate --json"; + std::string output; + int exit_code; +#ifdef _WIN32 + exit_code = lemon::utils::ProcessManager::run_command(command, output); +#else + FILE* pipe = popen(command.c_str(), "r"); + if (!pipe) { + error_message = "Failed to execute " + flm_exe; + return false; + } + + char buffer[1024]; + while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { + output += buffer; + } + + exit_code = pclose(pipe); + if (exit_code != -1) { + exit_code = WEXITSTATUS(exit_code); + } +#endif + + try { + if (!output.empty()) { + json j = lemon::utils::JsonUtils::parse(output); + if (j.is_object()) { + // Check for overall status + bool validation_ok = false; + if (j.contains("ready")) { + validation_ok = j["ready"].get(); + } + + if (validation_ok) { + error_message.clear(); + return true; + } + + std::vector errors; + + if (j.contains("amd_device_found") && !j["amd_device_found"].get()) { + errors.push_back("No AMD NPU device found."); + } + + if (j.contains("all_fw_ok") && !j["all_fw_ok"].get()) { + errors.push_back("NPU firmware is incompatible."); + } + if (j.contains("kernel_ok") && !j["kernel_ok"].get()) { + errors.push_back("Kernel version is incompatible."); + } + + if (j.contains("memlock_ok") && !j["memlock_ok"].get()) { + errors.push_back("Memlock limits are too low."); + } + + if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get()) { + errors.push_back("NPU driver version is too old."); + } + + if (errors.empty()) { + error_message = "NPU validation failed."; + } else { + error_message = ""; + for (size_t i = 0; i < errors.size(); ++i) { + error_message += errors[i] + (i == errors.size() - 1 ? "" : " "); + } + } + return false; + } + } + } catch (...) { + // Fallback for non-JSON output or parsing error + } + + if (exit_code != 0) { + error_message = "flm validate failed with exit code " + std::to_string(exit_code); + return false; + } + + error_message.clear(); + return true; +} + + } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 772fac2d3..e251e8240 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -170,7 +170,7 @@ void FastFlowLMServer::load(const std::string& model_name, // Validate NPU hardware/drivers std::string flm_path = get_flm_path(); std::string validate_error; - if (!utils::run_flm_validate(flm_path, validate_error)) { + if (!fastflowlm::run_flm_validate(flm_path, validate_error)) { throw std::runtime_error("FLM NPU validation failed: " + validate_error + "\nVisit " + DRIVER_INSTALL_URL + " for driver installation instructions."); } @@ -457,7 +457,7 @@ std::string FastFlowLMServer::get_flm_path() { } #else // On Linux, FLM is installed as a system package (in PATH) - std::string flm_path = utils::find_flm_executable(); + std::string flm_path = fastflowlm::find_flm_executable(); if (!flm_path.empty()) { LOG(INFO, "FastFlowLM") << "Found flm at: " << flm_path << std::endl; } else { @@ -515,6 +515,14 @@ class FlmOps : public BackendOps { } return file_version; } + + InstallCheck check_install(const std::string&, bool binary_found) const override { + // On Linux FLM is a system package on PATH, not in the managed install dir. + if (!binary_found && !find_flm_executable().empty()) { + return {true, ""}; + } + return {binary_found, ""}; + } }; } // namespace diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 0d8121a37..d98ce46e6 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -701,6 +701,52 @@ std::string system_llamacpp_version() { } +bool is_ggml_hip_plugin_available() { +#ifdef __linux__ + // Allow distros/packagers that install outside the FHS paths below + // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so. + if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) { + // Require the basename to look like the HIP plugin (libggml-hip*.so*, + // case-insensitive, versioned sonames allowed). This is a sanity check, + // not a security boundary: the path is not forwarded to ggml's loader, + // so we cannot verify it is actually loadable. It only guards against an + // accidental override pointing at an unrelated existing file. + std::string name = fs::path(env).filename().string(); + std::transform(name.begin(), name.end(), name.begin(), + [](unsigned char c) { return std::tolower(c); }); + const bool name_matches = name.rfind("libggml-hip", 0) == 0 && + name.find(".so") != std::string::npos; + // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing + // filesystem overload: an odd or malformed path resolves to "not a + // regular file" (ec set) instead of raising a filesystem_error. + std::error_code hip_path_ec; + if (name_matches && fs::is_regular_file(env, hip_path_ec)) { + return true; + } + } + // On Linux x86_64, check common system library paths for the HIP plugin + std::vector possible_paths = { + // Debian/Ubuntu multiarch path (most common) + "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so", + // Arch AUR path + "/usr/lib/libggml-hip.so", + // Standard Linux paths + "/usr/lib/ggml/backends0/libggml-hip.so", + "/usr/lib64/ggml/backends0/libggml-hip.so" + }; + + // Check all possible paths + for (const auto& path : possible_paths) { + if (fs::exists(path)) { + return true; + } + } +#endif + + return false; +} + + // llamacpp model-management behavior: GGUF metadata + capability labels. class LlamaCppOps : public BackendOps { public: @@ -774,6 +820,19 @@ class LlamaCppOps : public BackendOps { } return file_version; } + + InstallCheck check_install(const std::string& backend, bool binary_found) const override { + // The system llama-server also needs the ggml HIP plugin for ROCm GPU + // acceleration when an AMD GPU (KFD) is present. + if (binary_found && backend == "system") { +#ifdef __linux__ + if (std::filesystem::exists("/sys/class/kfd") && !is_ggml_hip_plugin_available()) { + return {false, "HIP plugin libggml-hip.so not installed"}; + } +#endif + } + return {binary_found, ""}; + } }; } // namespace diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index b117bf167..d77b80830 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -573,38 +573,22 @@ static bool is_recipe_installed(const std::string& recipe, const std::string& ba error_message = "Linux kernel missing support"; return false; } - auto* spec = try_get_spec_for_recipe(recipe); - if (spec) { + // Find the managed binary, then let the backend's ops decide installed-ness + // (llamacpp "system" also needs the HIP plugin; flm can be a PATH package). + bool binary_found = false; + if (auto* spec = try_get_spec_for_recipe(recipe)) { try { BackendUtils::get_backend_binary_path(*spec, backend); - - // For system llamacpp backend, also verify the HIP plugin is available - // This is required for ROCm GPU acceleration with dynamically loaded backends - if (recipe == "llamacpp" && backend == "system") { -#ifdef __linux__ - // Check if AMD GPU driver is loaded (KFD indicates amdgpu driver) - if (fs::exists("/sys/class/kfd")) { - // System has AMD GPU(s), so we need the HIP plugin - if (!is_ggml_hip_plugin_available()) { - error_message = "HIP plugin libggml-hip.so not installed"; - return false; - } - } -#endif - } - - return true; + binary_found = true; } catch (...) { -#ifndef _WIN32 - // On Linux, FLM is installed as a system package (in PATH, not install dir) - if (recipe == "flm" && !utils::find_flm_executable().empty()) { - return true; - } -#endif - return false; + binary_found = false; } } - return false; + auto check = backends::ops_for(recipe)->check_install(backend, binary_found); + if (!check.installed && !check.error.empty()) { + error_message = check.error; + } + return check.installed; } static std::string get_recipe_version(const std::string& recipe, const std::string& backend) { diff --git a/src/cpp/server/utils/path_utils.cpp b/src/cpp/server/utils/path_utils.cpp index dc7492295..fb8591337 100644 --- a/src/cpp/server/utils/path_utils.cpp +++ b/src/cpp/server/utils/path_utils.cpp @@ -103,30 +103,6 @@ bool looks_like_path(const std::string& v) { } } -std::string find_flm_executable() { -#ifdef _WIN32 - // On Windows, only check the Lemonade install directory (auto-installed zip). - // No system PATH fallback - FLM should be installed via install_backend(). - std::string install_dir = (fs::path(get_downloaded_bin_dir()) / "flm" / "npu").make_preferred().string(); - if (fs::exists(install_dir)) { - for (const auto& entry : fs::recursive_directory_iterator(install_dir)) { - if (entry.is_regular_file() && entry.path().filename().string() == "flm.exe") { - std::string path = entry.path().string(); - if (is_safe_executable_path(path)) { - return path; - } - } - } - } - return ""; -#else - // Walk PATH directly — minimal Fedora/openSUSE containers do not ship `which`. - if (!find_executable_in_path("flm").empty()) { - return "flm"; - } - return ""; -#endif -} std::string find_executable_in_path(const std::string& executable_name) { if (!is_safe_executable_path(executable_name)) { @@ -180,50 +156,6 @@ std::string find_executable_in_path(const std::string& executable_name) { #endif } -bool is_ggml_hip_plugin_available() { -#ifdef __linux__ - // Allow distros/packagers that install outside the FHS paths below - // (e.g. NixOS, custom prefixes) to point directly at libggml-hip.so. - if (const char* env = std::getenv("LEMONADE_GGML_HIP_PATH"); env && *env) { - // Require the basename to look like the HIP plugin (libggml-hip*.so*, - // case-insensitive, versioned sonames allowed). This is a sanity check, - // not a security boundary: the path is not forwarded to ggml's loader, - // so we cannot verify it is actually loadable. It only guards against an - // accidental override pointing at an unrelated existing file. - std::string name = fs::path(env).filename().string(); - std::transform(name.begin(), name.end(), name.begin(), - [](unsigned char c) { return std::tolower(c); }); - const bool name_matches = name.rfind("libggml-hip", 0) == 0 && - name.find(".so") != std::string::npos; - // LEMONADE_GGML_HIP_PATH is user-controlled, so use the non-throwing - // filesystem overload: an odd or malformed path resolves to "not a - // regular file" (ec set) instead of raising a filesystem_error. - std::error_code hip_path_ec; - if (name_matches && fs::is_regular_file(env, hip_path_ec)) { - return true; - } - } - // On Linux x86_64, check common system library paths for the HIP plugin - std::vector possible_paths = { - // Debian/Ubuntu multiarch path (most common) - "/usr/lib/x86_64-linux-gnu/ggml/backends0/libggml-hip.so", - // Arch AUR path - "/usr/lib/libggml-hip.so", - // Standard Linux paths - "/usr/lib/ggml/backends0/libggml-hip.so", - "/usr/lib64/ggml/backends0/libggml-hip.so" - }; - - // Check all possible paths - for (const auto& path : possible_paths) { - if (fs::exists(path)) { - return true; - } - } -#endif - - return false; -} std::string get_cache_dir() { // If set_cache_dir() was called at startup, use that @@ -295,98 +227,5 @@ std::string get_downloaded_bin_dir() { return bin_dir; } -bool run_flm_validate(const std::string& flm_path, std::string& error_message) { - std::string flm_exe = flm_path.empty() ? find_flm_executable() : flm_path; - if (flm_exe.empty()) { - error_message = "FLM executable not found"; - return false; - } - if (!is_safe_executable_path(flm_exe)) { - error_message = "FLM path contains invalid characters"; - return false; - } - - std::string command = "\"" + flm_exe + "\" validate --json"; - std::string output; - int exit_code; -#ifdef _WIN32 - exit_code = ProcessManager::run_command(command, output); -#else - FILE* pipe = popen(command.c_str(), "r"); - if (!pipe) { - error_message = "Failed to execute " + flm_exe; - return false; - } - - char buffer[1024]; - while (fgets(buffer, sizeof(buffer), pipe) != nullptr) { - output += buffer; - } - - exit_code = pclose(pipe); - if (exit_code != -1) { - exit_code = WEXITSTATUS(exit_code); - } -#endif - - try { - if (!output.empty()) { - json j = JsonUtils::parse(output); - if (j.is_object()) { - // Check for overall status - bool validation_ok = false; - if (j.contains("ready")) { - validation_ok = j["ready"].get(); - } - - if (validation_ok) { - error_message.clear(); - return true; - } - - std::vector errors; - - if (j.contains("amd_device_found") && !j["amd_device_found"].get()) { - errors.push_back("No AMD NPU device found."); - } - - if (j.contains("all_fw_ok") && !j["all_fw_ok"].get()) { - errors.push_back("NPU firmware is incompatible."); - } - if (j.contains("kernel_ok") && !j["kernel_ok"].get()) { - errors.push_back("Kernel version is incompatible."); - } - - if (j.contains("memlock_ok") && !j["memlock_ok"].get()) { - errors.push_back("Memlock limits are too low."); - } - - if (j.contains("npu_driver_ok") && !j["npu_driver_ok"].get()) { - errors.push_back("NPU driver version is too old."); - } - - if (errors.empty()) { - error_message = "NPU validation failed."; - } else { - error_message = ""; - for (size_t i = 0; i < errors.size(); ++i) { - error_message += errors[i] + (i == errors.size() - 1 ? "" : " "); - } - } - return false; - } - } - } catch (...) { - // Fallback for non-JSON output or parsing error - } - - if (exit_code != 0) { - error_message = "flm validate failed with exit code " + std::to_string(exit_code); - return false; - } - - error_message.clear(); - return true; -} } // namespace utils::lemon From 1ced08c910b10eaa7e472294cb68518d5a9b8e19 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:37:28 -0400 Subject: [PATCH 19/39] refactor(backends): descriptor-drive version comparison policy (Exact vs AtLeast) The update-required check special-cased recipe=="flm" to allow an installed version newer than the pin. Replace with a version_policy descriptor field (Exact default; flm = AtLeast for its system-managed package). system_info no longer names flm in the version-comparison logic. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_descriptor.h | 10 ++++++++++ src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h | 4 ++++ src/cpp/server/system_info.cpp | 7 ++++--- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index 57dd5a7b7..d640ca279 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -29,6 +29,12 @@ enum class SlotPolicy { Unmetered // never counts toward slots, never auto-evicted (cloud) }; +// How an installed backend version is compared against the expected pin. +enum class VersionPolicy { + Exact, // installed must match the expected version + AtLeast // installed >= expected is acceptable (system-managed packages, e.g. flm) +}; + inline const char* slot_policy_to_string(SlotPolicy p) { switch (p) { case SlotPolicy::Standard: return "standard"; @@ -79,6 +85,10 @@ struct BackendDescriptor { // CWSR fix. Gates the availability/remediation check for the "rocm" backend. bool rocm_requires_cwsr_fix = false; + // How the installed version is compared against the expected pin. Exact by + // default; system-managed packages (flm) accept any version >= expected. + VersionPolicy version_policy = VersionPolicy::Exact; + // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { return config_section.empty() ? recipe : config_section; diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index 24fc07470..b56c9e577 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -32,6 +32,10 @@ inline const BackendDescriptor descriptor = { /*experimental*/ false, /*web_display_name*/ "FastFlowLM NPU", /*web_priority*/ 3, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::AtLeast, // system-managed package }; } // namespace fastflowlm diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index d77b80830..4f5697b50 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -1401,9 +1401,10 @@ json SystemInfo::build_recipes_info(const json& devices) { return installed.compare(0, prefix.size(), prefix) == 0; }; #if !defined(_WIN32) - // On non-Windows, FLM is a system-managed package; a version newer - // than the minimum required is acceptable. - if (def.recipe == "flm") { + // System-managed packages (e.g. flm on Linux) accept a version newer + // than the minimum required. + const auto* ver_desc = backends::descriptor_for(def.recipe); + if (ver_desc && ver_desc->version_policy == VersionPolicy::AtLeast) { auto installed_ver = utils::Version::parse(installed_version); auto expected_ver = utils::Version::parse(expected_version); // If either version cannot be parsed, fall back to exact equality check From e89b47cee5d07c7a3cae985e177264d02107f991 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:41:59 -0400 Subject: [PATCH 20/39] refactor(backends): move FLM model deletion into the fastflowlm folder The `flm remove` subprocess orchestration moves out of ModelManager::delete_model into backends/fastflowlm (flm_remove). model_manager keeps only the generic HF-cache deletion path; the flm branch is now a thin call into the backend. Verified: server_endpoints 69 pass. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../backends/fastflowlm/fastflowlm_models.h | 3 ++ .../backends/fastflowlm/fastflowlm_models.cpp | 29 +++++++++++ src/cpp/server/model_manager.cpp | 50 +------------------ 3 files changed, 34 insertions(+), 48 deletions(-) diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h index 910e25be6..87470300c 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h @@ -47,6 +47,9 @@ std::string flm_version(); void flm_download(const std::string& checkpoint, bool do_not_upgrade, DownloadProgressCallback progress_callback); +// Remove an installed FLM model by checkpoint via `flm remove`; throws on failure. +void flm_remove(const std::string& checkpoint); + } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp index a2c4ad52f..83d2080bc 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp @@ -8,6 +8,8 @@ #include "lemon/utils/json_utils.h" #include "lemon/utils/path_utils.h" #include +#include +#include #include "lemon/backends/backend_descriptor_registry.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" @@ -719,6 +721,33 @@ bool run_flm_validate(const std::string& flm_path, std::string& error_message) { } +void flm_remove(const std::string& checkpoint) { + if (checkpoint.empty()) { + throw std::runtime_error("FLM model has empty checkpoint field, cannot delete"); + } + std::string flm_path = find_flm_binary(); + if (flm_path.empty()) { + throw std::runtime_error("FLM executable not found"); + } + std::vector args = {"remove", checkpoint}; + auto handle = lemon::utils::ProcessManager::start_process(flm_path, args, "", false); + + int timeout_seconds = 60; + for (int i = 0; i < timeout_seconds * 10; ++i) { + if (!lemon::utils::ProcessManager::is_running(handle)) { + int exit_code = lemon::utils::ProcessManager::get_exit_code(handle); + if (exit_code != 0) { + throw std::runtime_error("FLM remove failed for " + checkpoint + + " (exit code " + std::to_string(exit_code) + ")"); + } + return; + } + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + lemon::utils::ProcessManager::stop_process(handle); + throw std::runtime_error("FLM remove timed out for " + checkpoint); +} + } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 9210afd02..abcbbe3b0 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -3800,55 +3800,9 @@ void ModelManager::delete_model(const std::string& model_name) { "Delete the file directly from: " + info.checkpoint()); } - // Handle FLM models separately + // FLM models have no local HF cache; deletion is the backend's `flm remove`. if (info.recipe == "flm") { - LOG(INFO, "ModelManager") << "Deleting FLM model: " << info.checkpoint() << std::endl; - - // Validate checkpoint is not empty - if (info.checkpoint().empty()) { - throw std::runtime_error("FLM model has empty checkpoint field, cannot delete"); - } - - // Find flm executable — on Windows flm.exe lives under the lemonade - // cache dir, not on PATH, so we must resolve the full path. - std::string flm_path = backends::fastflowlm::find_flm_binary(); - if (flm_path.empty()) { - throw std::runtime_error("FLM executable not found"); - } - - // Prepare arguments for 'flm remove' command - std::vector args = {"remove", info.checkpoint()}; - - LOG(INFO, "ProcessManager") << "Starting process: \"" << flm_path << "\""; - for (const auto& arg : args) { - LOG(INFO, "ProcessManager") << " \"" << arg << "\""; - } - LOG(INFO, "ProcessManager") << std::endl; - - // Run flm remove command - auto handle = utils::ProcessManager::start_process(flm_path, args, "", false); - - // Wait for process to complete - int timeout_seconds = 60; // 1 minute timeout for removal - for (int i = 0; i < timeout_seconds * 10; ++i) { - if (!utils::ProcessManager::is_running(handle)) { - int exit_code = utils::ProcessManager::get_exit_code(handle); - if (exit_code != 0) { - LOG(ERROR, "ModelManager") << "FLM remove failed with exit code: " << exit_code << std::endl; - throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove failed with exit code " + std::to_string(exit_code)); - } - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - - // Check if process is still running (timeout) - if (utils::ProcessManager::is_running(handle)) { - LOG(ERROR, "ModelManager") << "FLM remove timed out" << std::endl; - throw std::runtime_error("Failed to delete FLM model " + canonical_model_name + ": FLM remove timed out"); - } - - LOG(INFO, "ModelManager") << "Successfully deleted FLM model: " << canonical_model_name << std::endl; + backends::fastflowlm::flm_remove(info.checkpoint()); // Remove from user models if it's a user model if (is_user_model_name(canonical_model_name)) { From 55fa6f12dd12ff449f96cd03f8c6f5c42d8004ef Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:45:51 -0400 Subject: [PATCH 21/39] refactor(config): drive recipe_options() from descriptors, not per-recipe blocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RuntimeConfig::recipe_options() had a hardcoded nested→flat translation block per recipe (llamacpp/whispercpp/moonshine/sdcpp/vllm). Replace with a single loop over the descriptors: each option's config.json key is derived from its name role (*_backend → "backend", *_args → variant "_args"/"args", *_device → "device", else the option name verbatim for sd-cpp's steps/cfg_scale/ width/height). Adding a backend no longer requires editing this function. Verified: server_endpoints 69 pass (config/params translation unchanged). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/server/runtime_config.cpp | 81 +++++++++++++------------------ 1 file changed, 34 insertions(+), 47 deletions(-) diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp index 0a14f006b..cfc3546b7 100644 --- a/src/cpp/server/runtime_config.cpp +++ b/src/cpp/server/runtime_config.cpp @@ -323,56 +323,43 @@ json RuntimeConfig::recipe_options(const std::string& backend) const { return val; }; - const std::string backend_args = backend + "_args"; - - if (config_.contains("llamacpp")) { - const auto& lc = config_["llamacpp"]; - if (lc.contains("backend")) result["llamacpp_backend"] = resolve_auto(lc["backend"]); - if (lc.contains(backend_args) && lc[backend_args] != "") { - result["llamacpp_args"] = lc[backend_args]; - } else if (lc.contains("args")) { - result["llamacpp_args"] = lc["args"]; - } - if (lc.contains("device")) result["llamacpp_device"] = lc["device"]; - } - - if (config_.contains("whispercpp")) { - const auto& wc = config_["whispercpp"]; - if (wc.contains("backend")) result["whispercpp_backend"] = resolve_auto(wc["backend"]); - if (wc.contains(backend_args) && wc[backend_args] != "") { - result["whispercpp_args"] = wc[backend_args]; - } else if (wc.contains("args")) { - result["whispercpp_args"] = wc["args"]; - } - } + auto ends_with = [](const std::string& s, const std::string& suf) { + return s.size() >= suf.size() && s.compare(s.size() - suf.size(), suf.size(), suf) == 0; + }; - if (config_.contains("moonshine")) { - const auto& ms = config_["moonshine"]; - if (ms.contains(backend_args) && ms[backend_args] != "") { - result["moonshine_args"] = ms[backend_args]; - } else if (ms.contains("args")) { - result["moonshine_args"] = ms["args"]; - } - } + const std::string backend_args = backend + "_args"; - if (config_.contains("sdcpp")) { - const auto& sd = config_["sdcpp"]; - if (sd.contains("backend")) result["sd-cpp_backend"] = resolve_auto(sd["backend"]); - if (sd.contains(backend_args) && sd[backend_args] != "") { - result["sdcpp_args"] = sd[backend_args]; - } else if (sd.contains("args")) { - result["sdcpp_args"] = sd["args"]; + // Translate each backend's nested config.json section into the flat + // recipe_options format, driven by the descriptor's option list — no + // per-recipe block. The flat key is the descriptor option name; the + // config.json key is derived from the option's role (its name suffix): + // *_backend -> "backend" *_args -> variant "_args" then "args" + // *_device -> "device" everything else -> the option name verbatim + // (sd-cpp's steps/cfg_scale/width/height/…) + for (const auto* desc : lemon::backends::all_descriptors()) { + const std::string section = desc->effective_config_section(); + if (!config_.contains(section) || !config_[section].is_object()) { + continue; + } + const auto& cfg = config_[section]; + for (const auto& opt : desc->options) { + if (ends_with(opt.name, "_backend")) { + if (cfg.contains("backend")) { + result[opt.name] = resolve_auto(cfg["backend"]); + } + } else if (ends_with(opt.name, "_args")) { + if (cfg.contains(backend_args) && cfg[backend_args] != "") { + result[opt.name] = cfg[backend_args]; + } else if (cfg.contains("args")) { + result[opt.name] = cfg["args"]; + } + } else { + const std::string ckey = ends_with(opt.name, "_device") ? "device" : opt.name; + if (cfg.contains(ckey)) { + result[opt.name] = cfg[ckey]; + } + } } - if (sd.contains("steps")) result["steps"] = sd["steps"]; - if (sd.contains("cfg_scale")) result["cfg_scale"] = sd["cfg_scale"]; - if (sd.contains("width")) result["width"] = sd["width"]; - if (sd.contains("height")) result["height"] = sd["height"]; - } - - if (config_.contains("vllm")) { - const auto& vl = config_["vllm"]; - if (vl.contains("backend")) result["vllm_backend"] = resolve_auto(vl["backend"]); - if (vl.contains("args")) result["vllm_args"] = vl["args"]; } if (config_.contains("ctx_size")) result["ctx_size"] = config_["ctx_size"]; From c3aff5976ef6d78074f97c49d9feddea61b4f018 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:51:43 -0400 Subject: [PATCH 22/39] =?UTF-8?q?polish(backends):=20build=20BackendSpec?= =?UTF-8?q?=20from=20the=20descriptor=20(dedup=20binary=20across=20descrip?= =?UTF-8?q?tor=E2=86=94server.h)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The backend binary name (and recipe) were duplicated between the descriptor (.h) and the BackendSpec literal (_server.h) — the cross-file redundancy. Remove the static SPEC member; each backend's spec() now builds the BackendSpec lazily from descriptor.binary (+ descriptor.recipe, or the explicit "ryzenai-server" install id where it differs) plus the class's get_install_params and split flag. In-class binary lookups go through spec(); server.cpp's sd upscale uses try_get_spec_for_recipe. Net: the binary name now lives in exactly one place (the descriptor). Lazy function-local statics also avoid any static-init-order coupling between the descriptor and the spec. Verified: builds green; system-info install detection unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../lemon/backends/fastflowlm/fastflowlm_server.h | 11 ----------- src/cpp/include/lemon/backends/kokoro/kokoro_server.h | 9 --------- .../include/lemon/backends/llamacpp/llamacpp_server.h | 9 --------- .../lemon/backends/moonshine/moonshine_server.h | 5 ----- .../include/lemon/backends/ryzenai/ryzenai_server.h | 9 --------- src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h | 9 --------- src/cpp/include/lemon/backends/vllm/vllm_server.h | 6 ------ .../lemon/backends/whispercpp/whispercpp_server.h | 9 --------- .../server/backends/fastflowlm/fastflowlm_server.cpp | 11 ++++++++--- src/cpp/server/backends/kokoro/kokoro_server.cpp | 11 ++++++++--- src/cpp/server/backends/llamacpp/llamacpp_server.cpp | 11 ++++++++--- .../server/backends/moonshine/moonshine_server.cpp | 11 ++++++++--- src/cpp/server/backends/ryzenai/ryzenai_server.cpp | 11 ++++++++--- src/cpp/server/backends/sdcpp/sdcpp_server.cpp | 11 ++++++++--- src/cpp/server/backends/vllm/vllm_server.cpp | 11 ++++++++--- .../server/backends/whispercpp/whispercpp_server.cpp | 11 ++++++++--- src/cpp/server/server.cpp | 2 +- 17 files changed, 65 insertions(+), 92 deletions(-) diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h index c422f4a4d..e4bce74d8 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h @@ -13,17 +13,6 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - // recipe - "flm", - // executable - #ifdef _WIN32 - "flm.exe" - #else - "flm" - #endif - , get_install_params - ); FastFlowLMServer(const std::string& log_level, ModelManager* model_manager = nullptr, BackendManager* backend_manager = nullptr); diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h index 9c628c076..ec8e74844 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h @@ -15,15 +15,6 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "kokoro", - #ifdef _WIN32 - "koko.exe" - #else - "koko" - #endif - , get_install_params - ); explicit KokoroServer(const std::string& log_level, ModelManager* model_manager, diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h index 8b28296c4..f1447c1ce 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h @@ -13,15 +13,6 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "llamacpp", - #ifdef _WIN32 - "llama-server.exe" - #else - "llama-server" - #endif - , get_install_params - ); LlamaCppServer(const std::string& log_level, ModelManager* model_manager, diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h index 611bfe51c..47ea21f58 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h @@ -14,11 +14,6 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "moonshine", - "moonshine-server", - get_install_params - ); explicit MoonshineServer(const std::string& log_level, ModelManager* model_manager, diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h index f824cfde3..f3a6806e7 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai_server.h @@ -17,15 +17,6 @@ class RyzenAIServer : public WrappedServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "ryzenai-server", -#ifdef _WIN32 - "ryzenai-server.exe" -#else - "ryzenai-server" -#endif - , get_install_params - ); RyzenAIServer(const std::string& model_name, bool debug, ModelManager* model_manager, BackendManager* backend_manager); diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h index 99be9e62c..65c470332 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h @@ -18,15 +18,6 @@ class SDServer : public WrappedServer, public IImageServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "sd-cpp", - #ifdef _WIN32 - "sd-server.exe" - #else - "sd-server" - #endif - , get_install_params - ); explicit SDServer(const std::string& log_level, ModelManager* model_manager, diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h index 700296b97..0293fa811 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm_server.h +++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h @@ -13,12 +13,6 @@ class VLLMServer : public WrappedServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "vllm", - "vllm-server" - , get_install_params - , /*supports_split_archive=*/true - ); VLLMServer(const std::string& log_level, ModelManager* model_manager, diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h index 8dc88bbb4..9ddd4f2af 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h @@ -15,15 +15,6 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer { public: static InstallParams get_install_params(const std::string& backend, const std::string& version); - inline static const BackendSpec SPEC = BackendSpec( - "whispercpp", -#ifdef _WIN32 - "whisper-server.exe" -#else - "whisper-server" -#endif - , get_install_params - ); explicit WhisperServer(const std::string& log_level, ModelManager* model_manager, diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index e251e8240..fc5ecef9b 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/fastflowlm/fastflowlm_server.h" +#include "lemon/backends/fastflowlm/fastflowlm.h" #include "lemon/backends/fastflowlm/fastflowlm_models.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_ops.h" @@ -164,7 +165,7 @@ void FastFlowLMServer::load(const std::string& model_name, #ifdef _WIN32 // On Windows, auto-install FLM binary if needed (downloads zip and extracts) - backend_manager_->install_backend(SPEC.recipe, "npu"); + backend_manager_->install_backend(fastflowlm::spec()->recipe, "npu"); #endif // Validate NPU hardware/drivers @@ -448,7 +449,7 @@ std::string FastFlowLMServer::get_flm_path() { #ifdef _WIN32 // On Windows, use the standard install directory (auto-installed zip) try { - std::string path = BackendUtils::get_backend_binary_path(SPEC, "npu"); + std::string path = BackendUtils::get_backend_binary_path(*fastflowlm::spec(), "npu"); LOG(INFO, "FastFlowLM") << "Found flm at: " << path << std::endl; return path; } catch (const std::exception& e) { @@ -526,7 +527,11 @@ class FlmOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { return &FastFlowLMServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, + FastFlowLMServer::get_install_params, /*split=*/false); + return &kSpec; +} const BackendOps* ops() { static const FlmOps kOps; return &kOps; diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp index 80d502ead..aa8ad871e 100644 --- a/src/cpp/server/backends/kokoro/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/kokoro/kokoro_server.h" +#include "lemon/backends/kokoro/kokoro.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" @@ -74,7 +75,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in // Install kokoros if needed const std::string backend = default_kokoro_backend(); - backend_manager_->install_backend(SPEC.recipe, backend); + backend_manager_->install_backend(kokoro::spec()->recipe, backend); // Use pre-resolved model path fs::path model_path = fs::path(model_info.resolved_path()); @@ -94,7 +95,7 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in LOG(INFO, "KokoroServer") << "Using model: " << model_index["model"] << std::endl; // Get koko executable path - std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend); + std::string exe_path = BackendUtils::get_backend_binary_path(*kokoro::spec(), backend); // Choose a port port_ = choose_port(); @@ -239,7 +240,11 @@ class KokoroOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { return &KokoroServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, + KokoroServer::get_install_params, /*split=*/false); + return &kSpec; +} const BackendOps* ops() { static const KokoroOps kOps; return &kOps; diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index d98ce46e6..9b75b6eaa 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/llamacpp/llamacpp_server.h" +#include "lemon/backends/llamacpp/llamacpp.h" #include "lemon/backends/llamacpp/llamacpp_gguf.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_ops.h" @@ -290,7 +291,7 @@ void LlamaCppServer::load(const std::string& model_name, device_type_ = use_gpu ? DEVICE_GPU : DEVICE_CPU; // Install llama-server if needed (use per-model backend) - backend_manager_->install_backend(SPEC.recipe, llamacpp_backend); + backend_manager_->install_backend(llamacpp::spec()->recipe, llamacpp_backend); // Use pre-resolved GGUF path. Skipped for hf_load models because llama-server // sources the weights itself via -hf; those models may not have local files. @@ -310,7 +311,7 @@ void LlamaCppServer::load(const std::string& model_name, port_ = choose_port(); // Get executable path - std::string executable = BackendUtils::get_backend_binary_path(SPEC, llamacpp_backend); + std::string executable = BackendUtils::get_backend_binary_path(*llamacpp::spec(), llamacpp_backend); // Check for embeddings and reranking support based on model type bool supports_embeddings = (model_info.type == ModelType::EMBEDDING); @@ -836,7 +837,11 @@ class LlamaCppOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { return &LlamaCppServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, + LlamaCppServer::get_install_params, /*split=*/false); + return &kSpec; +} const BackendOps* ops() { static const LlamaCppOps kOps; return &kOps; diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index b294f46ee..b9c8ebd34 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/moonshine/moonshine_server.h" +#include "lemon/backends/moonshine/moonshine.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" @@ -72,7 +73,7 @@ void MoonshineServer::load(const std::string& model_name, device_type_ = DEVICE_CPU; // Install moonshine-server if needed - backend_manager_->install_backend(SPEC.recipe, "cpu"); + backend_manager_->install_backend(moonshine::spec()->recipe, "cpu"); // Resolve model path from ModelManager (standard HF cache) std::string model_path = model_info.resolved_path(); @@ -98,7 +99,7 @@ void MoonshineServer::load(const std::string& model_name, } // Get executable path - std::string executable = BackendUtils::get_backend_binary_path(SPEC, "cpu"); + std::string executable = BackendUtils::get_backend_binary_path(*moonshine::spec(), "cpu"); LOG(INFO, "MoonshineServer") << "Using executable: " << executable << std::endl; // moonshine-server binds three consecutive ports: HTTP, WS (+1), TCP (+2). @@ -341,7 +342,11 @@ std::unique_ptr create(const BackendContext& ctx) { } -const BackendSpec* spec() { return &MoonshineServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, + MoonshineServer::get_install_params, /*split=*/false); + return &kSpec; +} const BackendOps* ops() { return default_backend_ops(); } } // namespace moonshine } // namespace backends diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp index f6ba8f457..c175301f6 100644 --- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/ryzenai/ryzenai_server.h" +#include "lemon/backends/ryzenai/ryzenai.h" #include "lemon/backends/backend_registry.h" #include "lemon/model_manager.h" #include "lemon/backends/backend_ops.h" @@ -43,7 +44,7 @@ RyzenAIServer::~RyzenAIServer() { bool RyzenAIServer::is_available() { try { - return !backends::BackendUtils::get_backend_binary_path(SPEC, "npu").empty(); + return !backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu").empty(); } catch (...) { return false; } @@ -60,7 +61,7 @@ void RyzenAIServer::load(const std::string& model_name, backend_manager_->install_backend("ryzenai-llm", "npu"); // Get the path to ryzenai-server - std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(SPEC, "npu"); + std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu"); if (ryzenai_server_path.empty()) { throw std::runtime_error("RyzenAI-Server executable not found even after installation attempt"); } @@ -208,7 +209,11 @@ class RyzenAiOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { return &::lemon::RyzenAIServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec("ryzenai-server", descriptor.binary, + ::lemon::RyzenAIServer::get_install_params, /*split=*/false); + return &kSpec; +} const BackendOps* ops() { static const RyzenAiOps kOps; return &kOps; diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp index 718855d8f..98f19e5ea 100644 --- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/sdcpp/sdcpp_server.h" +#include "lemon/backends/sdcpp/sdcpp.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/backend_manager.h" @@ -211,7 +212,7 @@ void SDServer::load(const std::string& model_name, } // Install sd-server if needed - backend_manager_->install_backend(SPEC.recipe, backend); + backend_manager_->install_backend(sdcpp::spec()->recipe, backend); // Get model path std::string model_path = model_info.resolved_path("main"); @@ -233,7 +234,7 @@ void SDServer::load(const std::string& model_name, LOG(DEBUG, "SDServer") << "Using model: " << model_path << std::endl; // Get sd-server executable path - std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, backend); + std::string exe_path = BackendUtils::get_backend_binary_path(*sdcpp::spec(), backend); // Choose a port port_ = choose_port(); @@ -757,7 +758,11 @@ std::unique_ptr create(const BackendContext& ctx) { } -const BackendSpec* spec() { return &SDServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, + SDServer::get_install_params, /*split=*/false); + return &kSpec; +} const BackendOps* ops() { return default_backend_ops(); } } // namespace sdcpp } // namespace backends diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp index 1ab4e22fc..085cd0f2a 100644 --- a/src/cpp/server/backends/vllm/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/vllm/vllm_server.h" +#include "lemon/backends/vllm/vllm.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_utils.h" #include "lemon/model_manager.h" @@ -123,7 +124,7 @@ void VLLMServer::load(const std::string& model_name, RuntimeConfig::validate_backend_choice("vllm", vllm_backend); // Install vllm-server if needed - backend_manager_->install_backend(SPEC.recipe, vllm_backend); + backend_manager_->install_backend(vllm::spec()->recipe, vllm_backend); // vLLM uses HuggingFace model names, not local file paths. // The checkpoint field in server_models.json is the HF model ID. @@ -138,7 +139,7 @@ void VLLMServer::load(const std::string& model_name, port_ = choose_port(); // Get executable path - std::string executable = BackendUtils::get_backend_binary_path(SPEC, vllm_backend); + std::string executable = BackendUtils::get_backend_binary_path(*vllm::spec(), vllm_backend); // Build command line arguments std::vector args; @@ -322,7 +323,11 @@ std::unique_ptr create(const BackendContext& ctx) { } -const BackendSpec* spec() { return &VLLMServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, + VLLMServer::get_install_params, /*split=*/true); + return &kSpec; +} const BackendOps* ops() { return default_backend_ops(); } } // namespace vllm } // namespace backends diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp index dfa0ebea9..dcccaf7ac 100644 --- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -1,4 +1,5 @@ #include "lemon/backends/whispercpp/whispercpp_server.h" +#include "lemon/backends/whispercpp/whispercpp.h" #include "lemon/backends/backend_registry.h" #include "lemon/backends/backend_ops.h" #include "lemon/backends/backend_utils.h" @@ -242,7 +243,7 @@ void WhisperServer::load(const std::string& model_name, device_type_ = DEVICE_CPU; } - backend_manager_->install_backend(SPEC.recipe, whispercpp_backend); + backend_manager_->install_backend(whispercpp::spec()->recipe, whispercpp_backend); std::string model_path = model_info.resolved_path(); if (model_path.empty()) { @@ -258,7 +259,7 @@ void WhisperServer::load(const std::string& model_name, } // Get whisper-server executable path - std::string exe_path = BackendUtils::get_backend_binary_path(SPEC, whispercpp_backend); + std::string exe_path = BackendUtils::get_backend_binary_path(*whispercpp::spec(), whispercpp_backend); // Choose a port port_ = choose_port(); @@ -733,7 +734,11 @@ class WhisperOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { return &WhisperServer::SPEC; } +const BackendSpec* spec() { + static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, + WhisperServer::get_install_params, /*split=*/false); + return &kSpec; +} const BackendOps* ops() { static const WhisperOps kOps; return &kOps; diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index 384412753..e4b7122f8 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -3118,7 +3118,7 @@ void Server::handle_image_upscale(const httplib::Request& req, httplib::Response // as a separate request from generation, which lets the frontend show // the original and upscaled images side by side with independent timing. std::string exe_dir = lemon::backends::BackendUtils::get_backend_binary_path( - lemon::backends::SDServer::SPEC, backend); + *lemon::backends::try_get_spec_for_recipe("sd-cpp"), backend); std::filesystem::path cli_exe = std::filesystem::path(exe_dir).parent_path() / #ifdef _WIN32 "sd-cli.exe"; From de6d3b1b9df5663a31646554458f827439c7181c Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 17:58:11 -0400 Subject: [PATCH 23/39] polish(backends): drop redundant recipe from descriptor support rows The recipe was repeated on every support row (6x in llamacpp.h). Introduce a recipe-free BackendSupport struct; the owning descriptor's recipe is filled in by recipe_defs() when flattening to RecipeBackendDef. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_descriptor.h | 2 +- .../include/lemon/backends/fastflowlm/fastflowlm.h | 2 +- src/cpp/include/lemon/backends/kokoro/kokoro.h | 4 ++-- src/cpp/include/lemon/backends/llamacpp/llamacpp.h | 12 ++++++------ src/cpp/include/lemon/backends/moonshine/moonshine.h | 6 +++--- src/cpp/include/lemon/backends/ryzenai/ryzenai.h | 2 +- src/cpp/include/lemon/backends/sdcpp/sdcpp.h | 10 +++++----- src/cpp/include/lemon/backends/vllm/vllm.h | 2 +- .../include/lemon/backends/whispercpp/whispercpp.h | 10 +++++----- src/cpp/include/lemon/recipe_backend_def.h | 10 ++++++++++ src/cpp/server/system_info.cpp | 3 ++- 11 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index d640ca279..efe938404 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -61,7 +61,7 @@ struct BackendDescriptor { bool dynamic_models = false; // true = class supplies models at runtime (cloud), not server_models.json std::vector options; // backend-specific knobs (common ones are automatic) - std::vector support; // which OS / GPU families it runs on ({} = no local gating) + std::vector support; // which OS / GPU families it runs on ({} = no local gating) std::vector default_labels; // labels injected when a model omits them std::vector required_checkpoints{"main"}; // unconditional files; conditional ones checked in load() diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index b56c9e577..0c621f053 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -24,7 +24,7 @@ inline const BackendDescriptor descriptor = { /*dynamic_models*/ false, /*options*/ {}, /*support*/ { - {"flm", "npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, + {"npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h index 3ebb9efbd..b1e52eba4 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -24,8 +24,8 @@ inline const BackendDescriptor descriptor = { /*dynamic_models*/ false, /*options*/ {}, /*support*/ { - {"kokoro", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, - {"kokoro", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, }, /*default_labels*/ {}, // kokoro models carry "tts" explicitly in server_models.json /*required_checkpoints*/ {"main"}, diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index 8e5435f9f..02ed728d7 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -31,14 +31,14 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to llama-server", "Llama.cpp Backend Options"}, }, /*support*/ { - {"llamacpp", "system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"}, - {"llamacpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, - {"llamacpp", "cuda", {"windows", "linux"}, + {"system", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/ARM64 CPU, GPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + {"cuda", {"windows", "linux"}, {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"}, - {"llamacpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"}, - {"llamacpp", "rocm", {"windows", "linux"}, + {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}, {"amd_gpu", {}}}, "x86_64 CPU, AMD iGPU, AMD dGPU; ARM64 CPU/GPU (Linux)"}, + {"rocm", {"windows", "linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, - {"llamacpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"}, + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64 CPU; ARM64 CPU (Linux)"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h index 5b8faafe2..2c9feed2b 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h @@ -23,9 +23,9 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to moonshine-server", ""}, }, /*support*/ { - {"moonshine", "cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"}, - {"moonshine", "cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"}, - {"moonshine", "cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"}, + {"cpu", {"windows"}, {{"cpu", {"x86_64"}}}, "x86_64/arm64 CPU"}, + {"cpu", {"linux"}, {{"cpu", {"x86_64", "arm64"}}}, "x86_64/arm64 CPU"}, + {"cpu", {"macos"}, {{"cpu", {"arm64"}}}, "x86_64/arm64 CPU"}, }, /*default_labels*/ {"transcription", "realtime-transcription"}, /*required_checkpoints*/ {"main"}, diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h index c290c4dd1..13ebb9a7c 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -24,7 +24,7 @@ inline const BackendDescriptor descriptor = { /*dynamic_models*/ false, /*options*/ {}, /*support*/ { - {"ryzenai-llm", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, + {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h index 57ce30cbd..8cf299a2c 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -36,13 +36,13 @@ inline const BackendDescriptor descriptor = { {"flow_shift", "", 0.0, "SIZE", "Flow shift", "Stable Diffusion Options"}, }, /*support*/ { - {"sd-cpp", "rocm", {"windows", "linux"}, + {"rocm", {"windows", "linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx1152", "gfx103X", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, - {"sd-cpp", "cuda", {"linux"}, + {"cuda", {"linux"}, {{"nvidia_gpu", {"sm_75", "sm_80", "sm_86", "sm_89", "sm_90", "sm_100", "sm_120", "sm_121"}}}, "NVIDIA GPUs (Turing or newer)**"}, - {"sd-cpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"}, - {"sd-cpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, - {"sd-cpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}, {"nvidia_gpu", {}}}, "Vulkan-capable GPUs"}, + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, }, /*default_labels*/ {"image"}, /*required_checkpoints*/ {"main"}, // flux text_encoder+vae validated together in load() diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h index 84a596168..97c58c715 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm.h +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -25,7 +25,7 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to vllm-server", "vLLM Options"}, }, /*support*/ { - {"vllm", "rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"}, + {"rocm", {"linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Strix Halo iGPU (gfx1151)"}, }, /*default_labels*/ {}, /*required_checkpoints*/ {"main"}, diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h index 8c4a29815..e62ee029c 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -29,12 +29,12 @@ inline const BackendDescriptor descriptor = { "Custom arguments to pass to whisper-server", "Whisper.cpp Options"}, }, /*support*/ { - {"whispercpp", "npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, - {"whispercpp", "rocm", {"windows", "linux"}, + {"npu", {"windows"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, + {"rocm", {"windows", "linux"}, {{"amd_gpu", {"gfx1150", "gfx1151", "gfx110X", "gfx120X"}}}, "Supported AMD ROCm iGPU/dGPU families*"}, - {"whispercpp", "vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"}, - {"whispercpp", "cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, - {"whispercpp", "metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, + {"vulkan", {"windows", "linux"}, {{"cpu", {"x86_64"}}, {"amd_gpu", {}}}, "x86_64 CPU"}, + {"cpu", {"windows", "linux"}, {{"cpu", {"x86_64"}}}, "x86_64 CPU"}, + {"metal", {"macos"}, {{"metal", {}}}, "Apple Silicon GPU"}, }, /*default_labels*/ {"transcription", "realtime-transcription"}, /*required_checkpoints*/ {"main"}, // npu_cache validated in load() (npu variant only) diff --git a/src/cpp/include/lemon/recipe_backend_def.h b/src/cpp/include/lemon/recipe_backend_def.h index 829ff0f78..ec0af9a9d 100644 --- a/src/cpp/include/lemon/recipe_backend_def.h +++ b/src/cpp/include/lemon/recipe_backend_def.h @@ -26,4 +26,14 @@ struct RecipeBackendDef { std::string device_summary = ""; }; +// A backend descriptor's support row, without the recipe (it's always the +// owning descriptor's recipe — assembling a RecipeBackendDef fills it in). Keeps +// the descriptor literals from repeating their own recipe on every row. +struct BackendSupport { + std::string backend; + std::set supported_os; + DeviceConstraints devices; + std::string device_summary = ""; +}; + } // namespace lemon diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index 4f5697b50..e30108fe7 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -427,7 +427,8 @@ static const std::vector& recipe_defs() { std::vector v; for (const auto* desc : lemon::backends::all_descriptors()) { for (const auto& row : desc->support) { - v.push_back(row); + // Fill in the recipe (the owning descriptor's) per support row. + v.push_back({desc->recipe, row.backend, row.supported_os, row.devices, row.device_summary}); } } return v; From 070fcbcbebf59014163d381ad3aefa425426995c Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 18:06:28 -0400 Subject: [PATCH 24/39] polish(backends): remove dead llamacpp-special branch in version lookup The preceding generic block already handles backend_versions[recipe] for any recipe, so the recipe=="llamacpp" branch was unreachable duplicate code. Removing it also drops a hardcoded backend name from shared code. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/server/backend_manager.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/cpp/server/backend_manager.cpp b/src/cpp/server/backend_manager.cpp index 2983d49ca..83b61e80c 100644 --- a/src/cpp/server/backend_manager.cpp +++ b/src/cpp/server/backend_manager.cpp @@ -64,15 +64,6 @@ std::string get_backend_runtime_version(const json& backend_versions, return backend_versions[recipe][runtime_key].get(); } - // Only fall back to llamacpp runtime version if the recipe is llamacpp - if (recipe == "llamacpp" && - backend_versions.contains("llamacpp") && - backend_versions["llamacpp"].is_object() && - backend_versions["llamacpp"].contains(runtime_key) && - backend_versions["llamacpp"][runtime_key].is_string()) { - return backend_versions["llamacpp"][runtime_key].get(); - } - throw std::runtime_error("backend_versions.json is missing runtime version for: " + recipe + ":" + runtime_key); } From 554ab6c567c5ed70e8149bc98fea960b1eca786f Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 18:13:20 -0400 Subject: [PATCH 25/39] polish(router): replace flm/cloud recipe-string checks with slot policy find_flm_server_by_type -> find_coexisting_server_by_type matches on SlotPolicy::CoexistByType; count_pinned_servers_by_type skips SlotPolicy::Unmetered instead of recipe=="cloud". router.cpp now holds zero backend-name string literals; both behaviors are unchanged (flm is the only CoexistByType backend, cloud the only Unmetered one). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/router.h | 2 +- src/cpp/server/router.cpp | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/cpp/include/lemon/router.h b/src/cpp/include/lemon/router.h index e98a8b11d..a4f2d9629 100644 --- a/src/cpp/include/lemon/router.h +++ b/src/cpp/include/lemon/router.h @@ -167,7 +167,7 @@ class Router { bool has_npu_server() const; WrappedServer* find_npu_server() const; WrappedServer* find_npu_server_by_recipe(const std::string& recipe) const; - WrappedServer* find_flm_server_by_type(ModelType type) const; + WrappedServer* find_coexisting_server_by_type(ModelType type) const; void evict_all_npu_servers(); void evict_server(WrappedServer* server, int timeout_seconds = -1); void evict_all_servers(); diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp index 307c51294..514a9773e 100644 --- a/src/cpp/server/router.cpp +++ b/src/cpp/server/router.cpp @@ -225,10 +225,11 @@ WrappedServer* Router::find_npu_server_by_recipe(const std::string& recipe) cons return nullptr; } -WrappedServer* Router::find_flm_server_by_type(ModelType type) const { +WrappedServer* Router::find_coexisting_server_by_type(ModelType type) const { for (const auto& server : loaded_servers_) { if (server->is_backend_alive() && - server->get_recipe_options().get_recipe() == "flm" && + slot_policy_for_recipe(server->get_recipe_options().get_recipe()) == + SlotPolicy::CoexistByType && server->get_model_type() == type) { return server.get(); } @@ -455,7 +456,7 @@ void Router::load_model(const std::string& model_name, evict_server(peer); } // 2. Evict FLM of the SAME model type (max 1 per type: 1 LLM, 1 transcription, 1 embed) - WrappedServer* same_type_flm = find_flm_server_by_type(model_type); + WrappedServer* same_type_flm = find_coexisting_server_by_type(model_type); if (same_type_flm) { LOG(INFO, "Router") << "FLM " << model_type_to_string(model_type) << " slot occupied by: " << same_type_flm->get_model_name() @@ -1449,7 +1450,8 @@ void Router::responses_stream(const std::string& request_body, httplib::DataSink int Router::count_pinned_servers_by_type(ModelType type) const { int count = 0; for (const auto& server : loaded_servers_) { - if (server->get_recipe_options().get_recipe() == "cloud") { + // Unmetered servers (cloud) never occupy a slot, so they don't count. + if (is_unmetered_recipe(server->get_recipe_options().get_recipe())) { continue; } if (server->is_backend_alive() && server->get_model_type() == type && server->is_pinned()) { From b4547cdbb5355950506a1efb24d73fb4b3dcdb4f Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 18:18:21 -0400 Subject: [PATCH 26/39] polish(backends): descriptor flag for self-managed downloads, not recipe==flm Add BackendDescriptor::self_manages_downloads (true only for flm) and ModelManager::backend_self_manages_downloads(). The two load-time auto-download guards in server.cpp/ollama_api.cpp now consult it instead of hardcoding recipe != "flm". flm is the only backend with the flag set, so behavior is identical. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_descriptor.h | 5 +++++ src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h | 1 + src/cpp/include/lemon/model_manager.h | 4 ++++ src/cpp/server/model_manager.cpp | 5 +++++ src/cpp/server/ollama_api.cpp | 5 +++-- src/cpp/server/server.cpp | 3 ++- 6 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index efe938404..a5dc97603 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -89,6 +89,11 @@ struct BackendDescriptor { // default; system-managed packages (flm) accept any version >= expected. VersionPolicy version_policy = VersionPolicy::Exact; + // True if the backend pulls its own models on demand (flm self-pulls via its + // CLI) rather than being pre-downloaded from Hugging Face by the router. Such + // backends are skipped by the load-time auto-download path. + bool self_manages_downloads = false; + // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { return config_section.empty() ? recipe : config_section; diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index 0c621f053..d773f1bc4 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -36,6 +36,7 @@ inline const BackendDescriptor descriptor = { /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ false, /*version_policy*/ VersionPolicy::AtLeast, // system-managed package + /*self_manages_downloads*/ true, // flm pulls its own models via the flm CLI }; } // namespace fastflowlm diff --git a/src/cpp/include/lemon/model_manager.h b/src/cpp/include/lemon/model_manager.h index f12cd3e9a..abdbeae71 100644 --- a/src/cpp/include/lemon/model_manager.h +++ b/src/cpp/include/lemon/model_manager.h @@ -216,6 +216,10 @@ class ModelManager { // Check if model is downloaded bool is_model_downloaded(const std::string& model_name); + // True if the model's backend pulls its own models on demand (e.g. flm) and + // so should be skipped by the router's load-time auto-download path. + bool backend_self_manages_downloads(const std::string& recipe) const; + // Shared Hugging Face completeness check: true if all required checkpoints // are present and complete (per-backend file validation runs via ops). The // default BackendOps::is_downloaded delegates here for HF-backed backends. diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index abcbbe3b0..7ed3c737a 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -2277,6 +2277,11 @@ bool ModelManager::is_model_downloaded(const std::string& model_name) { return false; } +bool ModelManager::backend_self_manages_downloads(const std::string& recipe) const { + const auto* desc = backends::descriptor_for(recipe); + return desc && desc->self_manages_downloads; +} + void ModelManager::download_registered_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress_callback) { // The backend's ops own the download (shared HF engine by default; flm pulls // via the flm CLI; cloud is a no-op). diff --git a/src/cpp/server/ollama_api.cpp b/src/cpp/server/ollama_api.cpp index 7687caab4..0604a3935 100644 --- a/src/cpp/server/ollama_api.cpp +++ b/src/cpp/server/ollama_api.cpp @@ -238,8 +238,9 @@ void OllamaApi::auto_load_model(const std::string& model) { auto info = model_manager_->get_model_info(name); - // Download if not cached - if (info.recipe != "flm" && !model_manager_->is_model_downloaded(name)) { + // Download if not cached (backends that self-manage downloads pull on load) + if (!model_manager_->backend_self_manages_downloads(info.recipe) && + !model_manager_->is_model_downloaded(name)) { LOG(INFO, "OllamaApi") << "Model not cached, downloading..." << std::endl; model_manager_->download_registered_model(info, true); info = model_manager_->get_model_info(name); diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index e4b7122f8..22af41fda 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -1575,7 +1575,8 @@ void Server::auto_load_model_if_needed(const std::string& requested_model) { // - If model is NOT downloaded: Download it from HuggingFace // - If model IS downloaded: Skip HuggingFace API check entirely (use cached version) // Only the /pull endpoint should check for updates (uses do_not_upgrade=false) - if (info.recipe != "flm" && !model_manager_->is_model_downloaded(requested_model)) { + if (!model_manager_->backend_self_manages_downloads(info.recipe) && + !model_manager_->is_model_downloaded(requested_model)) { LOG(INFO, "Server") << "Model not cached, downloading from Hugging Face..." << std::endl; LOG(INFO, "Server") << "This may take several minutes for large models." << std::endl; model_manager_->download_registered_model(info, true); From 71c1bb134a78127a5e1341a682178da22cfcb314 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 18:27:50 -0400 Subject: [PATCH 27/39] polish(backends): move local-import checkpoint scan into BackendOps resolve_and_register_local_model() had a recipe if/else scanning the imported directory for each backend's primary artifact (.gguf / .bin / genai_config.json dir). Replace with BackendOps::find_imported_checkpoint(dir): default "" registers the directory (sd-cpp/kokoro/moonshine); llamacpp reuses resolve_gguf_path, whisper finds the .bin, ryzenai finds genai_config.json's dir (and its resolve_checkpoint_path now reuses the same scan). server.cpp holds no per-recipe import logic. Verified via local_import smoke tests for llamacpp (ignores mmproj), whisper, and a default backend. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_ops.h | 9 +++ .../backends/llamacpp/llamacpp_server.cpp | 5 ++ .../backends/ryzenai/ryzenai_server.cpp | 10 +++- .../backends/whispercpp/whispercpp_server.cpp | 16 ++++++ src/cpp/server/server.cpp | 56 ++----------------- 5 files changed, 42 insertions(+), 54 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index cf796cb37..2daaee228 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -59,6 +59,15 @@ class BackendOps { virtual std::string resolve_checkpoint_path(const ModelInfo& info, const CheckpointResolveContext& ctx) const; + // Find the primary checkpoint artifact inside a freshly-imported local + // directory (a local_import pull), e.g. the .gguf / .bin file or the + // genai_config.json directory. Returns the absolute path to register, or "" + // to register the directory itself. Default: "" (register the directory). + virtual std::string find_imported_checkpoint(const std::string& import_dir) const { + (void)import_dir; + return ""; + } + // Models supplied at runtime rather than from server_models.json (descriptor // dynamic_models = true). Default: none. cloud/flm override. virtual std::vector discover_models(const BackendOpsContext& ctx) const { diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 9b75b6eaa..51e040893 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -792,6 +792,11 @@ class LlamaCppOps : public BackendOps { return BackendOps::resolve_checkpoint_path(info, ctx); } + std::string find_imported_checkpoint(const std::string& import_dir) const override { + // The primary artifact is the (non-mmproj) GGUF file. + return resolve_gguf_path(import_dir, ""); + } + std::string validate_checkpoint_file(const std::string& resolved_path) const override { // A .gguf file in the cache must start with the GGUF magic, else it's a // truncated/corrupt download and the model is not really present. diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp index c175301f6..7bee8e46d 100644 --- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -195,7 +195,13 @@ class RyzenAiOps : public BackendOps { std::string resolve_checkpoint_path(const ModelInfo&, const CheckpointResolveContext& ctx) const override { // RyzenAI models are a directory containing genai_config.json. - std::filesystem::path dir = lemon::utils::path_from_utf8(ctx.model_cache_path); + std::string found = find_imported_checkpoint(ctx.model_cache_path); + return found.empty() ? ctx.model_cache_path : found; // dir if not found + } + + std::string find_imported_checkpoint(const std::string& import_dir) const override { + // The primary artifact is the directory holding genai_config.json. + std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir); if (hf_cache::exists(dir)) { for (const auto& entry : std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { @@ -204,7 +210,7 @@ class RyzenAiOps : public BackendOps { } } } - return ctx.model_cache_path; // directory even if genai_config not found + return ""; // register the directory itself } }; } // namespace diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp index dcccaf7ac..ef1d9a7e5 100644 --- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -731,6 +731,22 @@ class WhisperOps : public BackendOps { } return BackendOps::resolve_checkpoint_path(info, ctx); } + + std::string find_imported_checkpoint(const std::string& import_dir) const override { + // The primary artifact is the .bin model file. + std::filesystem::path dir = lemon::utils::path_from_utf8(import_dir); + if (!hf_cache::exists(dir)) { + return ""; + } + for (const auto& entry : + std::filesystem::recursive_directory_iterator(dir, hf_cache::dir_options())) { + if (entry.is_regular_file() && + entry.path().filename().string().find(".bin") != std::string::npos) { + return lemon::utils::path_to_utf8(entry.path()); + } + } + return ""; + } }; } // namespace diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index 22af41fda..30cd919d3 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -3958,60 +3958,12 @@ void Server::resolve_and_register_local_model( std::string recipe = model_data.value("recipe", ""); bool vision = model_data.value("vision", false); - std::string resolved_checkpoint; + // The backend's ops locate its primary artifact within the imported + // directory (.gguf / .bin file, genai_config.json dir, …); "" means register + // the directory itself. + std::string resolved_checkpoint = backends::ops_for(recipe)->find_imported_checkpoint(dest_path); std::string resolved_mmproj; - // For RyzenAI LLM models, find genai_config.json - if (recipe == "ryzenai-llm") { - for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { - if (entry.is_regular_file() && entry.path().filename() == "genai_config.json") { - resolved_checkpoint = entry.path().parent_path().string(); - break; - } - } - if (resolved_checkpoint.empty()) { - resolved_checkpoint = dest_path; - } - } - // For llamacpp models, find the GGUF file - else if (recipe == "llamacpp") { - std::string gguf_file_found; - - // If no variant or variant not found, search for any .gguf file (excluding mmproj) - if (gguf_file_found.empty()) { - for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - std::string filename_lower = filename; - std::transform(filename_lower.begin(), filename_lower.end(), filename_lower.begin(), ::tolower); - - if (filename_lower.find(".gguf") != std::string::npos && - filename_lower.find("mmproj") == std::string::npos) { - gguf_file_found = entry.path().string(); - break; - } - } - } - } - - resolved_checkpoint = gguf_file_found.empty() ? dest_path : gguf_file_found; - } - // For whispercpp, find .bin file - else if (recipe == "whispercpp") { - for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { - if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - if (filename.find(".bin") != std::string::npos) { - resolved_checkpoint = entry.path().string(); - break; - } - } - } - if (resolved_checkpoint.empty()) { - resolved_checkpoint = dest_path; - } - } - // Search for mmproj file if vision is enabled or mmproj hint provided if (vision || !mmproj.empty()) { for (const auto& entry : std::filesystem::recursive_directory_iterator(dest_path)) { From 7d822208838238e63de3895d5d703fd041ce89b6 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 19:18:40 -0400 Subject: [PATCH 28/39] fix(llamacpp): parenthesize numeric_limits::max() for MSVC On Windows the merged include chain pulls in the windows.h max() macro into this TU, turning std::numeric_limits::max() into a syntax error (C2589). Wrap the calls as (std::numeric_limits::max)() so the macro cannot expand. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp index aeb4f2260..671900adb 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp @@ -60,7 +60,7 @@ static bool read_gguf_string(std::istream& in, std::string& value) { } static bool skip_bytes(std::istream& in, uint64_t bytes) { - if (bytes > static_cast(std::numeric_limits::max())) return false; + if (bytes > static_cast((std::numeric_limits::max)())) return false; in.seekg(static_cast(bytes), std::ios::cur); return static_cast(in); } @@ -100,7 +100,7 @@ static bool read_gguf_integer_value(std::istream& in, uint32_t type, int64_t& va case 10: { uint64_t v = 0; if (!read_le(in, v)) return false; - if (v > static_cast(std::numeric_limits::max())) return false; + if (v > static_cast((std::numeric_limits::max)())) return false; value = static_cast(v); return true; } @@ -132,7 +132,7 @@ static bool skip_gguf_value(std::istream& in, uint32_t type) { if (elem_type == 9) return false; uint64_t elem_size = gguf_scalar_size(elem_type); if (elem_size == 0) return false; - if (count > std::numeric_limits::max() / elem_size) return false; + if (count > (std::numeric_limits::max)() / elem_size) return false; return skip_bytes(in, count * elem_size); } From 6492260882f219155f895a4e784608669990c26f Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Mon, 22 Jun 2026 19:18:40 -0400 Subject: [PATCH 29/39] fix(flm): mark backend dynamic_models so its models register flm models come from flm's model_list.json at runtime (0 entries in server_models.json), but the descriptor had dynamic_models=false, so build_cache skipped flm's ops->discover_models() and flm models (e.g. llama3.2-1b-FLM) never registered -> 404. The build_cache comment already documents flm as a dynamic-discovery backend alongside cloud; align the descriptor with that intent. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_descriptor.h | 2 +- src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index a5dc97603..bd46c98c4 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -58,7 +58,7 @@ struct BackendDescriptor { SlotPolicy slot_policy = SlotPolicy::Standard; // default; override effective_slot_policy() if variant-dependent bool selectable_backend = false; // auto-creates "_backend" option + "--" flag bool uses_ctx_size = false; // opt in to the shared ctx_size option - bool dynamic_models = false; // true = class supplies models at runtime (cloud), not server_models.json + bool dynamic_models = false; // true = ops supply models at runtime (cloud, flm), not server_models.json std::vector options; // backend-specific knobs (common ones are automatic) std::vector support; // which OS / GPU families it runs on ({} = no local gating) diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index d773f1bc4..7b812bfb9 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -21,7 +21,7 @@ inline const BackendDescriptor descriptor = { /*slot_policy*/ SlotPolicy::CoexistByType, /*selectable_backend*/ false, /*uses_ctx_size*/ true, - /*dynamic_models*/ false, + /*dynamic_models*/ true, // models come from flm's model_list.json, not server_models.json /*options*/ {}, /*support*/ { {"npu", {"windows", "linux"}, {{"amd_npu", {"XDNA2"}}}, "XDNA2 NPU"}, From f7ec14caf32ff196c7743a4c971a1619b8f8d132 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Thu, 25 Jun 2026 15:15:08 -0400 Subject: [PATCH 30/39] polish(backends): move moonshine download file-selection into ops model_manager's download path hardcoded recipe == "moonshine" to fetch a variant directory of files. Add BackendOps::select_checkpoint_files (default nullopt = the GGUF/direct-file defaults) and override it in MoonshineOps. The download path no longer names a backend. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_ops.h | 13 ++++++ .../backends/moonshine/moonshine_server.cpp | 42 ++++++++++++++++++- src/cpp/server/model_manager.cpp | 26 ++++-------- 3 files changed, 63 insertions(+), 18 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index 2daaee228..49faa68b4 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include "lemon/model_manager.h" // ModelInfo, DownloadProgressCallback (server-side only) @@ -68,6 +69,18 @@ class BackendOps { return ""; } + // Select the repo-relative files to download for the main checkpoint + // `main_variant`, for backends whose artifact layout isn't a GGUF file. + // Return nullopt to use the default GGUF selection. (Direct single-file + // variants — .safetensors/.pth/.ckpt — are handled generically upstream.) + // moonshine overrides: its variant names a directory of files to fetch. + virtual std::optional> select_checkpoint_files( + const std::string& main_variant, const std::vector& repo_files) const { + (void)main_variant; + (void)repo_files; + return std::nullopt; + } + // Models supplied at runtime rather than from server_models.json (descriptor // dynamic_models = true). Default: none. cloud/flm override. virtual std::vector discover_models(const BackendOpsContext& ctx) const { diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index 41545837e..a84506e35 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -8,8 +8,10 @@ #include "lemon/utils/http_client.h" #include "lemon/utils/process_manager.h" #include "lemon/error_types.h" +#include #include #include +#include #include #include #include @@ -370,12 +372,50 @@ std::unique_ptr create(const BackendContext& ctx) { } +namespace { +class MoonshineOps : public BackendOps { +public: + std::optional> select_checkpoint_files( + const std::string& main_variant, const std::vector& repo_files) const override { + // A Moonshine variant names a directory (e.g. "medium-streaming-en/quantized"); + // download every file under it. + std::string folder_prefix = main_variant; + if (!folder_prefix.empty() && folder_prefix.back() != '/') { + folder_prefix += "/"; + } + auto starts_with_ci = [](const std::string& s, const std::string& p) { + if (s.size() < p.size()) return false; + for (size_t i = 0; i < p.size(); ++i) { + if (std::tolower(static_cast(s[i])) != + std::tolower(static_cast(p[i]))) { + return false; + } + } + return true; + }; + std::vector files; + for (const auto& f : repo_files) { + if (starts_with_ci(f, folder_prefix)) { + files.push_back(f); + } + } + if (files.empty()) { + throw std::runtime_error("No Moonshine model files found in folder: " + main_variant); + } + return files; + } +}; +} // namespace + const BackendSpec* spec() { static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, MoonshineServer::get_install_params, /*split=*/false); return &kSpec; } -const BackendOps* ops() { return default_backend_ops(); } +const BackendOps* ops() { + static const MoonshineOps kOps; + return &kOps; +} } // namespace moonshine } // namespace backends } // namespace lemon diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 7ed3c737a..824a1ecce 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -3535,7 +3535,11 @@ void ModelManager::download_from_huggingface(const ModelInfo& info, bool is_direct_file = ends_with(main_variant, ".safetensors") || ends_with(main_variant, ".pth") || ends_with(main_variant, ".ckpt"); - bool is_moonshine = info.recipe == "moonshine"; + + // Backends with a bespoke artifact layout (moonshine = a directory of + // files) select their own download set; nullopt = the default paths. + auto backend_files = + backends::ops_for(info.recipe)->select_checkpoint_files(main_variant, repo_files); if (is_direct_file) { // For non-GGUF model files, download the specified file directly @@ -3545,22 +3549,10 @@ void ModelManager::download_from_huggingface(const ModelInfo& info, } else { throw std::runtime_error("Model file not found in repository: " + main_variant); } - } else if (is_moonshine) { - // Moonshine variant is a directory path (e.g., "medium-streaming-en/quantized") - // Download all files under that directory - std::string folder_prefix = main_variant; - if (!folder_prefix.empty() && folder_prefix.back() != '/') { - folder_prefix += "/"; - } - for (const auto& file : repo_files) { - if (starts_with_ignore_case(file, folder_prefix)) { - files_to_download[main_repo_id].push_back(file); - } - } - if (files_to_download[main_repo_id].empty()) { - throw std::runtime_error("No Moonshine model files found in folder: " + main_variant); - } - LOG(INFO, "ModelManager") << "Moonshine: downloading " << files_to_download[main_repo_id].size() + } else if (backend_files) { + files_to_download[main_repo_id] = std::move(*backend_files); + LOG(INFO, "ModelManager") << info.recipe << ": downloading " + << files_to_download[main_repo_id].size() << " files from " << main_variant << std::endl; } else { // GGUF model: Use identify_gguf_models to determine which files to download From 6cc95524b1c8249059ef9a9d784744074b9a9850 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Thu, 25 Jun 2026 15:20:29 -0400 Subject: [PATCH 31/39] polish(backends): move FLM unavailable-state machine into flm ops system_info hardcoded a recipe == "flm" block to classify FLM's supported-but-unavailable state (.deb/driver manual setup) and emit troubleshoot links. Add BackendOps::classify_unavailable (default nullopt = the generic installable/no-fetch path) and implement it in FlmOps. system_info no longer names a backend in its install-state machine. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_ops.h | 23 ++++++++++ .../backends/fastflowlm/fastflowlm_server.cpp | 37 ++++++++++++++++ src/cpp/server/system_info.cpp | 42 +++++-------------- 3 files changed, 71 insertions(+), 31 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index 49faa68b4..c03c111f3 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -137,6 +137,29 @@ class BackendOps { (void)backend; return {binary_found, ""}; } + + // The /system-info state for a backend variant that is supported but not + // currently available (install probe failed). + struct UnavailableState { + std::string state; // "installable" | "update_required" | "action_required" + std::string message; // shown to the user + std::string action; // remediation (a URL or an install command) + bool attach_installed_version = false; // surface the installed version too + }; + + // Classify a "supported but not available" backend variant for /system-info, + // given the install probe's error text and the generic install command the + // caller would otherwise use. Return nullopt to use the generic + // installable/no-fetch default. flm overrides: it is a system .deb + drivers + // needing manual setup, so its states and remediation links differ. + virtual std::optional classify_unavailable( + const std::string& backend, const std::string& install_error, + const std::string& default_install_command) const { + (void)backend; + (void)install_error; + (void)default_install_command; + return std::nullopt; + } }; // Shared default ops instance for backends that override nothing. diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index fc5ecef9b..4a84ecd10 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -524,6 +525,42 @@ class FlmOps : public BackendOps { } return {binary_found, ""}; } + + std::optional classify_unavailable( + const std::string&, const std::string& install_error, + const std::string& default_install_command) const override { + // FLM needs richer state to guide users through manual setup (installing + // the .deb, xrt drivers, etc.) rather than an automatic backend install. + bool is_not_installed = install_error.empty() + || install_error.find("not installed") != std::string::npos + || install_error.find("not found") != std::string::npos; + bool is_version_mismatch = install_error.find("requires") != std::string::npos; + + UnavailableState s; + if (is_not_installed) { + s.state = "installable"; + } else if (is_version_mismatch) { + s.state = "update_required"; + } else { + s.state = "action_required"; + } + s.message = install_error; + s.attach_installed_version = !is_not_installed; + +#ifdef __linux__ + (void)default_install_command; + s.action = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot"; +#elif defined(_WIN32) + if (!is_not_installed && !is_version_mismatch) { + s.action = "Visit https://lemonade-server.ai/driver_install.html"; + } else { + s.action = default_install_command; + } +#else + s.action = default_install_command; +#endif + return s; + } }; } // namespace diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index 3f7849a05..2c67a8f94 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -1316,41 +1316,21 @@ json SystemInfo::build_recipes_info(const json& devices) { backend["message"] = message; backend["action"] = ""; } else if (!available) { - // FLM on Linux needs richer state to guide users through manual setup - // (installing .deb, xrt drivers, etc.) - if (def.recipe == "flm") { - bool is_not_installed = install_error.empty() - || install_error.find("not installed") != std::string::npos - || install_error.find("not found") != std::string::npos; - bool is_version_mismatch = install_error.find("requires") != std::string::npos; - - if (is_not_installed) { - backend["state"] = "installable"; - } else if (is_version_mismatch) { - backend["state"] = "update_required"; - } else { - backend["state"] = "action_required"; - } - backend["message"] = install_error; - - if (!is_not_installed) { + // Backends with bespoke unavailable-state guidance (flm: a system .deb + // + drivers needing manual setup) classify themselves; everyone else + // uses the generic installable/no-fetch default below. + const std::string default_install_command = get_install_command(def.recipe, def.backend); + if (auto st = backends::ops_for(def.recipe)->classify_unavailable( + def.backend, install_error, default_install_command)) { + backend["state"] = st->state; + backend["message"] = st->message; + backend["action"] = st->action; + if (st->attach_installed_version) { std::string installed_version = get_recipe_version(def.recipe, def.backend); if (!installed_version.empty() && installed_version != "unknown") { backend["version"] = installed_version; } } - -#ifdef __linux__ - backend["action"] = "Visit https://lemonade-server.ai/flm_npu_linux.html?mode=troubleshoot"; -#elif defined(_WIN32) - if (!is_not_installed && !is_version_mismatch) { - backend["action"] = "Visit https://lemonade-server.ai/driver_install.html"; - } else { - backend["action"] = get_install_command(def.recipe, def.backend); - } -#else - backend["action"] = get_install_command(def.recipe, def.backend); -#endif } else { auto* cfg = RuntimeConfig::global(); bool no_fetch = cfg && cfg->no_fetch_executables(); @@ -1369,7 +1349,7 @@ json SystemInfo::build_recipes_info(const json& devices) { && !install_error.empty() && needs_gfx1151_cwsr_fix()) { backend["action"] = "Visit https://lemonade-server.ai/gfx1151_linux.html"; } else { - backend["action"] = get_install_command(def.recipe, def.backend); + backend["action"] = default_install_command; } } } else { From d0368daf30f631f1a8fb89cc2abf9668a03a1112 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Thu, 25 Jun 2026 15:22:11 -0400 Subject: [PATCH 32/39] polish(cli): drive bench backend override from descriptor, not recipe==llamacpp bench hardcoded recipe == "llamacpp" to send the llamacpp_backend override. Use the CLI-safe descriptor registry: any recipe with selectable_backend gets its _backend override (llamacpp and vllm today). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/cli/bench.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cpp/cli/bench.cpp b/src/cpp/cli/bench.cpp index 6cf1b1a5b..280b26d33 100644 --- a/src/cpp/cli/bench.cpp +++ b/src/cpp/cli/bench.cpp @@ -1,5 +1,6 @@ #include "lemon_cli/bench.h" #include "lemon_cli/lemonade_client.h" +#include "lemon/backends/backend_descriptor_registry.h" #include #include #include @@ -406,9 +407,10 @@ bool load_model_for_backend(lemonade::LemonadeClient& client, request_body["model_name"] = model; request_body["save_options"] = false; - // For llamacpp recipe, pass backend override - if (recipe == "llamacpp") { - request_body["llamacpp_backend"] = backend; + // For recipes that expose a selectable backend, pass the override. + if (const auto* desc = lemon::backends::descriptor_for(recipe); + desc && desc->selectable_backend) { + request_body[desc->effective_config_section() + "_backend"] = backend; } if (ctx_size > 0) { From e14fc2a1917677633d9d996c4acbd1d4511bd0d4 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Thu, 25 Jun 2026 15:26:59 -0400 Subject: [PATCH 33/39] polish(backends): move GGUF :variant registration check into llamacpp ops model_manager hardcoded actual_recipe == "llamacpp" to require a :variant on GGUF checkpoints at registration. Add BackendOps::validate_registration_checkpoint (default accept) and implement the GGUF rule in LlamaCppOps. Verified: a GGUF checkpoint without :variant is still rejected; other recipes are unaffected. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_ops.h | 8 ++++++++ .../backends/llamacpp/llamacpp_server.cpp | 14 ++++++++++++++ src/cpp/server/model_manager.cpp | 19 +++++-------------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index c03c111f3..047c6795d 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -69,6 +69,14 @@ class BackendOps { return ""; } + // Validate a user-supplied checkpoint string when registering a new model. + // Return an error message if invalid, "" if acceptable. Default: accept. + // llamacpp requires a :variant on GGUF checkpoints. + virtual std::string validate_registration_checkpoint(const std::string& checkpoint) const { + (void)checkpoint; + return ""; + } + // Select the repo-relative files to download for the main checkpoint // `main_variant`, for backends whose artifact layout isn't a GGUF file. // Return nullopt to use the default GGUF selection. (Direct single-file diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index d822250fc..d441f998e 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -804,6 +804,20 @@ class LlamaCppOps : public BackendOps { return resolve_gguf_path(import_dir, ""); } + std::string validate_registration_checkpoint(const std::string& checkpoint) const override { + // A GGUF checkpoint must name its quant via CHECKPOINT:VARIANT. + std::string lower = checkpoint; + std::transform(lower.begin(), lower.end(), lower.begin(), ::tolower); + if (lower.find("gguf") != std::string::npos && + checkpoint.find(':') == std::string::npos) { + return "You are required to provide a 'variant' in the checkpoint field when " + "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. " + "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or " + "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf"; + } + return ""; + } + std::string validate_checkpoint_file(const std::string& resolved_path) const override { // A .gguf file in the cache must start with the GGUF magic, else it's a // truncated/corrupt download and the model is not really present. diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 824a1ecce..c904bf55b 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -2681,20 +2681,11 @@ void ModelManager::download_model(const std::string& model_name, ); } - // Validate GGUF models (llamacpp recipe) require a variant - if (actual_recipe == "llamacpp") { - std::string checkpoint_lower = actual_checkpoint; - std::transform(checkpoint_lower.begin(), checkpoint_lower.end(), - checkpoint_lower.begin(), ::tolower); - if (checkpoint_lower.find("gguf") != std::string::npos && - actual_checkpoint.find(':') == std::string::npos) { - throw std::runtime_error( - "You are required to provide a 'variant' in the checkpoint field when " - "registering a GGUF model. The variant is provided as CHECKPOINT:VARIANT. " - "For example: Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:Q4_0 or " - "Qwen/Qwen2.5-Coder-3B-Instruct-GGUF:qwen2.5-coder-3b-instruct-q4_0.gguf" - ); - } + // Backend-specific checkpoint validation (llamacpp: GGUF needs :variant). + if (auto err = backends::ops_for(actual_recipe)->validate_registration_checkpoint( + actual_checkpoint); + !err.empty()) { + throw std::runtime_error(err); } LOG(INFO, "ModelManager") << "Registering new user model: " << model_name << std::endl; From 5daebd52d396f09ffe835fa9a08418c367a7bbc2 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Thu, 25 Jun 2026 15:36:47 -0400 Subject: [PATCH 34/39] polish(backends): make_server helper + collapse redundant namespaces DRY pass across the backend folders: - Add backends::make_server(ctx) for the standard (log_level, model_manager, backend_manager) construction; the 6 plain create() bodies now call it instead of repeating the three context fields. cloud/ryzenai keep bespoke create(). - Each *_server.h closed and re-opened namespace lemon::backends just to nest the per-backend namespace; nest it inline instead (8 headers). ryzenai is left as-is (its legacy RyzenAIServer lives in namespace lemon, not lemon::backends). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_registry.h | 8 ++++++++ src/cpp/include/lemon/backends/cloud/cloud_server.h | 5 ----- .../include/lemon/backends/fastflowlm/fastflowlm_server.h | 5 ----- src/cpp/include/lemon/backends/kokoro/kokoro_server.h | 5 ----- src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h | 5 ----- .../include/lemon/backends/moonshine/moonshine_server.h | 5 ----- src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h | 5 ----- src/cpp/include/lemon/backends/vllm/vllm_server.h | 5 ----- .../include/lemon/backends/whispercpp/whispercpp_server.h | 5 ----- src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp | 2 +- src/cpp/server/backends/kokoro/kokoro_server.cpp | 2 +- src/cpp/server/backends/llamacpp/llamacpp_server.cpp | 2 +- src/cpp/server/backends/moonshine/moonshine_server.cpp | 2 +- src/cpp/server/backends/sdcpp/sdcpp_server.cpp | 2 +- src/cpp/server/backends/vllm/vllm_server.cpp | 2 +- 15 files changed, 14 insertions(+), 46 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h index 75709781d..868a0a584 100644 --- a/src/cpp/include/lemon/backends/backend_registry.h +++ b/src/cpp/include/lemon/backends/backend_registry.h @@ -30,6 +30,14 @@ struct BackendContext { using BackendCreateFn = std::unique_ptr (*)(const BackendContext&); +// Convenience for the common create(): construct a server class from the +// standard (log_level, model_manager, backend_manager) context fields. Backends +// needing extra constructor arguments (cloud, ryzenai) build theirs by hand. +template +std::unique_ptr make_server(const BackendContext& ctx) { + return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); +} + // Binds a descriptor (what the backend is) to its server class's create() (how // it runs). The generated factory registry supplies one per backend. This API is // server-only: it references server classes via create(), so it is compiled into diff --git a/src/cpp/include/lemon/backends/cloud/cloud_server.h b/src/cpp/include/lemon/backends/cloud/cloud_server.h index 774c44300..a2dc3a6e9 100644 --- a/src/cpp/include/lemon/backends/cloud/cloud_server.h +++ b/src/cpp/include/lemon/backends/cloud/cloud_server.h @@ -108,11 +108,6 @@ class CloudServer : public WrappedServer { bool loaded_ = false; }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace cloud { // Factory for the cloud backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h index e4bce74d8..bdcb1d88a 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_server.h @@ -61,11 +61,6 @@ class FastFlowLMServer : public WrappedServer, public IEmbeddingsServer, public bool is_loaded_ = false; }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace fastflowlm { // Factory for the fastflowlm backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h index ec8e74844..6a9738252 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro_server.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro_server.h @@ -38,11 +38,6 @@ class KokoroServer : public WrappedServer, public ITextToSpeechServer { void audio_speech(const json& request, httplib::DataSink& sink) override; }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace kokoro { // Factory for the kokoro backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h index f1447c1ce..8a7a8405f 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp_server.h @@ -49,11 +49,6 @@ class LlamaCppServer : public WrappedServer, public IEmbeddingsServer, public IR json tokenize(const json& request) override; }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace llamacpp { // Factory for the llamacpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h index 47ea21f58..e6535a34b 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine_server.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine_server.h @@ -48,11 +48,6 @@ class MoonshineServer : public WrappedServer, public ITranscriptionServer, publi int tcp_port_ = 0; // Port for line-delimited JSON streaming }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace moonshine { // Factory for the moonshine backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h index 65c470332..185108afc 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h @@ -86,11 +86,6 @@ class SDServer : public WrappedServer, public IImageServer { std::string resolve_size(const nlohmann::json& request) const; }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace sdcpp { // Factory for the sdcpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/vllm/vllm_server.h b/src/cpp/include/lemon/backends/vllm/vllm_server.h index 0293fa811..1ac9438ed 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm_server.h +++ b/src/cpp/include/lemon/backends/vllm/vllm_server.h @@ -41,11 +41,6 @@ class VLLMServer : public WrappedServer { }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace vllm { // Factory for the vllm backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h index 9ddd4f2af..dc97cbd9f 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp_server.h @@ -67,11 +67,6 @@ class WhisperServer : public WrappedServer, public ITranscriptionServer { std::filesystem::path temp_dir_; // Directory for temporary audio files }; -} // namespace backends -} // namespace lemon - -namespace lemon { -namespace backends { namespace whispercpp { // Factory for the whispercpp backend (constructs the server class — lemond only). std::unique_ptr create(const BackendContext& ctx); diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 4a84ecd10..25bbc444d 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -477,7 +477,7 @@ namespace backends { namespace fastflowlm { std::unique_ptr create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); + return make_server(ctx); } namespace { diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp index aa8ad871e..154973501 100644 --- a/src/cpp/server/backends/kokoro/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -216,7 +216,7 @@ namespace backends { namespace kokoro { std::unique_ptr create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); + return make_server(ctx); } diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index d441f998e..b1fd4ee83 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -667,7 +667,7 @@ namespace backends { namespace llamacpp { std::unique_ptr create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); + return make_server(ctx); } namespace { diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index a84506e35..ced8e716f 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -368,7 +368,7 @@ namespace backends { namespace moonshine { std::unique_ptr create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); + return make_server(ctx); } diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp index 98f19e5ea..4e23c046e 100644 --- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -754,7 +754,7 @@ namespace backends { namespace sdcpp { std::unique_ptr create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); + return make_server(ctx); } diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp index 085cd0f2a..8bfaced2d 100644 --- a/src/cpp/server/backends/vllm/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -319,7 +319,7 @@ namespace backends { namespace vllm { std::unique_ptr create(const BackendContext& ctx) { - return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); + return make_server(ctx); } From 43ec4f235ccbf6343b96376e51f8d09825b27ddf Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Thu, 25 Jun 2026 16:51:34 -0400 Subject: [PATCH 35/39] polish(backends): make_spec/single_ops helpers shrink spec()/ops() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-backend spec()/ops() are the name-based adapter the CMake codegen binds (::spec/ops), so the functions must exist — but their bodies were repetitive. Add make_spec(descriptor[, split]) (backend_utils.h, where BackendSpec is complete) and single_ops() (backend_registry.h, next to make_server) so the 7 standard spec() and 7 custom ops() collapse to one line each. ryzenai (install key != recipe) and cloud (no spec) keep bespoke spec(); sd-cpp/vllm keep default_backend_ops(). Pure refactor — registry binding, 71/71 endpoints, and all-backends-registered smoke unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/include/lemon/backends/backend_registry.h | 9 +++++++++ src/cpp/include/lemon/backends/backend_utils.h | 12 ++++++++++++ src/cpp/server/backends/cloud/cloud_server.cpp | 5 +---- .../server/backends/fastflowlm/fastflowlm_server.cpp | 11 ++--------- src/cpp/server/backends/kokoro/kokoro_server.cpp | 11 ++--------- src/cpp/server/backends/llamacpp/llamacpp_server.cpp | 11 ++--------- .../server/backends/moonshine/moonshine_server.cpp | 11 ++--------- src/cpp/server/backends/ryzenai/ryzenai_server.cpp | 5 +---- src/cpp/server/backends/sdcpp/sdcpp_server.cpp | 6 +----- src/cpp/server/backends/vllm/vllm_server.cpp | 6 +----- .../server/backends/whispercpp/whispercpp_server.cpp | 11 ++--------- 11 files changed, 35 insertions(+), 63 deletions(-) diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h index 868a0a584..240ddf728 100644 --- a/src/cpp/include/lemon/backends/backend_registry.h +++ b/src/cpp/include/lemon/backends/backend_registry.h @@ -38,6 +38,15 @@ std::unique_ptr make_server(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); } +// Construct-on-first-use singleton for a stateless ops class, giving the +// registry a stable pointer. Backends with no custom behavior return +// default_backend_ops() from their ops() instead. +template +const BackendOps* single_ops() { + static const T kOps; + return &kOps; +} + // Binds a descriptor (what the backend is) to its server class's create() (how // it runs). The generated factory registry supplies one per backend. This API is // server-only: it references server classes via create(), so it is compiled into diff --git a/src/cpp/include/lemon/backends/backend_utils.h b/src/cpp/include/lemon/backends/backend_utils.h index bfc37734d..bdbfe0869 100644 --- a/src/cpp/include/lemon/backends/backend_utils.h +++ b/src/cpp/include/lemon/backends/backend_utils.h @@ -5,6 +5,7 @@ #include #include #include +#include "lemon/backends/backend_descriptor.h" namespace fs = std::filesystem; @@ -42,6 +43,17 @@ namespace lemon::backends { std::string log_name() const { return recipe + " Server"; }; }; + // Build a backend's install/download spec from its descriptor's recipe/binary + // and the server class T's get_install_params. The construct-on-first-use + // static gives the registry a stable pointer. Backends whose install key + // differs from the recipe (ryzenai) or that have no installable artifact + // (cloud) build their BackendSpec by hand instead of using this. + template + const BackendSpec* make_spec(const BackendDescriptor& d, bool split = false) { + static const BackendSpec kSpec(d.recipe, d.binary, T::get_install_params, split); + return &kSpec; + } + // Return the backend spec for recipes that use the standard BackendSpec flow. // Returns nullptr for recipes that require custom handling (e.g., flm) or unknown recipes. const BackendSpec* try_get_spec_for_recipe(const std::string& recipe); diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp index f68bc0c92..3c61c213b 100644 --- a/src/cpp/server/backends/cloud/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -906,10 +906,7 @@ class CloudOps : public BackendOps { } // namespace const BackendSpec* spec() { return nullptr; } -const BackendOps* ops() { - static const CloudOps kOps; - return &kOps; -} +const BackendOps* ops() { return single_ops(); } } // namespace cloud } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 25bbc444d..050b5a961 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -564,15 +564,8 @@ class FlmOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { - static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, - FastFlowLMServer::get_install_params, /*split=*/false); - return &kSpec; -} -const BackendOps* ops() { - static const FlmOps kOps; - return &kOps; -} +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } } // namespace fastflowlm } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp index 154973501..95d46de6a 100644 --- a/src/cpp/server/backends/kokoro/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -240,15 +240,8 @@ class KokoroOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { - static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, - KokoroServer::get_install_params, /*split=*/false); - return &kSpec; -} -const BackendOps* ops() { - static const KokoroOps kOps; - return &kOps; -} +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } } // namespace kokoro } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index 6bdb0ae98..eb766e798 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -861,15 +861,8 @@ class LlamaCppOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { - static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, - LlamaCppServer::get_install_params, /*split=*/false); - return &kSpec; -} -const BackendOps* ops() { - static const LlamaCppOps kOps; - return &kOps; -} +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } } // namespace llamacpp } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index ced8e716f..bcf263d67 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -407,15 +407,8 @@ class MoonshineOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { - static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, - MoonshineServer::get_install_params, /*split=*/false); - return &kSpec; -} -const BackendOps* ops() { - static const MoonshineOps kOps; - return &kOps; -} +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } } // namespace moonshine } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp index 7bee8e46d..69e1eed16 100644 --- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -220,10 +220,7 @@ const BackendSpec* spec() { ::lemon::RyzenAIServer::get_install_params, /*split=*/false); return &kSpec; } -const BackendOps* ops() { - static const RyzenAiOps kOps; - return &kOps; -} +const BackendOps* ops() { return single_ops(); } } // namespace ryzenai } // namespace backends } // namespace lemon diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp index 4e23c046e..a4b1787f9 100644 --- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -758,11 +758,7 @@ std::unique_ptr create(const BackendContext& ctx) { } -const BackendSpec* spec() { - static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, - SDServer::get_install_params, /*split=*/false); - return &kSpec; -} +const BackendSpec* spec() { return make_spec(descriptor); } const BackendOps* ops() { return default_backend_ops(); } } // namespace sdcpp } // namespace backends diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp index 8bfaced2d..60a79c95f 100644 --- a/src/cpp/server/backends/vllm/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -323,11 +323,7 @@ std::unique_ptr create(const BackendContext& ctx) { } -const BackendSpec* spec() { - static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, - VLLMServer::get_install_params, /*split=*/true); - return &kSpec; -} +const BackendSpec* spec() { return make_spec(descriptor, /*split=*/true); } const BackendOps* ops() { return default_backend_ops(); } } // namespace vllm } // namespace backends diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp index bd245b9e5..d1222e551 100644 --- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -763,15 +763,8 @@ class WhisperOps : public BackendOps { }; } // namespace -const BackendSpec* spec() { - static const BackendSpec kSpec(descriptor.recipe, descriptor.binary, - WhisperServer::get_install_params, /*split=*/false); - return &kSpec; -} -const BackendOps* ops() { - static const WhisperOps kOps; - return &kOps; -} +const BackendSpec* spec() { return make_spec(descriptor); } +const BackendOps* ops() { return single_ops(); } } // namespace whispercpp } // namespace backends } // namespace lemon From 8f6f36e1e337adeedf5307562c248529ebc99eb9 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Thu, 25 Jun 2026 16:51:34 -0400 Subject: [PATCH 36/39] docs(nav): add Adding a Backend + Backends Reference to mkdocs nav The two backend dev docs added by this work (dev/adding-a-backend.md and the generated dev/backends-reference.md) were not wired into the Development nav. Co-Authored-By: Claude Opus 4.8 (1M context) --- mkdocs.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mkdocs.yml b/mkdocs.yml index 18201bba3..73ecc9981 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -83,6 +83,8 @@ nav: - Contribute: dev/contribute.md - Documentation Guide: dev/documentation.md - C++: dev/getting-started.md + - Adding a Backend: dev/adding-a-backend.md + - Backends Reference: dev/backends-reference.md - Desktop App: dev/app.md - Web UI: dev/web-ui.md - Lemonade Omni Models: dev/lemonade-omni.md From 2ac10fdfdf40f55802136ef152b905e094c71b83 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Fri, 26 Jun 2026 10:23:35 -0400 Subject: [PATCH 37/39] feat(config): generate defaults.json from descriptors via /internal/config/defaults Per-recipe config defaults are now declared in each backend descriptor (takes_args / arg_variants / bin_variants / config_extra -> config_defaults()) instead of hand-maintained blocks in defaults.json. The committed resources/defaults.json stays fully populated (so it remains the discoverable reference for factory defaults) but is now generated: - New GET /internal/config/defaults emits the canonical default config (ConfigFile::base_defaults(): global keys + descriptor-derived per-recipe sections, host/deployment-independent). Documented alongside /internal/config. - gen_backend_docs.py -> gen_backend_boilerplate.py, which mirrors that endpoint verbatim into resources/defaults.json (whole-file) in addition to the doc regions. The existing CI --check now also fails if defaults.json drifts. config_file keeps reading defaults.json at runtime; base_defaults() re-seeds the descriptor blocks so the descriptor stays authoritative even if the file lags. Verified: a fresh config.json reproduces every prior default; endpoints 71/71; generator --check clean; black clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/workflows/docs_and_style.yml | 2 +- docs/dev/backends-reference.md | 2 +- docs/dev/getting-started.md | 10 ++ docs/embeddable/runtime.md | 18 ++++ ...end_docs.py => gen_backend_boilerplate.py} | 55 ++++++++--- .../lemon/backends/backend_descriptor.h | 25 +++++ .../lemon/backends/fastflowlm/fastflowlm.h | 4 + .../include/lemon/backends/kokoro/kokoro.h | 9 ++ .../lemon/backends/llamacpp/llamacpp.h | 6 ++ .../lemon/backends/moonshine/moonshine.h | 10 ++ .../include/lemon/backends/ryzenai/ryzenai.h | 9 ++ src/cpp/include/lemon/backends/sdcpp/sdcpp.h | 6 ++ src/cpp/include/lemon/backends/vllm/vllm.h | 6 ++ .../lemon/backends/whispercpp/whispercpp.h | 9 ++ src/cpp/include/lemon/config_file.h | 11 ++- src/cpp/include/lemon/server.h | 1 + src/cpp/resources/defaults.json | 92 +++++++++---------- src/cpp/server/config_file.cpp | 21 ++++- src/cpp/server/server.cpp | 17 ++++ 19 files changed, 249 insertions(+), 64 deletions(-) rename docs/tools/{gen_backend_docs.py => gen_backend_boilerplate.py} (90%) diff --git a/.github/workflows/docs_and_style.yml b/.github/workflows/docs_and_style.yml index 35aa4cf50..3354f50fd 100644 --- a/.github/workflows/docs_and_style.yml +++ b/.github/workflows/docs_and_style.yml @@ -39,7 +39,7 @@ jobs: - name: Build lemond run: cmake --build --preset default --target lemond - name: Check backend reference docs are up to date - run: python3 docs/tools/gen_backend_docs.py --check + run: python3 docs/tools/gen_backend_boilerplate.py --check markdown-link-check: runs-on: ubuntu-latest diff --git a/docs/dev/backends-reference.md b/docs/dev/backends-reference.md index 5faad0189..3993fe8fe 100644 --- a/docs/dev/backends-reference.md +++ b/docs/dev/backends-reference.md @@ -1,6 +1,6 @@ # Backend reference - diff --git a/docs/dev/getting-started.md b/docs/dev/getting-started.md index b8e487c4c..ef1769059 100644 --- a/docs/dev/getting-started.md +++ b/docs/dev/getting-started.md @@ -625,6 +625,7 @@ Internal endpoints accept connections from any address, so first-party clients o | `POST` | `/internal/shutdown` | Unloads all models and shuts down the server | | `POST` | `/internal/set` | Unified config setter (see below) | | `GET` | `/internal/config` | Returns the full runtime config snapshot | +| `GET` | `/internal/config/defaults` | Returns the canonical default config (factory defaults) | | `POST` | `/internal/cleanup-cache` | Cleans up orphaned files in the Hugging Face cache | | `POST` | `/internal/pin` | Pin or unpin a loaded model | @@ -676,6 +677,15 @@ Returns the full runtime configuration as a flat JSON object containing all serv curl http://localhost:13305/internal/config ``` +#### `GET /internal/config/defaults` + +Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or deployment overrides. The per-recipe sections come from the backend descriptors (each descriptor's `config_defaults()`), making this the authoritative source of the factory defaults. `docs/tools/gen_backend_boilerplate.py` reads this endpoint to regenerate the committed `src/cpp/resources/defaults.json`, and a CI `--check` fails if that file drifts from the descriptors. + +**Example:** +```bash +curl http://localhost:13305/internal/config/defaults +``` + ### Dependencies All dependencies are automatically fetched by CMake via FetchContent: diff --git a/docs/embeddable/runtime.md b/docs/embeddable/runtime.md index a50b8c4af..983038e95 100644 --- a/docs/embeddable/runtime.md +++ b/docs/embeddable/runtime.md @@ -114,6 +114,7 @@ Your app can manage its `lemond` instance at runtime by using `/internal` endpoi |--------|------|-------------| | `POST` | `/internal/set` | Unified config setter (see below) | | `GET` | `/internal/config` | Returns the full runtime config snapshot | +| `GET` | `/internal/config/defaults` | Returns the canonical default config (factory defaults) | | `POST` | `/internal/pin` | Pin or unpin a loaded model (prevents auto-eviction) | The settings defined in `config.json` can all be changed at runtime without restarting `lemond` with the `/internal/set` endpoint. See the [Configuration Guide](../guide/configuration/README.md) for details on all settings. @@ -137,6 +138,23 @@ Returns the full runtime configuration as a flat JSON object containing all serv curl http://localhost:8000/internal/config ``` +#### `GET /internal/config/defaults` + +Returns the canonical default configuration — the values a brand-new `config.json` is seeded with, independent of this instance's current config or any deployment override. The per-recipe sections are derived from the backend descriptors, so this is the authoritative source for "what are the factory defaults." It is what `docs/tools/gen_backend_boilerplate.py` reads to regenerate `src/cpp/resources/defaults.json`. + +**Example:** +=== "Windows (cmd.exe)" + + ```cmd + curl http://localhost:8000/internal/config/defaults + ``` + +=== "Linux (bash)" + + ```bash + curl http://localhost:8000/internal/config/defaults + ``` + #### `POST /internal/set` Accepts a JSON object with one or more keys to update atomically. Returns `{"status":"success","updated":{...}}` on success, or `400` with an error message on validation failure. diff --git a/docs/tools/gen_backend_docs.py b/docs/tools/gen_backend_boilerplate.py similarity index 90% rename from docs/tools/gen_backend_docs.py rename to docs/tools/gen_backend_boilerplate.py index 8e5bf3133..b4e8ac8d9 100644 --- a/docs/tools/gen_backend_docs.py +++ b/docs/tools/gen_backend_boilerplate.py @@ -1,19 +1,24 @@ #!/usr/bin/env python3 -"""Generate backend reference docs from the self-describing backend descriptors. +"""Generate backend boilerplate (docs + config defaults) from the descriptors. The C++ backend descriptors (src/cpp/include/lemon/backends//.h) are the single source of truth for what each backend is. This script boots a `lemond` -server, reads the descriptor-generated ``/system-info`` ``recipes`` object and -``server_models.json``, and rewrites the marker-delimited regions of the target -doc(s). A CI step runs it with ``--check`` and fails if the committed docs drift. +server and regenerates the committed artifacts that would otherwise be +hand-maintained: -Usage: - python docs/tools/gen_backend_docs.py [--lemond PATH] [--check] + * Marker-delimited regions of the backend reference docs, from + ``/system-info`` ``recipes`` + ``server_models.json``. + * The whole of ``src/cpp/resources/defaults.json``, mirrored verbatim from + ``/internal/config/defaults`` (its per-recipe blocks come from each + descriptor's ``config_defaults()``). + +A CI step runs it with ``--check`` and fails if any committed artifact drifts. -``--check`` regenerates in memory and exits non-zero if the on-disk docs differ, -without modifying them. +Usage: + python docs/tools/gen_backend_boilerplate.py [--lemond PATH] [--check] -Only the regions between:: +``--check`` regenerates in memory and exits non-zero if any on-disk artifact +differs, without modifying it. For the docs, only the regions between:: @@ -108,6 +113,12 @@ def system_info(self) -> dict: def config(self) -> dict: return json.loads(self._get("/internal/config", timeout=10)) + def config_defaults_text(self) -> str: + # Verbatim text of the canonical default config (the server's own + # serialization) so the committed resources/defaults.json is byte-stable. + text = self._get("/internal/config/defaults", timeout=10).decode("utf-8") + return text if text.endswith("\n") else text + "\n" + def md_escape(text: str) -> str: return str(text).replace("|", "\\|") @@ -429,7 +440,7 @@ def render_models(recipes: dict) -> str: DEFAULT_TEMPLATE = """# Backend reference - @@ -504,6 +515,7 @@ def main() -> int: with Lemond(binary) as server: info = server.system_info() config = server.config() + defaults_text = server.config_defaults_text() recipes = info.get("recipes", {}) if not recipes: sys.exit("/system-info returned no recipes") @@ -562,7 +574,24 @@ def main() -> int: }, } + # Whole-file generated artifacts (not marker-delimited): resources/defaults.json + # is the canonical default config, mirrored verbatim from GET + # /internal/config/defaults (per-recipe blocks come from the descriptors). + raw_targets: dict = { + REPO_ROOT / "src" / "cpp" / "resources" / "defaults.json": defaults_text, + } + stale = [] + for path, content in raw_targets.items(): + rel = path.relative_to(REPO_ROOT) + if args.check: + if not path.exists() or path.read_text() != content: + stale.append(str(rel)) + else: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content) + print(f"Wrote {rel}") + for path, spec in targets.items(): rel = path.relative_to(REPO_ROOT) current = path.read_text() if path.exists() else spec.get("template", "") @@ -580,11 +609,11 @@ def main() -> int: if args.check: if stale: sys.exit( - "Stale generated docs: " + "Stale generated files: " + ", ".join(stale) - + "\nRun: python docs/tools/gen_backend_docs.py" + + "\nRun: python docs/tools/gen_backend_boilerplate.py" ) - print("All generated docs are up to date.") + print("All generated files are up to date.") return 0 diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index bd46c98c4..03ca71e69 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -94,10 +94,35 @@ struct BackendDescriptor { // backends are skipped by the load-time auto-download path. bool self_manages_downloads = false; + // --- config.json per-recipe defaults schema --- + // The backend's section of config.json is derived from these fields, so a new + // backend's defaults live in its descriptor instead of a hand-maintained + // defaults.json block. (selectable_backend additionally emits `backend: "auto"`.) + bool takes_args = false; // emits `args: ""` + std::vector arg_variants; // each emits `_args: ""` + std::vector bin_variants; // each emits `_bin: "builtin"` + nlohmann::json config_extra = nlohmann::json::object(); // fixed extras (e.g. prefer_system, image defaults) + // The config.json section name for this backend, falling back to the recipe. std::string effective_config_section() const { return config_section.empty() ? recipe : config_section; } + + // Build this backend's config.json default section from the schema above. + // Returns an empty object when the backend has no configurable section. + nlohmann::json config_defaults() const { + nlohmann::json block = nlohmann::json::object(); + if (selectable_backend) block["backend"] = "auto"; + if (takes_args) block["args"] = ""; + for (const auto& v : arg_variants) block[v + "_args"] = ""; + for (const auto& v : bin_variants) block[v + "_bin"] = "builtin"; + if (config_extra.is_object()) { + for (auto it = config_extra.begin(); it != config_extra.end(); ++it) { + block[it.key()] = it.value(); + } + } + return block; + } }; } // namespace lemon diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index 7b812bfb9..24049ab31 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -37,6 +37,10 @@ inline const BackendDescriptor descriptor = { /*rocm_requires_cwsr_fix*/ false, /*version_policy*/ VersionPolicy::AtLeast, // system-managed package /*self_manages_downloads*/ true, // flm pulls its own models via the flm CLI + /*takes_args*/ true, + /*arg_variants*/ {}, + /*bin_variants*/ {}, + /*config_extra*/ nlohmann::json::object(), }; } // namespace fastflowlm diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h index b1e52eba4..5f3fbf97c 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -33,6 +33,15 @@ inline const BackendDescriptor descriptor = { /*experimental*/ false, /*web_display_name*/ "", /*web_priority*/ 6, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ false, + /*arg_variants*/ {}, + /*bin_variants*/ {"cpu"}, + /*config_extra*/ nlohmann::json::object(), }; } // namespace kokoro diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index 02ed728d7..7c58a73f3 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -49,6 +49,12 @@ inline const BackendDescriptor descriptor = { /*rocm_channels*/ {"stable", "nightly"}, /*exposes_prometheus_metrics*/ true, /*rocm_requires_cwsr_fix*/ true, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"rocm", "vulkan", "cpu"}, + /*bin_variants*/ {"rocm", "vulkan", "cuda", "cpu"}, + /*config_extra*/ {{"prefer_system", true}}, }; } // namespace llamacpp diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h index 2c9feed2b..ae7313714 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h @@ -32,6 +32,16 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Speech-to-text", /*experimental*/ false, /*web_display_name*/ "", + /*web_priority*/ 0, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"cpu"}, + /*bin_variants*/ {"cpu"}, + /*config_extra*/ nlohmann::json::object(), }; } // namespace moonshine diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h index 13ebb9a7c..dbc15d7f3 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -32,6 +32,15 @@ inline const BackendDescriptor descriptor = { /*experimental*/ false, /*web_display_name*/ "Ryzen AI SW NPU", /*web_priority*/ 2, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ false, + /*arg_variants*/ {}, + /*bin_variants*/ {"server"}, + /*config_extra*/ nlohmann::json::object(), }; } // namespace ryzenai diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h index 8cf299a2c..986d26fbe 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -53,6 +53,12 @@ inline const BackendDescriptor descriptor = { /*rocm_channels*/ {"stable"}, /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ true, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"cpu", "rocm", "vulkan"}, + /*bin_variants*/ {"cpu", "rocm", "vulkan"}, + /*config_extra*/ {{"steps", 20}, {"cfg_scale", 7.0}, {"width", 512}, {"height", 512}}, }; } // namespace sdcpp diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h index 97c58c715..8984e15b3 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm.h +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -36,6 +36,12 @@ inline const BackendDescriptor descriptor = { /*rocm_channels*/ {}, // single rocm artifact, no stable/nightly channels /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ true, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {}, + /*bin_variants*/ {}, + /*config_extra*/ nlohmann::json::object(), }; } // namespace vllm diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h index e62ee029c..9c38b66d5 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -42,6 +42,15 @@ inline const BackendDescriptor descriptor = { /*experimental*/ false, /*web_display_name*/ "whisper.cpp", /*web_priority*/ 4, + /*rocm_channels*/ {}, + /*exposes_prometheus_metrics*/ false, + /*rocm_requires_cwsr_fix*/ false, + /*version_policy*/ VersionPolicy::Exact, + /*self_manages_downloads*/ false, + /*takes_args*/ true, + /*arg_variants*/ {"cpu", "npu"}, + /*bin_variants*/ {"cpu", "npu"}, + /*config_extra*/ nlohmann::json::object(), }; } // namespace whispercpp diff --git a/src/cpp/include/lemon/config_file.h b/src/cpp/include/lemon/config_file.h index ec56c17fb..8c46e125f 100644 --- a/src/cpp/include/lemon/config_file.h +++ b/src/cpp/include/lemon/config_file.h @@ -84,8 +84,15 @@ static inline bool config_migrate(json& config, /// Manages reading and writing config.json in the lemonade cache dir. class ConfigFile { public: - /// Returns the full default config loaded from installed resource JSON. - /// On Linux, an optional distro override at /usr/share/lemonade/defaults.json + /// The canonical default config: resources/defaults.json (global keys) with + /// each backend's per-recipe section seeded from its descriptor. Host- and + /// deployment-independent, so it is reproducible — this is what + /// GET /internal/config/defaults emits and gen_backend_boilerplate.py writes + /// back into resources/defaults.json. + static json base_defaults(); + + /// base_defaults() plus deployment overrides. On Linux, an optional distro + /// override at /usr/share/lemonade/defaults.json (and LEMONADE_DEFAULTS_PATH) /// is merged on top when present. static json get_defaults(); diff --git a/src/cpp/include/lemon/server.h b/src/cpp/include/lemon/server.h index d481f5b80..3e39357d3 100644 --- a/src/cpp/include/lemon/server.h +++ b/src/cpp/include/lemon/server.h @@ -73,6 +73,7 @@ class Server { // Unified config endpoints void handle_config_set(const httplib::Request& req, httplib::Response& res); void handle_config_get(const httplib::Request& req, httplib::Response& res); + void handle_config_defaults_get(const httplib::Request& req, httplib::Response& res); // Side-effect callback for RuntimeConfig::set(). Receives a nested JSON // mirroring the input shape, containing only entries that actually changed. diff --git a/src/cpp/resources/defaults.json b/src/cpp/resources/defaults.json index f79396266..ab86404dd 100644 --- a/src/cpp/resources/defaults.json +++ b/src/cpp/resources/defaults.json @@ -1,71 +1,71 @@ { + "cloud_providers": [], "config_version": 2, - "port": 13305, - "host": "localhost", - "websocket_port": "auto", - "log_level": "info", - "global_timeout": 600, - "max_loaded_models": 1, - "no_broadcast": false, - "extra_models_dir": "", - "models_dir": "auto", "ctx_size": -1, - "offline": false, - "no_fetch_executables": false, "disable_model_filtering": false, "enable_dgpu_gtt": false, - "rocm_channel": "stable", + "extra_models_dir": "", + "flm": { + "args": "" + }, + "global_timeout": 600, + "host": "localhost", + "kokoro": { + "cpu_bin": "builtin" + }, "llamacpp": { - "backend": "auto", "args": "", - "rocm_args": "", - "vulkan_args": "", + "backend": "auto", "cpu_args": "", + "cpu_bin": "builtin", + "cuda_bin": "builtin", "prefer_system": true, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin", - "cuda_bin": "builtin", - "cpu_bin": "builtin" + "vulkan_args": "", + "vulkan_bin": "builtin" }, - "whispercpp": { - "backend": "auto", + "log_level": "info", + "max_loaded_models": 1, + "models_dir": "auto", + "moonshine": { "args": "", "cpu_args": "", - "npu_args": "", - "cpu_bin": "builtin", - "npu_bin": "builtin" + "cpu_bin": "builtin" + }, + "no_broadcast": false, + "no_fetch_executables": false, + "offline": false, + "port": 13305, + "rocm_channel": "stable", + "ryzenai": { + "server_bin": "builtin" }, "sdcpp": { - "backend": "auto", "args": "", - "cpu_args": "", - "rocm_args": "", - "vulkan_args": "", - "steps": 20, + "backend": "auto", "cfg_scale": 7.0, - "width": 512, - "height": 512, + "cpu_args": "", "cpu_bin": "builtin", + "height": 512, + "rocm_args": "", "rocm_bin": "builtin", - "vulkan_bin": "builtin" - }, - "flm": { - "args": "" + "steps": 20, + "vulkan_args": "", + "vulkan_bin": "builtin", + "width": 512 }, "vllm": { - "backend": "auto", - "args": "" - }, - "ryzenai": { - "server_bin": "builtin" - }, - "kokoro": { - "cpu_bin": "builtin" + "args": "", + "backend": "auto" }, - "moonshine": { + "websocket_port": "auto", + "whispercpp": { "args": "", + "backend": "auto", "cpu_args": "", - "cpu_bin": "builtin" - }, - "cloud_providers": [] + "cpu_bin": "builtin", + "npu_args": "", + "npu_bin": "builtin" + } } diff --git a/src/cpp/server/config_file.cpp b/src/cpp/server/config_file.cpp index d8f6955af..2787c0167 100644 --- a/src/cpp/server/config_file.cpp +++ b/src/cpp/server/config_file.cpp @@ -1,4 +1,5 @@ #include "lemon/config_file.h" +#include "lemon/backends/backend_descriptor_registry.h" #include "lemon/utils/json_utils.h" #include "lemon/utils/path_utils.h" @@ -27,10 +28,28 @@ static json load_json_file(const fs::path& path) { } } -json ConfigFile::get_defaults() { +json ConfigFile::base_defaults() { json defaults = load_json_file(utils::path_from_utf8( utils::get_resource_path("resources/defaults.json"))); + // Seed each backend's config.json section from its descriptor. The per-recipe + // defaults are authored in the backend's descriptor; resources/defaults.json + // is the generated, committed mirror (see GET /internal/config/defaults and + // docs/tools/gen_backend_boilerplate.py). Re-seeding here keeps the descriptor + // authoritative even if the committed file lags. Empty result = no section. + for (const auto* d : backends::all_descriptors()) { + json block = d->config_defaults(); + if (!block.empty()) { + defaults[d->effective_config_section()] = block; + } + } + + return defaults; +} + +json ConfigFile::get_defaults() { + json defaults = base_defaults(); + #ifndef _WIN32 fs::path distro_defaults = "/usr/share/lemonade/defaults.json"; if (fs::exists(distro_defaults)) { diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index b4c529951..511aa080c 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -677,6 +677,9 @@ void Server::setup_routes(httplib::Server &web_server) { web_server.Get("/internal/config", [this](const httplib::Request& req, httplib::Response& res) { handle_config_get(req, res); }); + web_server.Get("/internal/config/defaults", [this](const httplib::Request& req, httplib::Response& res) { + handle_config_defaults_get(req, res); + }); web_server.Post("/internal/cleanup-cache", [this](const httplib::Request& req, httplib::Response& res) { handle_cleanup_cache(req, res); }); @@ -4468,6 +4471,20 @@ void Server::handle_config_get(const httplib::Request& /*req*/, httplib::Respons } } +void Server::handle_config_defaults_get(const httplib::Request& /*req*/, httplib::Response& res) { + try { + // The canonical default config (global keys + descriptor-derived per-recipe + // sections), independent of this host's config.json or deployment overrides. + // gen_backend_boilerplate.py reads this to regenerate resources/defaults.json. + res.set_content(ConfigFile::base_defaults().dump(2), "application/json"); + } catch (const std::exception& e) { + LOG(ERROR, "Server") << "ERROR in handle_config_defaults_get: " << e.what() << std::endl; + res.status = 500; + nlohmann::json error = {{"error", e.what()}}; + res.set_content(error.dump(), "application/json"); + } +} + void Server::handle_bin_change(const std::string& section, const std::string& bin_key, const std::string& new_value) { From 283297aa7b4315269150191db733f4e6fcb252e1 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Fri, 26 Jun 2026 10:25:02 -0400 Subject: [PATCH 38/39] polish(cli): branch hf pull on repo_kind, not recipe==llamacpp The single-installable-unit path keyed off recipe != "llamacpp"; switch it to repo_kind != "gguf", the same server-provided classification the function already uses for the collection branch. Behavior-equivalent (collections are handled earlier, so by here repo_kind is gguf or onnx-ryzenai), and it drops the last backend-name literal from hf_pull. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/cpp/cli/hf_pull.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/cpp/cli/hf_pull.cpp b/src/cpp/cli/hf_pull.cpp index 8ed30ca0a..f5a84c051 100644 --- a/src/cpp/cli/hf_pull.cpp +++ b/src/cpp/cli/hf_pull.cpp @@ -255,11 +255,12 @@ int hf_pull_flow(lemonade::LemonadeClient& client, const auto& variants = variants_response["variants"]; std::string recipe = variants_response.value("recipe", std::string("llamacpp")); + std::string repo_kind = variants_response.value("repo_kind", std::string("gguf")); - // Non-llamacpp recipes (currently: ONNX RyzenAI) ship as a single - // installable unit — no per-variant menu, no `:variant` checkpoint - // suffix, no `-VARIANT` model name tail. - if (recipe != "llamacpp") { + // Non-GGUF repos (currently: ONNX RyzenAI) ship as a single installable + // unit — no per-variant menu, no `:variant` checkpoint suffix, no + // `-VARIANT` model name tail. (Collections returned earlier above.) + if (repo_kind != "gguf") { if (!variant.empty()) { std::cerr << "warning: variant '" << variant << "' ignored for " << recipe << " checkpoints" << std::endl; From ce221b466e6be489b1bbab987e8fac2ed49f6c03 Mon Sep 17 00:00:00 2001 From: jeremyfowers Date: Sat, 27 Jun 2026 21:24:07 -0400 Subject: [PATCH 39/39] refactor(backends): functional-only comments; complete, alphabetical website model list (#2320 review) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address Geramy's review on #2320: - Comments: sweep every backend file to functional-only, removing commentary that narrates self-explanatory code or references backends/flows it can't know about (net ~330 comment lines cut). Short purpose comments on declarations and genuine WHY/invariant notes are kept. - Website model list: drop the per-backend `web_priority` descriptor field (a 0-default that silently dropped backends like moonshine and vllm) and instead list every descriptor-backed recipe, ordered alphabetically by display name — a new backend can no longer be accidentally excluded. - CMakeLists: move the authoritative LEMON_BACKENDS list near the top; the codegen foreach stays where it was. Co-Authored-By: Claude Opus 4.8 (1M context) --- CMakeLists.txt | 64 ++++---- docs/assets/models.js | 8 +- docs/tools/gen_backend_boilerplate.py | 14 +- .../lemon/backends/backend_descriptor.h | 4 +- src/cpp/include/lemon/backends/backend_ops.h | 46 ++---- .../include/lemon/backends/backend_registry.h | 6 +- .../include/lemon/backends/backend_utils.h | 4 +- .../lemon/backends/fastflowlm/fastflowlm.h | 1 - .../backends/fastflowlm/fastflowlm_models.h | 3 +- .../include/lemon/backends/kokoro/kokoro.h | 1 - .../lemon/backends/llamacpp/llamacpp.h | 1 - .../lemon/backends/moonshine/moonshine.h | 1 - .../include/lemon/backends/ryzenai/ryzenai.h | 1 - src/cpp/include/lemon/backends/sdcpp/sdcpp.h | 1 - .../lemon/backends/sdcpp/sdcpp_server.h | 14 -- src/cpp/include/lemon/backends/vllm/vllm.h | 1 - .../lemon/backends/whispercpp/whispercpp.h | 1 - src/cpp/include/lemon/config_file.h | 4 +- src/cpp/include/lemon/gguf_reader.h | 1 - src/cpp/include/lemon/model_types.h | 6 +- src/cpp/include/lemon/wrapped_server.h | 1 - src/cpp/server/backends/backend_ops.cpp | 3 +- src/cpp/server/backends/backend_utils.cpp | 9 +- .../server/backends/cloud/cloud_server.cpp | 148 ++++-------------- .../backends/fastflowlm/fastflowlm_models.cpp | 22 +-- .../backends/fastflowlm/fastflowlm_server.cpp | 29 +--- .../server/backends/kokoro/kokoro_server.cpp | 8 - .../backends/llamacpp/llamacpp_gguf.cpp | 65 +++----- .../backends/llamacpp/llamacpp_server.cpp | 8 - .../backends/moonshine/moonshine_server.cpp | 6 - .../backends/ryzenai/ryzenai_server.cpp | 13 +- .../server/backends/sdcpp/sdcpp_server.cpp | 15 +- src/cpp/server/backends/vllm/vllm_server.cpp | 14 +- .../backends/whispercpp/whispercpp_server.cpp | 38 +---- src/cpp/server/config_file.cpp | 8 +- src/cpp/server/model_manager.cpp | 6 +- src/cpp/server/recipe_options.cpp | 6 +- src/cpp/server/router.cpp | 6 +- src/cpp/server/runtime_config.cpp | 11 +- src/cpp/server/server.cpp | 1 - src/cpp/server/system_info.cpp | 20 +-- test/cpp/test_auto_tune.cpp | 52 +----- 42 files changed, 169 insertions(+), 502 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2b7dee8a4..78937c046 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,38 @@ include(GNUInstallDirs) ${CMAKE_SOURCE_DIR}/docs/man/man1/lemonade.1 DESTINATION ${CMAKE_INSTALL_MANDIR}/man1 ) + +# ============================================================ +# Self-describing backends registry +# ============================================================ +# The authoritative backend list. Each entry is "|": +# recipe - the recipe string used in server_models.json (may contain dashes) +# stem - identifier-safe name and folder. Each backend lives in its own +# folder, shipping (in namespace lemon::backends::): +# include/lemon/backends//.h inline const descriptor (CLI-safe data) +# include/lemon/backends//_server.h WrappedServer subclass + create() decl +# server/backends//_server.cpp implementation + create() def +# +# Adding a backend is one line here plus that folder. The codegen later in this +# file (see "Self-describing backends registry codegen") compiles the server +# source and regenerates the registry headers, which bind each descriptor to its +# create(). Because this list is a tracked input, editing it forces regeneration +# on the next build (a file(GLOB) would silently miss a newly added backend). The +# descriptor is a header-only inline const, so it links into both the lemonade +# CLI and lemond; only lemond links the server sources. +set(LEMON_BACKENDS + # "|" + "llamacpp|llamacpp" + "whispercpp|whispercpp" + "moonshine|moonshine" + "kokoro|kokoro" + "sd-cpp|sdcpp" + "flm|fastflowlm" + "ryzenai-llm|ryzenai" + "vllm|vllm" + "cloud|cloud" +) + # ============================================================ # Tauri app source paths (used by both runtime and installers) # ============================================================ @@ -639,35 +671,11 @@ elseif(UNIX) endif() # ============================================================ -# Self-describing backends registry +# Self-describing backends registry codegen # ============================================================ -# The authoritative backend list. Each entry is "|": -# recipe - the recipe string used in server_models.json (may contain dashes) -# stem - identifier-safe name and folder. Each backend lives in its own -# folder, shipping (in namespace lemon::backends::): -# include/lemon/backends//.h inline const descriptor (CLI-safe data) -# include/lemon/backends//_server.h WrappedServer subclass + create() decl -# server/backends//_server.cpp implementation + create() def -# -# Adding a backend is one line here plus that folder. The foreach below compiles -# the server source and regenerates the registry headers, which bind each -# descriptor to its create(). Because this list is a tracked input, editing it -# forces regeneration on the next build (a file(GLOB) would silently miss a -# newly added backend). The descriptor is a header-only inline const, so it links -# into both the lemonade CLI and lemond; only lemond links the server sources. -set(LEMON_BACKENDS - # "|" - "llamacpp|llamacpp" - "whispercpp|whispercpp" - "moonshine|moonshine" - "kokoro|kokoro" - "sd-cpp|sdcpp" - "flm|fastflowlm" - "ryzenai-llm|ryzenai" - "vllm|vllm" - "cloud|cloud" -) - +# Consumes LEMON_BACKENDS (defined near the top of this file): the foreach below +# compiles each backend's server source and regenerates the registry headers that +# bind every descriptor to its create(). set(LEMON_DESCRIPTOR_INCLUDES "") set(LEMON_DESCRIPTOR_ENTRIES "") set(LEMON_FACTORY_INCLUDES "") diff --git a/docs/assets/models.js b/docs/assets/models.js index d9814cccb..e3c7762d7 100644 --- a/docs/assets/models.js +++ b/docs/assets/models.js @@ -4,12 +4,14 @@ const RAW_BASE = 'https://raw.githubusercontent.com/lemonade-sdk/lemonade'; /* BEGIN GENERATED: models-js-recipes */ const RECIPE_PRIORITY = [ + 'flm', + 'kokoro', 'llamacpp', + 'moonshine', 'ryzenai-llm', - 'flm', - 'whispercpp', 'sd-cpp', - 'kokoro' + 'vllm', + 'whispercpp' ]; const RECIPE_DISPLAY_NAMES = { diff --git a/docs/tools/gen_backend_boilerplate.py b/docs/tools/gen_backend_boilerplate.py index b4e8ac8d9..940459288 100644 --- a/docs/tools/gen_backend_boilerplate.py +++ b/docs/tools/gen_backend_boilerplate.py @@ -279,12 +279,16 @@ def _js_key(recipe: str) -> str: return recipe if re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", recipe) else f"'{recipe}'" +def _web_display_name(info: dict, recipe: str) -> str: + return info.get("web_display_name") or info.get("display_name", recipe) + + def render_models_js(recipes: dict) -> str: - # RECIPE_PRIORITY: recipes with web_priority > 0, in that order (legacy oga-* - # recipes have no descriptor and are intentionally dropped). + # RECIPE_PRIORITY: every descriptor-backed recipe, ordered alphabetically by + # display name. Listing all of them (rather than an opt-in subset) means a new + # backend can never be silently dropped from the website. prioritized = sorted( - (r for r, i in recipes.items() if i.get("web_priority", 0) > 0), - key=lambda r: recipes[r]["web_priority"], + recipes, key=lambda r: _web_display_name(recipes[r], r).lower() ) pri_lines = ",\n".join(f" '{r}'" for r in prioritized) @@ -292,7 +296,7 @@ def render_models_js(recipes: dict) -> str: # fallback (matching the curated map, which omits redundant entries). name_lines = [] for r, info in _ordered(recipes): - name = info.get("web_display_name") or info.get("display_name", r) + name = _web_display_name(info, r) if name and name != _js_to_title(r): name_lines.append(f" {_js_key(r)}: '{name}'") names = ",\n".join(name_lines) diff --git a/src/cpp/include/lemon/backends/backend_descriptor.h b/src/cpp/include/lemon/backends/backend_descriptor.h index 03ca71e69..51b357154 100644 --- a/src/cpp/include/lemon/backends/backend_descriptor.h +++ b/src/cpp/include/lemon/backends/backend_descriptor.h @@ -20,8 +20,7 @@ struct BackendOption { std::string group; // CLI help group, e.g. "General Options" }; -// How a backend shares the accelerator. Replaces the router's recipe-string -// checks for NPU exclusivity and LRU slot accounting. +// How a backend shares the accelerator. enum class SlotPolicy { Standard, // counts toward the LRU slots, no device exclusivity (llamacpp, sd-cpp) ExclusiveNpu, // evict ALL npu servers before loading (ryzenai-llm, whispercpp-npu) @@ -69,7 +68,6 @@ struct BackendDescriptor { std::string modality; // "Text generation" | "Speech-to-text" | "Text-to-speech" | "Image generation" bool experimental = false; // true renders "(experimental)" next to the recipe in generated docs std::string web_display_name; // name used on the docs website ("" = fall back to display_name) - int web_priority = 0; // model-grouping order on the docs website (lower = higher; 0 = unlisted) // ROCm release channels this backend publishes (e.g. {"stable","nightly"}). // Empty = the backend has no ROCm channels (its "rocm" build is a single diff --git a/src/cpp/include/lemon/backends/backend_ops.h b/src/cpp/include/lemon/backends/backend_ops.h index 047c6795d..29bfa0b2d 100644 --- a/src/cpp/include/lemon/backends/backend_ops.h +++ b/src/cpp/include/lemon/backends/backend_ops.h @@ -3,7 +3,7 @@ #include #include #include -#include "lemon/model_manager.h" // ModelInfo, DownloadProgressCallback (server-side only) +#include "lemon/model_manager.h" // ModelInfo, DownloadProgressCallback namespace lemon { @@ -11,16 +11,14 @@ class CloudProviderRegistry; namespace backends { -// Context handed to BackendOps methods — the bits of server state model -// management needs without a running subprocess. Grows as migrations require. +// Context handed to BackendOps methods: the server state model management needs +// without a running subprocess. struct BackendOpsContext { ModelManager* model_manager = nullptr; CloudProviderRegistry* cloud_registry = nullptr; // for dynamic cloud discovery }; -// Inputs for resolving a checkpoint's on-disk path. The model manager computes -// the HF-cache locations generically; each backend's ops decide how to find its -// artifact within (a .gguf file, a genai_config.json directory, a .bin, …). +// Inputs for resolving a checkpoint's on-disk path. struct CheckpointResolveContext { std::string hf_cache; // HF cache root dir std::string model_cache_path; // hf_cache/ @@ -35,13 +33,10 @@ struct CheckpointResolveContext { // running subprocess: checkpoint-path resolution, download, dynamic discovery, // per-model metadata, version detection, availability. One singleton per // backend, exposed via lemon::backends::::ops() and bound in the registry -// (see BackendRegistration::ops). -// -// The base class is the shared default behavior (the common HF-backed case); -// each backend folder overrides ONLY the policy points it needs, so shared -// logic is inherited rather than copied. Methods are added here incrementally as -// switchboards in model_manager / system_info are migrated; every method has a -// default so adding one never forces edits to backends that don't override it. +// (see BackendRegistration::ops). The base class provides shared default +// behavior; backends override only the policy points they need. Every method +// has a default, so adding one never forces edits to backends that don't +// override it. class BackendOps { public: virtual ~BackendOps() = default; @@ -55,8 +50,7 @@ class BackendOps { // Resolve a checkpoint to its absolute on-disk path (file or directory). // Default: the shared HF behavior — locate the variant/aux file in the active - // snapshot, else fall back to the model cache directory. Backends with a - // bespoke artifact layout (GGUF file, genai_config.json dir, .bin, …) override. + // snapshot, else fall back to the model cache directory. virtual std::string resolve_checkpoint_path(const ModelInfo& info, const CheckpointResolveContext& ctx) const; @@ -71,7 +65,6 @@ class BackendOps { // Validate a user-supplied checkpoint string when registering a new model. // Return an error message if invalid, "" if acceptable. Default: accept. - // llamacpp requires a :variant on GGUF checkpoints. virtual std::string validate_registration_checkpoint(const std::string& checkpoint) const { (void)checkpoint; return ""; @@ -81,7 +74,6 @@ class BackendOps { // `main_variant`, for backends whose artifact layout isn't a GGUF file. // Return nullopt to use the default GGUF selection. (Direct single-file // variants — .safetensors/.pth/.ckpt — are handled generically upstream.) - // moonshine overrides: its variant names a directory of files to fetch. virtual std::optional> select_checkpoint_files( const std::string& main_variant, const std::vector& repo_files) const { (void)main_variant; @@ -90,40 +82,35 @@ class BackendOps { } // Models supplied at runtime rather than from server_models.json (descriptor - // dynamic_models = true). Default: none. cloud/flm override. + // dynamic_models = true). Default: none. virtual std::vector discover_models(const BackendOpsContext& ctx) const { (void)ctx; return {}; } // Whether a model's local artifacts are present. Default: the shared HF - // checkpoint-completeness check (ModelManager::checkpoints_complete). cloud - // (always true) and flm (installed-set membership) override. + // checkpoint-completeness check (ModelManager::checkpoints_complete). virtual bool is_downloaded(const ModelInfo& info, const BackendOpsContext& ctx) const; // Validate a resolved checkpoint file for the cache. Returns "" if valid, or - // a reason it should be treated as not-downloaded. Default: always valid; - // llamacpp checks GGUF magic. + // a reason it should be treated as not-downloaded. Default: always valid. virtual std::string validate_checkpoint_file(const std::string& resolved_path) const { (void)resolved_path; return ""; } // Download a model's artifacts. Default: the shared Hugging Face download. - // cloud (no-op) and flm (flm pull) override. virtual void download_model(const ModelInfo& info, bool do_not_upgrade, DownloadProgressCallback progress, const BackendOpsContext& ctx) const; // Whether the model cache must be rebuilt after this backend downloads a - // model (e.g. flm, whose model list changes). Default: false. + // model. Default: false. virtual bool invalidates_cache_after_download() const { return false; } // Resolve a backend's installed version for a given backend variant. The // caller passes the version read from the on-disk version.txt (or "" if - // absent); the default returns it unchanged. Backends that detect their - // version another way override: llamacpp's "system" build runs - // `llama-server --version`; flm queries `flm version` when no file is present. + // absent); the default returns it unchanged. virtual std::string resolve_version(const std::string& backend, const std::string& file_version) const { (void)backend; @@ -139,8 +126,6 @@ class BackendOps { // Decide whether a backend variant is installed, given whether its managed // binary was found on disk. Default: installed iff the binary was found. - // llamacpp's "system" build also requires the ggml HIP plugin when an AMD GPU - // is present; flm can be a system PATH package even without a managed binary. virtual InstallCheck check_install(const std::string& backend, bool binary_found) const { (void)backend; return {binary_found, ""}; @@ -158,8 +143,7 @@ class BackendOps { // Classify a "supported but not available" backend variant for /system-info, // given the install probe's error text and the generic install command the // caller would otherwise use. Return nullopt to use the generic - // installable/no-fetch default. flm overrides: it is a system .deb + drivers - // needing manual setup, so its states and remediation links differ. + // installable/no-fetch default. virtual std::optional classify_unavailable( const std::string& backend, const std::string& install_error, const std::string& default_install_command) const { diff --git a/src/cpp/include/lemon/backends/backend_registry.h b/src/cpp/include/lemon/backends/backend_registry.h index 240ddf728..4f4e7b6f8 100644 --- a/src/cpp/include/lemon/backends/backend_registry.h +++ b/src/cpp/include/lemon/backends/backend_registry.h @@ -18,8 +18,7 @@ namespace backends { struct BackendSpec; // install/download spec, defined in backend_utils.h -// Everything a backend's create() needs to build an instance. Mirrors the -// arguments the old router factory passed to each backend constructor. +// Everything a backend's create() needs to build an instance. struct BackendContext { std::string log_level; ModelManager* model_manager = nullptr; @@ -31,8 +30,7 @@ struct BackendContext { using BackendCreateFn = std::unique_ptr (*)(const BackendContext&); // Convenience for the common create(): construct a server class from the -// standard (log_level, model_manager, backend_manager) context fields. Backends -// needing extra constructor arguments (cloud, ryzenai) build theirs by hand. +// standard (log_level, model_manager, backend_manager) context fields. template std::unique_ptr make_server(const BackendContext& ctx) { return std::make_unique(ctx.log_level, ctx.model_manager, ctx.backend_manager); diff --git a/src/cpp/include/lemon/backends/backend_utils.h b/src/cpp/include/lemon/backends/backend_utils.h index bdbfe0869..8e2e532a5 100644 --- a/src/cpp/include/lemon/backends/backend_utils.h +++ b/src/cpp/include/lemon/backends/backend_utils.h @@ -45,9 +45,7 @@ namespace lemon::backends { // Build a backend's install/download spec from its descriptor's recipe/binary // and the server class T's get_install_params. The construct-on-first-use - // static gives the registry a stable pointer. Backends whose install key - // differs from the recipe (ryzenai) or that have no installable artifact - // (cloud) build their BackendSpec by hand instead of using this. + // static gives the registry a stable pointer. template const BackendSpec* make_spec(const BackendDescriptor& d, bool split = false) { static const BackendSpec kSpec(d.recipe, d.binary, T::get_install_params, split); diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h index 24049ab31..dcdf345ac 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm.h @@ -31,7 +31,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ false, /*web_display_name*/ "FastFlowLM NPU", - /*web_priority*/ 3, /*rocm_channels*/ {}, /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ false, diff --git a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h index 87470300c..ee690e16a 100644 --- a/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h +++ b/src/cpp/include/lemon/backends/fastflowlm/fastflowlm_models.h @@ -22,8 +22,7 @@ std::vector flm_installed_checkpoints(); std::vector flm_discover_models(); // FLM-specific model-file helpers. FLM stores models under FLM_MODEL_PATH / -// platform-default roots and describes them with a config.json; this knowledge -// lives in the fastflowlm backend folder rather than in the shared model manager. +// platform-default roots and describes them with a config.json. // Derive the on-disk repo directory name from an FLM model URL. std::string repo_dir_from_url(const std::string& url); diff --git a/src/cpp/include/lemon/backends/kokoro/kokoro.h b/src/cpp/include/lemon/backends/kokoro/kokoro.h index 5f3fbf97c..3c34dc268 100644 --- a/src/cpp/include/lemon/backends/kokoro/kokoro.h +++ b/src/cpp/include/lemon/backends/kokoro/kokoro.h @@ -32,7 +32,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text-to-speech", /*experimental*/ false, /*web_display_name*/ "", - /*web_priority*/ 6, /*rocm_channels*/ {}, /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ false, diff --git a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h index 7c58a73f3..f0308cfb2 100644 --- a/src/cpp/include/lemon/backends/llamacpp/llamacpp.h +++ b/src/cpp/include/lemon/backends/llamacpp/llamacpp.h @@ -45,7 +45,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ false, /*web_display_name*/ "llama.cpp GPU", - /*web_priority*/ 1, /*rocm_channels*/ {"stable", "nightly"}, /*exposes_prometheus_metrics*/ true, /*rocm_requires_cwsr_fix*/ true, diff --git a/src/cpp/include/lemon/backends/moonshine/moonshine.h b/src/cpp/include/lemon/backends/moonshine/moonshine.h index ae7313714..171b8a51f 100644 --- a/src/cpp/include/lemon/backends/moonshine/moonshine.h +++ b/src/cpp/include/lemon/backends/moonshine/moonshine.h @@ -32,7 +32,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Speech-to-text", /*experimental*/ false, /*web_display_name*/ "", - /*web_priority*/ 0, /*rocm_channels*/ {}, /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ false, diff --git a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h index dbc15d7f3..6df5511e4 100644 --- a/src/cpp/include/lemon/backends/ryzenai/ryzenai.h +++ b/src/cpp/include/lemon/backends/ryzenai/ryzenai.h @@ -31,7 +31,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ false, /*web_display_name*/ "Ryzen AI SW NPU", - /*web_priority*/ 2, /*rocm_channels*/ {}, /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ false, diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h index 986d26fbe..dadd24fe9 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp.h @@ -49,7 +49,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Image generation", /*experimental*/ false, /*web_display_name*/ "stable-diffusion.cpp", - /*web_priority*/ 5, /*rocm_channels*/ {"stable"}, /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ true, diff --git a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h index 185108afc..7e2acf048 100644 --- a/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h +++ b/src/cpp/include/lemon/backends/sdcpp/sdcpp_server.h @@ -46,20 +46,6 @@ class SDServer : public WrappedServer, public IImageServer { // // sd-server's HTTP API does not expose an upscaling endpoint, so we use the // sd-cli binary's -M upscale mode as a subprocess. - // - // Called by Server::handle_image_upscale (server.cpp), which is registered - // as the route handler for POST /api/v1/images/upscale (see register_post - // in Server::Server). - // - // Endpoint: POST /api/v1/images/upscale - // Request body (JSON): - // { "image": "", "model": "" } - // Success response (200): - // { "created": , "data": [{ "b64_json": "" }] } - // Error responses: - // 400 - missing "image" or "model" field - // 404 - model name not found in server_models.json - // 500 - upscale subprocess failed or sd-cli binary not found static std::string upscale_via_cli( const std::string& b64_image, const std::string& upscale_model_path, diff --git a/src/cpp/include/lemon/backends/vllm/vllm.h b/src/cpp/include/lemon/backends/vllm/vllm.h index 8984e15b3..b62fbd83f 100644 --- a/src/cpp/include/lemon/backends/vllm/vllm.h +++ b/src/cpp/include/lemon/backends/vllm/vllm.h @@ -32,7 +32,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Text generation", /*experimental*/ true, /*web_display_name*/ "", - /*web_priority*/ 0, /*rocm_channels*/ {}, // single rocm artifact, no stable/nightly channels /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ true, diff --git a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h index 9c38b66d5..28c617ec2 100644 --- a/src/cpp/include/lemon/backends/whispercpp/whispercpp.h +++ b/src/cpp/include/lemon/backends/whispercpp/whispercpp.h @@ -41,7 +41,6 @@ inline const BackendDescriptor descriptor = { /*modality*/ "Speech-to-text", /*experimental*/ false, /*web_display_name*/ "whisper.cpp", - /*web_priority*/ 4, /*rocm_channels*/ {}, /*exposes_prometheus_metrics*/ false, /*rocm_requires_cwsr_fix*/ false, diff --git a/src/cpp/include/lemon/config_file.h b/src/cpp/include/lemon/config_file.h index 8c46e125f..f0353a345 100644 --- a/src/cpp/include/lemon/config_file.h +++ b/src/cpp/include/lemon/config_file.h @@ -86,9 +86,7 @@ class ConfigFile { public: /// The canonical default config: resources/defaults.json (global keys) with /// each backend's per-recipe section seeded from its descriptor. Host- and - /// deployment-independent, so it is reproducible — this is what - /// GET /internal/config/defaults emits and gen_backend_boilerplate.py writes - /// back into resources/defaults.json. + /// deployment-independent. static json base_defaults(); /// base_defaults() plus deployment overrides. On Linux, an optional distro diff --git a/src/cpp/include/lemon/gguf_reader.h b/src/cpp/include/lemon/gguf_reader.h index 8eb4be3ad..865058067 100644 --- a/src/cpp/include/lemon/gguf_reader.h +++ b/src/cpp/include/lemon/gguf_reader.h @@ -219,7 +219,6 @@ inline bool read_gguf_metadata(GgufMetadata& out, const std::string& path) { uint32_t type = 0; if (!read_gguf_string(in, key) || !read_gguf_le(in, type)) return false; - // Read architecture if (key == "general.architecture" && type == 8) { if (!read_gguf_string(in, out.architecture)) return false; if (pending_context_length > 0) { diff --git a/src/cpp/include/lemon/model_types.h b/src/cpp/include/lemon/model_types.h index c92bedb37..855912f16 100644 --- a/src/cpp/include/lemon/model_types.h +++ b/src/cpp/include/lemon/model_types.h @@ -140,11 +140,7 @@ inline ModelType get_model_type_from_labels(const std::vector& labe } // Fallback device type for recipes with no registered backend descriptor -// (collections and unknown recipes). The authoritative per-backend default lives -// in BackendDescriptor::default_device; ModelManager::device_type_for_recipe -// consults the descriptor registry first and only falls back here. Kept in this -// low-level header (which must not depend on the backend registry) for that -// fallback alone — it intentionally carries no per-backend knowledge. +// (collections and unknown recipes); the descriptor registry is authoritative. inline DeviceType get_device_type_from_recipe(const std::string& recipe) { (void)recipe; return DEVICE_NONE; diff --git a/src/cpp/include/lemon/wrapped_server.h b/src/cpp/include/lemon/wrapped_server.h index 41e91595b..3ad8465f7 100644 --- a/src/cpp/include/lemon/wrapped_server.h +++ b/src/cpp/include/lemon/wrapped_server.h @@ -308,7 +308,6 @@ class WrappedServer : public ICompletionServer { // No-op by default } - // ICompletionServer implementation - forward requests to the wrapped server. // Default to an "unsupported" error so non-chat backends (TTS, image, // transcription) inherit a sensible response instead of stubbing each one. virtual json chat_completion(const json& request) override { diff --git a/src/cpp/server/backends/backend_ops.cpp b/src/cpp/server/backends/backend_ops.cpp index 2f4cdf48c..0fc96623a 100644 --- a/src/cpp/server/backends/backend_ops.cpp +++ b/src/cpp/server/backends/backend_ops.cpp @@ -15,8 +15,7 @@ using lemon::utils::path_to_utf8; // Default checkpoint resolution: the shared Hugging Face behavior. Locate the // requested variant (or auxiliary file like mmproj) within the active snapshot, -// falling back to the main repo and finally the model cache directory. Backends -// with bespoke layouts override resolve_checkpoint_path(). +// falling back to the main repo and finally the model cache directory. std::string BackendOps::resolve_checkpoint_path(const ModelInfo& info, const CheckpointResolveContext& ctx) const { (void)info; diff --git a/src/cpp/server/backends/backend_utils.cpp b/src/cpp/server/backends/backend_utils.cpp index 42c0d1709..8ccf12632 100644 --- a/src/cpp/server/backends/backend_utils.cpp +++ b/src/cpp/server/backends/backend_utils.cpp @@ -34,8 +34,7 @@ using json = nlohmann::json; namespace lemon::backends { const BackendSpec* try_get_spec_for_recipe(const std::string& recipe) { - // Each backend exposes its install/download spec through the registry - // (see ::spec()); no per-recipe branches or server includes here. + // Each backend exposes its install/download spec through the registry. return spec_for(recipe); } @@ -545,8 +544,6 @@ namespace lemon::backends { // Remove the downloaded archive on ANY exit from here on — success // OR exception, including a throw from commit_staged_install() below // (a swap/rename failure) — so the cache archive is never leaked. - // Mirrors StagingGuard above; replaces the per-throw fs::remove(zip_path) - // calls that did not cover the commit_staged_install throw path. struct ZipGuard { const std::string& path; ~ZipGuard() { @@ -754,9 +751,7 @@ namespace lemon::backends { LOG(ERROR, spec.log_name()) << "Extraction completed but executable not found" << std::endl; throw std::runtime_error("Extraction failed: executable not found"); } - // Swap succeeded: staging was consumed by the rename, so disarm the - // guard (its cleanup would now be a no-op, but disarm to make intent - // explicit and skip a pointless filesystem call). + // Swap succeeded: staging was consumed by the rename, so disarm the guard. staging_guard.active = false; LOG(DEBUG, spec.log_name()) << "Executable verified at: " << exe_path << std::endl; diff --git a/src/cpp/server/backends/cloud/cloud_server.cpp b/src/cpp/server/backends/cloud/cloud_server.cpp index 3c61c213b..0db3c3344 100644 --- a/src/cpp/server/backends/cloud/cloud_server.cpp +++ b/src/cpp/server/backends/cloud/cloud_server.cpp @@ -23,19 +23,8 @@ bool id_contains(const std::string& id, const std::string& needle) { return id.find(needle) != std::string::npos; } -// Pattern-based fallback for /v1/models entries that don't publish any -// capability metadata (notably OpenAI, whose response is just -// {id, object, owned_by, created}). The patterns cover the model -// families we currently know about: -// - Image/video: flux, stable-diffusion, sdxl, sd-, dall-e, gpt-image, -// chatgpt-image, sora -// - Audio: whisper, tts, *-transcribe, gpt-realtime, gpt-audio -// - Reranking: rerank -// - Embeddings: embed, bge-, nomic- -// - Classifiers: moderation -// Anything else falls through to LLM. New providers that publish -// capability metadata (see is_chat_model below) bypass this entirely -// and don't need new patterns. +// Id-pattern fallback for /v1/models entries that don't publish capability +// metadata (notably OpenAI). Anything unmatched falls through to LLM. ModelType infer_type(const std::string& id) { if (id_contains(id, "flux") || id_contains(id, "stable-diffusion") || id_contains(id, "sdxl") || id_contains(id, "sd-") || @@ -61,21 +50,12 @@ ModelType infer_type(const std::string& id) { } // Decide whether a /v1/models entry should be surfaced as a chat model. -// -// Strategy: trust provider-supplied capability metadata when it exists, -// fall back to id pattern matching only when there is none. This keeps -// the substring list bounded — adding a new provider that publishes -// capabilities does not require adding new patterns. -// -// Signals checked, in priority order: +// Trust provider capability metadata first, in priority order, falling back +// to infer_type(id) for bare responses: // 1. supports_chat: bool — Fireworks -// 2. capabilities: [string] — generic ("chat", "chat.completions", -// "embeddings", "image_generation", ...) -// 3. architecture.modality — OpenRouter ("text->text", -// "text+image->text", "text->image", ...) -// Anything that produces text via chat is -// considered chat-capable. -// 4. infer_type(id) == LLM — fallback for bare responses (OpenAI). +// 2. capabilities: [string] — generic +// 3. architecture.modality — OpenRouter +// 4. type: string — Together AI bool is_chat_model(const json& m) { if (!m.is_object() || !m.contains("id") || !m["id"].is_string()) { return false; @@ -142,24 +122,11 @@ std::vector chat_labels() { } // Detect capability labels (vision / tool-calling / reasoning) from a -// /v1/models entry and normalise the divergent fields providers use into -// lemonade's shared label vocabulary, so cloud models gate inputs exactly -// like local ones (the UI offers image upload iff "vision" is present, etc.). -// -// Strategy mirrors is_chat_model: trust structured provider metadata first, -// fall back to id patterns only for providers that publish none (OpenAI). -// When a signal is absent the capability defaults OFF — under-offering an -// input is safer than letting the client attach an image the provider rejects -// (the per-model override exists for the cases auto-detection can't cover). -// -// Recognised signals: -// vision — supports_image_input (Fireworks); supports_vision/vision bools; -// architecture.input_modalities ⊇ "image" (OpenRouter); -// modalities/input_modalities ⊇ "image". -// tools — supports_tools (Fireworks); supported_parameters ⊇ "tools" -// (OpenRouter); capabilities ⊇ "tools"/"function_calling"; -// function_calling/supports_function_calling bools. -// reason — supported_parameters ⊇ "reasoning"; reasoning/supports_reasoning. +// /v1/models entry, normalising the divergent fields providers use into +// lemonade's shared label vocabulary so cloud models gate inputs like local +// ones. When a signal is absent the capability defaults OFF — under-offering +// an input is safer than letting the client attach an image the provider +// rejects (the per-model override covers cases auto-detection can't). std::vector capability_labels(const json& m) { std::vector labels; if (!m.is_object()) return labels; @@ -175,7 +142,6 @@ std::vector capability_labels(const json& m) { return false; }; - // ---- vision ---- bool vision = flag("supports_image_input") || flag("supports_vision") || flag("vision") || array_has(m.value("modalities", json::array()), "image") || @@ -185,7 +151,6 @@ std::vector capability_labels(const json& m) { "image"); } - // ---- tool-calling ---- const json params = m.value("supported_parameters", json::array()); const json caps = m.value("capabilities", json::array()); bool tools = flag("supports_tools") || flag("function_calling") || @@ -194,14 +159,12 @@ std::vector capability_labels(const json& m) { array_has(caps, "tools") || array_has(caps, "function_calling") || array_has(caps, "tool_calling"); - // ---- reasoning ---- bool reasoning = flag("reasoning") || flag("supports_reasoning") || array_has(params, "reasoning") || array_has(params, "include_reasoning"); - // ---- id-pattern fallback for metadata-barren providers (OpenAI) ---- - // Only consulted when the entry carries no structured capability hints at - // all, so an authoritative "false" from a provider is never overridden. + // Id-pattern fallback, consulted only when the entry carries no structured + // capability hints, so an authoritative "false" from a provider stands. const bool has_meta = m.contains("supports_image_input") || m.contains("supports_vision") || m.contains("vision") || m.contains("supports_tools") || @@ -274,34 +237,10 @@ std::pair parse_cloud_cost(const json& m) { return cost; } -// Build the user-facing model name from a provider's upstream id, applying -// two universal cleanup rules (no provider-specific code): -// -// 1. Collapse "accounts//models/" -> "/". This is a -// content-pattern match (the GCP-style resource-path convention used -// by Fireworks). Any provider that adopts the same shape benefits -// automatically; providers using flat ids ("gpt-4o") or other -// namespaces ("meta-llama/Llama-3.3-70B-Instruct-Turbo") pass through -// untouched. -// -// 2. If the cleaned id leads with "/", strip it before adding -// the wrapping "/" prefix — otherwise Fireworks's first- -// party models ("fireworks/...") would render as -// "fireworks/fireworks/...". -// -// The provider namespace is joined with a "." separator (matching the -// "user."/"extra." namespacing used elsewhere); the cleaned upstream id keeps -// its own native "/" separators. -// -// Examples: -// provider="fireworks", id="accounts/fireworks/models/deepseek-v4-pro" -// -> "fireworks.deepseek-v4-pro" -// provider="fireworks", id="accounts/trilogy/models/cogsci-..." -// -> "fireworks.trilogy/cogsci-..." -// provider="openai", id="gpt-4o" -// -> "openai.gpt-4o" -// provider="together", id="meta-llama/Llama-3.3-70B-Instruct-Turbo" -// -> "together.meta-llama/Llama-3.3-70B-Instruct-Turbo" +// Build the user-facing model name "." by +// applying two content-pattern cleanup rules (no provider-specific code). +// Example: provider="fireworks", id="accounts/fireworks/models/deepseek-v4-pro" +// -> "fireworks.deepseek-v4-pro". std::string build_public_name(const std::string& provider, const std::string& upstream_id) { std::string cleaned = upstream_id; @@ -500,11 +439,8 @@ json CloudServer::post_with_auth(const std::string& path, const json& request, try { auto response = utils::HttpClient::post(url, request.dump(), headers, timeout_seconds); if (response.status_code == 200) { - // Telemetry: the chat/completions handler in server.cpp parses - // the `usage` field off the returned JSON and calls - // Router::update_telemetry / update_prompt_tokens. CloudServer - // returns the body unchanged so that path picks up the same - // prompt/completion counts every other backend reports. + // Return the body unchanged so the server.cpp handler picks up the + // `usage` telemetry like every other backend. return json::parse(response.body); } @@ -549,12 +485,8 @@ void CloudServer::forward_streaming_request(const std::string& endpoint, bool sse, long timeout_seconds, TelemetryCallback telemetry_callback) { - // Telemetry from cloud streaming responses: OpenAI-shape SSE puts the - // usage block in the final pre-[DONE] chunk. We don't parse it here — - // the Router-level streaming path delivers cleaner numbers than we can - // reconstruct from chunked output, and matching local backends here - // would only diverge subtly. Passing the callback through preserves the - // contract for callers that pass one in. + // Streaming telemetry is left to the Router-level path, which produces + // cleaner numbers than reconstructing them from chunked SSE output. (void) telemetry_callback; auto sse_error = [](const std::string& message, const std::string& type, const json& extra = json::object()) { @@ -638,7 +570,6 @@ void CloudServer::forward_streaming_request(const std::string& endpoint, if (length == 0) return true; if (first_chunk) { first_chunk = false; - // Skip leading whitespace before classifying. size_t i = 0; while (i < length && std::isspace(static_cast(data[i]))) ++i; if (i < length && (data[i] == 'd' || data[i] == ':')) { @@ -777,32 +708,20 @@ std::vector CloudServer::discover_models(const std::string& provider, } for (const auto& m : *model_array) { - // Chat-only by design. CloudServer implements chat_completion / - // completion against OpenAI v1; embeddings, audio, reranking, and - // image use diverging wire formats across providers and belong in - // sibling backends. is_chat_model() trusts provider-supplied - // capability metadata first (supports_chat, capabilities, - // architecture.modality) and falls back to id pattern matching for - // bare responses, so the router never sees a cloud model it cannot - // dispatch. + // Chat-only by design; embeddings/audio/reranking/image belong in + // sibling backends with diverging wire formats. if (!is_chat_model(m)) { continue; } std::string upstream_id = m["id"].get(); ModelInfo info; - // Public name = ".". The cleanup - // rules in build_public_name() are content-pattern based and apply - // universally to any provider — see the function comment for the - // examples and rationale. info.model_name = build_public_name(provider, upstream_id); info.checkpoints["main"] = upstream_id; info.recipe = "cloud"; info.cloud_provider = provider; - // Discovered models are "suggested" because the user explicitly - // configured this provider — they wouldn't have a working API key - // otherwise. Without this, the Model Manager UI's default - // suggested-only filter hides every cloud model. + // Mark suggested so the Model Manager's default suggested-only filter + // doesn't hide every cloud model the user explicitly configured. info.suggested = true; info.downloaded = true; // Cloud models have no local artifacts. info.size = 0.0; @@ -812,9 +731,7 @@ std::vector CloudServer::discover_models(const std::string& provider, for (auto& cap : capability_labels(m)) { info.labels.push_back(std::move(cap)); } - // Static metadata the providers publish (all three give context_length; - // OpenRouter/Together also give pricing). Surfaced in /models, /health - // and the discover response — display only, never affects routing. + // Display-only metadata; never affects routing. if (m.contains("context_length") && m["context_length"].is_number_integer()) { info.max_context_window = m["context_length"].get(); } @@ -849,24 +766,21 @@ class CloudOps : public BackendOps { public: std::string resolve_checkpoint_path(const ModelInfo&, const CheckpointResolveContext&) const override { - // Cloud-offloaded models have no local artifacts; the checkpoint is the - // upstream provider's model id, used directly when forwarding requests. + // Cloud models have no local artifacts; the checkpoint is the upstream + // provider's model id, used directly when forwarding requests. return ""; } - // Cloud models have no local artifacts — always "downloaded". bool is_downloaded(const ModelInfo&, const BackendOpsContext&) const override { return true; } - // "Downloading" a cloud model is a no-op. void download_model(const ModelInfo&, bool, DownloadProgressCallback, const BackendOpsContext&) const override {} // Discover models from each installed cloud provider with a resolvable - // credential. Per AGENTS.md invariant #11 the registry persists only - // {provider, base_url}; keys come from env vars / process memory. Failures - // are logged, never propagated, so one offline provider can't block discovery. + // credential. Failures are logged, never propagated, so one offline + // provider can't block discovery. std::vector discover_models(const BackendOpsContext& ctx) const override { std::vector out; if (ctx.cloud_registry == nullptr) { diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp index 83d2080bc..8a424234e 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_models.cpp @@ -145,7 +145,6 @@ std::vector flm_installed_checkpoints() { std::string flm_path = find_flm_binary(); if (flm_path.empty()) return installed_models; - // Run 'flm list --filter installed --quiet --json' to get only installed models std::string output; #ifdef _WIN32 std::string command = "\"" + flm_path + "\" list --filter installed --quiet --json 2>NUL"; @@ -180,7 +179,7 @@ std::vector flm_installed_checkpoints() { // Fallback to legacy parsing if JSON parsing fails } - // Legacy parsing - cleaner format without emojis + // Legacy parsing // Expected format: // Models: // - modelname:tag @@ -188,11 +187,9 @@ std::vector flm_installed_checkpoints() { std::istringstream stream(output); std::string line; while (std::getline(stream, line)) { - // Trim whitespace line.erase(0, line.find_first_not_of(" \t\r\n")); line.erase(line.find_last_not_of(" \t\r\n") + 1); - // Skip the "Models:" header line or empty lines if (line == "Models:" || line.empty()) { continue; } @@ -200,7 +197,6 @@ std::vector flm_installed_checkpoints() { // Parse model checkpoint (format: " - modelname:tag") if (line.find("- ") == 0) { std::string checkpoint = line.substr(2); - // Trim any remaining whitespace checkpoint.erase(0, checkpoint.find_first_not_of(" \t")); checkpoint.erase(checkpoint.find_last_not_of(" \t") + 1); if (!checkpoint.empty()) { @@ -223,7 +219,6 @@ std::vector flm_discover_models() { LOG(INFO, "ModelManager") << "FLM binary found at: " << flm_path << std::endl; - // Run 'flm list --json' to get all available models std::string output; #ifdef _WIN32 std::string command = "\"" + flm_path + "\" list --json"; @@ -260,7 +255,6 @@ std::vector flm_discover_models() { // Format display name: replace : with -, append -FLM // e.g., "llama3.2:1b" -> "llama3.2-1b-FLM" std::string display_name = checkpoint; - // Replace : with - std::replace(display_name.begin(), display_name.end(), ':', '-'); std::string model_name = display_name + "-FLM"; @@ -285,7 +279,6 @@ std::vector flm_discover_models() { info.size = m["footprint"].get(); } - // Labels from FLM metadata if (m.contains("label") && m["label"].is_array()) { for (const auto& l : m["label"]) { if (l.is_string()) { @@ -294,7 +287,6 @@ std::vector flm_discover_models() { } } - // Populate type and device fields (multi-model support) info.type = get_model_type_from_labels(info.labels); const BackendDescriptor* flm_desc = descriptor_for("flm"); info.device = flm_desc ? flm_desc->default_device : DEVICE_NPU; @@ -317,7 +309,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, DownloadProgressCallback progress_callback) { LOG(INFO, "ModelManager") << "Pulling FLM model: " << checkpoint << std::endl; - // Ensure FLM is ready (single source of truth) auto status = SystemInfoCache::get_flm_status(); if (!status.is_ready()) { throw std::runtime_error(status.error_string()); @@ -328,7 +319,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, throw std::runtime_error("FLM executable not found"); } - // Prepare arguments std::vector args = {"pull", checkpoint}; if (!do_not_upgrade) { args.push_back("--force"); @@ -346,14 +336,11 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, std::string current_filename; bool cancelled = false; - // Run flm pull command and parse output int exit_code = lemon::utils::ProcessManager::run_process_with_output( flm_path, args, [&](const std::string& line) -> bool { - // Always print the line to console LOG(INFO, "FLM") << line << std::endl; - // Parse FLM output to extract progress information // Pattern: "[FLM] Downloading X/Y: filename" if (line.find("[FLM] Downloading ") != std::string::npos && line.find("/") != std::string::npos && @@ -370,7 +357,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, total_files = std::stoi(line.substr(slash + 1, colon - slash - 1)); current_filename = line.substr(colon + 2); // Skip ": " - // Send progress update if (progress_callback) { DownloadProgress progress; progress.file = current_filename; @@ -395,7 +381,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, else if (line.find("[FLM] Downloading: ") != std::string::npos && line.find("%") != std::string::npos) { - // Extract percentage and bytes size_t start = line.find("Downloading: ") + 13; size_t pct_end = line.find("%", start); @@ -440,7 +425,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, bytes_total = parse_size(total_str); } - // Send progress update with byte-level info if (progress_callback) { DownloadProgress progress; progress.file = current_filename; @@ -518,7 +502,6 @@ void flm_download(const std::string& checkpoint, bool do_not_upgrade, throw std::runtime_error("FLM pull failed with exit code: " + std::to_string(exit_code)); } - // Send completion event if (progress_callback) { DownloadProgress progress; progress.complete = true; @@ -541,7 +524,6 @@ std::string flm_version() { return cached_version; } - // Find the flm executable using shared utility std::string flm_path = find_flm_executable(); if (flm_path.empty() || !lemon::utils::is_safe_executable_path(flm_path)) { return "unknown"; @@ -588,7 +570,6 @@ std::string flm_version() { size_t pos = output.find("FLM v"); // Keep the 'v' prefix so it matches backend_versions.json (e.g. "v0.9.34"). std::string version = output.substr(pos + 4); - // Trim whitespace and newlines size_t end = version.find_first_of(" \t\n\r"); if (end != std::string::npos) { version = version.substr(0, end); @@ -664,7 +645,6 @@ bool run_flm_validate(const std::string& flm_path, std::string& error_message) { if (!output.empty()) { json j = lemon::utils::JsonUtils::parse(output); if (j.is_object()) { - // Check for overall status bool validation_ok = false; if (j.contains("ready")) { validation_ok = j["ready"].get(); diff --git a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp index 050b5a961..94916d5d2 100644 --- a/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp +++ b/src/cpp/server/backends/fastflowlm/fastflowlm_server.cpp @@ -32,7 +32,6 @@ namespace fs = std::filesystem; namespace lemon { namespace backends { -// URL to direct users to for driver updates static const std::string DRIVER_INSTALL_URL = "https://lemonade-server.ai/driver_install.html"; @@ -78,7 +77,6 @@ FastFlowLMServer::~FastFlowLMServer() { std::string FastFlowLMServer::download_model(const std::string& checkpoint, bool do_not_upgrade) { LOG(INFO, "FastFlowLM") << "Pulling model with FLM: " << checkpoint << std::endl; - // Use flm pull command to download the model std::string flm_path = get_flm_path(); if (flm_path.empty()) { throw std::runtime_error("FLM not found"); @@ -95,7 +93,6 @@ std::string FastFlowLMServer::download_model(const std::string& checkpoint, bool } LOG(INFO, "ProcessManager") << std::endl; - // Run flm pull command (with debug output if enabled) auto handle = utils::ProcessManager::start_process(flm_path, args, "", is_debug()); // Wait for process to complete (handles both fast exits and long downloads). @@ -157,7 +154,6 @@ void FastFlowLMServer::load(const std::string& model_name, bool do_not_upgrade) { LOG(INFO, "FastFlowLM") << "Loading model: " << model_name << std::endl; - // Get FLM-specific options from RecipeOptions int ctx_size = options.get_option("ctx_size"); std::cout << "[FastFlowLM] Options: ctx_size=" << ctx_size << std::endl; @@ -165,11 +161,9 @@ void FastFlowLMServer::load(const std::string& model_name, // We use checkpoint_ (base class field) for FLM API calls #ifdef _WIN32 - // On Windows, auto-install FLM binary if needed (downloads zip and extracts) backend_manager_->install_backend(fastflowlm::spec()->recipe, "npu"); #endif - // Validate NPU hardware/drivers std::string flm_path = get_flm_path(); std::string validate_error; if (!fastflowlm::run_flm_validate(flm_path, validate_error)) { @@ -177,17 +171,13 @@ void FastFlowLMServer::load(const std::string& model_name, "\nVisit " + DRIVER_INSTALL_URL + " for driver installation instructions."); } - // Download model if needed download_model(model_info.checkpoint(), do_not_upgrade); - // Choose a port port_ = choose_port(); - // Construct flm serve command based on model type // Bind to localhost only for security std::vector args; if (model_type_ == ModelType::TRANSCRIPTION) { - // ASR mode: flm serve --asr 1 args = { "serve", "--asr", "1", @@ -196,7 +186,6 @@ void FastFlowLMServer::load(const std::string& model_name, "--quiet" }; } else if (model_type_ == ModelType::EMBEDDING) { - // Embedding mode: flm serve --embed 1 args = { "serve", "--embed", "1", @@ -205,7 +194,6 @@ void FastFlowLMServer::load(const std::string& model_name, "--quiet" }; } else { - // LLM mode (default): flm serve --ctx-len N args = { "serve", model_info.checkpoint(), @@ -226,7 +214,6 @@ void FastFlowLMServer::load(const std::string& model_name, set_process_handle(utils::ProcessManager::start_process(flm_path, args, "", is_debug(), true)); LOG(INFO, "ProcessManager") << "Process started successfully" << std::endl; - // Wait for flm-server to be ready bool ready = wait_for_ready(); if (!ready) { const ProcessHandle handle = consume_process_handle_for_cleanup(); @@ -277,14 +264,12 @@ bool FastFlowLMServer::wait_for_ready() { return false; } - // Try to reach the /api/tags endpoint if (utils::HttpClient::is_reachable(tags_url, 1)) { LOG(INFO, "FastFlowLM") << server_name_ + " is ready!" << std::endl; start_backend_watchdog("/api/tags"); return true; } - // Sleep 1 second between attempts std::this_thread::sleep_for(std::chrono::seconds(1)); } @@ -303,7 +288,7 @@ json FastFlowLMServer::chat_completion(const json& request) { // FLM requires the checkpoint name in the request (e.g., "gemma3:4b") // (whereas llama-server ignores the model name field) json modified_request = request; - modified_request["model"] = checkpoint_; // Use base class checkpoint field + modified_request["model"] = checkpoint_; return forward_request("/v1/chat/completions", modified_request); } @@ -318,7 +303,7 @@ json FastFlowLMServer::completion(const json& request) { // FLM requires the checkpoint name in the request (e.g., "lfm2:1.2b") // (whereas llama-server ignores the model name field) json modified_request = request; - modified_request["model"] = checkpoint_; // Use base class checkpoint field + modified_request["model"] = checkpoint_; return forward_request("/v1/completions", modified_request); } @@ -349,7 +334,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) { } try { - // Extract audio data from request (same format as WhisperServer) if (!request.contains("file_data")) { throw std::runtime_error("Missing 'file_data' in request"); } @@ -357,7 +341,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) { std::string audio_data = request["file_data"].get(); std::string filename = request.value("filename", "audio.wav"); - // Determine content type from filename extension std::filesystem::path filepath(filename); std::string ext = filepath.extension().string(); std::string content_type = "audio/wav"; @@ -367,10 +350,8 @@ json FastFlowLMServer::audio_transcriptions(const json& request) { else if (ext == ".flac") content_type = "audio/flac"; else if (ext == ".webm") content_type = "audio/webm"; - // Build multipart fields for FLM's /v1/audio/transcriptions endpoint std::vector fields; - // Audio file field fields.push_back({ "file", audio_data, @@ -381,7 +362,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) { // Model field (required by OpenAI API format) fields.push_back({"model", checkpoint_, "", ""}); - // Optional parameters if (request.contains("language")) { fields.push_back({"language", request["language"].get(), "", ""}); } @@ -408,7 +388,6 @@ json FastFlowLMServer::audio_transcriptions(const json& request) { } json FastFlowLMServer::responses(const json& request) { - // Responses API is not supported for FLM backend return ErrorResponse::from_exception( UnsupportedOperationException("Responses API", "flm") ); @@ -420,7 +399,6 @@ void FastFlowLMServer::forward_streaming_request(const std::string& endpoint, bool sse, long timeout_seconds, TelemetryCallback telemetry_callback) { - // Streaming is only supported for LLM models if (model_type_ == ModelType::TRANSCRIPTION || model_type_ == ModelType::EMBEDDING) { std::string error_msg = "data: {\"error\":{\"message\":\"Streaming not supported for FLM " + model_type_to_string(model_type_) + " model\",\"type\":\"unsupported_operation\"}}\n\n"; @@ -433,10 +411,9 @@ void FastFlowLMServer::forward_streaming_request(const std::string& endpoint, // not the Lemonade model name (e.g., "Gemma3-4b-it-FLM") try { json request = json::parse(request_body); - request["model"] = checkpoint_; // Use base class checkpoint field + request["model"] = checkpoint_; std::string modified_body = request.dump(); - // Call base class with modified request WrappedServer::forward_streaming_request(endpoint, modified_body, sink, sse, timeout_seconds, telemetry_callback); } catch (const json::exception& e) { diff --git a/src/cpp/server/backends/kokoro/kokoro_server.cpp b/src/cpp/server/backends/kokoro/kokoro_server.cpp index 95d46de6a..a78e0a954 100644 --- a/src/cpp/server/backends/kokoro/kokoro_server.cpp +++ b/src/cpp/server/backends/kokoro/kokoro_server.cpp @@ -73,11 +73,9 @@ KokoroServer::~KokoroServer() { void KokoroServer::load(const std::string& model_name, const ModelInfo& model_info, const RecipeOptions& options, bool do_not_upgrade) { LOG(INFO, "KokoroServer") << "Loading model: " << model_name << std::endl; - // Install kokoros if needed const std::string backend = default_kokoro_backend(); backend_manager_->install_backend(kokoro::spec()->recipe, backend); - // Use pre-resolved model path fs::path model_path = fs::path(model_info.resolved_path()); if (model_path.empty() || !fs::exists(model_path)) { throw std::runtime_error("Model file not found for checkpoint: " + model_info.checkpoint()); @@ -94,10 +92,8 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in LOG(INFO, "KokoroServer") << "Using model: " << model_index["model"] << std::endl; - // Get koko executable path std::string exe_path = BackendUtils::get_backend_binary_path(*kokoro::spec(), backend); - // Choose a port port_ = choose_port(); if (port_ == 0) { throw std::runtime_error("Failed to find an available port"); @@ -110,7 +106,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in env_vars.push_back({"ESPEAK_DATA_PATH", (exe_dir / "espeak-ng-data").string()}); #ifndef _WIN32 std::string lib_path = exe_dir.string(); - // Preserve existing LD_LIBRARY_PATH if it exists const char* existing_ld_path = std::getenv("LD_LIBRARY_PATH"); if (existing_ld_path && strlen(existing_ld_path) > 0) { lib_path = lib_path + ":" + std::string(existing_ld_path); @@ -120,7 +115,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in LOG(INFO, "KokoroServer") << "Setting LD_LIBRARY_PATH=" << lib_path << std::endl; #endif - // Build command line arguments // Note: Don't include exe_path here - ProcessManager::start_process already handles it fs::path model_dir = model_path.parent_path(); std::vector args = { @@ -131,7 +125,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in "--port", std::to_string(port_) }; - // Launch the subprocess ProcessHandle started_handle = utils::ProcessManager::start_process( exe_path, args, @@ -148,7 +141,6 @@ void KokoroServer::load(const std::string& model_name, const ModelInfo& model_in LOG(INFO, "KokoroServer") << "Process started with PID: " << started_handle.pid << std::endl; - // Wait for server to be ready if (!wait_for_ready("/")) { unload(); throw std::runtime_error("koko failed to start or become ready"); diff --git a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp index 81cc1c555..f41c2bd08 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_gguf.cpp @@ -30,7 +30,7 @@ std::string to_lower(std::string s) { std::string resolve_gguf_path(const std::string& model_cache_path, const std::string& variant) { fs::path model_cache_path_fs = path_from_utf8(model_cache_path); if (!hf_cache::exists(model_cache_path_fs)) { - return model_cache_path; // Return directory path even if not found + return model_cache_path; } // Collect the (sorted, mmproj-excluded) GGUF files under a search root. @@ -64,11 +64,8 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st const std::string variant_lower = to_lower(variant); - // Resolve the requested GGUF variant within a candidate list of files. - // Returns the matched absolute path, or "" if this candidate set does not - // contain the variant. Factored into a lambda so the search can be retried - // against a broader set of snapshots (see #2300 below) without duplicating - // the matching logic. + // Factored into a lambda so the search can be retried against a broader set + // of snapshots (see #2300 below) without duplicating the matching logic. auto resolve_gguf_variant = [&](const std::vector& gguf_files) -> std::string { if (gguf_files.empty()) { return ""; @@ -115,26 +112,14 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st } } - // Case 5: Local quant-token fallback. + // Case 5: Local quant-token fallback. Some repos put the quant token in + // the middle of the filename (e.g. ...-IQ4_XS-Q8nextn.gguf for variant + // IQ4_XS), so the suffix cases above miss it; mirror the downloader's + // variant enumeration over the local cache instead. // - // Keep the existing resolver cases above as the primary logic: exact - // filenames, suffix matches, and folder-based sharding are more - // specific and preserve the CHECKPOINT:VARIANT contract. - // - // Some GGUF repositories name files with the quant token in the middle, - // for example: - // Qwen3.6-27B-MTP-IMAT-IQ4_XS-Q8nextn.gguf - // for variant: - // IQ4_XS - // That file does not end with IQ4_XS.gguf, so mirror the downloader's - // GGUF variant enumeration over the files that are already present in - // the local HF cache before declaring the model missing. - // - // HF cache paths have an extra snapshots// prefix that is not - // part of the repository-relative filename. Strip it before calling - // enumerate_gguf_variants(); otherwise the enumerator treats - // "snapshots" as a top-level sharded-folder variant and never extracts - // the quant token from the actual GGUF filename. + // Strip the HF cache snapshots// prefix before calling + // enumerate_gguf_variants(), otherwise it treats "snapshots" as a + // sharded-folder variant and never extracts the quant token. std::vector relative_gguf_files; std::map absolute_by_relative; auto repo_relative_from_cache_relative = [](std::string rel) { @@ -191,9 +176,8 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st return ""; } - // No match in this candidate set. Do not fall back to another - // quantization in the same Hugging Face repo; otherwise a custom - // download with a different quant can make a built-in model appear + // Don't fall back to another quantization in the same HF repo; a custom + // download with a different quant could make a built-in model appear // downloaded and allow deleting the wrong file. return ""; }; @@ -215,25 +199,19 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st }; if (active_gguf_files.empty() && whole_cache_gguf_files().empty()) { - return model_cache_path; // Return directory if no GGUF found anywhere + return model_cache_path; } std::string resolved_path = resolve_gguf_variant(active_gguf_files); - // #2300: a sibling variant that shares this HF repo can live in a snapshot - // other than the one refs/main points at. refs/main advances to the - // snapshot of whichever variant was pulled or updated last, leaving the - // other variants' symlinks behind in earlier snapshots; after a restart the - // refs/main-only search above then reports them as missing. If the active - // snapshot did not contain the requested variant, broaden the search to - // every snapshot in this repo's cache before declaring it missing. Blobs are - // content-addressed and shared, so reading an older snapshot is safe, and - // resolving against the active snapshot first preserves the CHECKPOINT:VARIANT - // contract (a different quant is never substituted while the exact one exists). - // - // The whole-cache set is a superset of the active set, so the two are equal - // only when refs/main's snapshot is the sole snapshot holding GGUFs — in - // which case the broader search is identical and skipped. + // #2300: a requested variant can live in a snapshot other than the one + // refs/main points at (refs/main advances to whichever variant was pulled + // last, stranding the others in earlier snapshots), so the active-only + // search above can report it missing. Broaden to every snapshot before + // giving up; blobs are content-addressed so reading an older snapshot is + // safe, and searching the active snapshot first preserves CHECKPOINT:VARIANT. + // The whole-cache set is a superset of the active set, so when they're equal + // the broader search is identical and skipped. if (resolved_path.empty()) { const std::vector& all_files = whole_cache_gguf_files(); if (all_files != active_gguf_files) { @@ -247,4 +225,3 @@ std::string resolve_gguf_path(const std::string& model_cache_path, const std::st } // namespace llamacpp } // namespace backends } // namespace lemon - diff --git a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp index eb766e798..ef1a67aa6 100644 --- a/src/cpp/server/backends/llamacpp/llamacpp_server.cpp +++ b/src/cpp/server/backends/llamacpp/llamacpp_server.cpp @@ -48,7 +48,6 @@ using namespace lemon::utils; namespace lemon { namespace backends { -// Embedding model batch configuration set to 8192 as default static const int EMBEDDING_BATCH_SIZE = 8192; static const int EMBEDDING_UBATCH_SIZE = 8192; @@ -272,7 +271,6 @@ void LlamaCppServer::load(const std::string& model_name, bool do_not_upgrade) { LOG(INFO, "LlamaCpp") << "Loading model: " << model_name << std::endl; - // Llamacpp Backend logging LOG(DEBUG, "LlamaCpp") << "Per-model settings: " << options.to_log_string() << std::endl; int ctx_size = options.get_option("ctx_size"); @@ -309,13 +307,10 @@ void LlamaCppServer::load(const std::string& model_name, std::string mmproj_path = model_info.resolved_path("mmproj"); std::string draft_path = model_info.resolved_path("draft"); - // Choose port port_ = choose_port(); - // Get executable path std::string executable = BackendUtils::get_backend_binary_path(*llamacpp::spec(), llamacpp_backend); - // Check for embeddings and reranking support based on model type bool supports_embeddings = (model_info.type == ModelType::EMBEDDING); bool supports_reranking = (model_info.type == ModelType::RERANKING); @@ -365,7 +360,6 @@ void LlamaCppServer::load(const std::string& model_name, } push_reserved(reserved_flags, "--mmproj", std::vector{"-mm", "-mmu", "--mmproj-url", "--no-mmproj", "--mmproj-auto", "--no-mmproj-auto", "--mmproj-offload", "--no-mmproj-offload"}); - // Add draft model if present if (!draft_path.empty()) { push_arg(args, reserved_flags, "--model-draft", draft_path); } @@ -571,7 +565,6 @@ void LlamaCppServer::load(const std::string& model_name, set_process_handle(ProcessManager::start_process( process_executable, args, working_dir, inherit_llama_output, true, env_vars)); - // Wait for server to be ready if (!wait_for_ready("/health")) { const ProcessHandle handle = consume_process_handle_for_cleanup(); if (has_process_handle(handle)) { @@ -744,7 +737,6 @@ bool is_ggml_hip_plugin_available() { "/usr/lib64/ggml/backends0/libggml-hip.so" }; - // Check all possible paths for (const auto& path : possible_paths) { if (fs::exists(path)) { return true; diff --git a/src/cpp/server/backends/moonshine/moonshine_server.cpp b/src/cpp/server/backends/moonshine/moonshine_server.cpp index bcf263d67..f6f6c644c 100644 --- a/src/cpp/server/backends/moonshine/moonshine_server.cpp +++ b/src/cpp/server/backends/moonshine/moonshine_server.cpp @@ -74,10 +74,8 @@ void MoonshineServer::load(const std::string& model_name, device_type_ = DEVICE_CPU; - // Install moonshine-server if needed backend_manager_->install_backend(moonshine::spec()->recipe, "cpu"); - // Resolve model path from ModelManager (standard HF cache) std::string model_path = model_info.resolved_path(); if (model_path.empty() || !fs::exists(model_path)) { throw std::runtime_error("Model directory not found for checkpoint: " + model_info.checkpoint()); @@ -100,7 +98,6 @@ void MoonshineServer::load(const std::string& model_name, } } - // Get executable path std::string executable = BackendUtils::get_backend_binary_path(*moonshine::spec(), "cpu"); LOG(INFO, "MoonshineServer") << "Using executable: " << executable << std::endl; @@ -159,12 +156,10 @@ void MoonshineServer::load(const std::string& model_name, args.insert(args.end(), custom_args_vec.begin(), custom_args_vec.end()); } - // Set environment variables std::vector> env_vars; // Prevent system/user Python packages from leaking into the bundled environment env_vars.push_back({"PYTHONNOUSERSITE", "1"}); - // Launch the subprocess bool inherit_output = (log_level_ == "info") || is_debug(); ProcessHandle started_handle = utils::ProcessManager::start_process( executable, @@ -182,7 +177,6 @@ void MoonshineServer::load(const std::string& model_name, LOG(INFO, "MoonshineServer") << "Process started with PID: " << started_handle.pid << std::endl; - // Wait for server to be ready if (!wait_for_ready("/health")) { unload(); throw std::runtime_error("moonshine-server failed to start or become ready"); diff --git a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp index 69e1eed16..9b21edfca 100644 --- a/src/cpp/server/backends/ryzenai/ryzenai_server.cpp +++ b/src/cpp/server/backends/ryzenai/ryzenai_server.cpp @@ -57,10 +57,8 @@ void RyzenAIServer::load(const std::string& model_name, LOG(DEBUG, "RyzenAI") << "Loading model: " << model_name << std::endl; int ctx_size = options.get_option("ctx_size"); - // Install/check RyzenAI-Server (will download if not found) backend_manager_->install_backend("ryzenai-llm", "npu"); - // Get the path to ryzenai-server std::string ryzenai_server_path = backends::BackendUtils::get_backend_binary_path(*backends::ryzenai::spec(), "npu"); if (ryzenai_server_path.empty()) { throw std::runtime_error("RyzenAI-Server executable not found even after installation attempt"); @@ -68,7 +66,6 @@ void RyzenAIServer::load(const std::string& model_name, LOG(DEBUG, "RyzenAI") << "Found ryzenai-server at: " << ryzenai_server_path << std::endl; - // Model path should have been set via set_model_path() before calling load() if (model_path_.empty()) { throw std::runtime_error("Model path is required for RyzenAI-Server. Call set_model_path() before load()"); } @@ -81,10 +78,8 @@ void RyzenAIServer::load(const std::string& model_name, LOG(DEBUG, "RyzenAI") << "Model path: " << model_path_ << std::endl; - // Find available port port_ = choose_port(); - // Build command line arguments std::vector args = { "-m", model_path_, "--port", std::to_string(port_), @@ -95,7 +90,6 @@ void RyzenAIServer::load(const std::string& model_name, args.push_back("--verbose"); } - // Log the full command line LOG(DEBUG, "RyzenAI") << "Starting: \"" << ryzenai_server_path << "\""; for (const auto& arg : args) { LOG(DEBUG, "RyzenAI") << " \"" << arg << "\""; @@ -119,7 +113,6 @@ void RyzenAIServer::load(const std::string& model_name, LOG(DEBUG, "ProcessManager") << "Process started successfully, PID: " << started_handle.pid << std::endl; - // Wait for server to be ready if (!wait_for_ready("/health")) { const ProcessHandle handle = consume_process_handle_for_cleanup(); if (has_process_handle(handle)) { @@ -150,7 +143,6 @@ json RyzenAIServer::chat_completion(const json& request) { throw ModelNotLoadedException("RyzenAI-Server"); } - // Forward to /v1/chat/completions endpoint return forward_request("/v1/chat/completions", request); } @@ -159,7 +151,6 @@ json RyzenAIServer::completion(const json& request) { throw ModelNotLoadedException("RyzenAI-Server"); } - // Forward to /v1/completions endpoint return forward_request("/v1/completions", request); } @@ -168,7 +159,6 @@ json RyzenAIServer::responses(const json& request) { throw ModelNotLoadedException("RyzenAI-Server"); } - // Forward to /v1/responses endpoint return forward_request("/v1/responses", request); } @@ -179,8 +169,7 @@ namespace backends { namespace ryzenai { std::unique_ptr create(const BackendContext& ctx) { - // RyzenAI resolves its model path before load (set_model_path), matching the - // original router factory's special-casing. + // RyzenAI requires its model path resolved before load() via set_model_path(). auto server = std::make_unique<::lemon::RyzenAIServer>( ctx.model_info->model_name, ctx.log_level == "debug", ctx.model_manager, ctx.backend_manager); diff --git a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp index a4b1787f9..9dfec350b 100644 --- a/src/cpp/server/backends/sdcpp/sdcpp_server.cpp +++ b/src/cpp/server/backends/sdcpp/sdcpp_server.cpp @@ -211,10 +211,8 @@ void SDServer::load(const std::string& model_name, device_type_ = DEVICE_CPU; } - // Install sd-server if needed backend_manager_->install_backend(sdcpp::spec()->recipe, backend); - // Get model path std::string model_path = model_info.resolved_path("main"); std::string llm_path = model_info.resolved_path("text_encoder"); std::string vae_path = model_info.resolved_path("vae"); @@ -233,10 +231,8 @@ void SDServer::load(const std::string& model_name, LOG(DEBUG, "SDServer") << "Using model: " << model_path << std::endl; - // Get sd-server executable path std::string exe_path = BackendUtils::get_backend_binary_path(*sdcpp::spec(), backend); - // Choose a port port_ = choose_port(); if (port_ == 0) { throw std::runtime_error("Failed to find an available port"); @@ -244,7 +240,6 @@ void SDServer::load(const std::string& model_name, LOG(INFO, "SDServer") << "Starting server on port " << port_ << " (backend: " << backend << ")" << std::endl; - // Build command line arguments std::vector args = { "--listen-port", std::to_string(port_) }; @@ -295,7 +290,6 @@ void SDServer::load(const std::string& model_name, args.insert(args.end(), custom_args_vec.begin(), custom_args_vec.end()); } - // Set up environment variables std::vector> env_vars; fs::path exe_dir = fs::path(exe_path).parent_path(); #ifdef _WIN32 @@ -304,7 +298,6 @@ void SDServer::load(const std::string& model_name, #endif #ifndef _WIN32 - // For Linux, always set LD_LIBRARY_PATH to include executable directory std::string lib_path = exe_dir.string(); if (resolved_backend == "rocm-stable") { @@ -328,8 +321,6 @@ void SDServer::load(const std::string& model_name, // ROCm builds on Windows require hipblaslt.dll, rocblas.dll, amdhip64.dll, etc. // These DLLs are distributed alongside sd-server.exe but need PATH to be set for loading if (is_rocm_backend(resolved_backend)) { - // Add executable directory to PATH for ROCm runtime DLLs - // This allows the sd-server.exe to find required HIP/ROCm libraries at runtime std::string new_path = path_to_utf8(exe_dir); if (resolved_backend == "rocm-stable") { @@ -368,7 +359,6 @@ void SDServer::load(const std::string& model_name, BackendUtils::apply_cuda_env_vars(env_vars, "SDServer"); } - // Launch the server process std::string process_exe_path = exe_path; std::string working_dir; #ifdef _WIN32 @@ -392,7 +382,6 @@ void SDServer::load(const std::string& model_name, LOG(INFO, "SDServer") << "Process started with PID: " << started_handle.pid << std::endl; - // Wait for server to be ready if (!wait_for_ready("/")) { unload(); throw std::runtime_error("sd-server failed to start or become ready"); @@ -546,7 +535,7 @@ json SDServer::responses(const json& /* request */) { } json SDServer::image_generations(const json& request) { - // Build request - sd-server uses OpenAI-compatible format. + // sd-server uses OpenAI-compatible format. // // See PR #1173: https://github.com/leejet/stable-diffusion.cpp/pull/1173 // for the convention. @@ -596,7 +585,6 @@ json SDServer::image_edits(const json& request) { fields.push_back({"size", size, "", ""}); } - // Decode base64 image data back to binary for multipart upload if (request.contains("image_data")) { std::string image_binary = JsonUtils::base64_decode( request["image_data"].get()); @@ -634,7 +622,6 @@ json SDServer::image_variations(const json& request) { fields.push_back({"size", size, "", ""}); } - // Decode base64 image data back to binary for multipart upload if (request.contains("image_data")) { std::string image_binary = JsonUtils::base64_decode( request["image_data"].get()); diff --git a/src/cpp/server/backends/vllm/vllm_server.cpp b/src/cpp/server/backends/vllm/vllm_server.cpp index 60a79c95f..335f0660a 100644 --- a/src/cpp/server/backends/vllm/vllm_server.cpp +++ b/src/cpp/server/backends/vllm/vllm_server.cpp @@ -123,11 +123,9 @@ void VLLMServer::load(const std::string& model_name, RuntimeConfig::validate_backend_choice("vllm", vllm_backend); - // Install vllm-server if needed backend_manager_->install_backend(vllm::spec()->recipe, vllm_backend); - // vLLM uses HuggingFace model names, not local file paths. - // The checkpoint field in server_models.json is the HF model ID. + // vLLM uses HuggingFace model IDs, not local file paths. std::string model_id = model_info.checkpoint(); if (model_id.empty()) { throw std::runtime_error("Model checkpoint (HuggingFace ID) not found for: " + model_name); @@ -135,13 +133,10 @@ void VLLMServer::load(const std::string& model_name, LOG(DEBUG, "vLLM") << "Using model: " << model_id << std::endl; - // Choose port port_ = choose_port(); - // Get executable path std::string executable = BackendUtils::get_backend_binary_path(*vllm::spec(), vllm_backend); - // Build command line arguments std::vector args; args.push_back("--model"); args.push_back(model_id); @@ -175,7 +170,6 @@ void VLLMServer::load(const std::string& model_name, << "'; letting vLLM auto-select kernel" << std::endl; } - // enable prompt caching args.push_back("--enable-prefix-caching"); // Avoid vLLM's default gpu_memory_utilization=0.92 on shared-memory systems. @@ -186,7 +180,6 @@ void VLLMServer::load(const std::string& model_name, args.push_back("4G"); } - // Append custom vllm_args if provided if (!vllm_args.empty()) { LOG(DEBUG, "vLLM") << "Adding custom arguments: " << vllm_args << std::endl; std::istringstream iss(vllm_args); @@ -198,16 +191,13 @@ void VLLMServer::load(const std::string& model_name, LOG(INFO, "vLLM") << "Starting vllm-server on port " << get_backend_port() << "..." << std::endl; - // Set environment variables std::vector> env_vars; - // The vllm-server launcher script handles LD_LIBRARY_PATH for ROCm libs. - // Set FLASH_ATTENTION_TRITON_AMD_ENABLE for ROCm flash attention. + // Enable ROCm flash attention (the launcher script handles LD_LIBRARY_PATH). env_vars.push_back({"FLASH_ATTENTION_TRITON_AMD_ENABLE", "TRUE"}); // Prevent system/user Python packages from leaking into the bundled vLLM environment env_vars.push_back({"PYTHONNOUSERSITE", "1"}); - // Start process bool inherit_output = (log_level_ == "info") || is_debug(); set_process_handle(ProcessManager::start_process(executable, args, "", inherit_output, true, env_vars)); diff --git a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp index d1222e551..2d9a683b0 100644 --- a/src/cpp/server/backends/whispercpp/whispercpp_server.cpp +++ b/src/cpp/server/backends/whispercpp/whispercpp_server.cpp @@ -68,7 +68,6 @@ WhisperServer::WhisperServer(const std::string& log_level, ModelManager* model_m WhisperServer::~WhisperServer() { unload(); - // Clean up temp directory try { if (fs::exists(temp_dir_)) { fs::remove_all(temp_dir_); @@ -127,7 +126,6 @@ InstallParams WhisperServer::get_install_params(const std::string& backend, cons return params; } -// Helper to determine NPU compiled cache info based on model info from server_models.json static std::pair get_npu_cache_info(const ModelInfo& model_info) { std::string npu_cache = model_info.checkpoint("npu_cache"); std::string npu_cache_repo = ""; @@ -147,7 +145,6 @@ static std::pair get_npu_cache_info(const ModelInfo& m return {npu_cache_repo, npu_cache_filename}; } - // No NPU cache configured for this model in server_models.json LOG(INFO, "WhisperServer") << "No NPU cache configured for model: " << model_info.model_name << std::endl; return {"", ""}; } @@ -185,19 +182,16 @@ void WhisperServer::download_npu_compiled_cache(const std::string& model_path, throw std::runtime_error("npu_cache path escapes model directory"); } - // Check if cache already exists if (fs::exists(cache_path) && !do_not_upgrade) { LOG(INFO, "WhisperServer") << "NPU cache already exists: " << cache_path << std::endl; return; } try { - // Download .rai file directly from HuggingFace using HttpClient std::string hf_url = "https://huggingface.co/" + cache_repo + "/resolve/main/" + cache_filename; LOG(INFO, "WhisperServer") << "Downloading from: " << hf_url << std::endl; - // Download directly to the target location auto download_result = utils::HttpClient::download_file( hf_url, cache_path.string(), @@ -253,15 +247,13 @@ void WhisperServer::load(const std::string& model_name, LOG(INFO, "WhisperServer") << "Using model: " << model_path << std::endl; LOG(INFO, "WhisperServer") << "Using backend: " << whispercpp_backend << std::endl; - // For NPU backend, download the compiled cache (.rai file). This is a must-have for NPU backend. + // For NPU backend, download the compiled cache (.rai file). if (whispercpp_backend == "npu") { download_npu_compiled_cache(model_path, model_info, do_not_upgrade); } - // Get whisper-server executable path std::string exe_path = BackendUtils::get_backend_binary_path(*whispercpp::spec(), whispercpp_backend); - // Choose a port port_ = choose_port(); if (port_ == 0) { throw std::runtime_error("Failed to find an available port"); @@ -269,7 +261,7 @@ void WhisperServer::load(const std::string& model_name, LOG(INFO, "WhisperServer") << "Starting server on port " << port_ << std::endl; - // Build command line arguments. Lemonade manages the model path and port; + // Lemonade manages the model path and port; // optional whisper-server flags like --convert come from whispercpp_args. // Note: Don't include exe_path here - ProcessManager::start_process already handles it std::vector args = { @@ -298,12 +290,10 @@ void WhisperServer::load(const std::string& model_name, // Note: whisper-server doesn't support --debug flag - // Set up environment variables for shared library loading std::vector> env_vars; fs::path exe_dir = fs::path(exe_path).parent_path(); #ifndef _WIN32 - // set LD_LIBRARY_PATH to include executable directory std::string lib_path = exe_dir.string(); // ROCm whisper-server needs the TheRock ROCm libs (libamd_comgr.so.3, etc.) @@ -330,7 +320,6 @@ void WhisperServer::load(const std::string& model_name, } #endif - // Launch the subprocess ProcessHandle started_handle = utils::ProcessManager::start_process( exe_path, args, @@ -347,7 +336,6 @@ void WhisperServer::load(const std::string& model_name, LOG(INFO, "WhisperServer") << "Process started with PID: " << started_handle.pid << std::endl; - // Wait for server to be ready if (!wait_for_ready("/health")) { unload(); throw std::runtime_error("whisper-server failed to start or become ready"); @@ -399,14 +387,13 @@ json WhisperServer::responses(const json& request) { // Audio file handling helpers std::string WhisperServer::save_audio_to_temp(const std::string& audio_data, const std::string& filename) { - // Generate unique filename std::random_device rd; std::mt19937 gen(rd()); std::uniform_int_distribution<> dis(0, 999999); std::string ext = fs::path(filename).extension().string(); if (ext.empty()) { - ext = ".audio"; // Default extension + ext = ".audio"; } std::stringstream ss; @@ -414,7 +401,6 @@ std::string WhisperServer::save_audio_to_temp(const std::string& audio_data, fs::path temp_file = temp_dir_ / ss.str(); - // Write audio data to file std::ofstream outfile(temp_file, std::ios::binary); if (!outfile) { throw std::runtime_error("Failed to create temporary audio file: " + temp_file.string()); @@ -458,12 +444,10 @@ void WhisperServer::validate_audio_file(const std::string& path) { json WhisperServer::build_transcription_request(const json& request, bool translate) { json whisper_req; - // Required fields if (request.contains("file_path")) { whisper_req["file"] = request["file_path"]; } - // Optional fields if (request.contains("language") && !translate) { // For transcription, respect language hint whisper_req["language"] = request["language"]; @@ -481,10 +465,9 @@ json WhisperServer::build_transcription_request(const json& request, bool transl if (request.contains("response_format")) { whisper_req["response_format"] = request["response_format"]; } else { - whisper_req["response_format"] = "json"; // Default + whisper_req["response_format"] = "json"; } - // Add translate flag if needed if (translate) { whisper_req["translate"] = true; } @@ -492,11 +475,9 @@ json WhisperServer::build_transcription_request(const json& request, bool transl return whisper_req; } -// Forward audio file to whisper-server using multipart form-data json WhisperServer::forward_multipart_audio_request(const std::string& file_path, const json& params, bool translate) { - // Read the audio file content std::ifstream file(file_path, std::ios::binary); if (!file) { throw std::runtime_error("Could not open audio file: " + file_path); @@ -509,12 +490,10 @@ json WhisperServer::forward_multipart_audio_request(const std::string& file_path LOG(DEBUG, "WhisperServer") << "Audio file size: " << file_content.size() << " bytes" << std::endl; - // Determine content type based on file extension fs::path filepath(file_path); std::string ext = filepath.extension().string(); - std::string content_type = "audio/wav"; // Default + std::string content_type = "audio/wav"; - // Map common audio extensions to MIME types if (ext == ".mp3") content_type = "audio/mpeg"; else if (ext == ".wav") content_type = "audio/wav"; else if (ext == ".m4a") content_type = "audio/mp4"; @@ -531,7 +510,6 @@ json WhisperServer::forward_multipart_audio_request(const std::string& file_path audio_file.content_type = content_type; fields.push_back(audio_file); - // Add optional parameters as form fields std::string response_format = params.value("response_format", "json"); utils::MultipartField fmt_field; fmt_field.name = "response_format"; @@ -585,7 +563,6 @@ json WhisperServer::forward_multipart_audio_request(const std::string& file_path std::to_string(res.status_code) + ": " + res.body); } - // Try to parse as JSON try { return json::parse(res.body); } catch (const json::parse_error&) { @@ -604,10 +581,9 @@ json WhisperServer::forward_multipart_audio_data(const std::string& audio_data, LOG(DEBUG, "WhisperServer") << "Audio data size: " << audio_data.size() << " bytes (no file I/O)" << std::endl; - // Determine content type based on filename extension fs::path filepath(filename); std::string ext = filepath.extension().string(); - std::string content_type = "audio/wav"; // Default + std::string content_type = "audio/wav"; if (ext == ".mp3") content_type = "audio/mpeg"; else if (ext == ".wav") content_type = "audio/wav"; @@ -683,7 +659,6 @@ json WhisperServer::forward_multipart_audio_data(const std::string& audio_data, // ITranscriptionServer implementation json WhisperServer::audio_transcriptions(const json& request) { try { - // Extract audio data from request if (!request.contains("file_data")) { throw std::runtime_error("Missing 'file_data' in request"); } @@ -691,7 +666,6 @@ json WhisperServer::audio_transcriptions(const json& request) { std::string audio_data = request["file_data"].get(); std::string filename = request.value("filename", "audio.wav"); - // Send directly to whisper-server without file I/O return forward_multipart_audio_data(audio_data, filename, request, false); } catch (const std::exception& e) { diff --git a/src/cpp/server/config_file.cpp b/src/cpp/server/config_file.cpp index 2787c0167..dce9d17d9 100644 --- a/src/cpp/server/config_file.cpp +++ b/src/cpp/server/config_file.cpp @@ -32,11 +32,9 @@ json ConfigFile::base_defaults() { json defaults = load_json_file(utils::path_from_utf8( utils::get_resource_path("resources/defaults.json"))); - // Seed each backend's config.json section from its descriptor. The per-recipe - // defaults are authored in the backend's descriptor; resources/defaults.json - // is the generated, committed mirror (see GET /internal/config/defaults and - // docs/tools/gen_backend_boilerplate.py). Re-seeding here keeps the descriptor - // authoritative even if the committed file lags. Empty result = no section. + // Seed each backend's config.json section from its descriptor. + // resources/defaults.json is the generated, committed mirror; re-seeding here + // keeps the descriptor authoritative even if that file lags. for (const auto* d : backends::all_descriptors()) { json block = d->config_defaults(); if (!block.empty()) { diff --git a/src/cpp/server/model_manager.cpp b/src/cpp/server/model_manager.cpp index 02679e803..276449b6f 100644 --- a/src/cpp/server/model_manager.cpp +++ b/src/cpp/server/model_manager.cpp @@ -110,8 +110,6 @@ namespace lemon { // Properties which are defined by the user for model registration. static const std::vector USER_DEFINED_MODEL_PROPS = std::vector{"checkpoints", "checkpoint", "recipe", "mmproj", "size", "image_defaults", "components", "recipe_options"}; -// Helper functions for string operations — use shared implementations from gguf_reader_detail - static constexpr const char USER_MODEL_PREFIX[] = "user."; static constexpr size_t USER_MODEL_PREFIX_LEN = sizeof(USER_MODEL_PREFIX) - 1; static constexpr const char EXTRA_MODEL_PREFIX[] = "extra."; @@ -1084,7 +1082,7 @@ std::string ModelManager::resolve_model_path(const ModelInfo& info, const std::s // Compute the HF cache location for this checkpoint's repo, then let the // backend's ops find its artifact within (a .gguf file, a genai_config.json - // directory, a .bin, …) — no per-recipe switchboard here. + // directory, a .bin, …). backends::CheckpointResolveContext ctx; ctx.hf_cache = hf_cache; ctx.repo_id = checkpoint_to_repo_id(checkpoint); @@ -2160,7 +2158,7 @@ void ModelManager::register_user_model(const std::string& model_name, std::string recipe = model_data.value("recipe", ""); // Inject the backend's default labels for models that omit them (e.g. sd-cpp - // -> image, whispercpp/moonshine -> transcription). Sourced from the descriptor. + // -> image, whispercpp/moonshine -> transcription). if (const auto* desc = lemon::backends::descriptor_for(recipe)) { for (const auto& label : desc->default_labels) { labels.insert(label); diff --git a/src/cpp/server/recipe_options.cpp b/src/cpp/server/recipe_options.cpp index 70c188e34..65d1474fa 100644 --- a/src/cpp/server/recipe_options.cpp +++ b/src/cpp/server/recipe_options.cpp @@ -240,9 +240,9 @@ void RecipeOptions::set_option(const std::string& opt, const json& value) { #ifdef LEMONADE_CLI // CLI_OPTIONS used only by the lemonade CLI client for add_cli_options. // ctx_size/merge_args are the common flags; everything else is derived from -// descriptor options that declare a CLI flag, so the CLI never needs editing -// when a backend is added. Image-gen params (steps/cfg_scale/width/height) have -// no cli_flag in their descriptor, so they stay recipe-level only as before. +// descriptor options that declare a CLI flag. Image-gen params +// (steps/cfg_scale/width/height) have no cli_flag in their descriptor, so they +// stay recipe-level only. static const json& get_cli_options() { static const json cli_options = [] { json o = json::object(); diff --git a/src/cpp/server/router.cpp b/src/cpp/server/router.cpp index 514a9773e..eb9c9d18f 100644 --- a/src/cpp/server/router.cpp +++ b/src/cpp/server/router.cpp @@ -145,8 +145,7 @@ bool Router::reload_model_after_watchdog_reset(const std::string& requested_mode } // Slot/eviction policy for a recipe, from its descriptor (default Standard). -// This is the recipe-static policy used for pre-load slot decisions, mirroring -// the historical use of get_device_type_from_recipe at load time. +// This is the recipe-static policy used for pre-load slot decisions. static SlotPolicy slot_policy_for_recipe(const std::string& recipe) { if (const auto* desc = backends::descriptor_for(recipe)) { return desc->slot_policy; @@ -324,8 +323,7 @@ std::unique_ptr Router::create_backend_server(const ModelInfo& mo ctx.cloud_registry = cloud_registry_; ctx.model_info = &model_info; - // The backend registry binds each recipe's descriptor to its create(). It is - // the single source of truth for backend construction (see LEMON_BACKENDS). + // The backend registry binds each recipe to its create() (see LEMON_BACKENDS). std::unique_ptr new_server = backends::create_server(model_info.recipe, ctx); if (new_server) { LOG(DEBUG, "Router") << "Created backend for recipe '" << model_info.recipe diff --git a/src/cpp/server/runtime_config.cpp b/src/cpp/server/runtime_config.cpp index cc9bd6189..a6f7ffa7f 100644 --- a/src/cpp/server/runtime_config.cpp +++ b/src/cpp/server/runtime_config.cpp @@ -32,7 +32,7 @@ RuntimeConfig* RuntimeConfig::global() { // A valid config.json backend section is the config_section of any descriptor // that runs a local subprocess (binary != ""). Cloud has no binary, so it is not -// a backend section. Derived from descriptors — no hand-maintained list. +// a backend section. static bool is_backend_name(const std::string& key) { for (const auto* desc : lemon::backends::all_descriptors()) { if (!desc->binary.empty() && desc->effective_config_section() == key) { @@ -291,8 +291,7 @@ std::string RuntimeConfig::rocm_channel_for_recipe(const std::string& recipe) co std::string channel = rocm_channel(); // Clamp to a channel the backend actually publishes. A backend that lists // only {"stable"} (e.g. sd-cpp, which has no nightly artifacts) falls back to - // its first channel when "nightly" is requested. Driven by the descriptor's - // rocm_channels, so no per-recipe special case lives here. + // its first channel when "nightly" is requested. const auto* desc = lemon::backends::descriptor_for(recipe); if (desc && !desc->rocm_channels.empty()) { const auto& channels = desc->rocm_channels; @@ -365,9 +364,9 @@ json RuntimeConfig::recipe_options(const std::string& backend) const { const std::string backend_args = backend + "_args"; // Translate each backend's nested config.json section into the flat - // recipe_options format, driven by the descriptor's option list — no - // per-recipe block. The flat key is the descriptor option name; the - // config.json key is derived from the option's role (its name suffix): + // recipe_options format, driven by the descriptor's option list. The flat + // key is the descriptor option name; the config.json key is derived from the + // option's role (its name suffix): // *_backend -> "backend" *_args -> variant "_args" then "args" // *_device -> "device" everything else -> the option name verbatim // (sd-cpp's steps/cfg_scale/width/height/…) diff --git a/src/cpp/server/server.cpp b/src/cpp/server/server.cpp index 511aa080c..d4f7ad1ee 100644 --- a/src/cpp/server/server.cpp +++ b/src/cpp/server/server.cpp @@ -4475,7 +4475,6 @@ void Server::handle_config_defaults_get(const httplib::Request& /*req*/, httplib try { // The canonical default config (global keys + descriptor-derived per-recipe // sections), independent of this host's config.json or deployment overrides. - // gen_backend_boilerplate.py reads this to regenerate resources/defaults.json. res.set_content(ConfigFile::base_defaults().dump(2), "application/json"); } catch (const std::exception& e) { LOG(ERROR, "Server") << "ERROR in handle_config_defaults_get: " << e.what() << std::endl; diff --git a/src/cpp/server/system_info.cpp b/src/cpp/server/system_info.cpp index f7cccc162..22ef1c749 100644 --- a/src/cpp/server/system_info.cpp +++ b/src/cpp/server/system_info.cpp @@ -419,15 +419,13 @@ std::vector query_dxg_amd_gpus(const std::string& gpu_type) { // // Empty family set {} means "all families of that device type" // The recipe/backend support matrix is assembled from every backend descriptor's -// `support` rows (see lemon/backends/*_descriptor.cpp). Concatenated in registry -// order; within a recipe, row order is the backend preference order. This is the -// single source of truth — there is no separate hand-maintained table. +// `support` rows. Concatenated in registry order; within a recipe, row order is +// the backend preference order. static const std::vector& recipe_defs() { static const std::vector defs = [] { std::vector v; for (const auto* desc : lemon::backends::all_descriptors()) { for (const auto& row : desc->support) { - // Fill in the recipe (the owning descriptor's) per support row. v.push_back({desc->recipe, row.backend, row.supported_os, row.devices, row.device_summary}); } } @@ -579,7 +577,7 @@ static bool device_matches_constraint(const std::string& device_family, // Generic installation check static bool is_recipe_installed(const std::string& recipe, const std::string& backend, std::string& error_message) { // Special handling for ROCm backends on gfx1151 (Strix Halo) if the kernel - // CWSR fix is missing — which backends' rocm build needs it is a descriptor flag. + // CWSR fix is missing (a per-descriptor flag). const auto* cwsr_desc = backends::descriptor_for(recipe); if (backend == "rocm" && cwsr_desc && cwsr_desc->rocm_requires_cwsr_fix && needs_gfx1151_cwsr_fix()) { @@ -607,7 +605,7 @@ static bool is_recipe_installed(const std::string& recipe, const std::string& ba static std::string get_recipe_version(const std::string& recipe, const std::string& backend) { // Read the on-disk version.txt generically, then let the backend's ops // override (llamacpp "system" runs llama-server --version; flm queries the - // CLI when no file is present). No per-recipe branches here. + // CLI when no file is present). auto* spec = try_get_spec_for_recipe(recipe); std::string file_version; if (spec) { @@ -1471,8 +1469,7 @@ json SystemInfo::build_recipes_info(const json& devices) { // Enrich each recipe entry with descriptor metadata so clients (the desktop // app, the docs generator) can render display names and per-recipe option - // schemas without hardcoding them. This is the single source the frontend - // reads instead of its own per-recipe TypeScript tables. + // schemas without hardcoding them. int recipe_order = 0; for (const auto* desc : lemon::backends::all_descriptors()) { auto it = recipes.find(desc->recipe); @@ -1488,11 +1485,9 @@ json SystemInfo::build_recipes_info(const json& devices) { entry["modality"] = desc->modality; entry["experimental"] = desc->experimental; entry["web_display_name"] = desc->web_display_name.empty() ? desc->display_name : desc->web_display_name; - entry["web_priority"] = desc->web_priority; entry["slot_policy"] = slot_policy_to_string(desc->slot_policy); // Machine-independent support matrix (OS + device families + friendly - // device summary per backend), straight from the descriptor — used by the - // docs generator to render the README support matrix etc. + // device summary per backend), straight from the descriptor. json support = json::array(); for (const auto& row : desc->support) { json devices = json::array(); @@ -1586,8 +1581,7 @@ SystemInfo::SupportedBackendsResult SystemInfo::get_supported_backends(const std std::string SystemInfo::check_recipe_supported(const std::string& recipe) { // A backend whose descriptor declares no support rows has no local // hardware/OS gating (e.g. cloud offload): availability is determined at - // runtime (provider creds via the CloudProviderRegistry / API key), checked - // elsewhere in filter_models_by_backend / CloudServer::load. + // runtime (provider creds via the CloudProviderRegistry / API key). const auto* desc = lemon::backends::descriptor_for(recipe); if (desc && desc->support.empty()) { return ""; diff --git a/test/cpp/test_auto_tune.cpp b/test/cpp/test_auto_tune.cpp index 75dbeba74..c1f976f83 100644 --- a/test/cpp/test_auto_tune.cpp +++ b/test/cpp/test_auto_tune.cpp @@ -1,12 +1,5 @@ // Standalone test for GGUF array storage and weighted KV cache computation. // -// Covers: -// - GgufMetadata raw array fields (head_count_kv_per_layer, sliding_window_pattern) -// - Post-loop derivation of scalar convenience fields -// - compute_weighted_kv_cache_bytes_per_token() with per-layer arrays -// - full_attention_interval exact count (floor((n-1)/interval) + 1) -// - SWA precise weighted sum vs proportional approximation -// // Compile: g++ -std=c++17 -I src/cpp/include test/cpp/test_auto_tune.cpp -o test_auto_tune #include "lemon/gguf_reader.h" @@ -23,17 +16,13 @@ static void check(const char* name, bool ok) { if (!ok) ++g_failures; } -// Floating-point equality with tolerance static bool approx_eq(double a, double b, double tol = 0.001) { return std::fabs(a - b) < tol; } -// ── Helpers to simulate post-loop derivation ───────────────────────── - /// Simulate what read_gguf_metadata does after the KV loop: /// derive head_count_kv and swa_layer_count from raw arrays/scalars. static void derive_scalars(GgufMetadata& m) { - // head_count_kv derivation if (!m.head_count_kv_per_layer.empty()) { for (int64_t v : m.head_count_kv_per_layer) m.head_count_kv += v; @@ -42,19 +31,16 @@ static void derive_scalars(GgufMetadata& m) { m.head_count_kv_per_layer.assign(m.block_count, m.head_count_kv_scalar); } - // swa_layer_count derivation if (!m.sliding_window_pattern.empty()) { for (bool v : m.sliding_window_pattern) if (v) m.swa_layer_count++; } } -// ── Test: scalar head_count_kv derivation ───────────────────────────── - static void test_scalar_head_count_kv() { GgufMetadata m; m.block_count = 32; - m.head_count_kv_scalar = 4; // 4 KV heads per block + m.head_count_kv_scalar = 4; derive_scalars(m); @@ -68,8 +54,6 @@ static void test_scalar_head_count_kv() { [](int64_t v) { return v == 4; })); } -// ── Test: array head_count_kv derivation ────────────────────────────── - static void test_array_head_count_kv() { GgufMetadata m; m.block_count = 4; @@ -83,8 +67,6 @@ static void test_array_head_count_kv() { m.head_count_kv == 48); } -// ── Test: sliding_window_pattern derivation ─────────────────────────── - static void test_swa_pattern_derivation() { GgufMetadata m; m.block_count = 8; @@ -96,13 +78,11 @@ static void test_swa_pattern_derivation() { m.swa_layer_count == 4); } -// ── Test: standard MHA/GQA (no scaling) ────────────────────────────── - static void test_standard_mha() { GgufMetadata m; m.block_count = 32; m.key_length = 128; - m.head_count_kv_per_layer.assign(32, 4); // 4 KV heads per block + m.head_count_kv_per_layer.assign(32, 4); derive_scalars(m); // Expected: 128 total heads * 128 key_len * 2[F16] * 2[K+V] = 65536 @@ -112,8 +92,6 @@ static void test_standard_mha() { approx_eq(bytes, 128.0 * 128.0 * 4.0)); } -// ── Test: SWA with per-layer arrays (precise weighted sum) ─────────── - static void test_swa_precise() { GgufMetadata m; m.block_count = 4; @@ -140,10 +118,8 @@ static void test_swa_precise() { double scale = 0; lemon::compute_weighted_kv_cache_bytes_per_token(m, &scale); - // Unweighted: (8+4+8+4) * 256 = 4608 - // scale = 5120 / 4608 ≈ 1.1111... wait, that's > 1 which is wrong - // Actually: weighted = 8*256 + 4*128 + 8*256 + 4*128 = 2048+512+2048+512 = 5120 - // unweighted = (8+4+8+4) * 256 = 24 * 256 = 6144 + // weighted = 8*256 + 4*128 + 8*256 + 4*128 = 5120 + // unweighted = (8+4+8+4) * 256 = 6144 // scale = 5120 / 6144 ≈ 0.8333 check("swa-precise: scale factor < 1.0", scale > 0.0 && scale < 1.0); @@ -151,8 +127,6 @@ static void test_swa_precise() { approx_eq(scale, 5120.0 / 6144.0)); } -// ── Test: SWA with scalar fallback (proportional approximation) ────── - static void test_swa_scalar_fallback() { GgufMetadata m; m.block_count = 4; @@ -160,7 +134,7 @@ static void test_swa_scalar_fallback() { m.key_length_swa = 128; // Scalar case: no per-layer array, no sliding_window_pattern - m.head_count_kv_scalar = 8; // uniform 8 heads per block + m.head_count_kv_scalar = 8; derive_scalars(m); // After derivation, per_layer IS populated from scalar, and swa_pattern is empty. @@ -175,14 +149,12 @@ static void test_swa_scalar_fallback() { approx_eq(bytes, 32.0 * 256.0 * 4.0)); } -// ── Test: SWA with scalar + manually set swa_layer_count ───────────── - static void test_swa_scalar_with_count() { GgufMetadata m; m.block_count = 4; m.key_length = 256; m.key_length_swa = 128; - m.swa_layer_count = 2; // 2 out of 4 layers are SWA + m.swa_layer_count = 2; m.head_count_kv_scalar = 8; @@ -196,8 +168,6 @@ static void test_swa_scalar_with_count() { approx_eq(bytes, 32.0 * 256.0 * 4.0 * 0.75)); } -// ── Test: full_attention_interval exact count ───────────────────────── - static void test_full_attention_interval() { // For each (blocks, interval), verify the exact count: // floor((blocks - 1) / interval) + 1 @@ -251,8 +221,6 @@ static void test_full_attention_interval() { } } -// ── Test: full_attention_interval formula vs old approximation ──────── - static void test_fai_improvement() { // Demonstrate that the exact formula differs meaningfully from 1/interval // for non-divisible block counts. @@ -278,8 +246,6 @@ static void test_fai_improvement() { scale > old_approx); } -// ── Test: missing metadata returns 0 ────────────────────────────────── - static void test_missing_metadata() { GgufMetadata m_empty; double bytes = lemon::compute_weighted_kv_cache_bytes_per_token(m_empty); @@ -298,11 +264,8 @@ static void test_missing_metadata() { check("missing: no key_length returns 0", bytes == 0.0); } -// ── Test: varying head counts with SWA ──────────────────────────────── - static void test_varying_heads_swa() { // Model where SWA layers have FEWER heads than full layers. - // This is where the precise weighted sum matters most. GgufMetadata m; m.block_count = 6; m.key_length = 256; @@ -326,14 +289,11 @@ static void test_varying_heads_swa() { // Old proportional approximation (with uniform head count = total/6 = 6): // factor = 1 - 3/6 + 3/6 * 64/256 = 1 - 0.5 + 0.125 = 0.625 // bytes = 60 * 256 * 4 * 0.625 = 38400 - // The precise value (52224) is significantly different! double old_approx = 60.0 * 256.0 * 4.0 * 0.625; check("varying-heads-swa: precise differs from proportional", !approx_eq(bytes, old_approx, 1000.0)); } -// ── Main ────────────────────────────────────────────────────────────── - int main() { test_scalar_head_count_kv(); test_array_head_count_kv();